mirror of
https://we.phorge.it/source/phorge.git
synced 2024-12-26 15:30:58 +01:00
Make Ferret indexing more robust (UTF8, exception handling)
Summary: Ref T12819. Two minor improvements from live data: - Tokenize in a UTF8-aware way. - When one document fails to index, kill the transaction explicitly (rather than leaving it hanging) so we don't cause other failures later. Test Plan: Created some UTF8 documents locally, indexed them, got clean results. Reviewers: chad Reviewed By: chad Maniphest Tasks: T12819 Differential Revision: https://secure.phabricator.com/D18487
This commit is contained in:
parent
0609133f45
commit
4005a465f7
2 changed files with 11 additions and 2 deletions
|
@ -55,6 +55,8 @@ final class PhabricatorFerretFulltextEngineExtension
|
|||
->getNgramsFromString($ngrams_source, 'index');
|
||||
|
||||
$ferret_document->openTransaction();
|
||||
|
||||
try {
|
||||
$this->deleteOldDocument($engine, $object, $document);
|
||||
|
||||
$ferret_document->save();
|
||||
|
@ -85,6 +87,11 @@ final class PhabricatorFerretFulltextEngineExtension
|
|||
$ferret_ngrams->getTableName(),
|
||||
$chunk);
|
||||
}
|
||||
} catch (Exception $ex) {
|
||||
$ferret_document->killTransaction();
|
||||
throw $ex;
|
||||
}
|
||||
|
||||
$ferret_document->saveTransaction();
|
||||
}
|
||||
|
||||
|
|
|
@ -26,9 +26,11 @@ final class PhabricatorNgramEngine extends Phobject {
|
|||
break;
|
||||
}
|
||||
|
||||
$len = (strlen($token) - 2);
|
||||
$token_v = phutil_utf8v($token);
|
||||
$len = (count($token_v) - 2);
|
||||
for ($ii = 0; $ii < $len; $ii++) {
|
||||
$ngram = substr($token, $ii, 3);
|
||||
$ngram = array_slice($token_v, $ii, 3);
|
||||
$ngram = implode('', $ngram);
|
||||
$ngrams[$ngram] = $ngram;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue