1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-10 08:52:39 +01:00

Make Ferret indexing more robust (UTF8, exception handling)

Summary:
Ref T12819. Two minor improvements from live data:

  - Tokenize in a UTF8-aware way.
  - When one document fails to index, kill the transaction explicitly (rather than leaving it hanging) so we don't cause other failures later.

Test Plan: Created some UTF8 documents locally, indexed them, got clean results.

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T12819

Differential Revision: https://secure.phabricator.com/D18487
This commit is contained in:
epriestley 2017-08-28 15:37:56 -07:00
parent 0609133f45
commit 4005a465f7
2 changed files with 11 additions and 2 deletions

View file

@ -55,6 +55,8 @@ final class PhabricatorFerretFulltextEngineExtension
->getNgramsFromString($ngrams_source, 'index'); ->getNgramsFromString($ngrams_source, 'index');
$ferret_document->openTransaction(); $ferret_document->openTransaction();
try {
$this->deleteOldDocument($engine, $object, $document); $this->deleteOldDocument($engine, $object, $document);
$ferret_document->save(); $ferret_document->save();
@ -85,6 +87,11 @@ final class PhabricatorFerretFulltextEngineExtension
$ferret_ngrams->getTableName(), $ferret_ngrams->getTableName(),
$chunk); $chunk);
} }
} catch (Exception $ex) {
$ferret_document->killTransaction();
throw $ex;
}
$ferret_document->saveTransaction(); $ferret_document->saveTransaction();
} }

View file

@ -26,9 +26,11 @@ final class PhabricatorNgramEngine extends Phobject {
break; break;
} }
$len = (strlen($token) - 2); $token_v = phutil_utf8v($token);
$len = (count($token_v) - 2);
for ($ii = 0; $ii < $len; $ii++) { for ($ii = 0; $ii < $len; $ii++) {
$ngram = substr($token, $ii, 3); $ngram = array_slice($token_v, $ii, 3);
$ngram = implode('', $ngram);
$ngrams[$ngram] = $ngram; $ngrams[$ngram] = $ngram;
} }
} }