diff --git a/src/applications/search/ferret/PhabricatorFerretEngine.php b/src/applications/search/ferret/PhabricatorFerretEngine.php index 7d1d03a8b1..7cd06ae549 100644 --- a/src/applications/search/ferret/PhabricatorFerretEngine.php +++ b/src/applications/search/ferret/PhabricatorFerretEngine.php @@ -106,11 +106,25 @@ abstract class PhabricatorFerretEngine extends Phobject { $ngrams = array(); foreach ($unique_tokens as $token => $ignored) { $token_v = phutil_utf8v($token); - $len = (count($token_v) - 2); - for ($ii = 0; $ii < $len; $ii++) { - $ngram = array_slice($token_v, $ii, 3); - $ngram = implode('', $ngram); + $length = count($token_v); + + // NOTE: We're being somewhat clever here to micro-optimize performance, + // especially for very long strings. See PHI87. + + $token_l = array(); + for ($ii = 0; $ii < $length; $ii++) { + $token_l[$ii] = strlen($token_v[$ii]); + } + + $ngram_count = $length - 2; + $cursor = 0; + for ($ii = 0; $ii < $ngram_count; $ii++) { + $ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2]; + + $ngram = substr($token, $cursor, $ngram_l); $ngrams[$ngram] = $ngram; + + $cursor += $token_l[$ii]; } } diff --git a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php index a8690f1d8b..ea65a559ff 100644 --- a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php +++ b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php @@ -24,4 +24,34 @@ final class PhabricatorFerretEngineTestCase } } + public function testTermNgramExtraction() { + $snowman = "\xE2\x98\x83"; + + $map = array( + 'a' => array(' a '), + 'ab' => array(' ab', 'ab '), + 'abcdef' => array(' ab', 'abc', 'bcd', 'cde', 'def', 'ef '), + "{$snowman}" => array(" {$snowman} "), + "x{$snowman}y" => array( + " x{$snowman}", + "x{$snowman}y", + "{$snowman}y ", + ), + "{$snowman}{$snowman}" => array( + " {$snowman}{$snowman}", + "{$snowman}{$snowman} ", + ), + ); + + $engine = new ManiphestTaskFerretEngine(); + + foreach ($map as $input => $expect) { + $actual = $engine->getTermNgramsFromString($input); + $this->assertEqual( + $actual, + $expect, + pht('Term ngrams for: %s.', $input)); + } + } + }