From c034306320b55eca68dcdd94a1065b7e34db5689 Mon Sep 17 00:00:00 2001 From: epriestley Date: Tue, 26 Sep 2017 09:16:42 -0700 Subject: [PATCH] (stable) Improve performance of Ferret engine ngram extraction, particularly for large input strings Summary: See PHI87. Ref T12974. The `array_slice()` method of splitting the string apart can perform poorly for large input strings. I think this is mostly just the large number of calls plus building and returning an array being not entirely trivial. We can just use `substr()` instead, as long as we're a little bit careful about keeping track of where we're slicing the string if it has UTF8 characters. Test Plan: - Created a task with a single, unbroken blob of base64 encoded data as the description, roughly 100KB long. - Saw indexing performance improve from ~6s to ~1.5s after patch. - Before: https://secure.phabricator.com/xhprof/profile/PHID-FILE-nrxs4lwdvupbve5lhl6u/ - After: https://secure.phabricator.com/xhprof/profile/PHID-FILE-6vs2akgjj5nbqt7yo7ul/ Reviewers: amckinley Reviewed By: amckinley Maniphest Tasks: T12974 Differential Revision: https://secure.phabricator.com/D18649 --- .../search/ferret/PhabricatorFerretEngine.php | 22 +++++++++++--- .../PhabricatorFerretEngineTestCase.php | 30 +++++++++++++++++++ 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/src/applications/search/ferret/PhabricatorFerretEngine.php b/src/applications/search/ferret/PhabricatorFerretEngine.php index 7d1d03a8b1..7cd06ae549 100644 --- a/src/applications/search/ferret/PhabricatorFerretEngine.php +++ b/src/applications/search/ferret/PhabricatorFerretEngine.php @@ -106,11 +106,25 @@ abstract class PhabricatorFerretEngine extends Phobject { $ngrams = array(); foreach ($unique_tokens as $token => $ignored) { $token_v = phutil_utf8v($token); - $len = (count($token_v) - 2); - for ($ii = 0; $ii < $len; $ii++) { - $ngram = array_slice($token_v, $ii, 3); - $ngram = implode('', $ngram); + $length = count($token_v); + + // NOTE: We're being somewhat clever here to micro-optimize performance, + // especially for very long strings. See PHI87. + + $token_l = array(); + for ($ii = 0; $ii < $length; $ii++) { + $token_l[$ii] = strlen($token_v[$ii]); + } + + $ngram_count = $length - 2; + $cursor = 0; + for ($ii = 0; $ii < $ngram_count; $ii++) { + $ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2]; + + $ngram = substr($token, $cursor, $ngram_l); $ngrams[$ngram] = $ngram; + + $cursor += $token_l[$ii]; } } diff --git a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php index a8690f1d8b..ea65a559ff 100644 --- a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php +++ b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php @@ -24,4 +24,34 @@ final class PhabricatorFerretEngineTestCase } } + public function testTermNgramExtraction() { + $snowman = "\xE2\x98\x83"; + + $map = array( + 'a' => array(' a '), + 'ab' => array(' ab', 'ab '), + 'abcdef' => array(' ab', 'abc', 'bcd', 'cde', 'def', 'ef '), + "{$snowman}" => array(" {$snowman} "), + "x{$snowman}y" => array( + " x{$snowman}", + "x{$snowman}y", + "{$snowman}y ", + ), + "{$snowman}{$snowman}" => array( + " {$snowman}{$snowman}", + "{$snowman}{$snowman} ", + ), + ); + + $engine = new ManiphestTaskFerretEngine(); + + foreach ($map as $input => $expect) { + $actual = $engine->getTermNgramsFromString($input); + $this->assertEqual( + $actual, + $expect, + pht('Term ngrams for: %s.', $input)); + } + } + }