Improve performance of Ferret engine ngram extraction, particularly for large input strings

Summary: See PHI87. Ref T12974. The `array_slice()` method of splitting the string apart can perform poorly for large input strings. I think this is mostly just the large number of calls plus building and returning an array being not entirely trivial. We can just use `substr()` instead, as long as we're a little bit careful about keeping track of where we're slicing the string if it has UTF8 characters. Test Plan: - Created a task with a single, unbroken blob of base64 encoded data as the description, roughly 100KB long. - Saw indexing performance improve from ~6s to ~1.5s after patch. - Before: https://secure.phabricator.com/xhprof/profile/PHID-FILE-nrxs4lwdvupbve5lhl6u/ - After: https://secure.phabricator.com/xhprof/profile/PHID-FILE-6vs2akgjj5nbqt7yo7ul/ Reviewers: amckinley Reviewed By: amckinley Maniphest Tasks: T12974 Differential Revision: https://secure.phabricator.com/D18649
2025-03-29 04:28:12 +01:00 · 2017-09-26 09:16:42 -07:00 · 2017-09-26 09:16:42 -07:00 · 086a125ad5
commit 086a125ad5
parent a1d9a2389d
2 changed files with 48 additions and 4 deletions
--- a/src/applications/search/ferret/PhabricatorFerretEngine.php
+++ b/src/applications/search/ferret/PhabricatorFerretEngine.php
@ -106,11 +106,25 @@ abstract class PhabricatorFerretEngine extends Phobject {
    $ngrams = array();
    foreach ($unique_tokens as $token => $ignored) {
      $token_v = phutil_utf8v($token);
-      $len = (count($token_v) - 2);
-      for ($ii = 0; $ii < $len; $ii++) {
-        $ngram = array_slice($token_v, $ii, 3);
-        $ngram = implode('', $ngram);
+      $length = count($token_v);
+
+      // NOTE: We're being somewhat clever here to micro-optimize performance,
+      // especially for very long strings. See PHI87.
+
+      $token_l = array();
+      for ($ii = 0; $ii < $length; $ii++) {
+        $token_l[$ii] = strlen($token_v[$ii]);
+      }
+
+      $ngram_count = $length - 2;
+      $cursor = 0;
+      for ($ii = 0; $ii < $ngram_count; $ii++) {
+        $ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
+
+        $ngram = substr($token, $cursor, $ngram_l);
        $ngrams[$ngram] = $ngram;
+
+        $cursor += $token_l[$ii];
      }
    }

--- a/src/applications/search/ferret/tests/PhabricatorFerretEngineTestCase.php
+++ b/src/applications/search/ferret/tests/PhabricatorFerretEngineTestCase.php
@ -24,4 +24,34 @@ final class PhabricatorFerretEngineTestCase
    }
  }

+  public function testTermNgramExtraction() {
+    $snowman = "\xE2\x98\x83";
+
+    $map = array(
+      'a' => array(' a '),
+      'ab' => array(' ab', 'ab '),
+      'abcdef' => array(' ab', 'abc', 'bcd', 'cde', 'def', 'ef '),
+      "{$snowman}" => array(" {$snowman} "),
+      "x{$snowman}y" => array(
+        " x{$snowman}",
+        "x{$snowman}y",
+        "{$snowman}y ",
+      ),
+      "{$snowman}{$snowman}" => array(
+        " {$snowman}{$snowman}",
+        "{$snowman}{$snowman} ",
+      ),
+    );
+
+    $engine = new ManiphestTaskFerretEngine();
+
+    foreach ($map as $input => $expect) {
+      $actual = $engine->getTermNgramsFromString($input);
+      $this->assertEqual(
+        $actual,
+        $expect,
+        pht('Term ngrams for: %s.', $input));
+    }
+  }
+
 }