Improve performance of Ferret engine ngram extraction, particularly for large input strings

Summary: See PHI87. Ref T12974. The `array_slice()` method of splitting the string apart can perform poorly for large input strings. I think this is mostly just the large number of calls plus building and returning an array being not entirely trivial. We can just use `substr()` instead, as long as we're a little bit careful about keeping track of where we're slicing the string if it has UTF8 characters. Test Plan: - Created a task with a single, unbroken blob of base64 encoded data as the description, roughly 100KB long. - Saw indexing performance improve from ~6s to ~1.5s after patch. - Before: https://secure.phabricator.com/xhprof/profile/PHID-FILE-nrxs4lwdvupbve5lhl6u/ - After: https://secure.phabricator.com/xhprof/profile/PHID-FILE-6vs2akgjj5nbqt7yo7ul/ Reviewers: amckinley Reviewed By: amckinley Maniphest Tasks: T12974 Differential Revision: https://secure.phabricator.com/D18649
2024-11-26 08:42:41 +01:00 · 2017-09-26 09:16:42 -07:00 · 2017-09-26 09:16:42 -07:00 · 086a125ad5
commit 086a125ad5
parent a1d9a2389d
2 changed files with 48 additions and 4 deletions
--- a/src/applications/search/ferret/PhabricatorFerretEngine.php
+++ b/src/applications/search/ferret/PhabricatorFerretEngine.php
@ -106,11 +106,25 @@ abstract class PhabricatorFerretEngine extends Phobject {
    $ngrams = array();
    foreach ($unique_tokens as $token => $ignored) {
      $token_v = phutil_utf8v($token);
-      $len = (count($token_v) - 2);
+      $length = count($token_v);
-      for ($ii = 0; $ii < $len; $ii++) {
+
-        $ngram = array_slice($token_v, $ii, 3);
+      // NOTE: We're being somewhat clever here to micro-optimize performance,
-        $ngram = implode('', $ngram);
+      // especially for very long strings. See PHI87.
      $token_l = array();
      for ($ii = 0; $ii < $length; $ii++) {
        $token_l[$ii] = strlen($token_v[$ii]);
      }
      $ngram_count = $length - 2;
      $cursor = 0;
      for ($ii = 0; $ii < $ngram_count; $ii++) {
        $ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
        $ngram = substr($token, $cursor, $ngram_l);
        $ngrams[$ngram] = $ngram;
        $cursor += $token_l[$ii];
      }
    }
--- a/src/applications/search/ferret/tests/PhabricatorFerretEngineTestCase.php
+++ b/src/applications/search/ferret/tests/PhabricatorFerretEngineTestCase.php
@ -24,4 +24,34 @@ final class PhabricatorFerretEngineTestCase
    }
  }
  public function testTermNgramExtraction() {
    $snowman = "\xE2\x98\x83";
    $map = array(
      'a' => array(' a '),
      'ab' => array(' ab', 'ab '),
      'abcdef' => array(' ab', 'abc', 'bcd', 'cde', 'def', 'ef '),
      "{$snowman}" => array(" {$snowman} "),
      "x{$snowman}y" => array(
        " x{$snowman}",
        "x{$snowman}y",
        "{$snowman}y ",
      ),
      "{$snowman}{$snowman}" => array(
        " {$snowman}{$snowman}",
        "{$snowman}{$snowman} ",
      ),
    );
    $engine = new ManiphestTaskFerretEngine();
    foreach ($map as $input => $expect) {
      $actual = $engine->getTermNgramsFromString($input);
      $this->assertEqual(
        $actual,
        $expect,
        pht('Term ngrams for: %s.', $input));
    }
  }
 }