From c034306320b55eca68dcdd94a1065b7e34db5689 Mon Sep 17 00:00:00 2001
From: epriestley <git@epriestley.com>
Date: Tue, 26 Sep 2017 09:16:42 -0700
Subject: [PATCH] (stable) Improve performance of Ferret engine ngram
 extraction, particularly for large input strings

Summary:
See PHI87. Ref T12974. The `array_slice()` method of splitting the string apart can perform poorly for large input strings. I think this is mostly just the large number of calls plus building and returning an array being not entirely trivial.

We can just use `substr()` instead, as long as we're a little bit careful about keeping track of where we're slicing the string if it has UTF8 characters.

Test Plan:
  - Created a task with a single, unbroken blob of base64 encoded data as the description, roughly 100KB long.
  - Saw indexing performance improve from ~6s to ~1.5s after patch.
  - Before: https://secure.phabricator.com/xhprof/profile/PHID-FILE-nrxs4lwdvupbve5lhl6u/
  - After: https://secure.phabricator.com/xhprof/profile/PHID-FILE-6vs2akgjj5nbqt7yo7ul/

Reviewers: amckinley

Reviewed By: amckinley

Maniphest Tasks: T12974

Differential Revision: https://secure.phabricator.com/D18649
---
 .../search/ferret/PhabricatorFerretEngine.php | 22 +++++++++++---
 .../PhabricatorFerretEngineTestCase.php       | 30 +++++++++++++++++++
 2 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/src/applications/search/ferret/PhabricatorFerretEngine.php b/src/applications/search/ferret/PhabricatorFerretEngine.php
index 7d1d03a8b1..7cd06ae549 100644
--- a/src/applications/search/ferret/PhabricatorFerretEngine.php
+++ b/src/applications/search/ferret/PhabricatorFerretEngine.php
@@ -106,11 +106,25 @@ abstract class PhabricatorFerretEngine extends Phobject {
     $ngrams = array();
     foreach ($unique_tokens as $token => $ignored) {
       $token_v = phutil_utf8v($token);
-      $len = (count($token_v) - 2);
-      for ($ii = 0; $ii < $len; $ii++) {
-        $ngram = array_slice($token_v, $ii, 3);
-        $ngram = implode('', $ngram);
+      $length = count($token_v);
+
+      // NOTE: We're being somewhat clever here to micro-optimize performance,
+      // especially for very long strings. See PHI87.
+
+      $token_l = array();
+      for ($ii = 0; $ii < $length; $ii++) {
+        $token_l[$ii] = strlen($token_v[$ii]);
+      }
+
+      $ngram_count = $length - 2;
+      $cursor = 0;
+      for ($ii = 0; $ii < $ngram_count; $ii++) {
+        $ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
+
+        $ngram = substr($token, $cursor, $ngram_l);
         $ngrams[$ngram] = $ngram;
+
+        $cursor += $token_l[$ii];
       }
     }
 
diff --git a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
index a8690f1d8b..ea65a559ff 100644
--- a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
+++ b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php
@@ -24,4 +24,34 @@ final class PhabricatorFerretEngineTestCase
     }
   }
 
+  public function testTermNgramExtraction() {
+    $snowman = "\xE2\x98\x83";
+
+    $map = array(
+      'a' => array(' a '),
+      'ab' => array(' ab', 'ab '),
+      'abcdef' => array(' ab', 'abc', 'bcd', 'cde', 'def', 'ef '),
+      "{$snowman}" => array(" {$snowman} "),
+      "x{$snowman}y" => array(
+        " x{$snowman}",
+        "x{$snowman}y",
+        "{$snowman}y ",
+      ),
+      "{$snowman}{$snowman}" => array(
+        " {$snowman}{$snowman}",
+        "{$snowman}{$snowman} ",
+      ),
+    );
+
+    $engine = new ManiphestTaskFerretEngine();
+
+    foreach ($map as $input => $expect) {
+      $actual = $engine->getTermNgramsFromString($input);
+      $this->assertEqual(
+        $actual,
+        $expect,
+        pht('Term ngrams for: %s.', $input));
+    }
+  }
+
 }