From 9288cad0edc83bb319a2565a597699a6faa0c714 Mon Sep 17 00:00:00 2001 From: epriestley Date: Mon, 25 Sep 2017 19:11:35 -0700 Subject: [PATCH] (stable) Improve Ferret engine indexing performance for large blocks of text Summary: See PHI87. Ref T12974. Currently, we do a lot more work here than we need to: we call `phutil_utf8_strtolower()` on each token, but can do it once at the beginning on the whole block. Additionally, since ngrams don't care about order, we only need to convert unique tokens into ngrams. This saves us some `phutil_utf8v()`. These calls can be slow for large inputs. Test Plan: - Created a ~4MB task description. - Ran `bin/search index Txxx --profile ...` to profile indexing performance before and after the change. - Saw total runtime drop form 38s to 9s. - Before: - After: Reviewers: amckinley Reviewed By: amckinley Maniphest Tasks: T12974 Differential Revision: https://secure.phabricator.com/D18647 --- .../search/ferret/PhabricatorFerretEngine.php | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/applications/search/ferret/PhabricatorFerretEngine.php b/src/applications/search/ferret/PhabricatorFerretEngine.php index 3c8098c54f..7d1d03a8b1 100644 --- a/src/applications/search/ferret/PhabricatorFerretEngine.php +++ b/src/applications/search/ferret/PhabricatorFerretEngine.php @@ -88,16 +88,23 @@ abstract class PhabricatorFerretEngine extends Phobject { } private function getNgramsFromString($value, $as_term) { + $value = phutil_utf8_strtolower($value); $tokens = $this->tokenizeString($value); - $ngrams = array(); + // First, extract unique tokens from the string. This reduces the number + // of `phutil_utf8v()` calls we need to make if we are indexing a large + // corpus with redundant terms. + $unique_tokens = array(); foreach ($tokens as $token) { - $token = phutil_utf8_strtolower($token); - if ($as_term) { $token = ' '.$token.' '; } + $unique_tokens[$token] = true; + } + + $ngrams = array(); + foreach ($unique_tokens as $token => $ignored) { $token_v = phutil_utf8v($token); $len = (count($token_v) - 2); for ($ii = 0; $ii < $len; $ii++) {