1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-22 06:42:42 +01:00

Improve performance of Ferret engine ngram extraction, particularly for large input strings

Summary:
See PHI87. Ref T12974. The `array_slice()` method of splitting the string apart can perform poorly for large input strings. I think this is mostly just the large number of calls plus building and returning an array being not entirely trivial.

We can just use `substr()` instead, as long as we're a little bit careful about keeping track of where we're slicing the string if it has UTF8 characters.

Test Plan:
  - Created a task with a single, unbroken blob of base64 encoded data as the description, roughly 100KB long.
  - Saw indexing performance improve from ~6s to ~1.5s after patch.
  - Before: https://secure.phabricator.com/xhprof/profile/PHID-FILE-nrxs4lwdvupbve5lhl6u/
  - After: https://secure.phabricator.com/xhprof/profile/PHID-FILE-6vs2akgjj5nbqt7yo7ul/

Reviewers: amckinley

Reviewed By: amckinley

Maniphest Tasks: T12974

Differential Revision: https://secure.phabricator.com/D18649
This commit is contained in:
epriestley 2017-09-26 09:16:42 -07:00
parent a1d9a2389d
commit 086a125ad5
2 changed files with 48 additions and 4 deletions

View file

@ -106,11 +106,25 @@ abstract class PhabricatorFerretEngine extends Phobject {
$ngrams = array();
foreach ($unique_tokens as $token => $ignored) {
$token_v = phutil_utf8v($token);
$len = (count($token_v) - 2);
for ($ii = 0; $ii < $len; $ii++) {
$ngram = array_slice($token_v, $ii, 3);
$ngram = implode('', $ngram);
$length = count($token_v);
// NOTE: We're being somewhat clever here to micro-optimize performance,
// especially for very long strings. See PHI87.
$token_l = array();
for ($ii = 0; $ii < $length; $ii++) {
$token_l[$ii] = strlen($token_v[$ii]);
}
$ngram_count = $length - 2;
$cursor = 0;
for ($ii = 0; $ii < $ngram_count; $ii++) {
$ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
$ngram = substr($token, $cursor, $ngram_l);
$ngrams[$ngram] = $ngram;
$cursor += $token_l[$ii];
}
}

View file

@ -24,4 +24,34 @@ final class PhabricatorFerretEngineTestCase
}
}
public function testTermNgramExtraction() {
$snowman = "\xE2\x98\x83";
$map = array(
'a' => array(' a '),
'ab' => array(' ab', 'ab '),
'abcdef' => array(' ab', 'abc', 'bcd', 'cde', 'def', 'ef '),
"{$snowman}" => array(" {$snowman} "),
"x{$snowman}y" => array(
" x{$snowman}",
"x{$snowman}y",
"{$snowman}y ",
),
"{$snowman}{$snowman}" => array(
" {$snowman}{$snowman}",
"{$snowman}{$snowman} ",
),
);
$engine = new ManiphestTaskFerretEngine();
foreach ($map as $input => $expect) {
$actual = $engine->getTermNgramsFromString($input);
$this->assertEqual(
$actual,
$expect,
pht('Term ngrams for: %s.', $input));
}
}
}