mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-26 08:42:41 +01:00
Improve performance of Ferret engine ngram extraction, particularly for large input strings
Summary: See PHI87. Ref T12974. The `array_slice()` method of splitting the string apart can perform poorly for large input strings. I think this is mostly just the large number of calls plus building and returning an array being not entirely trivial. We can just use `substr()` instead, as long as we're a little bit careful about keeping track of where we're slicing the string if it has UTF8 characters. Test Plan: - Created a task with a single, unbroken blob of base64 encoded data as the description, roughly 100KB long. - Saw indexing performance improve from ~6s to ~1.5s after patch. - Before: https://secure.phabricator.com/xhprof/profile/PHID-FILE-nrxs4lwdvupbve5lhl6u/ - After: https://secure.phabricator.com/xhprof/profile/PHID-FILE-6vs2akgjj5nbqt7yo7ul/ Reviewers: amckinley Reviewed By: amckinley Maniphest Tasks: T12974 Differential Revision: https://secure.phabricator.com/D18649
This commit is contained in:
parent
a1d9a2389d
commit
086a125ad5
2 changed files with 48 additions and 4 deletions
|
@ -106,11 +106,25 @@ abstract class PhabricatorFerretEngine extends Phobject {
|
||||||
$ngrams = array();
|
$ngrams = array();
|
||||||
foreach ($unique_tokens as $token => $ignored) {
|
foreach ($unique_tokens as $token => $ignored) {
|
||||||
$token_v = phutil_utf8v($token);
|
$token_v = phutil_utf8v($token);
|
||||||
$len = (count($token_v) - 2);
|
$length = count($token_v);
|
||||||
for ($ii = 0; $ii < $len; $ii++) {
|
|
||||||
$ngram = array_slice($token_v, $ii, 3);
|
// NOTE: We're being somewhat clever here to micro-optimize performance,
|
||||||
$ngram = implode('', $ngram);
|
// especially for very long strings. See PHI87.
|
||||||
|
|
||||||
|
$token_l = array();
|
||||||
|
for ($ii = 0; $ii < $length; $ii++) {
|
||||||
|
$token_l[$ii] = strlen($token_v[$ii]);
|
||||||
|
}
|
||||||
|
|
||||||
|
$ngram_count = $length - 2;
|
||||||
|
$cursor = 0;
|
||||||
|
for ($ii = 0; $ii < $ngram_count; $ii++) {
|
||||||
|
$ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
|
||||||
|
|
||||||
|
$ngram = substr($token, $cursor, $ngram_l);
|
||||||
$ngrams[$ngram] = $ngram;
|
$ngrams[$ngram] = $ngram;
|
||||||
|
|
||||||
|
$cursor += $token_l[$ii];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -24,4 +24,34 @@ final class PhabricatorFerretEngineTestCase
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function testTermNgramExtraction() {
|
||||||
|
$snowman = "\xE2\x98\x83";
|
||||||
|
|
||||||
|
$map = array(
|
||||||
|
'a' => array(' a '),
|
||||||
|
'ab' => array(' ab', 'ab '),
|
||||||
|
'abcdef' => array(' ab', 'abc', 'bcd', 'cde', 'def', 'ef '),
|
||||||
|
"{$snowman}" => array(" {$snowman} "),
|
||||||
|
"x{$snowman}y" => array(
|
||||||
|
" x{$snowman}",
|
||||||
|
"x{$snowman}y",
|
||||||
|
"{$snowman}y ",
|
||||||
|
),
|
||||||
|
"{$snowman}{$snowman}" => array(
|
||||||
|
" {$snowman}{$snowman}",
|
||||||
|
"{$snowman}{$snowman} ",
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
$engine = new ManiphestTaskFerretEngine();
|
||||||
|
|
||||||
|
foreach ($map as $input => $expect) {
|
||||||
|
$actual = $engine->getTermNgramsFromString($input);
|
||||||
|
$this->assertEqual(
|
||||||
|
$actual,
|
||||||
|
$expect,
|
||||||
|
pht('Term ngrams for: %s.', $input));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue