mirror of
https://we.phorge.it/source/phorge.git
synced 2025-01-22 20:51:10 +01:00
Combine the two different ngram-splitting algorithms into a single engine
Summary: Ref T13501. Depends on D21127. With the "prefix" behavior removed in D21127, we now have two virtually identical copies of the same code. The newer one in Ferret is better: it slices utf8 correctly and is slightly more efficient on large inputs. Pull it out and make all callers call into it. Test Plan: - Grepped for all affected symbols. - Ran `bin/search index --force ...` to reindex various objects (tasks, files). - Searched for things in the UI. Maniphest Tasks: T13501 Differential Revision: https://secure.phabricator.com/D21128
This commit is contained in:
parent
fb3f423279
commit
9bdf477f2f
7 changed files with 104 additions and 102 deletions
|
@ -4694,6 +4694,7 @@ phutil_register_library_map(array(
|
||||||
'PhabricatorSearchManagementNgramsWorkflow' => 'applications/search/management/PhabricatorSearchManagementNgramsWorkflow.php',
|
'PhabricatorSearchManagementNgramsWorkflow' => 'applications/search/management/PhabricatorSearchManagementNgramsWorkflow.php',
|
||||||
'PhabricatorSearchManagementQueryWorkflow' => 'applications/search/management/PhabricatorSearchManagementQueryWorkflow.php',
|
'PhabricatorSearchManagementQueryWorkflow' => 'applications/search/management/PhabricatorSearchManagementQueryWorkflow.php',
|
||||||
'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php',
|
'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php',
|
||||||
|
'PhabricatorSearchNgramEngine' => 'applications/search/engine/PhabricatorSearchNgramEngine.php',
|
||||||
'PhabricatorSearchNgrams' => 'applications/search/ngrams/PhabricatorSearchNgrams.php',
|
'PhabricatorSearchNgrams' => 'applications/search/ngrams/PhabricatorSearchNgrams.php',
|
||||||
'PhabricatorSearchNgramsDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchNgramsDestructionEngineExtension.php',
|
'PhabricatorSearchNgramsDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchNgramsDestructionEngineExtension.php',
|
||||||
'PhabricatorSearchOrderController' => 'applications/search/controller/PhabricatorSearchOrderController.php',
|
'PhabricatorSearchOrderController' => 'applications/search/controller/PhabricatorSearchOrderController.php',
|
||||||
|
@ -11417,6 +11418,7 @@ phutil_register_library_map(array(
|
||||||
'PhabricatorSearchManagementNgramsWorkflow' => 'PhabricatorSearchManagementWorkflow',
|
'PhabricatorSearchManagementNgramsWorkflow' => 'PhabricatorSearchManagementWorkflow',
|
||||||
'PhabricatorSearchManagementQueryWorkflow' => 'PhabricatorSearchManagementWorkflow',
|
'PhabricatorSearchManagementQueryWorkflow' => 'PhabricatorSearchManagementWorkflow',
|
||||||
'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow',
|
'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow',
|
||||||
|
'PhabricatorSearchNgramEngine' => 'Phobject',
|
||||||
'PhabricatorSearchNgrams' => 'PhabricatorSearchDAO',
|
'PhabricatorSearchNgrams' => 'PhabricatorSearchDAO',
|
||||||
'PhabricatorSearchNgramsDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension',
|
'PhabricatorSearchNgramsDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension',
|
||||||
'PhabricatorSearchOrderController' => 'PhabricatorSearchBaseController',
|
'PhabricatorSearchOrderController' => 'PhabricatorSearchBaseController',
|
||||||
|
|
|
@ -0,0 +1,66 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
final class PhabricatorSearchNgramEngine
|
||||||
|
extends Phobject {
|
||||||
|
|
||||||
|
public function tokenizeNgramString($value) {
|
||||||
|
$value = trim($value, ' ');
|
||||||
|
$value = preg_split('/\s+/u', $value);
|
||||||
|
return $value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getTermNgramsFromString($string) {
|
||||||
|
return $this->getNgramsFromString($string, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getSubstringNgramsFromString($string) {
|
||||||
|
return $this->getNgramsFromString($string, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function getNgramsFromString($value, $as_term) {
|
||||||
|
$value = phutil_utf8_strtolower($value);
|
||||||
|
$tokens = $this->tokenizeNgramString($value);
|
||||||
|
|
||||||
|
// First, extract unique tokens from the string. This reduces the number
|
||||||
|
// of `phutil_utf8v()` calls we need to make if we are indexing a large
|
||||||
|
// corpus with redundant terms.
|
||||||
|
$unique_tokens = array();
|
||||||
|
foreach ($tokens as $token) {
|
||||||
|
if ($as_term) {
|
||||||
|
$token = ' '.$token.' ';
|
||||||
|
}
|
||||||
|
|
||||||
|
$unique_tokens[$token] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
$ngrams = array();
|
||||||
|
foreach ($unique_tokens as $token => $ignored) {
|
||||||
|
$token_v = phutil_utf8v($token);
|
||||||
|
$length = count($token_v);
|
||||||
|
|
||||||
|
// NOTE: We're being somewhat clever here to micro-optimize performance,
|
||||||
|
// especially for very long strings. See PHI87.
|
||||||
|
|
||||||
|
$token_l = array();
|
||||||
|
for ($ii = 0; $ii < $length; $ii++) {
|
||||||
|
$token_l[$ii] = strlen($token_v[$ii]);
|
||||||
|
}
|
||||||
|
|
||||||
|
$ngram_count = $length - 2;
|
||||||
|
$cursor = 0;
|
||||||
|
for ($ii = 0; $ii < $ngram_count; $ii++) {
|
||||||
|
$ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
|
||||||
|
|
||||||
|
$ngram = substr($token, $cursor, $ngram_l);
|
||||||
|
$ngrams[$ngram] = $ngram;
|
||||||
|
|
||||||
|
$cursor += $token_l[$ii];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ksort($ngrams);
|
||||||
|
|
||||||
|
return array_keys($ngrams);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -131,7 +131,8 @@ final class PhabricatorFerretFulltextEngineExtension
|
||||||
}
|
}
|
||||||
$ngrams_source = implode("\n", $ngrams_source);
|
$ngrams_source = implode("\n", $ngrams_source);
|
||||||
|
|
||||||
$ngrams = $engine->getTermNgramsFromString($ngrams_source);
|
$ngram_engine = new PhabricatorSearchNgramEngine();
|
||||||
|
$ngrams = $ngram_engine->getTermNgramsFromString($ngrams_source);
|
||||||
|
|
||||||
$object->openTransaction();
|
$object->openTransaction();
|
||||||
|
|
||||||
|
|
|
@ -62,66 +62,6 @@ abstract class PhabricatorFerretEngine extends Phobject {
|
||||||
return new PhutilSearchStemmer();
|
return new PhutilSearchStemmer();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function tokenizeString($value) {
|
|
||||||
$value = trim($value, ' ');
|
|
||||||
$value = preg_split('/\s+/u', $value);
|
|
||||||
return $value;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getTermNgramsFromString($string) {
|
|
||||||
return $this->getNgramsFromString($string, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getSubstringNgramsFromString($string) {
|
|
||||||
return $this->getNgramsFromString($string, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
private function getNgramsFromString($value, $as_term) {
|
|
||||||
$value = phutil_utf8_strtolower($value);
|
|
||||||
$tokens = $this->tokenizeString($value);
|
|
||||||
|
|
||||||
// First, extract unique tokens from the string. This reduces the number
|
|
||||||
// of `phutil_utf8v()` calls we need to make if we are indexing a large
|
|
||||||
// corpus with redundant terms.
|
|
||||||
$unique_tokens = array();
|
|
||||||
foreach ($tokens as $token) {
|
|
||||||
if ($as_term) {
|
|
||||||
$token = ' '.$token.' ';
|
|
||||||
}
|
|
||||||
|
|
||||||
$unique_tokens[$token] = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
$ngrams = array();
|
|
||||||
foreach ($unique_tokens as $token => $ignored) {
|
|
||||||
$token_v = phutil_utf8v($token);
|
|
||||||
$length = count($token_v);
|
|
||||||
|
|
||||||
// NOTE: We're being somewhat clever here to micro-optimize performance,
|
|
||||||
// especially for very long strings. See PHI87.
|
|
||||||
|
|
||||||
$token_l = array();
|
|
||||||
for ($ii = 0; $ii < $length; $ii++) {
|
|
||||||
$token_l[$ii] = strlen($token_v[$ii]);
|
|
||||||
}
|
|
||||||
|
|
||||||
$ngram_count = $length - 2;
|
|
||||||
$cursor = 0;
|
|
||||||
for ($ii = 0; $ii < $ngram_count; $ii++) {
|
|
||||||
$ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
|
|
||||||
|
|
||||||
$ngram = substr($token, $cursor, $ngram_l);
|
|
||||||
$ngrams[$ngram] = $ngram;
|
|
||||||
|
|
||||||
$cursor += $token_l[$ii];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ksort($ngrams);
|
|
||||||
|
|
||||||
return array_keys($ngrams);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function newTermsCorpus($raw_corpus) {
|
public function newTermsCorpus($raw_corpus) {
|
||||||
$term_corpus = strtr(
|
$term_corpus = strtr(
|
||||||
$raw_corpus,
|
$raw_corpus,
|
||||||
|
|
|
@ -43,10 +43,10 @@ final class PhabricatorFerretEngineTestCase
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
$engine = new ManiphestTaskFerretEngine();
|
$ngram_engine = new PhabricatorSearchNgramEngine();
|
||||||
|
|
||||||
foreach ($map as $input => $expect) {
|
foreach ($map as $input => $expect) {
|
||||||
$actual = $engine->getTermNgramsFromString($input);
|
$actual = $ngram_engine->getTermNgramsFromString($input);
|
||||||
$this->assertEqual(
|
$this->assertEqual(
|
||||||
$actual,
|
$actual,
|
||||||
$expect,
|
$expect,
|
||||||
|
|
|
@ -7,6 +7,7 @@ abstract class PhabricatorSearchNgrams
|
||||||
protected $ngram;
|
protected $ngram;
|
||||||
|
|
||||||
private $value;
|
private $value;
|
||||||
|
private $ngramEngine;
|
||||||
|
|
||||||
abstract public function getNgramKey();
|
abstract public function getNgramKey();
|
||||||
abstract public function getColumnName();
|
abstract public function getColumnName();
|
||||||
|
@ -44,41 +45,10 @@ abstract class PhabricatorSearchNgrams
|
||||||
return "{$application}_{$key}_ngrams";
|
return "{$application}_{$key}_ngrams";
|
||||||
}
|
}
|
||||||
|
|
||||||
final public function tokenizeString($value) {
|
|
||||||
$value = trim($value, ' ');
|
|
||||||
$value = preg_split('/ +/', $value);
|
|
||||||
return $value;
|
|
||||||
}
|
|
||||||
|
|
||||||
final public function getNgramsFromString($value, $mode) {
|
|
||||||
$tokens = $this->tokenizeString($value);
|
|
||||||
|
|
||||||
$ngrams = array();
|
|
||||||
foreach ($tokens as $token) {
|
|
||||||
$token = phutil_utf8_strtolower($token);
|
|
||||||
|
|
||||||
switch ($mode) {
|
|
||||||
case 'query':
|
|
||||||
break;
|
|
||||||
case 'index':
|
|
||||||
$token = ' '.$token.' ';
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
$len = (strlen($token) - 2);
|
|
||||||
for ($ii = 0; $ii < $len; $ii++) {
|
|
||||||
$ngram = substr($token, $ii, 3);
|
|
||||||
$ngrams[$ngram] = $ngram;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ksort($ngrams);
|
|
||||||
|
|
||||||
return array_keys($ngrams);
|
|
||||||
}
|
|
||||||
|
|
||||||
final public function writeNgram($object_id) {
|
final public function writeNgram($object_id) {
|
||||||
$ngrams = $this->getNgramsFromString($this->getValue(), 'index');
|
$ngram_engine = $this->getNgramEngine();
|
||||||
|
$ngrams = $ngram_engine->getTermNgramsFromString($this->getValue());
|
||||||
|
|
||||||
$conn_w = $this->establishConnection('w');
|
$conn_w = $this->establishConnection('w');
|
||||||
|
|
||||||
$sql = array();
|
$sql = array();
|
||||||
|
@ -107,4 +77,12 @@ abstract class PhabricatorSearchNgrams
|
||||||
return $this;
|
return $this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private function getNgramEngine() {
|
||||||
|
if (!$this->ngramEngine) {
|
||||||
|
$this->ngramEngine = new PhabricatorSearchNgramEngine();
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->ngramEngine;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,6 +36,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
private $ferretTables = array();
|
private $ferretTables = array();
|
||||||
private $ferretQuery;
|
private $ferretQuery;
|
||||||
private $ferretMetadata = array();
|
private $ferretMetadata = array();
|
||||||
|
private $ngramEngine;
|
||||||
|
|
||||||
const FULLTEXT_RANK = '_ft_rank';
|
const FULLTEXT_RANK = '_ft_rank';
|
||||||
const FULLTEXT_MODIFIED = '_ft_epochModified';
|
const FULLTEXT_MODIFIED = '_ft_epochModified';
|
||||||
|
@ -1984,6 +1985,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
$stemmer = $engine->newStemmer();
|
$stemmer = $engine->newStemmer();
|
||||||
|
|
||||||
$ngram_table = $engine->getNgramsTableName();
|
$ngram_table = $engine->getNgramsTableName();
|
||||||
|
$ngram_engine = $this->getNgramEngine();
|
||||||
|
|
||||||
$flat = array();
|
$flat = array();
|
||||||
foreach ($this->ferretTokens as $fulltext_token) {
|
foreach ($this->ferretTokens as $fulltext_token) {
|
||||||
|
@ -2032,10 +2034,10 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($is_substring) {
|
if ($is_substring) {
|
||||||
$ngrams = $engine->getSubstringNgramsFromString($value);
|
$ngrams = $ngram_engine->getSubstringNgramsFromString($value);
|
||||||
} else {
|
} else {
|
||||||
$terms_value = $engine->newTermsCorpus($value);
|
$terms_value = $engine->newTermsCorpus($value);
|
||||||
$ngrams = $engine->getTermNgramsFromString($terms_value);
|
$ngrams = $ngram_engine->getTermNgramsFromString($terms_value);
|
||||||
|
|
||||||
// If this is a stemmed term, only look for ngrams present in both the
|
// If this is a stemmed term, only look for ngrams present in both the
|
||||||
// unstemmed and stemmed variations.
|
// unstemmed and stemmed variations.
|
||||||
|
@ -2044,7 +2046,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
// is (or, at least, may be) a normal word and activates.
|
// is (or, at least, may be) a normal word and activates.
|
||||||
$terms_value = trim($terms_value, ' ');
|
$terms_value = trim($terms_value, ' ');
|
||||||
$stem_value = $stemmer->stemToken($terms_value);
|
$stem_value = $stemmer->stemToken($terms_value);
|
||||||
$stem_ngrams = $engine->getTermNgramsFromString($stem_value);
|
$stem_ngrams = $ngram_engine->getTermNgramsFromString($stem_value);
|
||||||
$ngrams = array_intersect($ngrams, $stem_ngrams);
|
$ngrams = array_intersect($ngrams, $stem_ngrams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2409,6 +2411,8 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
|
|
||||||
|
|
||||||
protected function buildNgramsJoinClause(AphrontDatabaseConnection $conn) {
|
protected function buildNgramsJoinClause(AphrontDatabaseConnection $conn) {
|
||||||
|
$ngram_engine = $this->getNgramEngine();
|
||||||
|
|
||||||
$flat = array();
|
$flat = array();
|
||||||
foreach ($this->ngrams as $spec) {
|
foreach ($this->ngrams as $spec) {
|
||||||
$length = $spec['length'];
|
$length = $spec['length'];
|
||||||
|
@ -2420,7 +2424,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
$index = $spec['index'];
|
$index = $spec['index'];
|
||||||
$value = $spec['value'];
|
$value = $spec['value'];
|
||||||
|
|
||||||
$ngrams = $index->getNgramsFromString($value, 'query');
|
$ngrams = $ngram_engine->getSubstringNgramsFromString($value);
|
||||||
|
|
||||||
foreach ($ngrams as $ngram) {
|
foreach ($ngrams as $ngram) {
|
||||||
$flat[] = array(
|
$flat[] = array(
|
||||||
|
@ -2476,6 +2480,8 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
protected function buildNgramsWhereClause(AphrontDatabaseConnection $conn) {
|
protected function buildNgramsWhereClause(AphrontDatabaseConnection $conn) {
|
||||||
$where = array();
|
$where = array();
|
||||||
|
|
||||||
|
$ngram_engine = $this->getNgramEngine();
|
||||||
|
|
||||||
foreach ($this->ngrams as $ngram) {
|
foreach ($this->ngrams as $ngram) {
|
||||||
$index = $ngram['index'];
|
$index = $ngram['index'];
|
||||||
$value = $ngram['value'];
|
$value = $ngram['value'];
|
||||||
|
@ -2488,7 +2494,8 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
$column = qsprintf($conn, '%T', $column);
|
$column = qsprintf($conn, '%T', $column);
|
||||||
}
|
}
|
||||||
|
|
||||||
$tokens = $index->tokenizeString($value);
|
$tokens = $ngram_engine->tokenizeNgramString($value);
|
||||||
|
|
||||||
foreach ($tokens as $token) {
|
foreach ($tokens as $token) {
|
||||||
$where[] = qsprintf(
|
$where[] = qsprintf(
|
||||||
$conn,
|
$conn,
|
||||||
|
@ -2506,6 +2513,14 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
return (bool)$this->ngrams;
|
return (bool)$this->ngrams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private function getNgramEngine() {
|
||||||
|
if (!$this->ngramEngine) {
|
||||||
|
$this->ngramEngine = new PhabricatorSearchNgramEngine();
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->ngramEngine;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* -( Edge Logic )--------------------------------------------------------- */
|
/* -( Edge Logic )--------------------------------------------------------- */
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue