mirror of
https://we.phorge.it/source/phorge.git
synced 2025-01-22 12:41:19 +01:00
Combine the two different ngram-splitting algorithms into a single engine
Summary: Ref T13501. Depends on D21127. With the "prefix" behavior removed in D21127, we now have two virtually identical copies of the same code. The newer one in Ferret is better: it slices utf8 correctly and is slightly more efficient on large inputs. Pull it out and make all callers call into it. Test Plan: - Grepped for all affected symbols. - Ran `bin/search index --force ...` to reindex various objects (tasks, files). - Searched for things in the UI. Maniphest Tasks: T13501 Differential Revision: https://secure.phabricator.com/D21128
This commit is contained in:
parent
fb3f423279
commit
9bdf477f2f
7 changed files with 104 additions and 102 deletions
|
@ -4694,6 +4694,7 @@ phutil_register_library_map(array(
|
|||
'PhabricatorSearchManagementNgramsWorkflow' => 'applications/search/management/PhabricatorSearchManagementNgramsWorkflow.php',
|
||||
'PhabricatorSearchManagementQueryWorkflow' => 'applications/search/management/PhabricatorSearchManagementQueryWorkflow.php',
|
||||
'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php',
|
||||
'PhabricatorSearchNgramEngine' => 'applications/search/engine/PhabricatorSearchNgramEngine.php',
|
||||
'PhabricatorSearchNgrams' => 'applications/search/ngrams/PhabricatorSearchNgrams.php',
|
||||
'PhabricatorSearchNgramsDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchNgramsDestructionEngineExtension.php',
|
||||
'PhabricatorSearchOrderController' => 'applications/search/controller/PhabricatorSearchOrderController.php',
|
||||
|
@ -11417,6 +11418,7 @@ phutil_register_library_map(array(
|
|||
'PhabricatorSearchManagementNgramsWorkflow' => 'PhabricatorSearchManagementWorkflow',
|
||||
'PhabricatorSearchManagementQueryWorkflow' => 'PhabricatorSearchManagementWorkflow',
|
||||
'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow',
|
||||
'PhabricatorSearchNgramEngine' => 'Phobject',
|
||||
'PhabricatorSearchNgrams' => 'PhabricatorSearchDAO',
|
||||
'PhabricatorSearchNgramsDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension',
|
||||
'PhabricatorSearchOrderController' => 'PhabricatorSearchBaseController',
|
||||
|
|
|
@ -0,0 +1,66 @@
|
|||
<?php
|
||||
|
||||
final class PhabricatorSearchNgramEngine
|
||||
extends Phobject {
|
||||
|
||||
public function tokenizeNgramString($value) {
|
||||
$value = trim($value, ' ');
|
||||
$value = preg_split('/\s+/u', $value);
|
||||
return $value;
|
||||
}
|
||||
|
||||
public function getTermNgramsFromString($string) {
|
||||
return $this->getNgramsFromString($string, true);
|
||||
}
|
||||
|
||||
public function getSubstringNgramsFromString($string) {
|
||||
return $this->getNgramsFromString($string, false);
|
||||
}
|
||||
|
||||
private function getNgramsFromString($value, $as_term) {
|
||||
$value = phutil_utf8_strtolower($value);
|
||||
$tokens = $this->tokenizeNgramString($value);
|
||||
|
||||
// First, extract unique tokens from the string. This reduces the number
|
||||
// of `phutil_utf8v()` calls we need to make if we are indexing a large
|
||||
// corpus with redundant terms.
|
||||
$unique_tokens = array();
|
||||
foreach ($tokens as $token) {
|
||||
if ($as_term) {
|
||||
$token = ' '.$token.' ';
|
||||
}
|
||||
|
||||
$unique_tokens[$token] = true;
|
||||
}
|
||||
|
||||
$ngrams = array();
|
||||
foreach ($unique_tokens as $token => $ignored) {
|
||||
$token_v = phutil_utf8v($token);
|
||||
$length = count($token_v);
|
||||
|
||||
// NOTE: We're being somewhat clever here to micro-optimize performance,
|
||||
// especially for very long strings. See PHI87.
|
||||
|
||||
$token_l = array();
|
||||
for ($ii = 0; $ii < $length; $ii++) {
|
||||
$token_l[$ii] = strlen($token_v[$ii]);
|
||||
}
|
||||
|
||||
$ngram_count = $length - 2;
|
||||
$cursor = 0;
|
||||
for ($ii = 0; $ii < $ngram_count; $ii++) {
|
||||
$ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
|
||||
|
||||
$ngram = substr($token, $cursor, $ngram_l);
|
||||
$ngrams[$ngram] = $ngram;
|
||||
|
||||
$cursor += $token_l[$ii];
|
||||
}
|
||||
}
|
||||
|
||||
ksort($ngrams);
|
||||
|
||||
return array_keys($ngrams);
|
||||
}
|
||||
|
||||
}
|
|
@ -131,7 +131,8 @@ final class PhabricatorFerretFulltextEngineExtension
|
|||
}
|
||||
$ngrams_source = implode("\n", $ngrams_source);
|
||||
|
||||
$ngrams = $engine->getTermNgramsFromString($ngrams_source);
|
||||
$ngram_engine = new PhabricatorSearchNgramEngine();
|
||||
$ngrams = $ngram_engine->getTermNgramsFromString($ngrams_source);
|
||||
|
||||
$object->openTransaction();
|
||||
|
||||
|
|
|
@ -62,66 +62,6 @@ abstract class PhabricatorFerretEngine extends Phobject {
|
|||
return new PhutilSearchStemmer();
|
||||
}
|
||||
|
||||
public function tokenizeString($value) {
|
||||
$value = trim($value, ' ');
|
||||
$value = preg_split('/\s+/u', $value);
|
||||
return $value;
|
||||
}
|
||||
|
||||
public function getTermNgramsFromString($string) {
|
||||
return $this->getNgramsFromString($string, true);
|
||||
}
|
||||
|
||||
public function getSubstringNgramsFromString($string) {
|
||||
return $this->getNgramsFromString($string, false);
|
||||
}
|
||||
|
||||
private function getNgramsFromString($value, $as_term) {
|
||||
$value = phutil_utf8_strtolower($value);
|
||||
$tokens = $this->tokenizeString($value);
|
||||
|
||||
// First, extract unique tokens from the string. This reduces the number
|
||||
// of `phutil_utf8v()` calls we need to make if we are indexing a large
|
||||
// corpus with redundant terms.
|
||||
$unique_tokens = array();
|
||||
foreach ($tokens as $token) {
|
||||
if ($as_term) {
|
||||
$token = ' '.$token.' ';
|
||||
}
|
||||
|
||||
$unique_tokens[$token] = true;
|
||||
}
|
||||
|
||||
$ngrams = array();
|
||||
foreach ($unique_tokens as $token => $ignored) {
|
||||
$token_v = phutil_utf8v($token);
|
||||
$length = count($token_v);
|
||||
|
||||
// NOTE: We're being somewhat clever here to micro-optimize performance,
|
||||
// especially for very long strings. See PHI87.
|
||||
|
||||
$token_l = array();
|
||||
for ($ii = 0; $ii < $length; $ii++) {
|
||||
$token_l[$ii] = strlen($token_v[$ii]);
|
||||
}
|
||||
|
||||
$ngram_count = $length - 2;
|
||||
$cursor = 0;
|
||||
for ($ii = 0; $ii < $ngram_count; $ii++) {
|
||||
$ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
|
||||
|
||||
$ngram = substr($token, $cursor, $ngram_l);
|
||||
$ngrams[$ngram] = $ngram;
|
||||
|
||||
$cursor += $token_l[$ii];
|
||||
}
|
||||
}
|
||||
|
||||
ksort($ngrams);
|
||||
|
||||
return array_keys($ngrams);
|
||||
}
|
||||
|
||||
public function newTermsCorpus($raw_corpus) {
|
||||
$term_corpus = strtr(
|
||||
$raw_corpus,
|
||||
|
|
|
@ -43,10 +43,10 @@ final class PhabricatorFerretEngineTestCase
|
|||
),
|
||||
);
|
||||
|
||||
$engine = new ManiphestTaskFerretEngine();
|
||||
$ngram_engine = new PhabricatorSearchNgramEngine();
|
||||
|
||||
foreach ($map as $input => $expect) {
|
||||
$actual = $engine->getTermNgramsFromString($input);
|
||||
$actual = $ngram_engine->getTermNgramsFromString($input);
|
||||
$this->assertEqual(
|
||||
$actual,
|
||||
$expect,
|
||||
|
|
|
@ -7,6 +7,7 @@ abstract class PhabricatorSearchNgrams
|
|||
protected $ngram;
|
||||
|
||||
private $value;
|
||||
private $ngramEngine;
|
||||
|
||||
abstract public function getNgramKey();
|
||||
abstract public function getColumnName();
|
||||
|
@ -44,41 +45,10 @@ abstract class PhabricatorSearchNgrams
|
|||
return "{$application}_{$key}_ngrams";
|
||||
}
|
||||
|
||||
final public function tokenizeString($value) {
|
||||
$value = trim($value, ' ');
|
||||
$value = preg_split('/ +/', $value);
|
||||
return $value;
|
||||
}
|
||||
|
||||
final public function getNgramsFromString($value, $mode) {
|
||||
$tokens = $this->tokenizeString($value);
|
||||
|
||||
$ngrams = array();
|
||||
foreach ($tokens as $token) {
|
||||
$token = phutil_utf8_strtolower($token);
|
||||
|
||||
switch ($mode) {
|
||||
case 'query':
|
||||
break;
|
||||
case 'index':
|
||||
$token = ' '.$token.' ';
|
||||
break;
|
||||
}
|
||||
|
||||
$len = (strlen($token) - 2);
|
||||
for ($ii = 0; $ii < $len; $ii++) {
|
||||
$ngram = substr($token, $ii, 3);
|
||||
$ngrams[$ngram] = $ngram;
|
||||
}
|
||||
}
|
||||
|
||||
ksort($ngrams);
|
||||
|
||||
return array_keys($ngrams);
|
||||
}
|
||||
|
||||
final public function writeNgram($object_id) {
|
||||
$ngrams = $this->getNgramsFromString($this->getValue(), 'index');
|
||||
$ngram_engine = $this->getNgramEngine();
|
||||
$ngrams = $ngram_engine->getTermNgramsFromString($this->getValue());
|
||||
|
||||
$conn_w = $this->establishConnection('w');
|
||||
|
||||
$sql = array();
|
||||
|
@ -107,4 +77,12 @@ abstract class PhabricatorSearchNgrams
|
|||
return $this;
|
||||
}
|
||||
|
||||
private function getNgramEngine() {
|
||||
if (!$this->ngramEngine) {
|
||||
$this->ngramEngine = new PhabricatorSearchNgramEngine();
|
||||
}
|
||||
|
||||
return $this->ngramEngine;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -36,6 +36,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
private $ferretTables = array();
|
||||
private $ferretQuery;
|
||||
private $ferretMetadata = array();
|
||||
private $ngramEngine;
|
||||
|
||||
const FULLTEXT_RANK = '_ft_rank';
|
||||
const FULLTEXT_MODIFIED = '_ft_epochModified';
|
||||
|
@ -1984,6 +1985,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
$stemmer = $engine->newStemmer();
|
||||
|
||||
$ngram_table = $engine->getNgramsTableName();
|
||||
$ngram_engine = $this->getNgramEngine();
|
||||
|
||||
$flat = array();
|
||||
foreach ($this->ferretTokens as $fulltext_token) {
|
||||
|
@ -2032,10 +2034,10 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
}
|
||||
|
||||
if ($is_substring) {
|
||||
$ngrams = $engine->getSubstringNgramsFromString($value);
|
||||
$ngrams = $ngram_engine->getSubstringNgramsFromString($value);
|
||||
} else {
|
||||
$terms_value = $engine->newTermsCorpus($value);
|
||||
$ngrams = $engine->getTermNgramsFromString($terms_value);
|
||||
$ngrams = $ngram_engine->getTermNgramsFromString($terms_value);
|
||||
|
||||
// If this is a stemmed term, only look for ngrams present in both the
|
||||
// unstemmed and stemmed variations.
|
||||
|
@ -2044,7 +2046,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
// is (or, at least, may be) a normal word and activates.
|
||||
$terms_value = trim($terms_value, ' ');
|
||||
$stem_value = $stemmer->stemToken($terms_value);
|
||||
$stem_ngrams = $engine->getTermNgramsFromString($stem_value);
|
||||
$stem_ngrams = $ngram_engine->getTermNgramsFromString($stem_value);
|
||||
$ngrams = array_intersect($ngrams, $stem_ngrams);
|
||||
}
|
||||
}
|
||||
|
@ -2409,6 +2411,8 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
|
||||
|
||||
protected function buildNgramsJoinClause(AphrontDatabaseConnection $conn) {
|
||||
$ngram_engine = $this->getNgramEngine();
|
||||
|
||||
$flat = array();
|
||||
foreach ($this->ngrams as $spec) {
|
||||
$length = $spec['length'];
|
||||
|
@ -2420,7 +2424,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
$index = $spec['index'];
|
||||
$value = $spec['value'];
|
||||
|
||||
$ngrams = $index->getNgramsFromString($value, 'query');
|
||||
$ngrams = $ngram_engine->getSubstringNgramsFromString($value);
|
||||
|
||||
foreach ($ngrams as $ngram) {
|
||||
$flat[] = array(
|
||||
|
@ -2476,6 +2480,8 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
protected function buildNgramsWhereClause(AphrontDatabaseConnection $conn) {
|
||||
$where = array();
|
||||
|
||||
$ngram_engine = $this->getNgramEngine();
|
||||
|
||||
foreach ($this->ngrams as $ngram) {
|
||||
$index = $ngram['index'];
|
||||
$value = $ngram['value'];
|
||||
|
@ -2488,7 +2494,8 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
$column = qsprintf($conn, '%T', $column);
|
||||
}
|
||||
|
||||
$tokens = $index->tokenizeString($value);
|
||||
$tokens = $ngram_engine->tokenizeNgramString($value);
|
||||
|
||||
foreach ($tokens as $token) {
|
||||
$where[] = qsprintf(
|
||||
$conn,
|
||||
|
@ -2506,6 +2513,14 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
return (bool)$this->ngrams;
|
||||
}
|
||||
|
||||
private function getNgramEngine() {
|
||||
if (!$this->ngramEngine) {
|
||||
$this->ngramEngine = new PhabricatorSearchNgramEngine();
|
||||
}
|
||||
|
||||
return $this->ngramEngine;
|
||||
}
|
||||
|
||||
|
||||
/* -( Edge Logic )--------------------------------------------------------- */
|
||||
|
||||
|
|
Loading…
Reference in a new issue