1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2025-01-22 20:51:10 +01:00

Combine the two different ngram-splitting algorithms into a single engine

Summary:
Ref T13501. Depends on D21127. With the "prefix" behavior removed in D21127, we now have two virtually identical copies of the same code.

The newer one in Ferret is better: it slices utf8 correctly and is slightly more efficient on large inputs. Pull it out and make all callers call into it.

Test Plan:
  - Grepped for all affected symbols.
  - Ran `bin/search index --force ...` to reindex various objects (tasks, files).
  - Searched for things in the UI.

Maniphest Tasks: T13501

Differential Revision: https://secure.phabricator.com/D21128
This commit is contained in:
epriestley 2020-04-16 06:05:05 -07:00
parent fb3f423279
commit 9bdf477f2f
7 changed files with 104 additions and 102 deletions

View file

@ -4694,6 +4694,7 @@ phutil_register_library_map(array(
'PhabricatorSearchManagementNgramsWorkflow' => 'applications/search/management/PhabricatorSearchManagementNgramsWorkflow.php', 'PhabricatorSearchManagementNgramsWorkflow' => 'applications/search/management/PhabricatorSearchManagementNgramsWorkflow.php',
'PhabricatorSearchManagementQueryWorkflow' => 'applications/search/management/PhabricatorSearchManagementQueryWorkflow.php', 'PhabricatorSearchManagementQueryWorkflow' => 'applications/search/management/PhabricatorSearchManagementQueryWorkflow.php',
'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php', 'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php',
'PhabricatorSearchNgramEngine' => 'applications/search/engine/PhabricatorSearchNgramEngine.php',
'PhabricatorSearchNgrams' => 'applications/search/ngrams/PhabricatorSearchNgrams.php', 'PhabricatorSearchNgrams' => 'applications/search/ngrams/PhabricatorSearchNgrams.php',
'PhabricatorSearchNgramsDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchNgramsDestructionEngineExtension.php', 'PhabricatorSearchNgramsDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchNgramsDestructionEngineExtension.php',
'PhabricatorSearchOrderController' => 'applications/search/controller/PhabricatorSearchOrderController.php', 'PhabricatorSearchOrderController' => 'applications/search/controller/PhabricatorSearchOrderController.php',
@ -11417,6 +11418,7 @@ phutil_register_library_map(array(
'PhabricatorSearchManagementNgramsWorkflow' => 'PhabricatorSearchManagementWorkflow', 'PhabricatorSearchManagementNgramsWorkflow' => 'PhabricatorSearchManagementWorkflow',
'PhabricatorSearchManagementQueryWorkflow' => 'PhabricatorSearchManagementWorkflow', 'PhabricatorSearchManagementQueryWorkflow' => 'PhabricatorSearchManagementWorkflow',
'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow', 'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow',
'PhabricatorSearchNgramEngine' => 'Phobject',
'PhabricatorSearchNgrams' => 'PhabricatorSearchDAO', 'PhabricatorSearchNgrams' => 'PhabricatorSearchDAO',
'PhabricatorSearchNgramsDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension', 'PhabricatorSearchNgramsDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension',
'PhabricatorSearchOrderController' => 'PhabricatorSearchBaseController', 'PhabricatorSearchOrderController' => 'PhabricatorSearchBaseController',

View file

@ -0,0 +1,66 @@
<?php
final class PhabricatorSearchNgramEngine
extends Phobject {
public function tokenizeNgramString($value) {
$value = trim($value, ' ');
$value = preg_split('/\s+/u', $value);
return $value;
}
public function getTermNgramsFromString($string) {
return $this->getNgramsFromString($string, true);
}
public function getSubstringNgramsFromString($string) {
return $this->getNgramsFromString($string, false);
}
private function getNgramsFromString($value, $as_term) {
$value = phutil_utf8_strtolower($value);
$tokens = $this->tokenizeNgramString($value);
// First, extract unique tokens from the string. This reduces the number
// of `phutil_utf8v()` calls we need to make if we are indexing a large
// corpus with redundant terms.
$unique_tokens = array();
foreach ($tokens as $token) {
if ($as_term) {
$token = ' '.$token.' ';
}
$unique_tokens[$token] = true;
}
$ngrams = array();
foreach ($unique_tokens as $token => $ignored) {
$token_v = phutil_utf8v($token);
$length = count($token_v);
// NOTE: We're being somewhat clever here to micro-optimize performance,
// especially for very long strings. See PHI87.
$token_l = array();
for ($ii = 0; $ii < $length; $ii++) {
$token_l[$ii] = strlen($token_v[$ii]);
}
$ngram_count = $length - 2;
$cursor = 0;
for ($ii = 0; $ii < $ngram_count; $ii++) {
$ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
$ngram = substr($token, $cursor, $ngram_l);
$ngrams[$ngram] = $ngram;
$cursor += $token_l[$ii];
}
}
ksort($ngrams);
return array_keys($ngrams);
}
}

View file

@ -131,7 +131,8 @@ final class PhabricatorFerretFulltextEngineExtension
} }
$ngrams_source = implode("\n", $ngrams_source); $ngrams_source = implode("\n", $ngrams_source);
$ngrams = $engine->getTermNgramsFromString($ngrams_source); $ngram_engine = new PhabricatorSearchNgramEngine();
$ngrams = $ngram_engine->getTermNgramsFromString($ngrams_source);
$object->openTransaction(); $object->openTransaction();

View file

@ -62,66 +62,6 @@ abstract class PhabricatorFerretEngine extends Phobject {
return new PhutilSearchStemmer(); return new PhutilSearchStemmer();
} }
public function tokenizeString($value) {
$value = trim($value, ' ');
$value = preg_split('/\s+/u', $value);
return $value;
}
public function getTermNgramsFromString($string) {
return $this->getNgramsFromString($string, true);
}
public function getSubstringNgramsFromString($string) {
return $this->getNgramsFromString($string, false);
}
private function getNgramsFromString($value, $as_term) {
$value = phutil_utf8_strtolower($value);
$tokens = $this->tokenizeString($value);
// First, extract unique tokens from the string. This reduces the number
// of `phutil_utf8v()` calls we need to make if we are indexing a large
// corpus with redundant terms.
$unique_tokens = array();
foreach ($tokens as $token) {
if ($as_term) {
$token = ' '.$token.' ';
}
$unique_tokens[$token] = true;
}
$ngrams = array();
foreach ($unique_tokens as $token => $ignored) {
$token_v = phutil_utf8v($token);
$length = count($token_v);
// NOTE: We're being somewhat clever here to micro-optimize performance,
// especially for very long strings. See PHI87.
$token_l = array();
for ($ii = 0; $ii < $length; $ii++) {
$token_l[$ii] = strlen($token_v[$ii]);
}
$ngram_count = $length - 2;
$cursor = 0;
for ($ii = 0; $ii < $ngram_count; $ii++) {
$ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
$ngram = substr($token, $cursor, $ngram_l);
$ngrams[$ngram] = $ngram;
$cursor += $token_l[$ii];
}
}
ksort($ngrams);
return array_keys($ngrams);
}
public function newTermsCorpus($raw_corpus) { public function newTermsCorpus($raw_corpus) {
$term_corpus = strtr( $term_corpus = strtr(
$raw_corpus, $raw_corpus,

View file

@ -43,10 +43,10 @@ final class PhabricatorFerretEngineTestCase
), ),
); );
$engine = new ManiphestTaskFerretEngine(); $ngram_engine = new PhabricatorSearchNgramEngine();
foreach ($map as $input => $expect) { foreach ($map as $input => $expect) {
$actual = $engine->getTermNgramsFromString($input); $actual = $ngram_engine->getTermNgramsFromString($input);
$this->assertEqual( $this->assertEqual(
$actual, $actual,
$expect, $expect,

View file

@ -7,6 +7,7 @@ abstract class PhabricatorSearchNgrams
protected $ngram; protected $ngram;
private $value; private $value;
private $ngramEngine;
abstract public function getNgramKey(); abstract public function getNgramKey();
abstract public function getColumnName(); abstract public function getColumnName();
@ -44,41 +45,10 @@ abstract class PhabricatorSearchNgrams
return "{$application}_{$key}_ngrams"; return "{$application}_{$key}_ngrams";
} }
final public function tokenizeString($value) {
$value = trim($value, ' ');
$value = preg_split('/ +/', $value);
return $value;
}
final public function getNgramsFromString($value, $mode) {
$tokens = $this->tokenizeString($value);
$ngrams = array();
foreach ($tokens as $token) {
$token = phutil_utf8_strtolower($token);
switch ($mode) {
case 'query':
break;
case 'index':
$token = ' '.$token.' ';
break;
}
$len = (strlen($token) - 2);
for ($ii = 0; $ii < $len; $ii++) {
$ngram = substr($token, $ii, 3);
$ngrams[$ngram] = $ngram;
}
}
ksort($ngrams);
return array_keys($ngrams);
}
final public function writeNgram($object_id) { final public function writeNgram($object_id) {
$ngrams = $this->getNgramsFromString($this->getValue(), 'index'); $ngram_engine = $this->getNgramEngine();
$ngrams = $ngram_engine->getTermNgramsFromString($this->getValue());
$conn_w = $this->establishConnection('w'); $conn_w = $this->establishConnection('w');
$sql = array(); $sql = array();
@ -107,4 +77,12 @@ abstract class PhabricatorSearchNgrams
return $this; return $this;
} }
private function getNgramEngine() {
if (!$this->ngramEngine) {
$this->ngramEngine = new PhabricatorSearchNgramEngine();
}
return $this->ngramEngine;
}
} }

View file

@ -36,6 +36,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
private $ferretTables = array(); private $ferretTables = array();
private $ferretQuery; private $ferretQuery;
private $ferretMetadata = array(); private $ferretMetadata = array();
private $ngramEngine;
const FULLTEXT_RANK = '_ft_rank'; const FULLTEXT_RANK = '_ft_rank';
const FULLTEXT_MODIFIED = '_ft_epochModified'; const FULLTEXT_MODIFIED = '_ft_epochModified';
@ -1984,6 +1985,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
$stemmer = $engine->newStemmer(); $stemmer = $engine->newStemmer();
$ngram_table = $engine->getNgramsTableName(); $ngram_table = $engine->getNgramsTableName();
$ngram_engine = $this->getNgramEngine();
$flat = array(); $flat = array();
foreach ($this->ferretTokens as $fulltext_token) { foreach ($this->ferretTokens as $fulltext_token) {
@ -2032,10 +2034,10 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
} }
if ($is_substring) { if ($is_substring) {
$ngrams = $engine->getSubstringNgramsFromString($value); $ngrams = $ngram_engine->getSubstringNgramsFromString($value);
} else { } else {
$terms_value = $engine->newTermsCorpus($value); $terms_value = $engine->newTermsCorpus($value);
$ngrams = $engine->getTermNgramsFromString($terms_value); $ngrams = $ngram_engine->getTermNgramsFromString($terms_value);
// If this is a stemmed term, only look for ngrams present in both the // If this is a stemmed term, only look for ngrams present in both the
// unstemmed and stemmed variations. // unstemmed and stemmed variations.
@ -2044,7 +2046,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
// is (or, at least, may be) a normal word and activates. // is (or, at least, may be) a normal word and activates.
$terms_value = trim($terms_value, ' '); $terms_value = trim($terms_value, ' ');
$stem_value = $stemmer->stemToken($terms_value); $stem_value = $stemmer->stemToken($terms_value);
$stem_ngrams = $engine->getTermNgramsFromString($stem_value); $stem_ngrams = $ngram_engine->getTermNgramsFromString($stem_value);
$ngrams = array_intersect($ngrams, $stem_ngrams); $ngrams = array_intersect($ngrams, $stem_ngrams);
} }
} }
@ -2409,6 +2411,8 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
protected function buildNgramsJoinClause(AphrontDatabaseConnection $conn) { protected function buildNgramsJoinClause(AphrontDatabaseConnection $conn) {
$ngram_engine = $this->getNgramEngine();
$flat = array(); $flat = array();
foreach ($this->ngrams as $spec) { foreach ($this->ngrams as $spec) {
$length = $spec['length']; $length = $spec['length'];
@ -2420,7 +2424,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
$index = $spec['index']; $index = $spec['index'];
$value = $spec['value']; $value = $spec['value'];
$ngrams = $index->getNgramsFromString($value, 'query'); $ngrams = $ngram_engine->getSubstringNgramsFromString($value);
foreach ($ngrams as $ngram) { foreach ($ngrams as $ngram) {
$flat[] = array( $flat[] = array(
@ -2476,6 +2480,8 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
protected function buildNgramsWhereClause(AphrontDatabaseConnection $conn) { protected function buildNgramsWhereClause(AphrontDatabaseConnection $conn) {
$where = array(); $where = array();
$ngram_engine = $this->getNgramEngine();
foreach ($this->ngrams as $ngram) { foreach ($this->ngrams as $ngram) {
$index = $ngram['index']; $index = $ngram['index'];
$value = $ngram['value']; $value = $ngram['value'];
@ -2488,7 +2494,8 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
$column = qsprintf($conn, '%T', $column); $column = qsprintf($conn, '%T', $column);
} }
$tokens = $index->tokenizeString($value); $tokens = $ngram_engine->tokenizeNgramString($value);
foreach ($tokens as $token) { foreach ($tokens as $token) {
$where[] = qsprintf( $where[] = qsprintf(
$conn, $conn,
@ -2506,6 +2513,14 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
return (bool)$this->ngrams; return (bool)$this->ngrams;
} }
private function getNgramEngine() {
if (!$this->ngramEngine) {
$this->ngramEngine = new PhabricatorSearchNgramEngine();
}
return $this->ngramEngine;
}
/* -( Edge Logic )--------------------------------------------------------- */ /* -( Edge Logic )--------------------------------------------------------- */