mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-26 00:32:42 +01:00
Consolidate more Ferret engine code into FerretEngine
Summary: Ref T12819. Earlier I separated some ngram code into an "ngram engine" hoping to share it across the simple Ngrams stuff and the full Ferret stuff, but they actually use slightly different rules. Just pull more of this stuff into FerretEngine to reduce the number of moving pieces and the amount of code duplication. Test Plan: Searched for terms, rebuilt indexes. Reviewers: chad Reviewed By: chad Maniphest Tasks: T12819 Differential Revision: https://secure.phabricator.com/D18533
This commit is contained in:
parent
d5ba30c777
commit
4a7593f47f
6 changed files with 110 additions and 115 deletions
|
@ -2834,6 +2834,7 @@ phutil_register_library_map(array(
|
|||
'PhabricatorFeedStoryReference' => 'applications/feed/storage/PhabricatorFeedStoryReference.php',
|
||||
'PhabricatorFerretDocument' => 'applications/search/ferret/PhabricatorFerretDocument.php',
|
||||
'PhabricatorFerretEngine' => 'applications/search/ferret/PhabricatorFerretEngine.php',
|
||||
'PhabricatorFerretEngineTestCase' => 'applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php',
|
||||
'PhabricatorFerretField' => 'applications/search/ferret/PhabricatorFerretField.php',
|
||||
'PhabricatorFerretFulltextEngineExtension' => 'applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php',
|
||||
'PhabricatorFerretInterface' => 'applications/search/ferret/PhabricatorFerretInterface.php',
|
||||
|
@ -3205,8 +3206,6 @@ phutil_register_library_map(array(
|
|||
'PhabricatorNamedQueryQuery' => 'applications/search/query/PhabricatorNamedQueryQuery.php',
|
||||
'PhabricatorNavigationRemarkupRule' => 'infrastructure/markup/rule/PhabricatorNavigationRemarkupRule.php',
|
||||
'PhabricatorNeverTriggerClock' => 'infrastructure/daemon/workers/clock/PhabricatorNeverTriggerClock.php',
|
||||
'PhabricatorNgramEngine' => 'applications/search/ngrams/PhabricatorNgramEngine.php',
|
||||
'PhabricatorNgramEngineTestCase' => 'applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php',
|
||||
'PhabricatorNgramsIndexEngineExtension' => 'applications/search/engineextension/PhabricatorNgramsIndexEngineExtension.php',
|
||||
'PhabricatorNgramsInterface' => 'applications/search/interface/PhabricatorNgramsInterface.php',
|
||||
'PhabricatorNotificationBuilder' => 'applications/notification/builder/PhabricatorNotificationBuilder.php',
|
||||
|
@ -8166,6 +8165,7 @@ phutil_register_library_map(array(
|
|||
'PhabricatorFeedStoryReference' => 'PhabricatorFeedDAO',
|
||||
'PhabricatorFerretDocument' => 'PhabricatorSearchDAO',
|
||||
'PhabricatorFerretEngine' => 'Phobject',
|
||||
'PhabricatorFerretEngineTestCase' => 'PhabricatorTestCase',
|
||||
'PhabricatorFerretField' => 'PhabricatorSearchDAO',
|
||||
'PhabricatorFerretFulltextEngineExtension' => 'PhabricatorFulltextEngineExtension',
|
||||
'PhabricatorFerretNgrams' => 'PhabricatorSearchDAO',
|
||||
|
@ -8587,8 +8587,6 @@ phutil_register_library_map(array(
|
|||
'PhabricatorNamedQueryQuery' => 'PhabricatorCursorPagedPolicyAwareQuery',
|
||||
'PhabricatorNavigationRemarkupRule' => 'PhutilRemarkupRule',
|
||||
'PhabricatorNeverTriggerClock' => 'PhabricatorTriggerClock',
|
||||
'PhabricatorNgramEngine' => 'Phobject',
|
||||
'PhabricatorNgramEngineTestCase' => 'PhabricatorTestCase',
|
||||
'PhabricatorNgramsIndexEngineExtension' => 'PhabricatorIndexEngineExtension',
|
||||
'PhabricatorNgramsInterface' => 'PhabricatorIndexableInterface',
|
||||
'PhabricatorNotificationBuilder' => 'Phobject',
|
||||
|
|
|
@ -29,8 +29,7 @@ final class PhabricatorFerretFulltextEngineExtension
|
|||
->setEpochCreated(0)
|
||||
->setEpochModified(0);
|
||||
|
||||
$stemmer = new PhutilSearchStemmer();
|
||||
$ngram_engine = id(new PhabricatorNgramEngine());
|
||||
$stemmer = $engine->newStemmer();
|
||||
|
||||
// Copy all of the "title" and "body" fields to create new "core" fields.
|
||||
// This allows users to search "in title or body" with the "core:" prefix.
|
||||
|
@ -69,10 +68,10 @@ final class PhabricatorFerretFulltextEngineExtension
|
|||
continue;
|
||||
}
|
||||
|
||||
$term_corpus = $ngram_engine->newTermsCorpus($raw_corpus);
|
||||
$term_corpus = $engine->newTermsCorpus($raw_corpus);
|
||||
|
||||
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
|
||||
$normal_coprus = $ngram_engine->newTermsCorpus($normal_corpus);
|
||||
$normal_coprus = $engine->newTermsCorpus($normal_corpus);
|
||||
|
||||
if (!isset($ferret_corpus_map[$key])) {
|
||||
$ferret_corpus_map[$key] = $empty_template;
|
||||
|
@ -116,7 +115,7 @@ final class PhabricatorFerretFulltextEngineExtension
|
|||
}
|
||||
$ngrams_source = implode(' ', $ngrams_source);
|
||||
|
||||
$ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index');
|
||||
$ngrams = $engine->getNgramsFromString($ngrams_source, 'index');
|
||||
|
||||
$ferret_document->openTransaction();
|
||||
|
||||
|
|
|
@ -6,4 +6,97 @@ abstract class PhabricatorFerretEngine extends Phobject {
|
|||
abstract public function newDocumentObject();
|
||||
abstract public function newFieldObject();
|
||||
|
||||
public function newStemmer() {
|
||||
return new PhutilSearchStemmer();
|
||||
}
|
||||
|
||||
public function tokenizeString($value) {
|
||||
$value = trim($value, ' ');
|
||||
$value = preg_split('/ +/', $value);
|
||||
return $value;
|
||||
}
|
||||
|
||||
public function getNgramsFromString($value, $mode) {
|
||||
$tokens = $this->tokenizeString($value);
|
||||
|
||||
$ngrams = array();
|
||||
foreach ($tokens as $token) {
|
||||
$token = phutil_utf8_strtolower($token);
|
||||
|
||||
switch ($mode) {
|
||||
case 'query':
|
||||
break;
|
||||
case 'index':
|
||||
$token = ' '.$token.' ';
|
||||
break;
|
||||
case 'prefix':
|
||||
$token = ' '.$token;
|
||||
break;
|
||||
}
|
||||
|
||||
$token_v = phutil_utf8v($token);
|
||||
$len = (count($token_v) - 2);
|
||||
for ($ii = 0; $ii < $len; $ii++) {
|
||||
$ngram = array_slice($token_v, $ii, 3);
|
||||
$ngram = implode('', $ngram);
|
||||
$ngrams[$ngram] = $ngram;
|
||||
}
|
||||
}
|
||||
|
||||
ksort($ngrams);
|
||||
|
||||
return array_keys($ngrams);
|
||||
}
|
||||
|
||||
public function newTermsCorpus($raw_corpus) {
|
||||
$term_corpus = strtr(
|
||||
$raw_corpus,
|
||||
array(
|
||||
'!' => ' ',
|
||||
'"' => ' ',
|
||||
'#' => ' ',
|
||||
'$' => ' ',
|
||||
'%' => ' ',
|
||||
'&' => ' ',
|
||||
'(' => ' ',
|
||||
')' => ' ',
|
||||
'*' => ' ',
|
||||
'+' => ' ',
|
||||
',' => ' ',
|
||||
'-' => ' ',
|
||||
'/' => ' ',
|
||||
':' => ' ',
|
||||
';' => ' ',
|
||||
'<' => ' ',
|
||||
'=' => ' ',
|
||||
'>' => ' ',
|
||||
'?' => ' ',
|
||||
'@' => ' ',
|
||||
'[' => ' ',
|
||||
'\\' => ' ',
|
||||
']' => ' ',
|
||||
'^' => ' ',
|
||||
'`' => ' ',
|
||||
'{' => ' ',
|
||||
'|' => ' ',
|
||||
'}' => ' ',
|
||||
'~' => ' ',
|
||||
'.' => ' ',
|
||||
'_' => ' ',
|
||||
"\n" => ' ',
|
||||
"\r" => ' ',
|
||||
"\t" => ' ',
|
||||
));
|
||||
|
||||
// NOTE: Single quotes divide terms only if they're at a word boundary.
|
||||
// In contractions, like "whom'st've", the entire word is a single term.
|
||||
$term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus);
|
||||
$term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus);
|
||||
|
||||
$term_corpus = preg_replace('/\s+/u', ' ', $term_corpus);
|
||||
$term_corpus = trim($term_corpus, ' ');
|
||||
|
||||
return $term_corpus;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
<?php
|
||||
|
||||
final class PhabricatorNgramEngineTestCase
|
||||
final class PhabricatorFerretEngineTestCase
|
||||
extends PhabricatorTestCase {
|
||||
|
||||
public function testTermsCorpus() {
|
||||
|
@ -12,7 +12,8 @@ final class PhabricatorNgramEngineTestCase
|
|||
'http example org path to file jpg',
|
||||
);
|
||||
|
||||
$engine = new PhabricatorNgramEngine();
|
||||
$engine = new ManiphestTaskFerretEngine();
|
||||
|
||||
foreach ($map as $input => $expect) {
|
||||
$actual = $engine->newTermsCorpus($input);
|
||||
|
|
@ -1,95 +0,0 @@
|
|||
<?php
|
||||
|
||||
final class PhabricatorNgramEngine extends Phobject {
|
||||
|
||||
public function tokenizeString($value) {
|
||||
$value = trim($value, ' ');
|
||||
$value = preg_split('/ +/', $value);
|
||||
return $value;
|
||||
}
|
||||
|
||||
public function getNgramsFromString($value, $mode) {
|
||||
$tokens = $this->tokenizeString($value);
|
||||
|
||||
$ngrams = array();
|
||||
foreach ($tokens as $token) {
|
||||
$token = phutil_utf8_strtolower($token);
|
||||
|
||||
switch ($mode) {
|
||||
case 'query':
|
||||
break;
|
||||
case 'index':
|
||||
$token = ' '.$token.' ';
|
||||
break;
|
||||
case 'prefix':
|
||||
$token = ' '.$token;
|
||||
break;
|
||||
}
|
||||
|
||||
$token_v = phutil_utf8v($token);
|
||||
$len = (count($token_v) - 2);
|
||||
for ($ii = 0; $ii < $len; $ii++) {
|
||||
$ngram = array_slice($token_v, $ii, 3);
|
||||
$ngram = implode('', $ngram);
|
||||
$ngrams[$ngram] = $ngram;
|
||||
}
|
||||
}
|
||||
|
||||
ksort($ngrams);
|
||||
|
||||
return array_keys($ngrams);
|
||||
}
|
||||
|
||||
public function newTermsCorpus($raw_corpus) {
|
||||
$term_corpus = strtr(
|
||||
$raw_corpus,
|
||||
array(
|
||||
'!' => ' ',
|
||||
'"' => ' ',
|
||||
'#' => ' ',
|
||||
'$' => ' ',
|
||||
'%' => ' ',
|
||||
'&' => ' ',
|
||||
'(' => ' ',
|
||||
')' => ' ',
|
||||
'*' => ' ',
|
||||
'+' => ' ',
|
||||
',' => ' ',
|
||||
'-' => ' ',
|
||||
'/' => ' ',
|
||||
':' => ' ',
|
||||
';' => ' ',
|
||||
'<' => ' ',
|
||||
'=' => ' ',
|
||||
'>' => ' ',
|
||||
'?' => ' ',
|
||||
'@' => ' ',
|
||||
'[' => ' ',
|
||||
'\\' => ' ',
|
||||
']' => ' ',
|
||||
'^' => ' ',
|
||||
'`' => ' ',
|
||||
'{' => ' ',
|
||||
'|' => ' ',
|
||||
'}' => ' ',
|
||||
'~' => ' ',
|
||||
'.' => ' ',
|
||||
'_' => ' ',
|
||||
"\n" => ' ',
|
||||
"\r" => ' ',
|
||||
"\t" => ' ',
|
||||
));
|
||||
|
||||
// NOTE: Single quotes divide terms only if they're at a word boundary.
|
||||
// In contractions, like "whom'st've", the entire word is a single term.
|
||||
$term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus);
|
||||
$term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus);
|
||||
|
||||
$term_corpus = preg_replace('/\s+/u', ' ', $term_corpus);
|
||||
$term_corpus = trim($term_corpus, ' ');
|
||||
|
||||
return $term_corpus;
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -1453,8 +1453,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
$op_not = PhutilSearchQueryCompiler::OPERATOR_NOT;
|
||||
|
||||
$engine = $this->ferretEngine;
|
||||
$ngram_engine = new PhabricatorNgramEngine();
|
||||
$stemmer = new PhutilSearchStemmer();
|
||||
$stemmer = $engine->newStemmer();
|
||||
|
||||
$ngram_table = $engine->newNgramsObject();
|
||||
$ngram_table_name = $ngram_table->getTableName();
|
||||
|
@ -1498,15 +1497,15 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
}
|
||||
|
||||
if ($is_substring) {
|
||||
$ngrams = $ngram_engine->getNgramsFromString($value, 'query');
|
||||
$ngrams = $engine->getNgramsFromString($value, 'query');
|
||||
} else {
|
||||
$ngrams = $ngram_engine->getNgramsFromString($value, 'index');
|
||||
$ngrams = $engine->getNgramsFromString($value, 'index');
|
||||
|
||||
// If this is a stemmed term, only look for ngrams present in both the
|
||||
// unstemmed and stemmed variations.
|
||||
if ($is_stemmed) {
|
||||
$stem_value = $stemmer->stemToken($value);
|
||||
$stem_ngrams = $ngram_engine->getNgramsFromString(
|
||||
$stem_ngrams = $engine->getNgramsFromString(
|
||||
$stem_value,
|
||||
'index');
|
||||
|
||||
|
@ -1587,8 +1586,8 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
return array();
|
||||
}
|
||||
|
||||
$ngram_engine = new PhabricatorNgramEngine();
|
||||
$stemmer = new PhutilSearchStemmer();
|
||||
$engine = $this->ferretEngine;
|
||||
$stemmer = $engine->newStemmer();
|
||||
$table_map = $this->ferretTables;
|
||||
|
||||
$op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING;
|
||||
|
@ -1653,7 +1652,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
|
||||
$term_constraints = array();
|
||||
|
||||
$term_value = ' '.$ngram_engine->newTermsCorpus($value).' ';
|
||||
$term_value = ' '.$engine->newTermsCorpus($value).' ';
|
||||
if ($is_not) {
|
||||
$term_constraints[] = qsprintf(
|
||||
$conn,
|
||||
|
@ -1670,7 +1669,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
|
||||
if ($is_stemmed) {
|
||||
$stem_value = $stemmer->stemToken($value);
|
||||
$stem_value = $ngram_engine->newTermsCorpus($stem_value);
|
||||
$stem_value = $engine->newTermsCorpus($stem_value);
|
||||
$stem_value = ' '.$stem_value.' ';
|
||||
|
||||
$term_constraints[] = qsprintf(
|
||||
|
|
Loading…
Reference in a new issue