1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-10 08:52:39 +01:00

Consolidate more Ferret engine code into FerretEngine

Summary: Ref T12819. Earlier I separated some ngram code into an "ngram engine" hoping to share it across the simple Ngrams stuff and the full Ferret stuff, but they actually use slightly different rules. Just pull more of this stuff into FerretEngine to reduce the number of moving pieces and the amount of code duplication.

Test Plan: Searched for terms, rebuilt indexes.

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T12819

Differential Revision: https://secure.phabricator.com/D18533
This commit is contained in:
epriestley 2017-09-05 07:54:22 -07:00
parent d5ba30c777
commit 4a7593f47f
6 changed files with 110 additions and 115 deletions

View file

@ -2834,6 +2834,7 @@ phutil_register_library_map(array(
'PhabricatorFeedStoryReference' => 'applications/feed/storage/PhabricatorFeedStoryReference.php',
'PhabricatorFerretDocument' => 'applications/search/ferret/PhabricatorFerretDocument.php',
'PhabricatorFerretEngine' => 'applications/search/ferret/PhabricatorFerretEngine.php',
'PhabricatorFerretEngineTestCase' => 'applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php',
'PhabricatorFerretField' => 'applications/search/ferret/PhabricatorFerretField.php',
'PhabricatorFerretFulltextEngineExtension' => 'applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php',
'PhabricatorFerretInterface' => 'applications/search/ferret/PhabricatorFerretInterface.php',
@ -3205,8 +3206,6 @@ phutil_register_library_map(array(
'PhabricatorNamedQueryQuery' => 'applications/search/query/PhabricatorNamedQueryQuery.php',
'PhabricatorNavigationRemarkupRule' => 'infrastructure/markup/rule/PhabricatorNavigationRemarkupRule.php',
'PhabricatorNeverTriggerClock' => 'infrastructure/daemon/workers/clock/PhabricatorNeverTriggerClock.php',
'PhabricatorNgramEngine' => 'applications/search/ngrams/PhabricatorNgramEngine.php',
'PhabricatorNgramEngineTestCase' => 'applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php',
'PhabricatorNgramsIndexEngineExtension' => 'applications/search/engineextension/PhabricatorNgramsIndexEngineExtension.php',
'PhabricatorNgramsInterface' => 'applications/search/interface/PhabricatorNgramsInterface.php',
'PhabricatorNotificationBuilder' => 'applications/notification/builder/PhabricatorNotificationBuilder.php',
@ -8166,6 +8165,7 @@ phutil_register_library_map(array(
'PhabricatorFeedStoryReference' => 'PhabricatorFeedDAO',
'PhabricatorFerretDocument' => 'PhabricatorSearchDAO',
'PhabricatorFerretEngine' => 'Phobject',
'PhabricatorFerretEngineTestCase' => 'PhabricatorTestCase',
'PhabricatorFerretField' => 'PhabricatorSearchDAO',
'PhabricatorFerretFulltextEngineExtension' => 'PhabricatorFulltextEngineExtension',
'PhabricatorFerretNgrams' => 'PhabricatorSearchDAO',
@ -8587,8 +8587,6 @@ phutil_register_library_map(array(
'PhabricatorNamedQueryQuery' => 'PhabricatorCursorPagedPolicyAwareQuery',
'PhabricatorNavigationRemarkupRule' => 'PhutilRemarkupRule',
'PhabricatorNeverTriggerClock' => 'PhabricatorTriggerClock',
'PhabricatorNgramEngine' => 'Phobject',
'PhabricatorNgramEngineTestCase' => 'PhabricatorTestCase',
'PhabricatorNgramsIndexEngineExtension' => 'PhabricatorIndexEngineExtension',
'PhabricatorNgramsInterface' => 'PhabricatorIndexableInterface',
'PhabricatorNotificationBuilder' => 'Phobject',

View file

@ -29,8 +29,7 @@ final class PhabricatorFerretFulltextEngineExtension
->setEpochCreated(0)
->setEpochModified(0);
$stemmer = new PhutilSearchStemmer();
$ngram_engine = id(new PhabricatorNgramEngine());
$stemmer = $engine->newStemmer();
// Copy all of the "title" and "body" fields to create new "core" fields.
// This allows users to search "in title or body" with the "core:" prefix.
@ -69,10 +68,10 @@ final class PhabricatorFerretFulltextEngineExtension
continue;
}
$term_corpus = $ngram_engine->newTermsCorpus($raw_corpus);
$term_corpus = $engine->newTermsCorpus($raw_corpus);
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
$normal_coprus = $ngram_engine->newTermsCorpus($normal_corpus);
$normal_coprus = $engine->newTermsCorpus($normal_corpus);
if (!isset($ferret_corpus_map[$key])) {
$ferret_corpus_map[$key] = $empty_template;
@ -116,7 +115,7 @@ final class PhabricatorFerretFulltextEngineExtension
}
$ngrams_source = implode(' ', $ngrams_source);
$ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index');
$ngrams = $engine->getNgramsFromString($ngrams_source, 'index');
$ferret_document->openTransaction();

View file

@ -6,4 +6,97 @@ abstract class PhabricatorFerretEngine extends Phobject {
abstract public function newDocumentObject();
abstract public function newFieldObject();
public function newStemmer() {
return new PhutilSearchStemmer();
}
public function tokenizeString($value) {
$value = trim($value, ' ');
$value = preg_split('/ +/', $value);
return $value;
}
public function getNgramsFromString($value, $mode) {
$tokens = $this->tokenizeString($value);
$ngrams = array();
foreach ($tokens as $token) {
$token = phutil_utf8_strtolower($token);
switch ($mode) {
case 'query':
break;
case 'index':
$token = ' '.$token.' ';
break;
case 'prefix':
$token = ' '.$token;
break;
}
$token_v = phutil_utf8v($token);
$len = (count($token_v) - 2);
for ($ii = 0; $ii < $len; $ii++) {
$ngram = array_slice($token_v, $ii, 3);
$ngram = implode('', $ngram);
$ngrams[$ngram] = $ngram;
}
}
ksort($ngrams);
return array_keys($ngrams);
}
public function newTermsCorpus($raw_corpus) {
$term_corpus = strtr(
$raw_corpus,
array(
'!' => ' ',
'"' => ' ',
'#' => ' ',
'$' => ' ',
'%' => ' ',
'&' => ' ',
'(' => ' ',
')' => ' ',
'*' => ' ',
'+' => ' ',
',' => ' ',
'-' => ' ',
'/' => ' ',
':' => ' ',
';' => ' ',
'<' => ' ',
'=' => ' ',
'>' => ' ',
'?' => ' ',
'@' => ' ',
'[' => ' ',
'\\' => ' ',
']' => ' ',
'^' => ' ',
'`' => ' ',
'{' => ' ',
'|' => ' ',
'}' => ' ',
'~' => ' ',
'.' => ' ',
'_' => ' ',
"\n" => ' ',
"\r" => ' ',
"\t" => ' ',
));
// NOTE: Single quotes divide terms only if they're at a word boundary.
// In contractions, like "whom'st've", the entire word is a single term.
$term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus);
$term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus);
$term_corpus = preg_replace('/\s+/u', ' ', $term_corpus);
$term_corpus = trim($term_corpus, ' ');
return $term_corpus;
}
}

View file

@ -1,6 +1,6 @@
<?php
final class PhabricatorNgramEngineTestCase
final class PhabricatorFerretEngineTestCase
extends PhabricatorTestCase {
public function testTermsCorpus() {
@ -12,7 +12,8 @@ final class PhabricatorNgramEngineTestCase
'http example org path to file jpg',
);
$engine = new PhabricatorNgramEngine();
$engine = new ManiphestTaskFerretEngine();
foreach ($map as $input => $expect) {
$actual = $engine->newTermsCorpus($input);

View file

@ -1,95 +0,0 @@
<?php
final class PhabricatorNgramEngine extends Phobject {
public function tokenizeString($value) {
$value = trim($value, ' ');
$value = preg_split('/ +/', $value);
return $value;
}
public function getNgramsFromString($value, $mode) {
$tokens = $this->tokenizeString($value);
$ngrams = array();
foreach ($tokens as $token) {
$token = phutil_utf8_strtolower($token);
switch ($mode) {
case 'query':
break;
case 'index':
$token = ' '.$token.' ';
break;
case 'prefix':
$token = ' '.$token;
break;
}
$token_v = phutil_utf8v($token);
$len = (count($token_v) - 2);
for ($ii = 0; $ii < $len; $ii++) {
$ngram = array_slice($token_v, $ii, 3);
$ngram = implode('', $ngram);
$ngrams[$ngram] = $ngram;
}
}
ksort($ngrams);
return array_keys($ngrams);
}
public function newTermsCorpus($raw_corpus) {
$term_corpus = strtr(
$raw_corpus,
array(
'!' => ' ',
'"' => ' ',
'#' => ' ',
'$' => ' ',
'%' => ' ',
'&' => ' ',
'(' => ' ',
')' => ' ',
'*' => ' ',
'+' => ' ',
',' => ' ',
'-' => ' ',
'/' => ' ',
':' => ' ',
';' => ' ',
'<' => ' ',
'=' => ' ',
'>' => ' ',
'?' => ' ',
'@' => ' ',
'[' => ' ',
'\\' => ' ',
']' => ' ',
'^' => ' ',
'`' => ' ',
'{' => ' ',
'|' => ' ',
'}' => ' ',
'~' => ' ',
'.' => ' ',
'_' => ' ',
"\n" => ' ',
"\r" => ' ',
"\t" => ' ',
));
// NOTE: Single quotes divide terms only if they're at a word boundary.
// In contractions, like "whom'st've", the entire word is a single term.
$term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus);
$term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus);
$term_corpus = preg_replace('/\s+/u', ' ', $term_corpus);
$term_corpus = trim($term_corpus, ' ');
return $term_corpus;
}
}

View file

@ -1453,8 +1453,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
$op_not = PhutilSearchQueryCompiler::OPERATOR_NOT;
$engine = $this->ferretEngine;
$ngram_engine = new PhabricatorNgramEngine();
$stemmer = new PhutilSearchStemmer();
$stemmer = $engine->newStemmer();
$ngram_table = $engine->newNgramsObject();
$ngram_table_name = $ngram_table->getTableName();
@ -1498,15 +1497,15 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
}
if ($is_substring) {
$ngrams = $ngram_engine->getNgramsFromString($value, 'query');
$ngrams = $engine->getNgramsFromString($value, 'query');
} else {
$ngrams = $ngram_engine->getNgramsFromString($value, 'index');
$ngrams = $engine->getNgramsFromString($value, 'index');
// If this is a stemmed term, only look for ngrams present in both the
// unstemmed and stemmed variations.
if ($is_stemmed) {
$stem_value = $stemmer->stemToken($value);
$stem_ngrams = $ngram_engine->getNgramsFromString(
$stem_ngrams = $engine->getNgramsFromString(
$stem_value,
'index');
@ -1587,8 +1586,8 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
return array();
}
$ngram_engine = new PhabricatorNgramEngine();
$stemmer = new PhutilSearchStemmer();
$engine = $this->ferretEngine;
$stemmer = $engine->newStemmer();
$table_map = $this->ferretTables;
$op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING;
@ -1653,7 +1652,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
$term_constraints = array();
$term_value = ' '.$ngram_engine->newTermsCorpus($value).' ';
$term_value = ' '.$engine->newTermsCorpus($value).' ';
if ($is_not) {
$term_constraints[] = qsprintf(
$conn,
@ -1670,7 +1669,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
if ($is_stemmed) {
$stem_value = $stemmer->stemToken($value);
$stem_value = $ngram_engine->newTermsCorpus($stem_value);
$stem_value = $engine->newTermsCorpus($stem_value);
$stem_value = ' '.$stem_value.' ';
$term_constraints[] = qsprintf(