From 4a7593f47fc76590056928a578be2af4cc7fb994 Mon Sep 17 00:00:00 2001 From: epriestley Date: Tue, 5 Sep 2017 07:54:22 -0700 Subject: [PATCH] Consolidate more Ferret engine code into FerretEngine Summary: Ref T12819. Earlier I separated some ngram code into an "ngram engine" hoping to share it across the simple Ngrams stuff and the full Ferret stuff, but they actually use slightly different rules. Just pull more of this stuff into FerretEngine to reduce the number of moving pieces and the amount of code duplication. Test Plan: Searched for terms, rebuilt indexes. Reviewers: chad Reviewed By: chad Maniphest Tasks: T12819 Differential Revision: https://secure.phabricator.com/D18533 --- src/__phutil_library_map__.php | 6 +- ...abricatorFerretFulltextEngineExtension.php | 9 +- .../search/ferret/PhabricatorFerretEngine.php | 93 ++++++++++++++++++ .../PhabricatorFerretEngineTestCase.php} | 5 +- .../search/ngrams/PhabricatorNgramEngine.php | 95 ------------------- ...PhabricatorCursorPagedPolicyAwareQuery.php | 17 ++-- 6 files changed, 110 insertions(+), 115 deletions(-) rename src/applications/search/{ngrams/__tests__/PhabricatorNgramEngineTestCase.php => ferret/__tests__/PhabricatorFerretEngineTestCase.php} (86%) delete mode 100644 src/applications/search/ngrams/PhabricatorNgramEngine.php diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php index ff30ddae98..4ce066e802 100644 --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -2834,6 +2834,7 @@ phutil_register_library_map(array( 'PhabricatorFeedStoryReference' => 'applications/feed/storage/PhabricatorFeedStoryReference.php', 'PhabricatorFerretDocument' => 'applications/search/ferret/PhabricatorFerretDocument.php', 'PhabricatorFerretEngine' => 'applications/search/ferret/PhabricatorFerretEngine.php', + 'PhabricatorFerretEngineTestCase' => 'applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php', 'PhabricatorFerretField' => 'applications/search/ferret/PhabricatorFerretField.php', 'PhabricatorFerretFulltextEngineExtension' => 'applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php', 'PhabricatorFerretInterface' => 'applications/search/ferret/PhabricatorFerretInterface.php', @@ -3205,8 +3206,6 @@ phutil_register_library_map(array( 'PhabricatorNamedQueryQuery' => 'applications/search/query/PhabricatorNamedQueryQuery.php', 'PhabricatorNavigationRemarkupRule' => 'infrastructure/markup/rule/PhabricatorNavigationRemarkupRule.php', 'PhabricatorNeverTriggerClock' => 'infrastructure/daemon/workers/clock/PhabricatorNeverTriggerClock.php', - 'PhabricatorNgramEngine' => 'applications/search/ngrams/PhabricatorNgramEngine.php', - 'PhabricatorNgramEngineTestCase' => 'applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php', 'PhabricatorNgramsIndexEngineExtension' => 'applications/search/engineextension/PhabricatorNgramsIndexEngineExtension.php', 'PhabricatorNgramsInterface' => 'applications/search/interface/PhabricatorNgramsInterface.php', 'PhabricatorNotificationBuilder' => 'applications/notification/builder/PhabricatorNotificationBuilder.php', @@ -8166,6 +8165,7 @@ phutil_register_library_map(array( 'PhabricatorFeedStoryReference' => 'PhabricatorFeedDAO', 'PhabricatorFerretDocument' => 'PhabricatorSearchDAO', 'PhabricatorFerretEngine' => 'Phobject', + 'PhabricatorFerretEngineTestCase' => 'PhabricatorTestCase', 'PhabricatorFerretField' => 'PhabricatorSearchDAO', 'PhabricatorFerretFulltextEngineExtension' => 'PhabricatorFulltextEngineExtension', 'PhabricatorFerretNgrams' => 'PhabricatorSearchDAO', @@ -8587,8 +8587,6 @@ phutil_register_library_map(array( 'PhabricatorNamedQueryQuery' => 'PhabricatorCursorPagedPolicyAwareQuery', 'PhabricatorNavigationRemarkupRule' => 'PhutilRemarkupRule', 'PhabricatorNeverTriggerClock' => 'PhabricatorTriggerClock', - 'PhabricatorNgramEngine' => 'Phobject', - 'PhabricatorNgramEngineTestCase' => 'PhabricatorTestCase', 'PhabricatorNgramsIndexEngineExtension' => 'PhabricatorIndexEngineExtension', 'PhabricatorNgramsInterface' => 'PhabricatorIndexableInterface', 'PhabricatorNotificationBuilder' => 'Phobject', diff --git a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php index 1c9efeba9d..766bc4e4c8 100644 --- a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php +++ b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php @@ -29,8 +29,7 @@ final class PhabricatorFerretFulltextEngineExtension ->setEpochCreated(0) ->setEpochModified(0); - $stemmer = new PhutilSearchStemmer(); - $ngram_engine = id(new PhabricatorNgramEngine()); + $stemmer = $engine->newStemmer(); // Copy all of the "title" and "body" fields to create new "core" fields. // This allows users to search "in title or body" with the "core:" prefix. @@ -69,10 +68,10 @@ final class PhabricatorFerretFulltextEngineExtension continue; } - $term_corpus = $ngram_engine->newTermsCorpus($raw_corpus); + $term_corpus = $engine->newTermsCorpus($raw_corpus); $normal_corpus = $stemmer->stemCorpus($raw_corpus); - $normal_coprus = $ngram_engine->newTermsCorpus($normal_corpus); + $normal_coprus = $engine->newTermsCorpus($normal_corpus); if (!isset($ferret_corpus_map[$key])) { $ferret_corpus_map[$key] = $empty_template; @@ -116,7 +115,7 @@ final class PhabricatorFerretFulltextEngineExtension } $ngrams_source = implode(' ', $ngrams_source); - $ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index'); + $ngrams = $engine->getNgramsFromString($ngrams_source, 'index'); $ferret_document->openTransaction(); diff --git a/src/applications/search/ferret/PhabricatorFerretEngine.php b/src/applications/search/ferret/PhabricatorFerretEngine.php index e816ef3f59..3811e16289 100644 --- a/src/applications/search/ferret/PhabricatorFerretEngine.php +++ b/src/applications/search/ferret/PhabricatorFerretEngine.php @@ -6,4 +6,97 @@ abstract class PhabricatorFerretEngine extends Phobject { abstract public function newDocumentObject(); abstract public function newFieldObject(); + public function newStemmer() { + return new PhutilSearchStemmer(); + } + + public function tokenizeString($value) { + $value = trim($value, ' '); + $value = preg_split('/ +/', $value); + return $value; + } + + public function getNgramsFromString($value, $mode) { + $tokens = $this->tokenizeString($value); + + $ngrams = array(); + foreach ($tokens as $token) { + $token = phutil_utf8_strtolower($token); + + switch ($mode) { + case 'query': + break; + case 'index': + $token = ' '.$token.' '; + break; + case 'prefix': + $token = ' '.$token; + break; + } + + $token_v = phutil_utf8v($token); + $len = (count($token_v) - 2); + for ($ii = 0; $ii < $len; $ii++) { + $ngram = array_slice($token_v, $ii, 3); + $ngram = implode('', $ngram); + $ngrams[$ngram] = $ngram; + } + } + + ksort($ngrams); + + return array_keys($ngrams); + } + + public function newTermsCorpus($raw_corpus) { + $term_corpus = strtr( + $raw_corpus, + array( + '!' => ' ', + '"' => ' ', + '#' => ' ', + '$' => ' ', + '%' => ' ', + '&' => ' ', + '(' => ' ', + ')' => ' ', + '*' => ' ', + '+' => ' ', + ',' => ' ', + '-' => ' ', + '/' => ' ', + ':' => ' ', + ';' => ' ', + '<' => ' ', + '=' => ' ', + '>' => ' ', + '?' => ' ', + '@' => ' ', + '[' => ' ', + '\\' => ' ', + ']' => ' ', + '^' => ' ', + '`' => ' ', + '{' => ' ', + '|' => ' ', + '}' => ' ', + '~' => ' ', + '.' => ' ', + '_' => ' ', + "\n" => ' ', + "\r" => ' ', + "\t" => ' ', + )); + + // NOTE: Single quotes divide terms only if they're at a word boundary. + // In contractions, like "whom'st've", the entire word is a single term. + $term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus); + $term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus); + + $term_corpus = preg_replace('/\s+/u', ' ', $term_corpus); + $term_corpus = trim($term_corpus, ' '); + + return $term_corpus; + } + } diff --git a/src/applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php similarity index 86% rename from src/applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php rename to src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php index fccb6fb324..b8f1043ab2 100644 --- a/src/applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php +++ b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php @@ -1,6 +1,6 @@ $expect) { $actual = $engine->newTermsCorpus($input); diff --git a/src/applications/search/ngrams/PhabricatorNgramEngine.php b/src/applications/search/ngrams/PhabricatorNgramEngine.php deleted file mode 100644 index f8f55d8757..0000000000 --- a/src/applications/search/ngrams/PhabricatorNgramEngine.php +++ /dev/null @@ -1,95 +0,0 @@ -tokenizeString($value); - - $ngrams = array(); - foreach ($tokens as $token) { - $token = phutil_utf8_strtolower($token); - - switch ($mode) { - case 'query': - break; - case 'index': - $token = ' '.$token.' '; - break; - case 'prefix': - $token = ' '.$token; - break; - } - - $token_v = phutil_utf8v($token); - $len = (count($token_v) - 2); - for ($ii = 0; $ii < $len; $ii++) { - $ngram = array_slice($token_v, $ii, 3); - $ngram = implode('', $ngram); - $ngrams[$ngram] = $ngram; - } - } - - ksort($ngrams); - - return array_keys($ngrams); - } - - public function newTermsCorpus($raw_corpus) { - $term_corpus = strtr( - $raw_corpus, - array( - '!' => ' ', - '"' => ' ', - '#' => ' ', - '$' => ' ', - '%' => ' ', - '&' => ' ', - '(' => ' ', - ')' => ' ', - '*' => ' ', - '+' => ' ', - ',' => ' ', - '-' => ' ', - '/' => ' ', - ':' => ' ', - ';' => ' ', - '<' => ' ', - '=' => ' ', - '>' => ' ', - '?' => ' ', - '@' => ' ', - '[' => ' ', - '\\' => ' ', - ']' => ' ', - '^' => ' ', - '`' => ' ', - '{' => ' ', - '|' => ' ', - '}' => ' ', - '~' => ' ', - '.' => ' ', - '_' => ' ', - "\n" => ' ', - "\r" => ' ', - "\t" => ' ', - )); - - // NOTE: Single quotes divide terms only if they're at a word boundary. - // In contractions, like "whom'st've", the entire word is a single term. - $term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus); - $term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus); - - $term_corpus = preg_replace('/\s+/u', ' ', $term_corpus); - $term_corpus = trim($term_corpus, ' '); - - return $term_corpus; - } - - -} diff --git a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php index 03d56bc1d2..ccf78de5df 100644 --- a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php +++ b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php @@ -1453,8 +1453,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery $op_not = PhutilSearchQueryCompiler::OPERATOR_NOT; $engine = $this->ferretEngine; - $ngram_engine = new PhabricatorNgramEngine(); - $stemmer = new PhutilSearchStemmer(); + $stemmer = $engine->newStemmer(); $ngram_table = $engine->newNgramsObject(); $ngram_table_name = $ngram_table->getTableName(); @@ -1498,15 +1497,15 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery } if ($is_substring) { - $ngrams = $ngram_engine->getNgramsFromString($value, 'query'); + $ngrams = $engine->getNgramsFromString($value, 'query'); } else { - $ngrams = $ngram_engine->getNgramsFromString($value, 'index'); + $ngrams = $engine->getNgramsFromString($value, 'index'); // If this is a stemmed term, only look for ngrams present in both the // unstemmed and stemmed variations. if ($is_stemmed) { $stem_value = $stemmer->stemToken($value); - $stem_ngrams = $ngram_engine->getNgramsFromString( + $stem_ngrams = $engine->getNgramsFromString( $stem_value, 'index'); @@ -1587,8 +1586,8 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery return array(); } - $ngram_engine = new PhabricatorNgramEngine(); - $stemmer = new PhutilSearchStemmer(); + $engine = $this->ferretEngine; + $stemmer = $engine->newStemmer(); $table_map = $this->ferretTables; $op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING; @@ -1653,7 +1652,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery $term_constraints = array(); - $term_value = ' '.$ngram_engine->newTermsCorpus($value).' '; + $term_value = ' '.$engine->newTermsCorpus($value).' '; if ($is_not) { $term_constraints[] = qsprintf( $conn, @@ -1670,7 +1669,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery if ($is_stemmed) { $stem_value = $stemmer->stemToken($value); - $stem_value = $ngram_engine->newTermsCorpus($stem_value); + $stem_value = $engine->newTermsCorpus($stem_value); $stem_value = ' '.$stem_value.' '; $term_constraints[] = qsprintf(