diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php index ff30ddae98..4ce066e802 100644 --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -2834,6 +2834,7 @@ phutil_register_library_map(array( 'PhabricatorFeedStoryReference' => 'applications/feed/storage/PhabricatorFeedStoryReference.php', 'PhabricatorFerretDocument' => 'applications/search/ferret/PhabricatorFerretDocument.php', 'PhabricatorFerretEngine' => 'applications/search/ferret/PhabricatorFerretEngine.php', + 'PhabricatorFerretEngineTestCase' => 'applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php', 'PhabricatorFerretField' => 'applications/search/ferret/PhabricatorFerretField.php', 'PhabricatorFerretFulltextEngineExtension' => 'applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php', 'PhabricatorFerretInterface' => 'applications/search/ferret/PhabricatorFerretInterface.php', @@ -3205,8 +3206,6 @@ phutil_register_library_map(array( 'PhabricatorNamedQueryQuery' => 'applications/search/query/PhabricatorNamedQueryQuery.php', 'PhabricatorNavigationRemarkupRule' => 'infrastructure/markup/rule/PhabricatorNavigationRemarkupRule.php', 'PhabricatorNeverTriggerClock' => 'infrastructure/daemon/workers/clock/PhabricatorNeverTriggerClock.php', - 'PhabricatorNgramEngine' => 'applications/search/ngrams/PhabricatorNgramEngine.php', - 'PhabricatorNgramEngineTestCase' => 'applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php', 'PhabricatorNgramsIndexEngineExtension' => 'applications/search/engineextension/PhabricatorNgramsIndexEngineExtension.php', 'PhabricatorNgramsInterface' => 'applications/search/interface/PhabricatorNgramsInterface.php', 'PhabricatorNotificationBuilder' => 'applications/notification/builder/PhabricatorNotificationBuilder.php', @@ -8166,6 +8165,7 @@ phutil_register_library_map(array( 'PhabricatorFeedStoryReference' => 'PhabricatorFeedDAO', 'PhabricatorFerretDocument' => 'PhabricatorSearchDAO', 'PhabricatorFerretEngine' => 'Phobject', + 'PhabricatorFerretEngineTestCase' => 'PhabricatorTestCase', 'PhabricatorFerretField' => 'PhabricatorSearchDAO', 'PhabricatorFerretFulltextEngineExtension' => 'PhabricatorFulltextEngineExtension', 'PhabricatorFerretNgrams' => 'PhabricatorSearchDAO', @@ -8587,8 +8587,6 @@ phutil_register_library_map(array( 'PhabricatorNamedQueryQuery' => 'PhabricatorCursorPagedPolicyAwareQuery', 'PhabricatorNavigationRemarkupRule' => 'PhutilRemarkupRule', 'PhabricatorNeverTriggerClock' => 'PhabricatorTriggerClock', - 'PhabricatorNgramEngine' => 'Phobject', - 'PhabricatorNgramEngineTestCase' => 'PhabricatorTestCase', 'PhabricatorNgramsIndexEngineExtension' => 'PhabricatorIndexEngineExtension', 'PhabricatorNgramsInterface' => 'PhabricatorIndexableInterface', 'PhabricatorNotificationBuilder' => 'Phobject', diff --git a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php index 1c9efeba9d..766bc4e4c8 100644 --- a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php +++ b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php @@ -29,8 +29,7 @@ final class PhabricatorFerretFulltextEngineExtension ->setEpochCreated(0) ->setEpochModified(0); - $stemmer = new PhutilSearchStemmer(); - $ngram_engine = id(new PhabricatorNgramEngine()); + $stemmer = $engine->newStemmer(); // Copy all of the "title" and "body" fields to create new "core" fields. // This allows users to search "in title or body" with the "core:" prefix. @@ -69,10 +68,10 @@ final class PhabricatorFerretFulltextEngineExtension continue; } - $term_corpus = $ngram_engine->newTermsCorpus($raw_corpus); + $term_corpus = $engine->newTermsCorpus($raw_corpus); $normal_corpus = $stemmer->stemCorpus($raw_corpus); - $normal_coprus = $ngram_engine->newTermsCorpus($normal_corpus); + $normal_coprus = $engine->newTermsCorpus($normal_corpus); if (!isset($ferret_corpus_map[$key])) { $ferret_corpus_map[$key] = $empty_template; @@ -116,7 +115,7 @@ final class PhabricatorFerretFulltextEngineExtension } $ngrams_source = implode(' ', $ngrams_source); - $ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index'); + $ngrams = $engine->getNgramsFromString($ngrams_source, 'index'); $ferret_document->openTransaction(); diff --git a/src/applications/search/ferret/PhabricatorFerretEngine.php b/src/applications/search/ferret/PhabricatorFerretEngine.php index e816ef3f59..3811e16289 100644 --- a/src/applications/search/ferret/PhabricatorFerretEngine.php +++ b/src/applications/search/ferret/PhabricatorFerretEngine.php @@ -6,4 +6,97 @@ abstract class PhabricatorFerretEngine extends Phobject { abstract public function newDocumentObject(); abstract public function newFieldObject(); + public function newStemmer() { + return new PhutilSearchStemmer(); + } + + public function tokenizeString($value) { + $value = trim($value, ' '); + $value = preg_split('/ +/', $value); + return $value; + } + + public function getNgramsFromString($value, $mode) { + $tokens = $this->tokenizeString($value); + + $ngrams = array(); + foreach ($tokens as $token) { + $token = phutil_utf8_strtolower($token); + + switch ($mode) { + case 'query': + break; + case 'index': + $token = ' '.$token.' '; + break; + case 'prefix': + $token = ' '.$token; + break; + } + + $token_v = phutil_utf8v($token); + $len = (count($token_v) - 2); + for ($ii = 0; $ii < $len; $ii++) { + $ngram = array_slice($token_v, $ii, 3); + $ngram = implode('', $ngram); + $ngrams[$ngram] = $ngram; + } + } + + ksort($ngrams); + + return array_keys($ngrams); + } + + public function newTermsCorpus($raw_corpus) { + $term_corpus = strtr( + $raw_corpus, + array( + '!' => ' ', + '"' => ' ', + '#' => ' ', + '$' => ' ', + '%' => ' ', + '&' => ' ', + '(' => ' ', + ')' => ' ', + '*' => ' ', + '+' => ' ', + ',' => ' ', + '-' => ' ', + '/' => ' ', + ':' => ' ', + ';' => ' ', + '<' => ' ', + '=' => ' ', + '>' => ' ', + '?' => ' ', + '@' => ' ', + '[' => ' ', + '\\' => ' ', + ']' => ' ', + '^' => ' ', + '`' => ' ', + '{' => ' ', + '|' => ' ', + '}' => ' ', + '~' => ' ', + '.' => ' ', + '_' => ' ', + "\n" => ' ', + "\r" => ' ', + "\t" => ' ', + )); + + // NOTE: Single quotes divide terms only if they're at a word boundary. + // In contractions, like "whom'st've", the entire word is a single term. + $term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus); + $term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus); + + $term_corpus = preg_replace('/\s+/u', ' ', $term_corpus); + $term_corpus = trim($term_corpus, ' '); + + return $term_corpus; + } + } diff --git a/src/applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php similarity index 86% rename from src/applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php rename to src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php index fccb6fb324..b8f1043ab2 100644 --- a/src/applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php +++ b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php @@ -1,6 +1,6 @@ $expect) { $actual = $engine->newTermsCorpus($input); diff --git a/src/applications/search/ngrams/PhabricatorNgramEngine.php b/src/applications/search/ngrams/PhabricatorNgramEngine.php deleted file mode 100644 index f8f55d8757..0000000000 --- a/src/applications/search/ngrams/PhabricatorNgramEngine.php +++ /dev/null @@ -1,95 +0,0 @@ -tokenizeString($value); - - $ngrams = array(); - foreach ($tokens as $token) { - $token = phutil_utf8_strtolower($token); - - switch ($mode) { - case 'query': - break; - case 'index': - $token = ' '.$token.' '; - break; - case 'prefix': - $token = ' '.$token; - break; - } - - $token_v = phutil_utf8v($token); - $len = (count($token_v) - 2); - for ($ii = 0; $ii < $len; $ii++) { - $ngram = array_slice($token_v, $ii, 3); - $ngram = implode('', $ngram); - $ngrams[$ngram] = $ngram; - } - } - - ksort($ngrams); - - return array_keys($ngrams); - } - - public function newTermsCorpus($raw_corpus) { - $term_corpus = strtr( - $raw_corpus, - array( - '!' => ' ', - '"' => ' ', - '#' => ' ', - '$' => ' ', - '%' => ' ', - '&' => ' ', - '(' => ' ', - ')' => ' ', - '*' => ' ', - '+' => ' ', - ',' => ' ', - '-' => ' ', - '/' => ' ', - ':' => ' ', - ';' => ' ', - '<' => ' ', - '=' => ' ', - '>' => ' ', - '?' => ' ', - '@' => ' ', - '[' => ' ', - '\\' => ' ', - ']' => ' ', - '^' => ' ', - '`' => ' ', - '{' => ' ', - '|' => ' ', - '}' => ' ', - '~' => ' ', - '.' => ' ', - '_' => ' ', - "\n" => ' ', - "\r" => ' ', - "\t" => ' ', - )); - - // NOTE: Single quotes divide terms only if they're at a word boundary. - // In contractions, like "whom'st've", the entire word is a single term. - $term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus); - $term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus); - - $term_corpus = preg_replace('/\s+/u', ' ', $term_corpus); - $term_corpus = trim($term_corpus, ' '); - - return $term_corpus; - } - - -} diff --git a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php index 03d56bc1d2..ccf78de5df 100644 --- a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php +++ b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php @@ -1453,8 +1453,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery $op_not = PhutilSearchQueryCompiler::OPERATOR_NOT; $engine = $this->ferretEngine; - $ngram_engine = new PhabricatorNgramEngine(); - $stemmer = new PhutilSearchStemmer(); + $stemmer = $engine->newStemmer(); $ngram_table = $engine->newNgramsObject(); $ngram_table_name = $ngram_table->getTableName(); @@ -1498,15 +1497,15 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery } if ($is_substring) { - $ngrams = $ngram_engine->getNgramsFromString($value, 'query'); + $ngrams = $engine->getNgramsFromString($value, 'query'); } else { - $ngrams = $ngram_engine->getNgramsFromString($value, 'index'); + $ngrams = $engine->getNgramsFromString($value, 'index'); // If this is a stemmed term, only look for ngrams present in both the // unstemmed and stemmed variations. if ($is_stemmed) { $stem_value = $stemmer->stemToken($value); - $stem_ngrams = $ngram_engine->getNgramsFromString( + $stem_ngrams = $engine->getNgramsFromString( $stem_value, 'index'); @@ -1587,8 +1586,8 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery return array(); } - $ngram_engine = new PhabricatorNgramEngine(); - $stemmer = new PhutilSearchStemmer(); + $engine = $this->ferretEngine; + $stemmer = $engine->newStemmer(); $table_map = $this->ferretTables; $op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING; @@ -1653,7 +1652,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery $term_constraints = array(); - $term_value = ' '.$ngram_engine->newTermsCorpus($value).' '; + $term_value = ' '.$engine->newTermsCorpus($value).' '; if ($is_not) { $term_constraints[] = qsprintf( $conn, @@ -1670,7 +1669,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery if ($is_stemmed) { $stem_value = $stemmer->stemToken($value); - $stem_value = $ngram_engine->newTermsCorpus($stem_value); + $stem_value = $engine->newTermsCorpus($stem_value); $stem_value = ' '.$stem_value.' '; $term_constraints[] = qsprintf(