From 46abc11114c36fc92496f933f6f77e4a2753589e Mon Sep 17 00:00:00 2001 From: epriestley Date: Tue, 5 Sep 2017 09:38:03 -0700 Subject: [PATCH] Reduce the number of magic strings in the Ferret implementation Summary: Ref T12819. Push more of the magic `' '` stuff into the engine and simplify calls to ngram construction. Also fixes a bug where a task with title "apple banana" and description "cherry doughnut" could match query "banana cherry" by separating separate term segments with newlines instead of spaces. Test Plan: - Indexed some objects. - Searched (term, substring, quoted terms). - Viewed index in database. Reviewers: chad Reviewed By: chad Maniphest Tasks: T12819 Differential Revision: https://secure.phabricator.com/D18534 --- ...abricatorFerretFulltextEngineExtension.php | 16 ++++++------ .../search/ferret/PhabricatorFerretEngine.php | 25 +++++++++++-------- .../PhabricatorFerretEngineTestCase.php | 8 +++--- ...PhabricatorCursorPagedPolicyAwareQuery.php | 12 +++------ 4 files changed, 31 insertions(+), 30 deletions(-) diff --git a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php index 766bc4e4c8..8d1b6070e6 100644 --- a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php +++ b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php @@ -71,7 +71,7 @@ final class PhabricatorFerretFulltextEngineExtension $term_corpus = $engine->newTermsCorpus($raw_corpus); $normal_corpus = $stemmer->stemCorpus($raw_corpus); - $normal_coprus = $engine->newTermsCorpus($normal_corpus); + $normal_corpus = $engine->newTermsCorpus($normal_corpus); if (!isset($ferret_corpus_map[$key])) { $ferret_corpus_map[$key] = $empty_template; @@ -91,20 +91,20 @@ final class PhabricatorFerretFulltextEngineExtension foreach ($ferret_corpus_map as $key => $fields) { $raw_corpus = $fields['raw']; $raw_corpus = implode("\n", $raw_corpus); - $ngrams_source[] = $raw_corpus; + if (strlen($raw_corpus)) { + $ngrams_source[] = $raw_corpus; + } $normal_corpus = $fields['normal']; - $normal_corpus = implode(' ', $normal_corpus); + $normal_corpus = implode("\n", $normal_corpus); if (strlen($normal_corpus)) { $ngrams_source[] = $normal_corpus; - $normal_corpus = ' '.$normal_corpus.' '; } $term_corpus = $fields['term']; - $term_corpus = implode(' ', $term_corpus); + $term_corpus = implode("\n", $term_corpus); if (strlen($term_corpus)) { $ngrams_source[] = $term_corpus; - $term_corpus = ' '.$term_corpus.' '; } $ferret_fields[] = $engine->newFieldObject() @@ -113,9 +113,9 @@ final class PhabricatorFerretFulltextEngineExtension ->setTermCorpus($term_corpus) ->setNormalCorpus($normal_corpus); } - $ngrams_source = implode(' ', $ngrams_source); + $ngrams_source = implode("\n", $ngrams_source); - $ngrams = $engine->getNgramsFromString($ngrams_source, 'index'); + $ngrams = $engine->getTermNgramsFromString($ngrams_source); $ferret_document->openTransaction(); diff --git a/src/applications/search/ferret/PhabricatorFerretEngine.php b/src/applications/search/ferret/PhabricatorFerretEngine.php index 3811e16289..5f6470ca20 100644 --- a/src/applications/search/ferret/PhabricatorFerretEngine.php +++ b/src/applications/search/ferret/PhabricatorFerretEngine.php @@ -16,22 +16,23 @@ abstract class PhabricatorFerretEngine extends Phobject { return $value; } - public function getNgramsFromString($value, $mode) { + public function getTermNgramsFromString($string) { + return $this->getNgramsFromString($string, true); + } + + public function getSubstringNgramsFromString($string) { + return $this->getNgramsFromString($string, false); + } + + private function getNgramsFromString($value, $as_term) { $tokens = $this->tokenizeString($value); $ngrams = array(); foreach ($tokens as $token) { $token = phutil_utf8_strtolower($token); - switch ($mode) { - case 'query': - break; - case 'index': - $token = ' '.$token.' '; - break; - case 'prefix': - $token = ' '.$token; - break; + if ($as_term) { + $token = ' '.$token.' '; } $token_v = phutil_utf8v($token); @@ -96,6 +97,10 @@ abstract class PhabricatorFerretEngine extends Phobject { $term_corpus = preg_replace('/\s+/u', ' ', $term_corpus); $term_corpus = trim($term_corpus, ' '); + if (strlen($term_corpus)) { + $term_corpus = ' '.$term_corpus.' '; + } + return $term_corpus; } diff --git a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php index b8f1043ab2..a8690f1d8b 100644 --- a/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php +++ b/src/applications/search/ferret/__tests__/PhabricatorFerretEngineTestCase.php @@ -5,11 +5,11 @@ final class PhabricatorFerretEngineTestCase public function testTermsCorpus() { $map = array( - 'Hear ye, hear ye!' => 'Hear ye hear ye', - "Thou whom'st've art worthy." => "Thou whom'st've art worthy", - 'Guaranteed to contain "food".' => 'Guaranteed to contain food', + 'Hear ye, hear ye!' => ' Hear ye hear ye ', + "Thou whom'st've art worthy." => " Thou whom'st've art worthy ", + 'Guaranteed to contain "food".' => ' Guaranteed to contain food ', 'http://example.org/path/to/file.jpg' => - 'http example org path to file jpg', + ' http example org path to file jpg ', ); $engine = new ManiphestTaskFerretEngine(); diff --git a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php index ccf78de5df..3fdfb33bfe 100644 --- a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php +++ b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php @@ -1497,18 +1497,15 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery } if ($is_substring) { - $ngrams = $engine->getNgramsFromString($value, 'query'); + $ngrams = $engine->getSubstringNgramsFromString($value); } else { - $ngrams = $engine->getNgramsFromString($value, 'index'); + $ngrams = $engine->getTermNgramsFromString($value); // If this is a stemmed term, only look for ngrams present in both the // unstemmed and stemmed variations. if ($is_stemmed) { $stem_value = $stemmer->stemToken($value); - $stem_ngrams = $engine->getNgramsFromString( - $stem_value, - 'index'); - + $stem_ngrams = $engine->getTermNgramsFromString($stem_value); $ngrams = array_intersect($ngrams, $stem_ngrams); } } @@ -1652,7 +1649,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery $term_constraints = array(); - $term_value = ' '.$engine->newTermsCorpus($value).' '; + $term_value = $engine->newTermsCorpus($value); if ($is_not) { $term_constraints[] = qsprintf( $conn, @@ -1670,7 +1667,6 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery if ($is_stemmed) { $stem_value = $stemmer->stemToken($value); $stem_value = $engine->newTermsCorpus($stem_value); - $stem_value = ' '.$stem_value.' '; $term_constraints[] = qsprintf( $conn,