From 0e2e525bb41a39af857c55232af54f2dcb1fb1c1 Mon Sep 17 00:00:00 2001 From: epriestley Date: Wed, 30 Aug 2017 07:32:18 -0700 Subject: [PATCH] Add a "terms" corpus to Ferret fields Summary: Ref T12819. Ferret currently does substring search, but this is not the default mode users expect: when you search for the "RICO" act, you do not expect to find documents containing "apRICOt" even though "RICO" is a substring. To support term search, index the corpus as a list of terms with puncutation removed and whitespace normalized so the engine can match against it. Test Plan: Ran `storage upgrade`, ran `search index`, saw sensible database results: ``` rawCorpus: This is the task description. Hark! Whom'st'dve eaten this "food" shall surely ~perish~?? #blessed normalCorpus: thi the task descript hark whom dve eaten food shall sure perish bless termCorpus: This is the task description Hark Whom'st'dve eaten this food shall surely perish blessed ``` Reviewers: chad Reviewed By: chad Maniphest Tasks: T12819 Differential Revision: https://secure.phabricator.com/D18498 --- .../autopatches/20170830.ferret.02.term.sql | 2 + src/__phutil_library_map__.php | 2 + ...abricatorFerretFulltextEngineExtension.php | 15 +++++- .../search/ferret/PhabricatorFerretField.php | 2 + .../search/ngrams/PhabricatorNgramEngine.php | 52 +++++++++++++++++++ .../PhabricatorNgramEngineTestCase.php | 26 ++++++++++ 6 files changed, 97 insertions(+), 2 deletions(-) create mode 100644 resources/sql/autopatches/20170830.ferret.02.term.sql create mode 100644 src/applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php diff --git a/resources/sql/autopatches/20170830.ferret.02.term.sql b/resources/sql/autopatches/20170830.ferret.02.term.sql new file mode 100644 index 0000000000..81a619d85d --- /dev/null +++ b/resources/sql/autopatches/20170830.ferret.02.term.sql @@ -0,0 +1,2 @@ +ALTER TABLE {$NAMESPACE}_maniphest.maniphest_task_ffield + ADD termCorpus LONGTEXT NOT NULL COLLATE {$COLLATE_SORT}; diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php index 3edb31640b..ff30ddae98 100644 --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -3206,6 +3206,7 @@ phutil_register_library_map(array( 'PhabricatorNavigationRemarkupRule' => 'infrastructure/markup/rule/PhabricatorNavigationRemarkupRule.php', 'PhabricatorNeverTriggerClock' => 'infrastructure/daemon/workers/clock/PhabricatorNeverTriggerClock.php', 'PhabricatorNgramEngine' => 'applications/search/ngrams/PhabricatorNgramEngine.php', + 'PhabricatorNgramEngineTestCase' => 'applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php', 'PhabricatorNgramsIndexEngineExtension' => 'applications/search/engineextension/PhabricatorNgramsIndexEngineExtension.php', 'PhabricatorNgramsInterface' => 'applications/search/interface/PhabricatorNgramsInterface.php', 'PhabricatorNotificationBuilder' => 'applications/notification/builder/PhabricatorNotificationBuilder.php', @@ -8587,6 +8588,7 @@ phutil_register_library_map(array( 'PhabricatorNavigationRemarkupRule' => 'PhutilRemarkupRule', 'PhabricatorNeverTriggerClock' => 'PhabricatorTriggerClock', 'PhabricatorNgramEngine' => 'Phobject', + 'PhabricatorNgramEngineTestCase' => 'PhabricatorTestCase', 'PhabricatorNgramsIndexEngineExtension' => 'PhabricatorIndexEngineExtension', 'PhabricatorNgramsInterface' => 'PhabricatorIndexableInterface', 'PhabricatorNotificationBuilder' => 'Phobject', diff --git a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php index bafeca2c81..6eb97e2b7c 100644 --- a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php +++ b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php @@ -30,11 +30,13 @@ final class PhabricatorFerretFulltextEngineExtension ->setEpochModified(0); $stemmer = new PhutilSearchStemmer(); + $ngram_engine = id(new PhabricatorNgramEngine()); $key_all = PhabricatorSearchDocumentFieldType::FIELD_ALL; $empty_template = array( 'raw' => array(), + 'term' => array(), 'normal' => array(), ); @@ -49,15 +51,18 @@ final class PhabricatorFerretFulltextEngineExtension } $normal_corpus = $stemmer->stemCorpus($raw_corpus); + $term_corpus = $ngram_engine->newTermsCorpus($raw_corpus); if (!isset($ferret_corpus_map[$key])) { $ferret_corpus_map[$key] = $empty_template; } $ferret_corpus_map[$key]['raw'][] = $raw_corpus; + $ferret_corpus_map[$key]['term'][] = $term_corpus; $ferret_corpus_map[$key]['normal'][] = $normal_corpus; $ferret_corpus_map[$key_all]['raw'][] = $raw_corpus; + $ferret_corpus_map[$key_all]['term'][] = $term_corpus; $ferret_corpus_map[$key_all]['normal'][] = $normal_corpus; } @@ -69,17 +74,23 @@ final class PhabricatorFerretFulltextEngineExtension $normal_corpus = $fields['normal']; $normal_corpus = implode("\n", $normal_corpus); + $term_corpus = $fields['term']; + $term_corpus = implode(' ', $term_corpus); + if (strlen($term_corpus)) { + $term_corpus = ' '.$term_corpus.' '; + } + $ferret_fields[] = $engine->newFieldObject() ->setFieldKey($key) ->setRawCorpus($raw_corpus) + ->setTermCorpus($term_corpus) ->setNormalCorpus($normal_corpus); } $ngrams_source = $ferret_corpus_map[$key_all]['raw']; $ngrams_source = implode("\n", $ngrams_source); - $ngrams = id(new PhabricatorNgramEngine()) - ->getNgramsFromString($ngrams_source, 'index'); + $ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index'); $ferret_document->openTransaction(); diff --git a/src/applications/search/ferret/PhabricatorFerretField.php b/src/applications/search/ferret/PhabricatorFerretField.php index cd7e7c68d5..be39e745ed 100644 --- a/src/applications/search/ferret/PhabricatorFerretField.php +++ b/src/applications/search/ferret/PhabricatorFerretField.php @@ -6,6 +6,7 @@ abstract class PhabricatorFerretField protected $documentID; protected $fieldKey; protected $rawCorpus; + protected $termCorpus; protected $normalCorpus; abstract public function getIndexKey(); @@ -17,6 +18,7 @@ abstract class PhabricatorFerretField 'documentID' => 'uint32', 'fieldKey' => 'text4', 'rawCorpus' => 'sort', + 'termCorpus' => 'sort', 'normalCorpus' => 'sort', ), self::CONFIG_KEY_SCHEMA => array( diff --git a/src/applications/search/ngrams/PhabricatorNgramEngine.php b/src/applications/search/ngrams/PhabricatorNgramEngine.php index 87abdfc446..f8f55d8757 100644 --- a/src/applications/search/ngrams/PhabricatorNgramEngine.php +++ b/src/applications/search/ngrams/PhabricatorNgramEngine.php @@ -40,4 +40,56 @@ final class PhabricatorNgramEngine extends Phobject { return array_keys($ngrams); } + public function newTermsCorpus($raw_corpus) { + $term_corpus = strtr( + $raw_corpus, + array( + '!' => ' ', + '"' => ' ', + '#' => ' ', + '$' => ' ', + '%' => ' ', + '&' => ' ', + '(' => ' ', + ')' => ' ', + '*' => ' ', + '+' => ' ', + ',' => ' ', + '-' => ' ', + '/' => ' ', + ':' => ' ', + ';' => ' ', + '<' => ' ', + '=' => ' ', + '>' => ' ', + '?' => ' ', + '@' => ' ', + '[' => ' ', + '\\' => ' ', + ']' => ' ', + '^' => ' ', + '`' => ' ', + '{' => ' ', + '|' => ' ', + '}' => ' ', + '~' => ' ', + '.' => ' ', + '_' => ' ', + "\n" => ' ', + "\r" => ' ', + "\t" => ' ', + )); + + // NOTE: Single quotes divide terms only if they're at a word boundary. + // In contractions, like "whom'st've", the entire word is a single term. + $term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus); + $term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus); + + $term_corpus = preg_replace('/\s+/u', ' ', $term_corpus); + $term_corpus = trim($term_corpus, ' '); + + return $term_corpus; + } + + } diff --git a/src/applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php b/src/applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php new file mode 100644 index 0000000000..fccb6fb324 --- /dev/null +++ b/src/applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php @@ -0,0 +1,26 @@ + 'Hear ye hear ye', + "Thou whom'st've art worthy." => "Thou whom'st've art worthy", + 'Guaranteed to contain "food".' => 'Guaranteed to contain food', + 'http://example.org/path/to/file.jpg' => + 'http example org path to file jpg', + ); + + $engine = new PhabricatorNgramEngine(); + foreach ($map as $input => $expect) { + $actual = $engine->newTermsCorpus($input); + + $this->assertEqual( + $expect, + $actual, + pht('Terms corpus for: %s', $input)); + } + } + +}