From 77ef38f9a87f342721538bf4c4fabdd26b7d854b Mon Sep 17 00:00:00 2001 From: epriestley Date: Wed, 30 Aug 2017 07:04:16 -0700 Subject: [PATCH] Aggregate corpus data in Ferret field rows Summary: Ref T12819. This addresses two issues: - One practical issue is that right now, if you search for "dog cat", and they appear in different fields (for example, "dog" appears ONLY in the title, while "cat" appears ONLY in a comment) we won't find the document. This is somewhat rare -- usually, if "dog" appears in the title, it's also repeated in the description -- but I think clearly a bug. To attack this, start automatically creating a virtual "ALL" field with the full document text which we'll use as the primary thing we match against. - For fields which may occur more than once -- today, only comments -- aggregate them all into one big "all of the text" row instead of writing one row per comment. This partly addresses the first point ("dog" in one comment and "cat" in a different comment won't be found) and partly makes some of the query gymnastics easier. Test Plan: Ran `bin/storage upgrade`, ran `bin/search index `, saw sensible corpus values in the database: ``` mysql> select * from maniphest_task_ffield\G *************************** 1. row *************************** id: 3 documentID: 1981 fieldKey: full rawCorpus: This is the task title This is the task description. normalCorpus: thi the task titl thi the task descript *************************** 2. row *************************** id: 4 documentID: 1981 fieldKey: titl rawCorpus: This is the task title normalCorpus: thi the task titl *************************** 3. row *************************** id: 5 documentID: 1981 fieldKey: body rawCorpus: This is the task description. normalCorpus: thi the task descript 3 rows in set (0.00 sec) ``` Reviewers: chad Reviewed By: chad Maniphest Tasks: T12819 Differential Revision: https://secure.phabricator.com/D18497 --- .../autopatches/20170830.ferret.01.unique.sql | 4 ++ .../PhabricatorSearchDocumentFieldType.php | 1 + ...abricatorFerretFulltextEngineExtension.php | 39 ++++++++++++++++--- .../search/ferret/PhabricatorFerretField.php | 3 +- 4 files changed, 40 insertions(+), 7 deletions(-) create mode 100644 resources/sql/autopatches/20170830.ferret.01.unique.sql diff --git a/resources/sql/autopatches/20170830.ferret.01.unique.sql b/resources/sql/autopatches/20170830.ferret.01.unique.sql new file mode 100644 index 0000000000..f76c5050e8 --- /dev/null +++ b/resources/sql/autopatches/20170830.ferret.01.unique.sql @@ -0,0 +1,4 @@ +TRUNCATE TABLE {$NAMESPACE}_maniphest.maniphest_task_ffield; + +ALTER TABLE {$NAMESPACE}_maniphest.maniphest_task_ffield + ADD UNIQUE KEY `key_documentfield` (documentID, fieldKey); diff --git a/src/applications/search/constants/PhabricatorSearchDocumentFieldType.php b/src/applications/search/constants/PhabricatorSearchDocumentFieldType.php index 10dbf0ca65..4dd49e8c92 100644 --- a/src/applications/search/constants/PhabricatorSearchDocumentFieldType.php +++ b/src/applications/search/constants/PhabricatorSearchDocumentFieldType.php @@ -5,5 +5,6 @@ final class PhabricatorSearchDocumentFieldType extends Phobject { const FIELD_TITLE = 'titl'; const FIELD_BODY = 'body'; const FIELD_COMMENT = 'cmnt'; + const FIELD_ALL = 'full'; } diff --git a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php index 04d2fad608..bafeca2c81 100644 --- a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php +++ b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php @@ -31,25 +31,52 @@ final class PhabricatorFerretFulltextEngineExtension $stemmer = new PhutilSearchStemmer(); - $ferret_fields = array(); - $ngrams_source = array(); + $key_all = PhabricatorSearchDocumentFieldType::FIELD_ALL; + + $empty_template = array( + 'raw' => array(), + 'normal' => array(), + ); + + $ferret_corpus_map = array( + $key_all => $empty_template, + ); + foreach ($document->getFieldData() as $field) { list($key, $raw_corpus) = $field; - if (!strlen($raw_corpus)) { continue; } $normal_corpus = $stemmer->stemCorpus($raw_corpus); + if (!isset($ferret_corpus_map[$key])) { + $ferret_corpus_map[$key] = $empty_template; + } + + $ferret_corpus_map[$key]['raw'][] = $raw_corpus; + $ferret_corpus_map[$key]['normal'][] = $normal_corpus; + + $ferret_corpus_map[$key_all]['raw'][] = $raw_corpus; + $ferret_corpus_map[$key_all]['normal'][] = $normal_corpus; + } + + $ferret_fields = array(); + foreach ($ferret_corpus_map as $key => $fields) { + $raw_corpus = $fields['raw']; + $raw_corpus = implode("\n", $raw_corpus); + + $normal_corpus = $fields['normal']; + $normal_corpus = implode("\n", $normal_corpus); + $ferret_fields[] = $engine->newFieldObject() ->setFieldKey($key) ->setRawCorpus($raw_corpus) ->setNormalCorpus($normal_corpus); - - $ngrams_source[] = $raw_corpus; } - $ngrams_source = implode(' ', $ngrams_source); + + $ngrams_source = $ferret_corpus_map[$key_all]['raw']; + $ngrams_source = implode("\n", $ngrams_source); $ngrams = id(new PhabricatorNgramEngine()) ->getNgramsFromString($ngrams_source, 'index'); diff --git a/src/applications/search/ferret/PhabricatorFerretField.php b/src/applications/search/ferret/PhabricatorFerretField.php index 5b2370ae8f..cd7e7c68d5 100644 --- a/src/applications/search/ferret/PhabricatorFerretField.php +++ b/src/applications/search/ferret/PhabricatorFerretField.php @@ -20,8 +20,9 @@ abstract class PhabricatorFerretField 'normalCorpus' => 'sort', ), self::CONFIG_KEY_SCHEMA => array( - 'key_document' => array( + 'key_documentfield' => array( 'columns' => array('documentID', 'fieldKey'), + 'unique' => true, ), ), ) + parent::getConfiguration();