1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-10 00:42:41 +01:00

Aggregate corpus data in Ferret field rows

Summary:
Ref T12819. This addresses two issues:

  - One practical issue is that right now, if you search for "dog cat", and they appear in different fields (for example, "dog" appears ONLY in the title, while "cat" appears ONLY in a comment) we won't find the document. This is somewhat rare -- usually, if "dog" appears in the title, it's also repeated in the description -- but I think clearly a bug. To attack this, start automatically creating a virtual "ALL" field with the full document text which we'll use as the primary thing we match against.
  - For fields which may occur more than once -- today, only comments -- aggregate them all into one big "all of the text" row instead of writing one row per comment. This partly addresses the first point ("dog" in one comment and "cat" in a different comment won't be found) and partly makes some of the query gymnastics easier.

Test Plan:
Ran `bin/storage upgrade`, ran `bin/search index <Txxx>`, saw sensible corpus values in the database:

```
mysql> select * from maniphest_task_ffield\G
*************************** 1. row ***************************
          id: 3
  documentID: 1981
    fieldKey: full
   rawCorpus: This is the task title
This is the task description.
normalCorpus: thi the task titl
thi the task descript
*************************** 2. row ***************************
          id: 4
  documentID: 1981
    fieldKey: titl
   rawCorpus: This is the task title
normalCorpus: thi the task titl
*************************** 3. row ***************************
          id: 5
  documentID: 1981
    fieldKey: body
   rawCorpus: This is the task description.
normalCorpus: thi the task descript
3 rows in set (0.00 sec)
```

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T12819

Differential Revision: https://secure.phabricator.com/D18497
This commit is contained in:
epriestley 2017-08-30 07:04:16 -07:00
parent 72cb3d3c84
commit 77ef38f9a8
4 changed files with 40 additions and 7 deletions

View file

@ -0,0 +1,4 @@
TRUNCATE TABLE {$NAMESPACE}_maniphest.maniphest_task_ffield;
ALTER TABLE {$NAMESPACE}_maniphest.maniphest_task_ffield
ADD UNIQUE KEY `key_documentfield` (documentID, fieldKey);

View file

@ -5,5 +5,6 @@ final class PhabricatorSearchDocumentFieldType extends Phobject {
const FIELD_TITLE = 'titl';
const FIELD_BODY = 'body';
const FIELD_COMMENT = 'cmnt';
const FIELD_ALL = 'full';
}

View file

@ -31,25 +31,52 @@ final class PhabricatorFerretFulltextEngineExtension
$stemmer = new PhutilSearchStemmer();
$ferret_fields = array();
$ngrams_source = array();
$key_all = PhabricatorSearchDocumentFieldType::FIELD_ALL;
$empty_template = array(
'raw' => array(),
'normal' => array(),
);
$ferret_corpus_map = array(
$key_all => $empty_template,
);
foreach ($document->getFieldData() as $field) {
list($key, $raw_corpus) = $field;
if (!strlen($raw_corpus)) {
continue;
}
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
if (!isset($ferret_corpus_map[$key])) {
$ferret_corpus_map[$key] = $empty_template;
}
$ferret_corpus_map[$key]['raw'][] = $raw_corpus;
$ferret_corpus_map[$key]['normal'][] = $normal_corpus;
$ferret_corpus_map[$key_all]['raw'][] = $raw_corpus;
$ferret_corpus_map[$key_all]['normal'][] = $normal_corpus;
}
$ferret_fields = array();
foreach ($ferret_corpus_map as $key => $fields) {
$raw_corpus = $fields['raw'];
$raw_corpus = implode("\n", $raw_corpus);
$normal_corpus = $fields['normal'];
$normal_corpus = implode("\n", $normal_corpus);
$ferret_fields[] = $engine->newFieldObject()
->setFieldKey($key)
->setRawCorpus($raw_corpus)
->setNormalCorpus($normal_corpus);
$ngrams_source[] = $raw_corpus;
}
$ngrams_source = implode(' ', $ngrams_source);
$ngrams_source = $ferret_corpus_map[$key_all]['raw'];
$ngrams_source = implode("\n", $ngrams_source);
$ngrams = id(new PhabricatorNgramEngine())
->getNgramsFromString($ngrams_source, 'index');

View file

@ -20,8 +20,9 @@ abstract class PhabricatorFerretField
'normalCorpus' => 'sort',
),
self::CONFIG_KEY_SCHEMA => array(
'key_document' => array(
'key_documentfield' => array(
'columns' => array('documentID', 'fieldKey'),
'unique' => true,
),
),
) + parent::getConfiguration();