1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-12-22 21:40:55 +01:00

Add a "terms" corpus to Ferret fields

Summary:
Ref T12819. Ferret currently does substring search, but this is not the default mode users expect: when you search for the "RICO" act, you do not expect to find documents containing "apRICOt" even though "RICO" is a substring.

To support term search, index the corpus as a list of terms with puncutation removed and whitespace normalized so the engine can match against it.

Test Plan:
Ran `storage upgrade`, ran `search index`, saw sensible database results:

```
   rawCorpus: This is the task description.

Hark! Whom'st'dve eaten this "food" shall surely ~perish~?? #blessed
normalCorpus: thi the task descript hark whom dve eaten food shall sure perish bless
  termCorpus:  This is the task description Hark Whom'st'dve eaten this food shall surely perish blessed
```

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T12819

Differential Revision: https://secure.phabricator.com/D18498
This commit is contained in:
epriestley 2017-08-30 07:32:18 -07:00
parent 77ef38f9a8
commit 0e2e525bb4
6 changed files with 97 additions and 2 deletions

View file

@ -0,0 +1,2 @@
ALTER TABLE {$NAMESPACE}_maniphest.maniphest_task_ffield
ADD termCorpus LONGTEXT NOT NULL COLLATE {$COLLATE_SORT};

View file

@ -3206,6 +3206,7 @@ phutil_register_library_map(array(
'PhabricatorNavigationRemarkupRule' => 'infrastructure/markup/rule/PhabricatorNavigationRemarkupRule.php',
'PhabricatorNeverTriggerClock' => 'infrastructure/daemon/workers/clock/PhabricatorNeverTriggerClock.php',
'PhabricatorNgramEngine' => 'applications/search/ngrams/PhabricatorNgramEngine.php',
'PhabricatorNgramEngineTestCase' => 'applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php',
'PhabricatorNgramsIndexEngineExtension' => 'applications/search/engineextension/PhabricatorNgramsIndexEngineExtension.php',
'PhabricatorNgramsInterface' => 'applications/search/interface/PhabricatorNgramsInterface.php',
'PhabricatorNotificationBuilder' => 'applications/notification/builder/PhabricatorNotificationBuilder.php',
@ -8587,6 +8588,7 @@ phutil_register_library_map(array(
'PhabricatorNavigationRemarkupRule' => 'PhutilRemarkupRule',
'PhabricatorNeverTriggerClock' => 'PhabricatorTriggerClock',
'PhabricatorNgramEngine' => 'Phobject',
'PhabricatorNgramEngineTestCase' => 'PhabricatorTestCase',
'PhabricatorNgramsIndexEngineExtension' => 'PhabricatorIndexEngineExtension',
'PhabricatorNgramsInterface' => 'PhabricatorIndexableInterface',
'PhabricatorNotificationBuilder' => 'Phobject',

View file

@ -30,11 +30,13 @@ final class PhabricatorFerretFulltextEngineExtension
->setEpochModified(0);
$stemmer = new PhutilSearchStemmer();
$ngram_engine = id(new PhabricatorNgramEngine());
$key_all = PhabricatorSearchDocumentFieldType::FIELD_ALL;
$empty_template = array(
'raw' => array(),
'term' => array(),
'normal' => array(),
);
@ -49,15 +51,18 @@ final class PhabricatorFerretFulltextEngineExtension
}
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
$term_corpus = $ngram_engine->newTermsCorpus($raw_corpus);
if (!isset($ferret_corpus_map[$key])) {
$ferret_corpus_map[$key] = $empty_template;
}
$ferret_corpus_map[$key]['raw'][] = $raw_corpus;
$ferret_corpus_map[$key]['term'][] = $term_corpus;
$ferret_corpus_map[$key]['normal'][] = $normal_corpus;
$ferret_corpus_map[$key_all]['raw'][] = $raw_corpus;
$ferret_corpus_map[$key_all]['term'][] = $term_corpus;
$ferret_corpus_map[$key_all]['normal'][] = $normal_corpus;
}
@ -69,17 +74,23 @@ final class PhabricatorFerretFulltextEngineExtension
$normal_corpus = $fields['normal'];
$normal_corpus = implode("\n", $normal_corpus);
$term_corpus = $fields['term'];
$term_corpus = implode(' ', $term_corpus);
if (strlen($term_corpus)) {
$term_corpus = ' '.$term_corpus.' ';
}
$ferret_fields[] = $engine->newFieldObject()
->setFieldKey($key)
->setRawCorpus($raw_corpus)
->setTermCorpus($term_corpus)
->setNormalCorpus($normal_corpus);
}
$ngrams_source = $ferret_corpus_map[$key_all]['raw'];
$ngrams_source = implode("\n", $ngrams_source);
$ngrams = id(new PhabricatorNgramEngine())
->getNgramsFromString($ngrams_source, 'index');
$ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index');
$ferret_document->openTransaction();

View file

@ -6,6 +6,7 @@ abstract class PhabricatorFerretField
protected $documentID;
protected $fieldKey;
protected $rawCorpus;
protected $termCorpus;
protected $normalCorpus;
abstract public function getIndexKey();
@ -17,6 +18,7 @@ abstract class PhabricatorFerretField
'documentID' => 'uint32',
'fieldKey' => 'text4',
'rawCorpus' => 'sort',
'termCorpus' => 'sort',
'normalCorpus' => 'sort',
),
self::CONFIG_KEY_SCHEMA => array(

View file

@ -40,4 +40,56 @@ final class PhabricatorNgramEngine extends Phobject {
return array_keys($ngrams);
}
public function newTermsCorpus($raw_corpus) {
$term_corpus = strtr(
$raw_corpus,
array(
'!' => ' ',
'"' => ' ',
'#' => ' ',
'$' => ' ',
'%' => ' ',
'&' => ' ',
'(' => ' ',
')' => ' ',
'*' => ' ',
'+' => ' ',
',' => ' ',
'-' => ' ',
'/' => ' ',
':' => ' ',
';' => ' ',
'<' => ' ',
'=' => ' ',
'>' => ' ',
'?' => ' ',
'@' => ' ',
'[' => ' ',
'\\' => ' ',
']' => ' ',
'^' => ' ',
'`' => ' ',
'{' => ' ',
'|' => ' ',
'}' => ' ',
'~' => ' ',
'.' => ' ',
'_' => ' ',
"\n" => ' ',
"\r" => ' ',
"\t" => ' ',
));
// NOTE: Single quotes divide terms only if they're at a word boundary.
// In contractions, like "whom'st've", the entire word is a single term.
$term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus);
$term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus);
$term_corpus = preg_replace('/\s+/u', ' ', $term_corpus);
$term_corpus = trim($term_corpus, ' ');
return $term_corpus;
}
}

View file

@ -0,0 +1,26 @@
<?php
final class PhabricatorNgramEngineTestCase
extends PhabricatorTestCase {
public function testTermsCorpus() {
$map = array(
'Hear ye, hear ye!' => 'Hear ye hear ye',
"Thou whom'st've art worthy." => "Thou whom'st've art worthy",
'Guaranteed to contain "food".' => 'Guaranteed to contain food',
'http://example.org/path/to/file.jpg' =>
'http example org path to file jpg',
);
$engine = new PhabricatorNgramEngine();
foreach ($map as $input => $expect) {
$actual = $engine->newTermsCorpus($input);
$this->assertEqual(
$expect,
$actual,
pht('Terms corpus for: %s', $input));
}
}
}