mirror of
https://we.phorge.it/source/phorge.git
synced 2025-01-08 22:01:03 +01:00
Add a "terms" corpus to Ferret fields
Summary: Ref T12819. Ferret currently does substring search, but this is not the default mode users expect: when you search for the "RICO" act, you do not expect to find documents containing "apRICOt" even though "RICO" is a substring. To support term search, index the corpus as a list of terms with puncutation removed and whitespace normalized so the engine can match against it. Test Plan: Ran `storage upgrade`, ran `search index`, saw sensible database results: ``` rawCorpus: This is the task description. Hark! Whom'st'dve eaten this "food" shall surely ~perish~?? #blessed normalCorpus: thi the task descript hark whom dve eaten food shall sure perish bless termCorpus: This is the task description Hark Whom'st'dve eaten this food shall surely perish blessed ``` Reviewers: chad Reviewed By: chad Maniphest Tasks: T12819 Differential Revision: https://secure.phabricator.com/D18498
This commit is contained in:
parent
77ef38f9a8
commit
0e2e525bb4
6 changed files with 97 additions and 2 deletions
2
resources/sql/autopatches/20170830.ferret.02.term.sql
Normal file
2
resources/sql/autopatches/20170830.ferret.02.term.sql
Normal file
|
@ -0,0 +1,2 @@
|
|||
ALTER TABLE {$NAMESPACE}_maniphest.maniphest_task_ffield
|
||||
ADD termCorpus LONGTEXT NOT NULL COLLATE {$COLLATE_SORT};
|
|
@ -3206,6 +3206,7 @@ phutil_register_library_map(array(
|
|||
'PhabricatorNavigationRemarkupRule' => 'infrastructure/markup/rule/PhabricatorNavigationRemarkupRule.php',
|
||||
'PhabricatorNeverTriggerClock' => 'infrastructure/daemon/workers/clock/PhabricatorNeverTriggerClock.php',
|
||||
'PhabricatorNgramEngine' => 'applications/search/ngrams/PhabricatorNgramEngine.php',
|
||||
'PhabricatorNgramEngineTestCase' => 'applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php',
|
||||
'PhabricatorNgramsIndexEngineExtension' => 'applications/search/engineextension/PhabricatorNgramsIndexEngineExtension.php',
|
||||
'PhabricatorNgramsInterface' => 'applications/search/interface/PhabricatorNgramsInterface.php',
|
||||
'PhabricatorNotificationBuilder' => 'applications/notification/builder/PhabricatorNotificationBuilder.php',
|
||||
|
@ -8587,6 +8588,7 @@ phutil_register_library_map(array(
|
|||
'PhabricatorNavigationRemarkupRule' => 'PhutilRemarkupRule',
|
||||
'PhabricatorNeverTriggerClock' => 'PhabricatorTriggerClock',
|
||||
'PhabricatorNgramEngine' => 'Phobject',
|
||||
'PhabricatorNgramEngineTestCase' => 'PhabricatorTestCase',
|
||||
'PhabricatorNgramsIndexEngineExtension' => 'PhabricatorIndexEngineExtension',
|
||||
'PhabricatorNgramsInterface' => 'PhabricatorIndexableInterface',
|
||||
'PhabricatorNotificationBuilder' => 'Phobject',
|
||||
|
|
|
@ -30,11 +30,13 @@ final class PhabricatorFerretFulltextEngineExtension
|
|||
->setEpochModified(0);
|
||||
|
||||
$stemmer = new PhutilSearchStemmer();
|
||||
$ngram_engine = id(new PhabricatorNgramEngine());
|
||||
|
||||
$key_all = PhabricatorSearchDocumentFieldType::FIELD_ALL;
|
||||
|
||||
$empty_template = array(
|
||||
'raw' => array(),
|
||||
'term' => array(),
|
||||
'normal' => array(),
|
||||
);
|
||||
|
||||
|
@ -49,15 +51,18 @@ final class PhabricatorFerretFulltextEngineExtension
|
|||
}
|
||||
|
||||
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
|
||||
$term_corpus = $ngram_engine->newTermsCorpus($raw_corpus);
|
||||
|
||||
if (!isset($ferret_corpus_map[$key])) {
|
||||
$ferret_corpus_map[$key] = $empty_template;
|
||||
}
|
||||
|
||||
$ferret_corpus_map[$key]['raw'][] = $raw_corpus;
|
||||
$ferret_corpus_map[$key]['term'][] = $term_corpus;
|
||||
$ferret_corpus_map[$key]['normal'][] = $normal_corpus;
|
||||
|
||||
$ferret_corpus_map[$key_all]['raw'][] = $raw_corpus;
|
||||
$ferret_corpus_map[$key_all]['term'][] = $term_corpus;
|
||||
$ferret_corpus_map[$key_all]['normal'][] = $normal_corpus;
|
||||
}
|
||||
|
||||
|
@ -69,17 +74,23 @@ final class PhabricatorFerretFulltextEngineExtension
|
|||
$normal_corpus = $fields['normal'];
|
||||
$normal_corpus = implode("\n", $normal_corpus);
|
||||
|
||||
$term_corpus = $fields['term'];
|
||||
$term_corpus = implode(' ', $term_corpus);
|
||||
if (strlen($term_corpus)) {
|
||||
$term_corpus = ' '.$term_corpus.' ';
|
||||
}
|
||||
|
||||
$ferret_fields[] = $engine->newFieldObject()
|
||||
->setFieldKey($key)
|
||||
->setRawCorpus($raw_corpus)
|
||||
->setTermCorpus($term_corpus)
|
||||
->setNormalCorpus($normal_corpus);
|
||||
}
|
||||
|
||||
$ngrams_source = $ferret_corpus_map[$key_all]['raw'];
|
||||
$ngrams_source = implode("\n", $ngrams_source);
|
||||
|
||||
$ngrams = id(new PhabricatorNgramEngine())
|
||||
->getNgramsFromString($ngrams_source, 'index');
|
||||
$ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index');
|
||||
|
||||
$ferret_document->openTransaction();
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ abstract class PhabricatorFerretField
|
|||
protected $documentID;
|
||||
protected $fieldKey;
|
||||
protected $rawCorpus;
|
||||
protected $termCorpus;
|
||||
protected $normalCorpus;
|
||||
|
||||
abstract public function getIndexKey();
|
||||
|
@ -17,6 +18,7 @@ abstract class PhabricatorFerretField
|
|||
'documentID' => 'uint32',
|
||||
'fieldKey' => 'text4',
|
||||
'rawCorpus' => 'sort',
|
||||
'termCorpus' => 'sort',
|
||||
'normalCorpus' => 'sort',
|
||||
),
|
||||
self::CONFIG_KEY_SCHEMA => array(
|
||||
|
|
|
@ -40,4 +40,56 @@ final class PhabricatorNgramEngine extends Phobject {
|
|||
return array_keys($ngrams);
|
||||
}
|
||||
|
||||
public function newTermsCorpus($raw_corpus) {
|
||||
$term_corpus = strtr(
|
||||
$raw_corpus,
|
||||
array(
|
||||
'!' => ' ',
|
||||
'"' => ' ',
|
||||
'#' => ' ',
|
||||
'$' => ' ',
|
||||
'%' => ' ',
|
||||
'&' => ' ',
|
||||
'(' => ' ',
|
||||
')' => ' ',
|
||||
'*' => ' ',
|
||||
'+' => ' ',
|
||||
',' => ' ',
|
||||
'-' => ' ',
|
||||
'/' => ' ',
|
||||
':' => ' ',
|
||||
';' => ' ',
|
||||
'<' => ' ',
|
||||
'=' => ' ',
|
||||
'>' => ' ',
|
||||
'?' => ' ',
|
||||
'@' => ' ',
|
||||
'[' => ' ',
|
||||
'\\' => ' ',
|
||||
']' => ' ',
|
||||
'^' => ' ',
|
||||
'`' => ' ',
|
||||
'{' => ' ',
|
||||
'|' => ' ',
|
||||
'}' => ' ',
|
||||
'~' => ' ',
|
||||
'.' => ' ',
|
||||
'_' => ' ',
|
||||
"\n" => ' ',
|
||||
"\r" => ' ',
|
||||
"\t" => ' ',
|
||||
));
|
||||
|
||||
// NOTE: Single quotes divide terms only if they're at a word boundary.
|
||||
// In contractions, like "whom'st've", the entire word is a single term.
|
||||
$term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus);
|
||||
$term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus);
|
||||
|
||||
$term_corpus = preg_replace('/\s+/u', ' ', $term_corpus);
|
||||
$term_corpus = trim($term_corpus, ' ');
|
||||
|
||||
return $term_corpus;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
<?php
|
||||
|
||||
final class PhabricatorNgramEngineTestCase
|
||||
extends PhabricatorTestCase {
|
||||
|
||||
public function testTermsCorpus() {
|
||||
$map = array(
|
||||
'Hear ye, hear ye!' => 'Hear ye hear ye',
|
||||
"Thou whom'st've art worthy." => "Thou whom'st've art worthy",
|
||||
'Guaranteed to contain "food".' => 'Guaranteed to contain food',
|
||||
'http://example.org/path/to/file.jpg' =>
|
||||
'http example org path to file jpg',
|
||||
);
|
||||
|
||||
$engine = new PhabricatorNgramEngine();
|
||||
foreach ($map as $input => $expect) {
|
||||
$actual = $engine->newTermsCorpus($input);
|
||||
|
||||
$this->assertEqual(
|
||||
$expect,
|
||||
$actual,
|
||||
pht('Terms corpus for: %s', $input));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in a new issue