mirror of
https://we.phorge.it/source/phorge.git
synced 2025-01-08 22:01:03 +01:00
Add a "terms" corpus to Ferret fields
Summary: Ref T12819. Ferret currently does substring search, but this is not the default mode users expect: when you search for the "RICO" act, you do not expect to find documents containing "apRICOt" even though "RICO" is a substring. To support term search, index the corpus as a list of terms with puncutation removed and whitespace normalized so the engine can match against it. Test Plan: Ran `storage upgrade`, ran `search index`, saw sensible database results: ``` rawCorpus: This is the task description. Hark! Whom'st'dve eaten this "food" shall surely ~perish~?? #blessed normalCorpus: thi the task descript hark whom dve eaten food shall sure perish bless termCorpus: This is the task description Hark Whom'st'dve eaten this food shall surely perish blessed ``` Reviewers: chad Reviewed By: chad Maniphest Tasks: T12819 Differential Revision: https://secure.phabricator.com/D18498
This commit is contained in:
parent
77ef38f9a8
commit
0e2e525bb4
6 changed files with 97 additions and 2 deletions
2
resources/sql/autopatches/20170830.ferret.02.term.sql
Normal file
2
resources/sql/autopatches/20170830.ferret.02.term.sql
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
ALTER TABLE {$NAMESPACE}_maniphest.maniphest_task_ffield
|
||||||
|
ADD termCorpus LONGTEXT NOT NULL COLLATE {$COLLATE_SORT};
|
|
@ -3206,6 +3206,7 @@ phutil_register_library_map(array(
|
||||||
'PhabricatorNavigationRemarkupRule' => 'infrastructure/markup/rule/PhabricatorNavigationRemarkupRule.php',
|
'PhabricatorNavigationRemarkupRule' => 'infrastructure/markup/rule/PhabricatorNavigationRemarkupRule.php',
|
||||||
'PhabricatorNeverTriggerClock' => 'infrastructure/daemon/workers/clock/PhabricatorNeverTriggerClock.php',
|
'PhabricatorNeverTriggerClock' => 'infrastructure/daemon/workers/clock/PhabricatorNeverTriggerClock.php',
|
||||||
'PhabricatorNgramEngine' => 'applications/search/ngrams/PhabricatorNgramEngine.php',
|
'PhabricatorNgramEngine' => 'applications/search/ngrams/PhabricatorNgramEngine.php',
|
||||||
|
'PhabricatorNgramEngineTestCase' => 'applications/search/ngrams/__tests__/PhabricatorNgramEngineTestCase.php',
|
||||||
'PhabricatorNgramsIndexEngineExtension' => 'applications/search/engineextension/PhabricatorNgramsIndexEngineExtension.php',
|
'PhabricatorNgramsIndexEngineExtension' => 'applications/search/engineextension/PhabricatorNgramsIndexEngineExtension.php',
|
||||||
'PhabricatorNgramsInterface' => 'applications/search/interface/PhabricatorNgramsInterface.php',
|
'PhabricatorNgramsInterface' => 'applications/search/interface/PhabricatorNgramsInterface.php',
|
||||||
'PhabricatorNotificationBuilder' => 'applications/notification/builder/PhabricatorNotificationBuilder.php',
|
'PhabricatorNotificationBuilder' => 'applications/notification/builder/PhabricatorNotificationBuilder.php',
|
||||||
|
@ -8587,6 +8588,7 @@ phutil_register_library_map(array(
|
||||||
'PhabricatorNavigationRemarkupRule' => 'PhutilRemarkupRule',
|
'PhabricatorNavigationRemarkupRule' => 'PhutilRemarkupRule',
|
||||||
'PhabricatorNeverTriggerClock' => 'PhabricatorTriggerClock',
|
'PhabricatorNeverTriggerClock' => 'PhabricatorTriggerClock',
|
||||||
'PhabricatorNgramEngine' => 'Phobject',
|
'PhabricatorNgramEngine' => 'Phobject',
|
||||||
|
'PhabricatorNgramEngineTestCase' => 'PhabricatorTestCase',
|
||||||
'PhabricatorNgramsIndexEngineExtension' => 'PhabricatorIndexEngineExtension',
|
'PhabricatorNgramsIndexEngineExtension' => 'PhabricatorIndexEngineExtension',
|
||||||
'PhabricatorNgramsInterface' => 'PhabricatorIndexableInterface',
|
'PhabricatorNgramsInterface' => 'PhabricatorIndexableInterface',
|
||||||
'PhabricatorNotificationBuilder' => 'Phobject',
|
'PhabricatorNotificationBuilder' => 'Phobject',
|
||||||
|
|
|
@ -30,11 +30,13 @@ final class PhabricatorFerretFulltextEngineExtension
|
||||||
->setEpochModified(0);
|
->setEpochModified(0);
|
||||||
|
|
||||||
$stemmer = new PhutilSearchStemmer();
|
$stemmer = new PhutilSearchStemmer();
|
||||||
|
$ngram_engine = id(new PhabricatorNgramEngine());
|
||||||
|
|
||||||
$key_all = PhabricatorSearchDocumentFieldType::FIELD_ALL;
|
$key_all = PhabricatorSearchDocumentFieldType::FIELD_ALL;
|
||||||
|
|
||||||
$empty_template = array(
|
$empty_template = array(
|
||||||
'raw' => array(),
|
'raw' => array(),
|
||||||
|
'term' => array(),
|
||||||
'normal' => array(),
|
'normal' => array(),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -49,15 +51,18 @@ final class PhabricatorFerretFulltextEngineExtension
|
||||||
}
|
}
|
||||||
|
|
||||||
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
|
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
|
||||||
|
$term_corpus = $ngram_engine->newTermsCorpus($raw_corpus);
|
||||||
|
|
||||||
if (!isset($ferret_corpus_map[$key])) {
|
if (!isset($ferret_corpus_map[$key])) {
|
||||||
$ferret_corpus_map[$key] = $empty_template;
|
$ferret_corpus_map[$key] = $empty_template;
|
||||||
}
|
}
|
||||||
|
|
||||||
$ferret_corpus_map[$key]['raw'][] = $raw_corpus;
|
$ferret_corpus_map[$key]['raw'][] = $raw_corpus;
|
||||||
|
$ferret_corpus_map[$key]['term'][] = $term_corpus;
|
||||||
$ferret_corpus_map[$key]['normal'][] = $normal_corpus;
|
$ferret_corpus_map[$key]['normal'][] = $normal_corpus;
|
||||||
|
|
||||||
$ferret_corpus_map[$key_all]['raw'][] = $raw_corpus;
|
$ferret_corpus_map[$key_all]['raw'][] = $raw_corpus;
|
||||||
|
$ferret_corpus_map[$key_all]['term'][] = $term_corpus;
|
||||||
$ferret_corpus_map[$key_all]['normal'][] = $normal_corpus;
|
$ferret_corpus_map[$key_all]['normal'][] = $normal_corpus;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -69,17 +74,23 @@ final class PhabricatorFerretFulltextEngineExtension
|
||||||
$normal_corpus = $fields['normal'];
|
$normal_corpus = $fields['normal'];
|
||||||
$normal_corpus = implode("\n", $normal_corpus);
|
$normal_corpus = implode("\n", $normal_corpus);
|
||||||
|
|
||||||
|
$term_corpus = $fields['term'];
|
||||||
|
$term_corpus = implode(' ', $term_corpus);
|
||||||
|
if (strlen($term_corpus)) {
|
||||||
|
$term_corpus = ' '.$term_corpus.' ';
|
||||||
|
}
|
||||||
|
|
||||||
$ferret_fields[] = $engine->newFieldObject()
|
$ferret_fields[] = $engine->newFieldObject()
|
||||||
->setFieldKey($key)
|
->setFieldKey($key)
|
||||||
->setRawCorpus($raw_corpus)
|
->setRawCorpus($raw_corpus)
|
||||||
|
->setTermCorpus($term_corpus)
|
||||||
->setNormalCorpus($normal_corpus);
|
->setNormalCorpus($normal_corpus);
|
||||||
}
|
}
|
||||||
|
|
||||||
$ngrams_source = $ferret_corpus_map[$key_all]['raw'];
|
$ngrams_source = $ferret_corpus_map[$key_all]['raw'];
|
||||||
$ngrams_source = implode("\n", $ngrams_source);
|
$ngrams_source = implode("\n", $ngrams_source);
|
||||||
|
|
||||||
$ngrams = id(new PhabricatorNgramEngine())
|
$ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index');
|
||||||
->getNgramsFromString($ngrams_source, 'index');
|
|
||||||
|
|
||||||
$ferret_document->openTransaction();
|
$ferret_document->openTransaction();
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,7 @@ abstract class PhabricatorFerretField
|
||||||
protected $documentID;
|
protected $documentID;
|
||||||
protected $fieldKey;
|
protected $fieldKey;
|
||||||
protected $rawCorpus;
|
protected $rawCorpus;
|
||||||
|
protected $termCorpus;
|
||||||
protected $normalCorpus;
|
protected $normalCorpus;
|
||||||
|
|
||||||
abstract public function getIndexKey();
|
abstract public function getIndexKey();
|
||||||
|
@ -17,6 +18,7 @@ abstract class PhabricatorFerretField
|
||||||
'documentID' => 'uint32',
|
'documentID' => 'uint32',
|
||||||
'fieldKey' => 'text4',
|
'fieldKey' => 'text4',
|
||||||
'rawCorpus' => 'sort',
|
'rawCorpus' => 'sort',
|
||||||
|
'termCorpus' => 'sort',
|
||||||
'normalCorpus' => 'sort',
|
'normalCorpus' => 'sort',
|
||||||
),
|
),
|
||||||
self::CONFIG_KEY_SCHEMA => array(
|
self::CONFIG_KEY_SCHEMA => array(
|
||||||
|
|
|
@ -40,4 +40,56 @@ final class PhabricatorNgramEngine extends Phobject {
|
||||||
return array_keys($ngrams);
|
return array_keys($ngrams);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function newTermsCorpus($raw_corpus) {
|
||||||
|
$term_corpus = strtr(
|
||||||
|
$raw_corpus,
|
||||||
|
array(
|
||||||
|
'!' => ' ',
|
||||||
|
'"' => ' ',
|
||||||
|
'#' => ' ',
|
||||||
|
'$' => ' ',
|
||||||
|
'%' => ' ',
|
||||||
|
'&' => ' ',
|
||||||
|
'(' => ' ',
|
||||||
|
')' => ' ',
|
||||||
|
'*' => ' ',
|
||||||
|
'+' => ' ',
|
||||||
|
',' => ' ',
|
||||||
|
'-' => ' ',
|
||||||
|
'/' => ' ',
|
||||||
|
':' => ' ',
|
||||||
|
';' => ' ',
|
||||||
|
'<' => ' ',
|
||||||
|
'=' => ' ',
|
||||||
|
'>' => ' ',
|
||||||
|
'?' => ' ',
|
||||||
|
'@' => ' ',
|
||||||
|
'[' => ' ',
|
||||||
|
'\\' => ' ',
|
||||||
|
']' => ' ',
|
||||||
|
'^' => ' ',
|
||||||
|
'`' => ' ',
|
||||||
|
'{' => ' ',
|
||||||
|
'|' => ' ',
|
||||||
|
'}' => ' ',
|
||||||
|
'~' => ' ',
|
||||||
|
'.' => ' ',
|
||||||
|
'_' => ' ',
|
||||||
|
"\n" => ' ',
|
||||||
|
"\r" => ' ',
|
||||||
|
"\t" => ' ',
|
||||||
|
));
|
||||||
|
|
||||||
|
// NOTE: Single quotes divide terms only if they're at a word boundary.
|
||||||
|
// In contractions, like "whom'st've", the entire word is a single term.
|
||||||
|
$term_corpus = preg_replace('/(^| )[\']+/', ' ', $term_corpus);
|
||||||
|
$term_corpus = preg_replace('/[\']+( |$)/', ' ', $term_corpus);
|
||||||
|
|
||||||
|
$term_corpus = preg_replace('/\s+/u', ' ', $term_corpus);
|
||||||
|
$term_corpus = trim($term_corpus, ' ');
|
||||||
|
|
||||||
|
return $term_corpus;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
final class PhabricatorNgramEngineTestCase
|
||||||
|
extends PhabricatorTestCase {
|
||||||
|
|
||||||
|
public function testTermsCorpus() {
|
||||||
|
$map = array(
|
||||||
|
'Hear ye, hear ye!' => 'Hear ye hear ye',
|
||||||
|
"Thou whom'st've art worthy." => "Thou whom'st've art worthy",
|
||||||
|
'Guaranteed to contain "food".' => 'Guaranteed to contain food',
|
||||||
|
'http://example.org/path/to/file.jpg' =>
|
||||||
|
'http example org path to file jpg',
|
||||||
|
);
|
||||||
|
|
||||||
|
$engine = new PhabricatorNgramEngine();
|
||||||
|
foreach ($map as $input => $expect) {
|
||||||
|
$actual = $engine->newTermsCorpus($input);
|
||||||
|
|
||||||
|
$this->assertEqual(
|
||||||
|
$expect,
|
||||||
|
$actual,
|
||||||
|
pht('Terms corpus for: %s', $input));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in a new issue