1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-12-26 15:30:58 +01:00

Reduce the number of magic strings in the Ferret implementation

Summary:
Ref T12819. Push more of the magic `' '` stuff into the engine and simplify calls to ngram construction.

Also fixes a bug where a task with title "apple banana" and description "cherry doughnut" could match query "banana cherry" by separating separate term segments with newlines instead of spaces.

Test Plan:
  - Indexed some objects.
  - Searched (term, substring, quoted terms).
  - Viewed index in database.

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T12819

Differential Revision: https://secure.phabricator.com/D18534
This commit is contained in:
epriestley 2017-09-05 09:38:03 -07:00
parent 4a7593f47f
commit 46abc11114
4 changed files with 31 additions and 30 deletions

View file

@ -71,7 +71,7 @@ final class PhabricatorFerretFulltextEngineExtension
$term_corpus = $engine->newTermsCorpus($raw_corpus);
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
$normal_coprus = $engine->newTermsCorpus($normal_corpus);
$normal_corpus = $engine->newTermsCorpus($normal_corpus);
if (!isset($ferret_corpus_map[$key])) {
$ferret_corpus_map[$key] = $empty_template;
@ -91,20 +91,20 @@ final class PhabricatorFerretFulltextEngineExtension
foreach ($ferret_corpus_map as $key => $fields) {
$raw_corpus = $fields['raw'];
$raw_corpus = implode("\n", $raw_corpus);
$ngrams_source[] = $raw_corpus;
if (strlen($raw_corpus)) {
$ngrams_source[] = $raw_corpus;
}
$normal_corpus = $fields['normal'];
$normal_corpus = implode(' ', $normal_corpus);
$normal_corpus = implode("\n", $normal_corpus);
if (strlen($normal_corpus)) {
$ngrams_source[] = $normal_corpus;
$normal_corpus = ' '.$normal_corpus.' ';
}
$term_corpus = $fields['term'];
$term_corpus = implode(' ', $term_corpus);
$term_corpus = implode("\n", $term_corpus);
if (strlen($term_corpus)) {
$ngrams_source[] = $term_corpus;
$term_corpus = ' '.$term_corpus.' ';
}
$ferret_fields[] = $engine->newFieldObject()
@ -113,9 +113,9 @@ final class PhabricatorFerretFulltextEngineExtension
->setTermCorpus($term_corpus)
->setNormalCorpus($normal_corpus);
}
$ngrams_source = implode(' ', $ngrams_source);
$ngrams_source = implode("\n", $ngrams_source);
$ngrams = $engine->getNgramsFromString($ngrams_source, 'index');
$ngrams = $engine->getTermNgramsFromString($ngrams_source);
$ferret_document->openTransaction();

View file

@ -16,22 +16,23 @@ abstract class PhabricatorFerretEngine extends Phobject {
return $value;
}
public function getNgramsFromString($value, $mode) {
public function getTermNgramsFromString($string) {
return $this->getNgramsFromString($string, true);
}
public function getSubstringNgramsFromString($string) {
return $this->getNgramsFromString($string, false);
}
private function getNgramsFromString($value, $as_term) {
$tokens = $this->tokenizeString($value);
$ngrams = array();
foreach ($tokens as $token) {
$token = phutil_utf8_strtolower($token);
switch ($mode) {
case 'query':
break;
case 'index':
$token = ' '.$token.' ';
break;
case 'prefix':
$token = ' '.$token;
break;
if ($as_term) {
$token = ' '.$token.' ';
}
$token_v = phutil_utf8v($token);
@ -96,6 +97,10 @@ abstract class PhabricatorFerretEngine extends Phobject {
$term_corpus = preg_replace('/\s+/u', ' ', $term_corpus);
$term_corpus = trim($term_corpus, ' ');
if (strlen($term_corpus)) {
$term_corpus = ' '.$term_corpus.' ';
}
return $term_corpus;
}

View file

@ -5,11 +5,11 @@ final class PhabricatorFerretEngineTestCase
public function testTermsCorpus() {
$map = array(
'Hear ye, hear ye!' => 'Hear ye hear ye',
"Thou whom'st've art worthy." => "Thou whom'st've art worthy",
'Guaranteed to contain "food".' => 'Guaranteed to contain food',
'Hear ye, hear ye!' => ' Hear ye hear ye ',
"Thou whom'st've art worthy." => " Thou whom'st've art worthy ",
'Guaranteed to contain "food".' => ' Guaranteed to contain food ',
'http://example.org/path/to/file.jpg' =>
'http example org path to file jpg',
' http example org path to file jpg ',
);
$engine = new ManiphestTaskFerretEngine();

View file

@ -1497,18 +1497,15 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
}
if ($is_substring) {
$ngrams = $engine->getNgramsFromString($value, 'query');
$ngrams = $engine->getSubstringNgramsFromString($value);
} else {
$ngrams = $engine->getNgramsFromString($value, 'index');
$ngrams = $engine->getTermNgramsFromString($value);
// If this is a stemmed term, only look for ngrams present in both the
// unstemmed and stemmed variations.
if ($is_stemmed) {
$stem_value = $stemmer->stemToken($value);
$stem_ngrams = $engine->getNgramsFromString(
$stem_value,
'index');
$stem_ngrams = $engine->getTermNgramsFromString($stem_value);
$ngrams = array_intersect($ngrams, $stem_ngrams);
}
}
@ -1652,7 +1649,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
$term_constraints = array();
$term_value = ' '.$engine->newTermsCorpus($value).' ';
$term_value = $engine->newTermsCorpus($value);
if ($is_not) {
$term_constraints[] = qsprintf(
$conn,
@ -1670,7 +1667,6 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
if ($is_stemmed) {
$stem_value = $stemmer->stemToken($value);
$stem_value = $engine->newTermsCorpus($stem_value);
$stem_value = ' '.$stem_value.' ';
$term_constraints[] = qsprintf(
$conn,