1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-26 00:32:42 +01:00

Reduce the number of magic strings in the Ferret implementation

Summary:
Ref T12819. Push more of the magic `' '` stuff into the engine and simplify calls to ngram construction.

Also fixes a bug where a task with title "apple banana" and description "cherry doughnut" could match query "banana cherry" by separating separate term segments with newlines instead of spaces.

Test Plan:
  - Indexed some objects.
  - Searched (term, substring, quoted terms).
  - Viewed index in database.

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T12819

Differential Revision: https://secure.phabricator.com/D18534
This commit is contained in:
epriestley 2017-09-05 09:38:03 -07:00
parent 4a7593f47f
commit 46abc11114
4 changed files with 31 additions and 30 deletions

View file

@ -71,7 +71,7 @@ final class PhabricatorFerretFulltextEngineExtension
$term_corpus = $engine->newTermsCorpus($raw_corpus); $term_corpus = $engine->newTermsCorpus($raw_corpus);
$normal_corpus = $stemmer->stemCorpus($raw_corpus); $normal_corpus = $stemmer->stemCorpus($raw_corpus);
$normal_coprus = $engine->newTermsCorpus($normal_corpus); $normal_corpus = $engine->newTermsCorpus($normal_corpus);
if (!isset($ferret_corpus_map[$key])) { if (!isset($ferret_corpus_map[$key])) {
$ferret_corpus_map[$key] = $empty_template; $ferret_corpus_map[$key] = $empty_template;
@ -91,20 +91,20 @@ final class PhabricatorFerretFulltextEngineExtension
foreach ($ferret_corpus_map as $key => $fields) { foreach ($ferret_corpus_map as $key => $fields) {
$raw_corpus = $fields['raw']; $raw_corpus = $fields['raw'];
$raw_corpus = implode("\n", $raw_corpus); $raw_corpus = implode("\n", $raw_corpus);
$ngrams_source[] = $raw_corpus; if (strlen($raw_corpus)) {
$ngrams_source[] = $raw_corpus;
}
$normal_corpus = $fields['normal']; $normal_corpus = $fields['normal'];
$normal_corpus = implode(' ', $normal_corpus); $normal_corpus = implode("\n", $normal_corpus);
if (strlen($normal_corpus)) { if (strlen($normal_corpus)) {
$ngrams_source[] = $normal_corpus; $ngrams_source[] = $normal_corpus;
$normal_corpus = ' '.$normal_corpus.' ';
} }
$term_corpus = $fields['term']; $term_corpus = $fields['term'];
$term_corpus = implode(' ', $term_corpus); $term_corpus = implode("\n", $term_corpus);
if (strlen($term_corpus)) { if (strlen($term_corpus)) {
$ngrams_source[] = $term_corpus; $ngrams_source[] = $term_corpus;
$term_corpus = ' '.$term_corpus.' ';
} }
$ferret_fields[] = $engine->newFieldObject() $ferret_fields[] = $engine->newFieldObject()
@ -113,9 +113,9 @@ final class PhabricatorFerretFulltextEngineExtension
->setTermCorpus($term_corpus) ->setTermCorpus($term_corpus)
->setNormalCorpus($normal_corpus); ->setNormalCorpus($normal_corpus);
} }
$ngrams_source = implode(' ', $ngrams_source); $ngrams_source = implode("\n", $ngrams_source);
$ngrams = $engine->getNgramsFromString($ngrams_source, 'index'); $ngrams = $engine->getTermNgramsFromString($ngrams_source);
$ferret_document->openTransaction(); $ferret_document->openTransaction();

View file

@ -16,22 +16,23 @@ abstract class PhabricatorFerretEngine extends Phobject {
return $value; return $value;
} }
public function getNgramsFromString($value, $mode) { public function getTermNgramsFromString($string) {
return $this->getNgramsFromString($string, true);
}
public function getSubstringNgramsFromString($string) {
return $this->getNgramsFromString($string, false);
}
private function getNgramsFromString($value, $as_term) {
$tokens = $this->tokenizeString($value); $tokens = $this->tokenizeString($value);
$ngrams = array(); $ngrams = array();
foreach ($tokens as $token) { foreach ($tokens as $token) {
$token = phutil_utf8_strtolower($token); $token = phutil_utf8_strtolower($token);
switch ($mode) { if ($as_term) {
case 'query': $token = ' '.$token.' ';
break;
case 'index':
$token = ' '.$token.' ';
break;
case 'prefix':
$token = ' '.$token;
break;
} }
$token_v = phutil_utf8v($token); $token_v = phutil_utf8v($token);
@ -96,6 +97,10 @@ abstract class PhabricatorFerretEngine extends Phobject {
$term_corpus = preg_replace('/\s+/u', ' ', $term_corpus); $term_corpus = preg_replace('/\s+/u', ' ', $term_corpus);
$term_corpus = trim($term_corpus, ' '); $term_corpus = trim($term_corpus, ' ');
if (strlen($term_corpus)) {
$term_corpus = ' '.$term_corpus.' ';
}
return $term_corpus; return $term_corpus;
} }

View file

@ -5,11 +5,11 @@ final class PhabricatorFerretEngineTestCase
public function testTermsCorpus() { public function testTermsCorpus() {
$map = array( $map = array(
'Hear ye, hear ye!' => 'Hear ye hear ye', 'Hear ye, hear ye!' => ' Hear ye hear ye ',
"Thou whom'st've art worthy." => "Thou whom'st've art worthy", "Thou whom'st've art worthy." => " Thou whom'st've art worthy ",
'Guaranteed to contain "food".' => 'Guaranteed to contain food', 'Guaranteed to contain "food".' => ' Guaranteed to contain food ',
'http://example.org/path/to/file.jpg' => 'http://example.org/path/to/file.jpg' =>
'http example org path to file jpg', ' http example org path to file jpg ',
); );
$engine = new ManiphestTaskFerretEngine(); $engine = new ManiphestTaskFerretEngine();

View file

@ -1497,18 +1497,15 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
} }
if ($is_substring) { if ($is_substring) {
$ngrams = $engine->getNgramsFromString($value, 'query'); $ngrams = $engine->getSubstringNgramsFromString($value);
} else { } else {
$ngrams = $engine->getNgramsFromString($value, 'index'); $ngrams = $engine->getTermNgramsFromString($value);
// If this is a stemmed term, only look for ngrams present in both the // If this is a stemmed term, only look for ngrams present in both the
// unstemmed and stemmed variations. // unstemmed and stemmed variations.
if ($is_stemmed) { if ($is_stemmed) {
$stem_value = $stemmer->stemToken($value); $stem_value = $stemmer->stemToken($value);
$stem_ngrams = $engine->getNgramsFromString( $stem_ngrams = $engine->getTermNgramsFromString($stem_value);
$stem_value,
'index');
$ngrams = array_intersect($ngrams, $stem_ngrams); $ngrams = array_intersect($ngrams, $stem_ngrams);
} }
} }
@ -1652,7 +1649,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
$term_constraints = array(); $term_constraints = array();
$term_value = ' '.$engine->newTermsCorpus($value).' '; $term_value = $engine->newTermsCorpus($value);
if ($is_not) { if ($is_not) {
$term_constraints[] = qsprintf( $term_constraints[] = qsprintf(
$conn, $conn,
@ -1670,7 +1667,6 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
if ($is_stemmed) { if ($is_stemmed) {
$stem_value = $stemmer->stemToken($value); $stem_value = $stemmer->stemToken($value);
$stem_value = $engine->newTermsCorpus($stem_value); $stem_value = $engine->newTermsCorpus($stem_value);
$stem_value = ' '.$stem_value.' ';
$term_constraints[] = qsprintf( $term_constraints[] = qsprintf(
$conn, $conn,