mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-26 00:32:42 +01:00
Reduce the number of magic strings in the Ferret implementation
Summary: Ref T12819. Push more of the magic `' '` stuff into the engine and simplify calls to ngram construction. Also fixes a bug where a task with title "apple banana" and description "cherry doughnut" could match query "banana cherry" by separating separate term segments with newlines instead of spaces. Test Plan: - Indexed some objects. - Searched (term, substring, quoted terms). - Viewed index in database. Reviewers: chad Reviewed By: chad Maniphest Tasks: T12819 Differential Revision: https://secure.phabricator.com/D18534
This commit is contained in:
parent
4a7593f47f
commit
46abc11114
4 changed files with 31 additions and 30 deletions
|
@ -71,7 +71,7 @@ final class PhabricatorFerretFulltextEngineExtension
|
||||||
$term_corpus = $engine->newTermsCorpus($raw_corpus);
|
$term_corpus = $engine->newTermsCorpus($raw_corpus);
|
||||||
|
|
||||||
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
|
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
|
||||||
$normal_coprus = $engine->newTermsCorpus($normal_corpus);
|
$normal_corpus = $engine->newTermsCorpus($normal_corpus);
|
||||||
|
|
||||||
if (!isset($ferret_corpus_map[$key])) {
|
if (!isset($ferret_corpus_map[$key])) {
|
||||||
$ferret_corpus_map[$key] = $empty_template;
|
$ferret_corpus_map[$key] = $empty_template;
|
||||||
|
@ -91,20 +91,20 @@ final class PhabricatorFerretFulltextEngineExtension
|
||||||
foreach ($ferret_corpus_map as $key => $fields) {
|
foreach ($ferret_corpus_map as $key => $fields) {
|
||||||
$raw_corpus = $fields['raw'];
|
$raw_corpus = $fields['raw'];
|
||||||
$raw_corpus = implode("\n", $raw_corpus);
|
$raw_corpus = implode("\n", $raw_corpus);
|
||||||
$ngrams_source[] = $raw_corpus;
|
if (strlen($raw_corpus)) {
|
||||||
|
$ngrams_source[] = $raw_corpus;
|
||||||
|
}
|
||||||
|
|
||||||
$normal_corpus = $fields['normal'];
|
$normal_corpus = $fields['normal'];
|
||||||
$normal_corpus = implode(' ', $normal_corpus);
|
$normal_corpus = implode("\n", $normal_corpus);
|
||||||
if (strlen($normal_corpus)) {
|
if (strlen($normal_corpus)) {
|
||||||
$ngrams_source[] = $normal_corpus;
|
$ngrams_source[] = $normal_corpus;
|
||||||
$normal_corpus = ' '.$normal_corpus.' ';
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$term_corpus = $fields['term'];
|
$term_corpus = $fields['term'];
|
||||||
$term_corpus = implode(' ', $term_corpus);
|
$term_corpus = implode("\n", $term_corpus);
|
||||||
if (strlen($term_corpus)) {
|
if (strlen($term_corpus)) {
|
||||||
$ngrams_source[] = $term_corpus;
|
$ngrams_source[] = $term_corpus;
|
||||||
$term_corpus = ' '.$term_corpus.' ';
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$ferret_fields[] = $engine->newFieldObject()
|
$ferret_fields[] = $engine->newFieldObject()
|
||||||
|
@ -113,9 +113,9 @@ final class PhabricatorFerretFulltextEngineExtension
|
||||||
->setTermCorpus($term_corpus)
|
->setTermCorpus($term_corpus)
|
||||||
->setNormalCorpus($normal_corpus);
|
->setNormalCorpus($normal_corpus);
|
||||||
}
|
}
|
||||||
$ngrams_source = implode(' ', $ngrams_source);
|
$ngrams_source = implode("\n", $ngrams_source);
|
||||||
|
|
||||||
$ngrams = $engine->getNgramsFromString($ngrams_source, 'index');
|
$ngrams = $engine->getTermNgramsFromString($ngrams_source);
|
||||||
|
|
||||||
$ferret_document->openTransaction();
|
$ferret_document->openTransaction();
|
||||||
|
|
||||||
|
|
|
@ -16,22 +16,23 @@ abstract class PhabricatorFerretEngine extends Phobject {
|
||||||
return $value;
|
return $value;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getNgramsFromString($value, $mode) {
|
public function getTermNgramsFromString($string) {
|
||||||
|
return $this->getNgramsFromString($string, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getSubstringNgramsFromString($string) {
|
||||||
|
return $this->getNgramsFromString($string, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function getNgramsFromString($value, $as_term) {
|
||||||
$tokens = $this->tokenizeString($value);
|
$tokens = $this->tokenizeString($value);
|
||||||
|
|
||||||
$ngrams = array();
|
$ngrams = array();
|
||||||
foreach ($tokens as $token) {
|
foreach ($tokens as $token) {
|
||||||
$token = phutil_utf8_strtolower($token);
|
$token = phutil_utf8_strtolower($token);
|
||||||
|
|
||||||
switch ($mode) {
|
if ($as_term) {
|
||||||
case 'query':
|
$token = ' '.$token.' ';
|
||||||
break;
|
|
||||||
case 'index':
|
|
||||||
$token = ' '.$token.' ';
|
|
||||||
break;
|
|
||||||
case 'prefix':
|
|
||||||
$token = ' '.$token;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$token_v = phutil_utf8v($token);
|
$token_v = phutil_utf8v($token);
|
||||||
|
@ -96,6 +97,10 @@ abstract class PhabricatorFerretEngine extends Phobject {
|
||||||
$term_corpus = preg_replace('/\s+/u', ' ', $term_corpus);
|
$term_corpus = preg_replace('/\s+/u', ' ', $term_corpus);
|
||||||
$term_corpus = trim($term_corpus, ' ');
|
$term_corpus = trim($term_corpus, ' ');
|
||||||
|
|
||||||
|
if (strlen($term_corpus)) {
|
||||||
|
$term_corpus = ' '.$term_corpus.' ';
|
||||||
|
}
|
||||||
|
|
||||||
return $term_corpus;
|
return $term_corpus;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -5,11 +5,11 @@ final class PhabricatorFerretEngineTestCase
|
||||||
|
|
||||||
public function testTermsCorpus() {
|
public function testTermsCorpus() {
|
||||||
$map = array(
|
$map = array(
|
||||||
'Hear ye, hear ye!' => 'Hear ye hear ye',
|
'Hear ye, hear ye!' => ' Hear ye hear ye ',
|
||||||
"Thou whom'st've art worthy." => "Thou whom'st've art worthy",
|
"Thou whom'st've art worthy." => " Thou whom'st've art worthy ",
|
||||||
'Guaranteed to contain "food".' => 'Guaranteed to contain food',
|
'Guaranteed to contain "food".' => ' Guaranteed to contain food ',
|
||||||
'http://example.org/path/to/file.jpg' =>
|
'http://example.org/path/to/file.jpg' =>
|
||||||
'http example org path to file jpg',
|
' http example org path to file jpg ',
|
||||||
);
|
);
|
||||||
|
|
||||||
$engine = new ManiphestTaskFerretEngine();
|
$engine = new ManiphestTaskFerretEngine();
|
||||||
|
|
|
@ -1497,18 +1497,15 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($is_substring) {
|
if ($is_substring) {
|
||||||
$ngrams = $engine->getNgramsFromString($value, 'query');
|
$ngrams = $engine->getSubstringNgramsFromString($value);
|
||||||
} else {
|
} else {
|
||||||
$ngrams = $engine->getNgramsFromString($value, 'index');
|
$ngrams = $engine->getTermNgramsFromString($value);
|
||||||
|
|
||||||
// If this is a stemmed term, only look for ngrams present in both the
|
// If this is a stemmed term, only look for ngrams present in both the
|
||||||
// unstemmed and stemmed variations.
|
// unstemmed and stemmed variations.
|
||||||
if ($is_stemmed) {
|
if ($is_stemmed) {
|
||||||
$stem_value = $stemmer->stemToken($value);
|
$stem_value = $stemmer->stemToken($value);
|
||||||
$stem_ngrams = $engine->getNgramsFromString(
|
$stem_ngrams = $engine->getTermNgramsFromString($stem_value);
|
||||||
$stem_value,
|
|
||||||
'index');
|
|
||||||
|
|
||||||
$ngrams = array_intersect($ngrams, $stem_ngrams);
|
$ngrams = array_intersect($ngrams, $stem_ngrams);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1652,7 +1649,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
|
|
||||||
$term_constraints = array();
|
$term_constraints = array();
|
||||||
|
|
||||||
$term_value = ' '.$engine->newTermsCorpus($value).' ';
|
$term_value = $engine->newTermsCorpus($value);
|
||||||
if ($is_not) {
|
if ($is_not) {
|
||||||
$term_constraints[] = qsprintf(
|
$term_constraints[] = qsprintf(
|
||||||
$conn,
|
$conn,
|
||||||
|
@ -1670,7 +1667,6 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
if ($is_stemmed) {
|
if ($is_stemmed) {
|
||||||
$stem_value = $stemmer->stemToken($value);
|
$stem_value = $stemmer->stemToken($value);
|
||||||
$stem_value = $engine->newTermsCorpus($stem_value);
|
$stem_value = $engine->newTermsCorpus($stem_value);
|
||||||
$stem_value = ' '.$stem_value.' ';
|
|
||||||
|
|
||||||
$term_constraints[] = qsprintf(
|
$term_constraints[] = qsprintf(
|
||||||
$conn,
|
$conn,
|
||||||
|
|
Loading…
Reference in a new issue