mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-29 18:22:41 +01:00
Provide some "term vs substring" support for the Ferret engine
Summary: Ref T12819. Distinguishes between "term" queries and "substring" queries, and tries to match them correctly most of the time. For example: - `example` matches "example", obviously. - `~amp` matches "example", but `amp` does not. - `examples` matches "example" through stemming. - `"examples"` does not match "example" (quoted text does not stem). - `"an examp"` does not match "an example" (quoted text is still term text). - `~"an examp"` matches "an example" (quoted, substring-operator text uses substring search). Test Plan: Ran searches similar to the above, they seemed to do what they should. Reviewers: chad Reviewed By: chad Maniphest Tasks: T12819 Differential Revision: https://secure.phabricator.com/D18500
This commit is contained in:
parent
e5a495f435
commit
df9c24e750
3 changed files with 124 additions and 43 deletions
|
@ -92,8 +92,8 @@ final class ManiphestTaskSearchEngine
|
||||||
->setLabel(pht('Contains Words'))
|
->setLabel(pht('Contains Words'))
|
||||||
->setKey('fulltext'),
|
->setKey('fulltext'),
|
||||||
id(new PhabricatorSearchTextField())
|
id(new PhabricatorSearchTextField())
|
||||||
->setLabel(pht('Matches (Prototype)'))
|
->setLabel(pht('Query (Prototype)'))
|
||||||
->setKey('ferret')
|
->setKey('query')
|
||||||
->setIsHidden($hide_ferret),
|
->setIsHidden($hide_ferret),
|
||||||
id(new PhabricatorSearchThreeStateField())
|
id(new PhabricatorSearchThreeStateField())
|
||||||
->setLabel(pht('Open Parents'))
|
->setLabel(pht('Open Parents'))
|
||||||
|
@ -150,8 +150,8 @@ final class ManiphestTaskSearchEngine
|
||||||
'statuses',
|
'statuses',
|
||||||
'priorities',
|
'priorities',
|
||||||
'subtypes',
|
'subtypes',
|
||||||
|
'query',
|
||||||
'fulltext',
|
'fulltext',
|
||||||
'ferret',
|
|
||||||
'hasParents',
|
'hasParents',
|
||||||
'hasSubtasks',
|
'hasSubtasks',
|
||||||
'parentIDs',
|
'parentIDs',
|
||||||
|
@ -231,8 +231,8 @@ final class ManiphestTaskSearchEngine
|
||||||
$query->withFullTextSearch($map['fulltext']);
|
$query->withFullTextSearch($map['fulltext']);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (strlen($map['ferret'])) {
|
if (strlen($map['query'])) {
|
||||||
$raw_query = $map['ferret'];
|
$raw_query = $map['query'];
|
||||||
|
|
||||||
$compiler = id(new PhutilSearchQueryCompiler())
|
$compiler = id(new PhutilSearchQueryCompiler())
|
||||||
->setEnableFunctions(true);
|
->setEnableFunctions(true);
|
||||||
|
|
|
@ -50,9 +50,11 @@ final class PhabricatorFerretFulltextEngineExtension
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
|
|
||||||
$term_corpus = $ngram_engine->newTermsCorpus($raw_corpus);
|
$term_corpus = $ngram_engine->newTermsCorpus($raw_corpus);
|
||||||
|
|
||||||
|
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
|
||||||
|
$normal_coprus = $ngram_engine->newTermsCorpus($normal_corpus);
|
||||||
|
|
||||||
if (!isset($ferret_corpus_map[$key])) {
|
if (!isset($ferret_corpus_map[$key])) {
|
||||||
$ferret_corpus_map[$key] = $empty_template;
|
$ferret_corpus_map[$key] = $empty_template;
|
||||||
}
|
}
|
||||||
|
@ -67,16 +69,23 @@ final class PhabricatorFerretFulltextEngineExtension
|
||||||
}
|
}
|
||||||
|
|
||||||
$ferret_fields = array();
|
$ferret_fields = array();
|
||||||
|
$ngrams_source = array();
|
||||||
foreach ($ferret_corpus_map as $key => $fields) {
|
foreach ($ferret_corpus_map as $key => $fields) {
|
||||||
$raw_corpus = $fields['raw'];
|
$raw_corpus = $fields['raw'];
|
||||||
$raw_corpus = implode("\n", $raw_corpus);
|
$raw_corpus = implode("\n", $raw_corpus);
|
||||||
|
$ngrams_source[] = $raw_corpus;
|
||||||
|
|
||||||
$normal_corpus = $fields['normal'];
|
$normal_corpus = $fields['normal'];
|
||||||
$normal_corpus = implode("\n", $normal_corpus);
|
$normal_corpus = implode(' ', $normal_corpus);
|
||||||
|
if (strlen($normal_corpus)) {
|
||||||
|
$ngrams_source[] = $normal_corpus;
|
||||||
|
$normal_corpus = ' '.$normal_corpus.' ';
|
||||||
|
}
|
||||||
|
|
||||||
$term_corpus = $fields['term'];
|
$term_corpus = $fields['term'];
|
||||||
$term_corpus = implode(' ', $term_corpus);
|
$term_corpus = implode(' ', $term_corpus);
|
||||||
if (strlen($term_corpus)) {
|
if (strlen($term_corpus)) {
|
||||||
|
$ngrams_source[] = $term_corpus;
|
||||||
$term_corpus = ' '.$term_corpus.' ';
|
$term_corpus = ' '.$term_corpus.' ';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -86,9 +95,7 @@ final class PhabricatorFerretFulltextEngineExtension
|
||||||
->setTermCorpus($term_corpus)
|
->setTermCorpus($term_corpus)
|
||||||
->setNormalCorpus($normal_corpus);
|
->setNormalCorpus($normal_corpus);
|
||||||
}
|
}
|
||||||
|
$ngrams_source = implode(' ', $ngrams_source);
|
||||||
$ngrams_source = $ferret_corpus_map[$key_all]['raw'];
|
|
||||||
$ngrams_source = implode("\n", $ngrams_source);
|
|
||||||
|
|
||||||
$ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index');
|
$ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index');
|
||||||
|
|
||||||
|
|
|
@ -1409,8 +1409,11 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
return array();
|
return array();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING;
|
||||||
|
|
||||||
$engine = $this->ferretEngine;
|
$engine = $this->ferretEngine;
|
||||||
$ngram_engine = new PhabricatorNgramEngine();
|
$ngram_engine = new PhabricatorNgramEngine();
|
||||||
|
$stemmer = new PhutilSearchStemmer();
|
||||||
|
|
||||||
$ngram_table = $engine->newNgramsObject();
|
$ngram_table = $engine->newNgramsObject();
|
||||||
$ngram_table_name = $ngram_table->getTableName();
|
$ngram_table_name = $ngram_table->getTableName();
|
||||||
|
@ -1422,22 +1425,49 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
|
|
||||||
$length = count(phutil_utf8v($value));
|
$length = count(phutil_utf8v($value));
|
||||||
|
|
||||||
if ($length >= 3) {
|
if ($raw_token->getOperator() == $op_sub) {
|
||||||
$ngrams = $ngram_engine->getNgramsFromString($value, 'query');
|
$is_substring = true;
|
||||||
$prefix = false;
|
|
||||||
} else if ($length == 2) {
|
|
||||||
$ngrams = $ngram_engine->getNgramsFromString($value, 'prefix');
|
|
||||||
$prefix = false;
|
|
||||||
} else {
|
} else {
|
||||||
$ngrams = array(' '.$value);
|
$is_substring = false;
|
||||||
$prefix = true;
|
}
|
||||||
|
|
||||||
|
// If the user specified a substring query for a substring which is
|
||||||
|
// shorter than the ngram length, we can't use the ngram index, so
|
||||||
|
// don't do a join. We'll fall back to just doing LIKE on the full
|
||||||
|
// corpus.
|
||||||
|
if ($is_substring) {
|
||||||
|
if ($length < 3) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($raw_token->isQuoted()) {
|
||||||
|
$is_stemmed = false;
|
||||||
|
} else {
|
||||||
|
$is_stemmed = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($is_substring) {
|
||||||
|
$ngrams = $ngram_engine->getNgramsFromString($value, 'query');
|
||||||
|
} else {
|
||||||
|
$ngrams = $ngram_engine->getNgramsFromString($value, 'index');
|
||||||
|
|
||||||
|
// If this is a stemmed term, only look for ngrams present in both the
|
||||||
|
// unstemmed and stemmed variations.
|
||||||
|
if ($is_stemmed) {
|
||||||
|
$stem_value = $stemmer->stemToken($value);
|
||||||
|
$stem_ngrams = $ngram_engine->getNgramsFromString(
|
||||||
|
$stem_value,
|
||||||
|
'index');
|
||||||
|
|
||||||
|
$ngrams = array_intersect($ngrams, $stem_ngrams);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
foreach ($ngrams as $ngram) {
|
foreach ($ngrams as $ngram) {
|
||||||
$flat[] = array(
|
$flat[] = array(
|
||||||
'table' => $ngram_table_name,
|
'table' => $ngram_table_name,
|
||||||
'ngram' => $ngram,
|
'ngram' => $ngram,
|
||||||
'prefix' => $prefix,
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1472,20 +1502,9 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
foreach ($flat as $spec) {
|
foreach ($flat as $spec) {
|
||||||
$table = $spec['table'];
|
$table = $spec['table'];
|
||||||
$ngram = $spec['ngram'];
|
$ngram = $spec['ngram'];
|
||||||
$prefix = $spec['prefix'];
|
|
||||||
|
|
||||||
$alias = 'ft'.$idx++;
|
$alias = 'ft'.$idx++;
|
||||||
|
|
||||||
if ($prefix) {
|
|
||||||
$joins[] = qsprintf(
|
|
||||||
$conn,
|
|
||||||
'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram LIKE %>',
|
|
||||||
$table,
|
|
||||||
$alias,
|
|
||||||
$alias,
|
|
||||||
$alias,
|
|
||||||
$ngram);
|
|
||||||
} else {
|
|
||||||
$joins[] = qsprintf(
|
$joins[] = qsprintf(
|
||||||
$conn,
|
$conn,
|
||||||
'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram = %s',
|
'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram = %s',
|
||||||
|
@ -1495,7 +1514,6 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
$alias,
|
$alias,
|
||||||
$ngram);
|
$ngram);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
$joins[] = qsprintf(
|
$joins[] = qsprintf(
|
||||||
$conn,
|
$conn,
|
||||||
|
@ -1510,16 +1528,72 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
return array();
|
return array();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$ngram_engine = new PhabricatorNgramEngine();
|
||||||
|
$stemmer = new PhutilSearchStemmer();
|
||||||
|
$op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING;
|
||||||
|
|
||||||
$where = array();
|
$where = array();
|
||||||
foreach ($this->ferretTokens as $fulltext_token) {
|
foreach ($this->ferretTokens as $fulltext_token) {
|
||||||
$raw_token = $fulltext_token->getToken();
|
$raw_token = $fulltext_token->getToken();
|
||||||
$value = $raw_token->getValue();
|
$value = $raw_token->getValue();
|
||||||
|
|
||||||
|
if ($raw_token->getOperator() == $op_sub) {
|
||||||
|
$is_substring = true;
|
||||||
|
} else {
|
||||||
|
$is_substring = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we're doing substring search, we just match against the raw corpus
|
||||||
|
// and we're done.
|
||||||
|
if ($is_substring) {
|
||||||
$where[] = qsprintf(
|
$where[] = qsprintf(
|
||||||
$conn,
|
$conn,
|
||||||
'(ftfield.rawCorpus LIKE %~ OR ftfield.normalCorpus LIKE %~)',
|
'(ftfield.rawCorpus LIKE %~)',
|
||||||
$value,
|
|
||||||
$value);
|
$value);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Otherwise, we need to match against the term corpus and the normal
|
||||||
|
// corpus, so that searching for "raw" does not find "strawberry".
|
||||||
|
if ($raw_token->isQuoted()) {
|
||||||
|
$is_quoted = true;
|
||||||
|
$is_stemmed = false;
|
||||||
|
} else {
|
||||||
|
$is_quoted = false;
|
||||||
|
$is_stemmed = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
$term_constraints = array();
|
||||||
|
|
||||||
|
$term_value = ' '.$ngram_engine->newTermsCorpus($value).' ';
|
||||||
|
$term_constraints[] = qsprintf(
|
||||||
|
$conn,
|
||||||
|
'(ftfield.termCorpus LIKE %~)',
|
||||||
|
$term_value);
|
||||||
|
|
||||||
|
if ($is_stemmed) {
|
||||||
|
$stem_value = $stemmer->stemToken($value);
|
||||||
|
$stem_value = $ngram_engine->newTermsCorpus($stem_value);
|
||||||
|
$stem_value = ' '.$stem_value.' ';
|
||||||
|
|
||||||
|
$term_constraints[] = qsprintf(
|
||||||
|
$conn,
|
||||||
|
'(ftfield.normalCorpus LIKE %~)',
|
||||||
|
$stem_value);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($is_quoted) {
|
||||||
|
$where[] = qsprintf(
|
||||||
|
$conn,
|
||||||
|
'(ftfield.rawCorpus LIKE %~ AND (%Q))',
|
||||||
|
$value,
|
||||||
|
implode(' OR ', $term_constraints));
|
||||||
|
} else {
|
||||||
|
$where[] = qsprintf(
|
||||||
|
$conn,
|
||||||
|
'(%Q)',
|
||||||
|
implode(' OR ', $term_constraints));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return $where;
|
return $where;
|
||||||
|
|
Loading…
Reference in a new issue