mirror of
https://we.phorge.it/source/phorge.git
synced 2025-01-14 00:31:05 +01:00
Provide some "term vs substring" support for the Ferret engine
Summary: Ref T12819. Distinguishes between "term" queries and "substring" queries, and tries to match them correctly most of the time. For example: - `example` matches "example", obviously. - `~amp` matches "example", but `amp` does not. - `examples` matches "example" through stemming. - `"examples"` does not match "example" (quoted text does not stem). - `"an examp"` does not match "an example" (quoted text is still term text). - `~"an examp"` matches "an example" (quoted, substring-operator text uses substring search). Test Plan: Ran searches similar to the above, they seemed to do what they should. Reviewers: chad Reviewed By: chad Maniphest Tasks: T12819 Differential Revision: https://secure.phabricator.com/D18500
This commit is contained in:
parent
e5a495f435
commit
df9c24e750
3 changed files with 124 additions and 43 deletions
|
@ -92,8 +92,8 @@ final class ManiphestTaskSearchEngine
|
|||
->setLabel(pht('Contains Words'))
|
||||
->setKey('fulltext'),
|
||||
id(new PhabricatorSearchTextField())
|
||||
->setLabel(pht('Matches (Prototype)'))
|
||||
->setKey('ferret')
|
||||
->setLabel(pht('Query (Prototype)'))
|
||||
->setKey('query')
|
||||
->setIsHidden($hide_ferret),
|
||||
id(new PhabricatorSearchThreeStateField())
|
||||
->setLabel(pht('Open Parents'))
|
||||
|
@ -150,8 +150,8 @@ final class ManiphestTaskSearchEngine
|
|||
'statuses',
|
||||
'priorities',
|
||||
'subtypes',
|
||||
'query',
|
||||
'fulltext',
|
||||
'ferret',
|
||||
'hasParents',
|
||||
'hasSubtasks',
|
||||
'parentIDs',
|
||||
|
@ -231,8 +231,8 @@ final class ManiphestTaskSearchEngine
|
|||
$query->withFullTextSearch($map['fulltext']);
|
||||
}
|
||||
|
||||
if (strlen($map['ferret'])) {
|
||||
$raw_query = $map['ferret'];
|
||||
if (strlen($map['query'])) {
|
||||
$raw_query = $map['query'];
|
||||
|
||||
$compiler = id(new PhutilSearchQueryCompiler())
|
||||
->setEnableFunctions(true);
|
||||
|
|
|
@ -50,9 +50,11 @@ final class PhabricatorFerretFulltextEngineExtension
|
|||
continue;
|
||||
}
|
||||
|
||||
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
|
||||
$term_corpus = $ngram_engine->newTermsCorpus($raw_corpus);
|
||||
|
||||
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
|
||||
$normal_coprus = $ngram_engine->newTermsCorpus($normal_corpus);
|
||||
|
||||
if (!isset($ferret_corpus_map[$key])) {
|
||||
$ferret_corpus_map[$key] = $empty_template;
|
||||
}
|
||||
|
@ -67,16 +69,23 @@ final class PhabricatorFerretFulltextEngineExtension
|
|||
}
|
||||
|
||||
$ferret_fields = array();
|
||||
$ngrams_source = array();
|
||||
foreach ($ferret_corpus_map as $key => $fields) {
|
||||
$raw_corpus = $fields['raw'];
|
||||
$raw_corpus = implode("\n", $raw_corpus);
|
||||
$ngrams_source[] = $raw_corpus;
|
||||
|
||||
$normal_corpus = $fields['normal'];
|
||||
$normal_corpus = implode("\n", $normal_corpus);
|
||||
$normal_corpus = implode(' ', $normal_corpus);
|
||||
if (strlen($normal_corpus)) {
|
||||
$ngrams_source[] = $normal_corpus;
|
||||
$normal_corpus = ' '.$normal_corpus.' ';
|
||||
}
|
||||
|
||||
$term_corpus = $fields['term'];
|
||||
$term_corpus = implode(' ', $term_corpus);
|
||||
if (strlen($term_corpus)) {
|
||||
$ngrams_source[] = $term_corpus;
|
||||
$term_corpus = ' '.$term_corpus.' ';
|
||||
}
|
||||
|
||||
|
@ -86,9 +95,7 @@ final class PhabricatorFerretFulltextEngineExtension
|
|||
->setTermCorpus($term_corpus)
|
||||
->setNormalCorpus($normal_corpus);
|
||||
}
|
||||
|
||||
$ngrams_source = $ferret_corpus_map[$key_all]['raw'];
|
||||
$ngrams_source = implode("\n", $ngrams_source);
|
||||
$ngrams_source = implode(' ', $ngrams_source);
|
||||
|
||||
$ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index');
|
||||
|
||||
|
|
|
@ -1409,8 +1409,11 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
return array();
|
||||
}
|
||||
|
||||
$op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING;
|
||||
|
||||
$engine = $this->ferretEngine;
|
||||
$ngram_engine = new PhabricatorNgramEngine();
|
||||
$stemmer = new PhutilSearchStemmer();
|
||||
|
||||
$ngram_table = $engine->newNgramsObject();
|
||||
$ngram_table_name = $ngram_table->getTableName();
|
||||
|
@ -1422,22 +1425,49 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
|
||||
$length = count(phutil_utf8v($value));
|
||||
|
||||
if ($length >= 3) {
|
||||
$ngrams = $ngram_engine->getNgramsFromString($value, 'query');
|
||||
$prefix = false;
|
||||
} else if ($length == 2) {
|
||||
$ngrams = $ngram_engine->getNgramsFromString($value, 'prefix');
|
||||
$prefix = false;
|
||||
if ($raw_token->getOperator() == $op_sub) {
|
||||
$is_substring = true;
|
||||
} else {
|
||||
$ngrams = array(' '.$value);
|
||||
$prefix = true;
|
||||
$is_substring = false;
|
||||
}
|
||||
|
||||
// If the user specified a substring query for a substring which is
|
||||
// shorter than the ngram length, we can't use the ngram index, so
|
||||
// don't do a join. We'll fall back to just doing LIKE on the full
|
||||
// corpus.
|
||||
if ($is_substring) {
|
||||
if ($length < 3) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if ($raw_token->isQuoted()) {
|
||||
$is_stemmed = false;
|
||||
} else {
|
||||
$is_stemmed = true;
|
||||
}
|
||||
|
||||
if ($is_substring) {
|
||||
$ngrams = $ngram_engine->getNgramsFromString($value, 'query');
|
||||
} else {
|
||||
$ngrams = $ngram_engine->getNgramsFromString($value, 'index');
|
||||
|
||||
// If this is a stemmed term, only look for ngrams present in both the
|
||||
// unstemmed and stemmed variations.
|
||||
if ($is_stemmed) {
|
||||
$stem_value = $stemmer->stemToken($value);
|
||||
$stem_ngrams = $ngram_engine->getNgramsFromString(
|
||||
$stem_value,
|
||||
'index');
|
||||
|
||||
$ngrams = array_intersect($ngrams, $stem_ngrams);
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($ngrams as $ngram) {
|
||||
$flat[] = array(
|
||||
'table' => $ngram_table_name,
|
||||
'ngram' => $ngram,
|
||||
'prefix' => $prefix,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -1472,29 +1502,17 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
foreach ($flat as $spec) {
|
||||
$table = $spec['table'];
|
||||
$ngram = $spec['ngram'];
|
||||
$prefix = $spec['prefix'];
|
||||
|
||||
$alias = 'ft'.$idx++;
|
||||
|
||||
if ($prefix) {
|
||||
$joins[] = qsprintf(
|
||||
$conn,
|
||||
'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram LIKE %>',
|
||||
$table,
|
||||
$alias,
|
||||
$alias,
|
||||
$alias,
|
||||
$ngram);
|
||||
} else {
|
||||
$joins[] = qsprintf(
|
||||
$conn,
|
||||
'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram = %s',
|
||||
$table,
|
||||
$alias,
|
||||
$alias,
|
||||
$alias,
|
||||
$ngram);
|
||||
}
|
||||
$joins[] = qsprintf(
|
||||
$conn,
|
||||
'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram = %s',
|
||||
$table,
|
||||
$alias,
|
||||
$alias,
|
||||
$alias,
|
||||
$ngram);
|
||||
}
|
||||
|
||||
$joins[] = qsprintf(
|
||||
|
@ -1510,16 +1528,72 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
return array();
|
||||
}
|
||||
|
||||
$ngram_engine = new PhabricatorNgramEngine();
|
||||
$stemmer = new PhutilSearchStemmer();
|
||||
$op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING;
|
||||
|
||||
$where = array();
|
||||
foreach ($this->ferretTokens as $fulltext_token) {
|
||||
$raw_token = $fulltext_token->getToken();
|
||||
$value = $raw_token->getValue();
|
||||
|
||||
$where[] = qsprintf(
|
||||
if ($raw_token->getOperator() == $op_sub) {
|
||||
$is_substring = true;
|
||||
} else {
|
||||
$is_substring = false;
|
||||
}
|
||||
|
||||
// If we're doing substring search, we just match against the raw corpus
|
||||
// and we're done.
|
||||
if ($is_substring) {
|
||||
$where[] = qsprintf(
|
||||
$conn,
|
||||
'(ftfield.rawCorpus LIKE %~)',
|
||||
$value);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Otherwise, we need to match against the term corpus and the normal
|
||||
// corpus, so that searching for "raw" does not find "strawberry".
|
||||
if ($raw_token->isQuoted()) {
|
||||
$is_quoted = true;
|
||||
$is_stemmed = false;
|
||||
} else {
|
||||
$is_quoted = false;
|
||||
$is_stemmed = true;
|
||||
}
|
||||
|
||||
$term_constraints = array();
|
||||
|
||||
$term_value = ' '.$ngram_engine->newTermsCorpus($value).' ';
|
||||
$term_constraints[] = qsprintf(
|
||||
$conn,
|
||||
'(ftfield.rawCorpus LIKE %~ OR ftfield.normalCorpus LIKE %~)',
|
||||
$value,
|
||||
$value);
|
||||
'(ftfield.termCorpus LIKE %~)',
|
||||
$term_value);
|
||||
|
||||
if ($is_stemmed) {
|
||||
$stem_value = $stemmer->stemToken($value);
|
||||
$stem_value = $ngram_engine->newTermsCorpus($stem_value);
|
||||
$stem_value = ' '.$stem_value.' ';
|
||||
|
||||
$term_constraints[] = qsprintf(
|
||||
$conn,
|
||||
'(ftfield.normalCorpus LIKE %~)',
|
||||
$stem_value);
|
||||
}
|
||||
|
||||
if ($is_quoted) {
|
||||
$where[] = qsprintf(
|
||||
$conn,
|
||||
'(ftfield.rawCorpus LIKE %~ AND (%Q))',
|
||||
$value,
|
||||
implode(' OR ', $term_constraints));
|
||||
} else {
|
||||
$where[] = qsprintf(
|
||||
$conn,
|
||||
'(%Q)',
|
||||
implode(' OR ', $term_constraints));
|
||||
}
|
||||
}
|
||||
|
||||
return $where;
|
||||
|
|
Loading…
Reference in a new issue