1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2025-01-14 00:31:05 +01:00

Provide some "term vs substring" support for the Ferret engine

Summary:
Ref T12819. Distinguishes between "term" queries and "substring" queries, and tries to match them correctly most of the time. For example:

  - `example` matches "example", obviously.
  - `~amp` matches "example", but `amp` does not.
  - `examples` matches "example" through stemming.
  - `"examples"` does not match "example" (quoted text does not stem).
  - `"an examp"` does not match "an example" (quoted text is still term text).
  - `~"an examp"` matches "an example" (quoted, substring-operator text uses substring search).

Test Plan: Ran searches similar to the above, they seemed to do what they should.

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T12819

Differential Revision: https://secure.phabricator.com/D18500
This commit is contained in:
epriestley 2017-08-30 09:24:57 -07:00
parent e5a495f435
commit df9c24e750
3 changed files with 124 additions and 43 deletions

View file

@ -92,8 +92,8 @@ final class ManiphestTaskSearchEngine
->setLabel(pht('Contains Words'))
->setKey('fulltext'),
id(new PhabricatorSearchTextField())
->setLabel(pht('Matches (Prototype)'))
->setKey('ferret')
->setLabel(pht('Query (Prototype)'))
->setKey('query')
->setIsHidden($hide_ferret),
id(new PhabricatorSearchThreeStateField())
->setLabel(pht('Open Parents'))
@ -150,8 +150,8 @@ final class ManiphestTaskSearchEngine
'statuses',
'priorities',
'subtypes',
'query',
'fulltext',
'ferret',
'hasParents',
'hasSubtasks',
'parentIDs',
@ -231,8 +231,8 @@ final class ManiphestTaskSearchEngine
$query->withFullTextSearch($map['fulltext']);
}
if (strlen($map['ferret'])) {
$raw_query = $map['ferret'];
if (strlen($map['query'])) {
$raw_query = $map['query'];
$compiler = id(new PhutilSearchQueryCompiler())
->setEnableFunctions(true);

View file

@ -50,9 +50,11 @@ final class PhabricatorFerretFulltextEngineExtension
continue;
}
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
$term_corpus = $ngram_engine->newTermsCorpus($raw_corpus);
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
$normal_coprus = $ngram_engine->newTermsCorpus($normal_corpus);
if (!isset($ferret_corpus_map[$key])) {
$ferret_corpus_map[$key] = $empty_template;
}
@ -67,16 +69,23 @@ final class PhabricatorFerretFulltextEngineExtension
}
$ferret_fields = array();
$ngrams_source = array();
foreach ($ferret_corpus_map as $key => $fields) {
$raw_corpus = $fields['raw'];
$raw_corpus = implode("\n", $raw_corpus);
$ngrams_source[] = $raw_corpus;
$normal_corpus = $fields['normal'];
$normal_corpus = implode("\n", $normal_corpus);
$normal_corpus = implode(' ', $normal_corpus);
if (strlen($normal_corpus)) {
$ngrams_source[] = $normal_corpus;
$normal_corpus = ' '.$normal_corpus.' ';
}
$term_corpus = $fields['term'];
$term_corpus = implode(' ', $term_corpus);
if (strlen($term_corpus)) {
$ngrams_source[] = $term_corpus;
$term_corpus = ' '.$term_corpus.' ';
}
@ -86,9 +95,7 @@ final class PhabricatorFerretFulltextEngineExtension
->setTermCorpus($term_corpus)
->setNormalCorpus($normal_corpus);
}
$ngrams_source = $ferret_corpus_map[$key_all]['raw'];
$ngrams_source = implode("\n", $ngrams_source);
$ngrams_source = implode(' ', $ngrams_source);
$ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index');

View file

@ -1409,8 +1409,11 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
return array();
}
$op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING;
$engine = $this->ferretEngine;
$ngram_engine = new PhabricatorNgramEngine();
$stemmer = new PhutilSearchStemmer();
$ngram_table = $engine->newNgramsObject();
$ngram_table_name = $ngram_table->getTableName();
@ -1422,22 +1425,49 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
$length = count(phutil_utf8v($value));
if ($length >= 3) {
$ngrams = $ngram_engine->getNgramsFromString($value, 'query');
$prefix = false;
} else if ($length == 2) {
$ngrams = $ngram_engine->getNgramsFromString($value, 'prefix');
$prefix = false;
if ($raw_token->getOperator() == $op_sub) {
$is_substring = true;
} else {
$ngrams = array(' '.$value);
$prefix = true;
$is_substring = false;
}
// If the user specified a substring query for a substring which is
// shorter than the ngram length, we can't use the ngram index, so
// don't do a join. We'll fall back to just doing LIKE on the full
// corpus.
if ($is_substring) {
if ($length < 3) {
continue;
}
}
if ($raw_token->isQuoted()) {
$is_stemmed = false;
} else {
$is_stemmed = true;
}
if ($is_substring) {
$ngrams = $ngram_engine->getNgramsFromString($value, 'query');
} else {
$ngrams = $ngram_engine->getNgramsFromString($value, 'index');
// If this is a stemmed term, only look for ngrams present in both the
// unstemmed and stemmed variations.
if ($is_stemmed) {
$stem_value = $stemmer->stemToken($value);
$stem_ngrams = $ngram_engine->getNgramsFromString(
$stem_value,
'index');
$ngrams = array_intersect($ngrams, $stem_ngrams);
}
}
foreach ($ngrams as $ngram) {
$flat[] = array(
'table' => $ngram_table_name,
'ngram' => $ngram,
'prefix' => $prefix,
);
}
}
@ -1472,29 +1502,17 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
foreach ($flat as $spec) {
$table = $spec['table'];
$ngram = $spec['ngram'];
$prefix = $spec['prefix'];
$alias = 'ft'.$idx++;
if ($prefix) {
$joins[] = qsprintf(
$conn,
'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram LIKE %>',
$table,
$alias,
$alias,
$alias,
$ngram);
} else {
$joins[] = qsprintf(
$conn,
'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram = %s',
$table,
$alias,
$alias,
$alias,
$ngram);
}
$joins[] = qsprintf(
$conn,
'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram = %s',
$table,
$alias,
$alias,
$alias,
$ngram);
}
$joins[] = qsprintf(
@ -1510,16 +1528,72 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
return array();
}
$ngram_engine = new PhabricatorNgramEngine();
$stemmer = new PhutilSearchStemmer();
$op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING;
$where = array();
foreach ($this->ferretTokens as $fulltext_token) {
$raw_token = $fulltext_token->getToken();
$value = $raw_token->getValue();
$where[] = qsprintf(
if ($raw_token->getOperator() == $op_sub) {
$is_substring = true;
} else {
$is_substring = false;
}
// If we're doing substring search, we just match against the raw corpus
// and we're done.
if ($is_substring) {
$where[] = qsprintf(
$conn,
'(ftfield.rawCorpus LIKE %~)',
$value);
continue;
}
// Otherwise, we need to match against the term corpus and the normal
// corpus, so that searching for "raw" does not find "strawberry".
if ($raw_token->isQuoted()) {
$is_quoted = true;
$is_stemmed = false;
} else {
$is_quoted = false;
$is_stemmed = true;
}
$term_constraints = array();
$term_value = ' '.$ngram_engine->newTermsCorpus($value).' ';
$term_constraints[] = qsprintf(
$conn,
'(ftfield.rawCorpus LIKE %~ OR ftfield.normalCorpus LIKE %~)',
$value,
$value);
'(ftfield.termCorpus LIKE %~)',
$term_value);
if ($is_stemmed) {
$stem_value = $stemmer->stemToken($value);
$stem_value = $ngram_engine->newTermsCorpus($stem_value);
$stem_value = ' '.$stem_value.' ';
$term_constraints[] = qsprintf(
$conn,
'(ftfield.normalCorpus LIKE %~)',
$stem_value);
}
if ($is_quoted) {
$where[] = qsprintf(
$conn,
'(ftfield.rawCorpus LIKE %~ AND (%Q))',
$value,
implode(' OR ', $term_constraints));
} else {
$where[] = qsprintf(
$conn,
'(%Q)',
implode(' OR ', $term_constraints));
}
}
return $where;