1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-10 08:52:39 +01:00

Provide some "term vs substring" support for the Ferret engine

Summary:
Ref T12819. Distinguishes between "term" queries and "substring" queries, and tries to match them correctly most of the time. For example:

  - `example` matches "example", obviously.
  - `~amp` matches "example", but `amp` does not.
  - `examples` matches "example" through stemming.
  - `"examples"` does not match "example" (quoted text does not stem).
  - `"an examp"` does not match "an example" (quoted text is still term text).
  - `~"an examp"` matches "an example" (quoted, substring-operator text uses substring search).

Test Plan: Ran searches similar to the above, they seemed to do what they should.

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T12819

Differential Revision: https://secure.phabricator.com/D18500
This commit is contained in:
epriestley 2017-08-30 09:24:57 -07:00
parent e5a495f435
commit df9c24e750
3 changed files with 124 additions and 43 deletions

View file

@ -92,8 +92,8 @@ final class ManiphestTaskSearchEngine
->setLabel(pht('Contains Words')) ->setLabel(pht('Contains Words'))
->setKey('fulltext'), ->setKey('fulltext'),
id(new PhabricatorSearchTextField()) id(new PhabricatorSearchTextField())
->setLabel(pht('Matches (Prototype)')) ->setLabel(pht('Query (Prototype)'))
->setKey('ferret') ->setKey('query')
->setIsHidden($hide_ferret), ->setIsHidden($hide_ferret),
id(new PhabricatorSearchThreeStateField()) id(new PhabricatorSearchThreeStateField())
->setLabel(pht('Open Parents')) ->setLabel(pht('Open Parents'))
@ -150,8 +150,8 @@ final class ManiphestTaskSearchEngine
'statuses', 'statuses',
'priorities', 'priorities',
'subtypes', 'subtypes',
'query',
'fulltext', 'fulltext',
'ferret',
'hasParents', 'hasParents',
'hasSubtasks', 'hasSubtasks',
'parentIDs', 'parentIDs',
@ -231,8 +231,8 @@ final class ManiphestTaskSearchEngine
$query->withFullTextSearch($map['fulltext']); $query->withFullTextSearch($map['fulltext']);
} }
if (strlen($map['ferret'])) { if (strlen($map['query'])) {
$raw_query = $map['ferret']; $raw_query = $map['query'];
$compiler = id(new PhutilSearchQueryCompiler()) $compiler = id(new PhutilSearchQueryCompiler())
->setEnableFunctions(true); ->setEnableFunctions(true);

View file

@ -50,9 +50,11 @@ final class PhabricatorFerretFulltextEngineExtension
continue; continue;
} }
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
$term_corpus = $ngram_engine->newTermsCorpus($raw_corpus); $term_corpus = $ngram_engine->newTermsCorpus($raw_corpus);
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
$normal_coprus = $ngram_engine->newTermsCorpus($normal_corpus);
if (!isset($ferret_corpus_map[$key])) { if (!isset($ferret_corpus_map[$key])) {
$ferret_corpus_map[$key] = $empty_template; $ferret_corpus_map[$key] = $empty_template;
} }
@ -67,16 +69,23 @@ final class PhabricatorFerretFulltextEngineExtension
} }
$ferret_fields = array(); $ferret_fields = array();
$ngrams_source = array();
foreach ($ferret_corpus_map as $key => $fields) { foreach ($ferret_corpus_map as $key => $fields) {
$raw_corpus = $fields['raw']; $raw_corpus = $fields['raw'];
$raw_corpus = implode("\n", $raw_corpus); $raw_corpus = implode("\n", $raw_corpus);
$ngrams_source[] = $raw_corpus;
$normal_corpus = $fields['normal']; $normal_corpus = $fields['normal'];
$normal_corpus = implode("\n", $normal_corpus); $normal_corpus = implode(' ', $normal_corpus);
if (strlen($normal_corpus)) {
$ngrams_source[] = $normal_corpus;
$normal_corpus = ' '.$normal_corpus.' ';
}
$term_corpus = $fields['term']; $term_corpus = $fields['term'];
$term_corpus = implode(' ', $term_corpus); $term_corpus = implode(' ', $term_corpus);
if (strlen($term_corpus)) { if (strlen($term_corpus)) {
$ngrams_source[] = $term_corpus;
$term_corpus = ' '.$term_corpus.' '; $term_corpus = ' '.$term_corpus.' ';
} }
@ -86,9 +95,7 @@ final class PhabricatorFerretFulltextEngineExtension
->setTermCorpus($term_corpus) ->setTermCorpus($term_corpus)
->setNormalCorpus($normal_corpus); ->setNormalCorpus($normal_corpus);
} }
$ngrams_source = implode(' ', $ngrams_source);
$ngrams_source = $ferret_corpus_map[$key_all]['raw'];
$ngrams_source = implode("\n", $ngrams_source);
$ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index'); $ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index');

View file

@ -1409,8 +1409,11 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
return array(); return array();
} }
$op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING;
$engine = $this->ferretEngine; $engine = $this->ferretEngine;
$ngram_engine = new PhabricatorNgramEngine(); $ngram_engine = new PhabricatorNgramEngine();
$stemmer = new PhutilSearchStemmer();
$ngram_table = $engine->newNgramsObject(); $ngram_table = $engine->newNgramsObject();
$ngram_table_name = $ngram_table->getTableName(); $ngram_table_name = $ngram_table->getTableName();
@ -1422,22 +1425,49 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
$length = count(phutil_utf8v($value)); $length = count(phutil_utf8v($value));
if ($length >= 3) { if ($raw_token->getOperator() == $op_sub) {
$ngrams = $ngram_engine->getNgramsFromString($value, 'query'); $is_substring = true;
$prefix = false;
} else if ($length == 2) {
$ngrams = $ngram_engine->getNgramsFromString($value, 'prefix');
$prefix = false;
} else { } else {
$ngrams = array(' '.$value); $is_substring = false;
$prefix = true; }
// If the user specified a substring query for a substring which is
// shorter than the ngram length, we can't use the ngram index, so
// don't do a join. We'll fall back to just doing LIKE on the full
// corpus.
if ($is_substring) {
if ($length < 3) {
continue;
}
}
if ($raw_token->isQuoted()) {
$is_stemmed = false;
} else {
$is_stemmed = true;
}
if ($is_substring) {
$ngrams = $ngram_engine->getNgramsFromString($value, 'query');
} else {
$ngrams = $ngram_engine->getNgramsFromString($value, 'index');
// If this is a stemmed term, only look for ngrams present in both the
// unstemmed and stemmed variations.
if ($is_stemmed) {
$stem_value = $stemmer->stemToken($value);
$stem_ngrams = $ngram_engine->getNgramsFromString(
$stem_value,
'index');
$ngrams = array_intersect($ngrams, $stem_ngrams);
}
} }
foreach ($ngrams as $ngram) { foreach ($ngrams as $ngram) {
$flat[] = array( $flat[] = array(
'table' => $ngram_table_name, 'table' => $ngram_table_name,
'ngram' => $ngram, 'ngram' => $ngram,
'prefix' => $prefix,
); );
} }
} }
@ -1472,29 +1502,17 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
foreach ($flat as $spec) { foreach ($flat as $spec) {
$table = $spec['table']; $table = $spec['table'];
$ngram = $spec['ngram']; $ngram = $spec['ngram'];
$prefix = $spec['prefix'];
$alias = 'ft'.$idx++; $alias = 'ft'.$idx++;
if ($prefix) { $joins[] = qsprintf(
$joins[] = qsprintf( $conn,
$conn, 'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram = %s',
'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram LIKE %>', $table,
$table, $alias,
$alias, $alias,
$alias, $alias,
$alias, $ngram);
$ngram);
} else {
$joins[] = qsprintf(
$conn,
'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram = %s',
$table,
$alias,
$alias,
$alias,
$ngram);
}
} }
$joins[] = qsprintf( $joins[] = qsprintf(
@ -1510,16 +1528,72 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
return array(); return array();
} }
$ngram_engine = new PhabricatorNgramEngine();
$stemmer = new PhutilSearchStemmer();
$op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING;
$where = array(); $where = array();
foreach ($this->ferretTokens as $fulltext_token) { foreach ($this->ferretTokens as $fulltext_token) {
$raw_token = $fulltext_token->getToken(); $raw_token = $fulltext_token->getToken();
$value = $raw_token->getValue(); $value = $raw_token->getValue();
$where[] = qsprintf( if ($raw_token->getOperator() == $op_sub) {
$is_substring = true;
} else {
$is_substring = false;
}
// If we're doing substring search, we just match against the raw corpus
// and we're done.
if ($is_substring) {
$where[] = qsprintf(
$conn,
'(ftfield.rawCorpus LIKE %~)',
$value);
continue;
}
// Otherwise, we need to match against the term corpus and the normal
// corpus, so that searching for "raw" does not find "strawberry".
if ($raw_token->isQuoted()) {
$is_quoted = true;
$is_stemmed = false;
} else {
$is_quoted = false;
$is_stemmed = true;
}
$term_constraints = array();
$term_value = ' '.$ngram_engine->newTermsCorpus($value).' ';
$term_constraints[] = qsprintf(
$conn, $conn,
'(ftfield.rawCorpus LIKE %~ OR ftfield.normalCorpus LIKE %~)', '(ftfield.termCorpus LIKE %~)',
$value, $term_value);
$value);
if ($is_stemmed) {
$stem_value = $stemmer->stemToken($value);
$stem_value = $ngram_engine->newTermsCorpus($stem_value);
$stem_value = ' '.$stem_value.' ';
$term_constraints[] = qsprintf(
$conn,
'(ftfield.normalCorpus LIKE %~)',
$stem_value);
}
if ($is_quoted) {
$where[] = qsprintf(
$conn,
'(ftfield.rawCorpus LIKE %~ AND (%Q))',
$value,
implode(' OR ', $term_constraints));
} else {
$where[] = qsprintf(
$conn,
'(%Q)',
implode(' OR ', $term_constraints));
}
} }
return $where; return $where;