From df9c24e750588ad9b9a6921665c8c1e0f26db461 Mon Sep 17 00:00:00 2001 From: epriestley Date: Wed, 30 Aug 2017 09:24:57 -0700 Subject: [PATCH] Provide some "term vs substring" support for the Ferret engine Summary: Ref T12819. Distinguishes between "term" queries and "substring" queries, and tries to match them correctly most of the time. For example: - `example` matches "example", obviously. - `~amp` matches "example", but `amp` does not. - `examples` matches "example" through stemming. - `"examples"` does not match "example" (quoted text does not stem). - `"an examp"` does not match "an example" (quoted text is still term text). - `~"an examp"` matches "an example" (quoted, substring-operator text uses substring search). Test Plan: Ran searches similar to the above, they seemed to do what they should. Reviewers: chad Reviewed By: chad Maniphest Tasks: T12819 Differential Revision: https://secure.phabricator.com/D18500 --- .../query/ManiphestTaskSearchEngine.php | 10 +- ...abricatorFerretFulltextEngineExtension.php | 17 ++- ...PhabricatorCursorPagedPolicyAwareQuery.php | 140 +++++++++++++----- 3 files changed, 124 insertions(+), 43 deletions(-) diff --git a/src/applications/maniphest/query/ManiphestTaskSearchEngine.php b/src/applications/maniphest/query/ManiphestTaskSearchEngine.php index 956b8c168e..0e3e0db69f 100644 --- a/src/applications/maniphest/query/ManiphestTaskSearchEngine.php +++ b/src/applications/maniphest/query/ManiphestTaskSearchEngine.php @@ -92,8 +92,8 @@ final class ManiphestTaskSearchEngine ->setLabel(pht('Contains Words')) ->setKey('fulltext'), id(new PhabricatorSearchTextField()) - ->setLabel(pht('Matches (Prototype)')) - ->setKey('ferret') + ->setLabel(pht('Query (Prototype)')) + ->setKey('query') ->setIsHidden($hide_ferret), id(new PhabricatorSearchThreeStateField()) ->setLabel(pht('Open Parents')) @@ -150,8 +150,8 @@ final class ManiphestTaskSearchEngine 'statuses', 'priorities', 'subtypes', + 'query', 'fulltext', - 'ferret', 'hasParents', 'hasSubtasks', 'parentIDs', @@ -231,8 +231,8 @@ final class ManiphestTaskSearchEngine $query->withFullTextSearch($map['fulltext']); } - if (strlen($map['ferret'])) { - $raw_query = $map['ferret']; + if (strlen($map['query'])) { + $raw_query = $map['query']; $compiler = id(new PhutilSearchQueryCompiler()) ->setEnableFunctions(true); diff --git a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php index 6eb97e2b7c..77599b9d5d 100644 --- a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php +++ b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php @@ -50,9 +50,11 @@ final class PhabricatorFerretFulltextEngineExtension continue; } - $normal_corpus = $stemmer->stemCorpus($raw_corpus); $term_corpus = $ngram_engine->newTermsCorpus($raw_corpus); + $normal_corpus = $stemmer->stemCorpus($raw_corpus); + $normal_coprus = $ngram_engine->newTermsCorpus($normal_corpus); + if (!isset($ferret_corpus_map[$key])) { $ferret_corpus_map[$key] = $empty_template; } @@ -67,16 +69,23 @@ final class PhabricatorFerretFulltextEngineExtension } $ferret_fields = array(); + $ngrams_source = array(); foreach ($ferret_corpus_map as $key => $fields) { $raw_corpus = $fields['raw']; $raw_corpus = implode("\n", $raw_corpus); + $ngrams_source[] = $raw_corpus; $normal_corpus = $fields['normal']; - $normal_corpus = implode("\n", $normal_corpus); + $normal_corpus = implode(' ', $normal_corpus); + if (strlen($normal_corpus)) { + $ngrams_source[] = $normal_corpus; + $normal_corpus = ' '.$normal_corpus.' '; + } $term_corpus = $fields['term']; $term_corpus = implode(' ', $term_corpus); if (strlen($term_corpus)) { + $ngrams_source[] = $term_corpus; $term_corpus = ' '.$term_corpus.' '; } @@ -86,9 +95,7 @@ final class PhabricatorFerretFulltextEngineExtension ->setTermCorpus($term_corpus) ->setNormalCorpus($normal_corpus); } - - $ngrams_source = $ferret_corpus_map[$key_all]['raw']; - $ngrams_source = implode("\n", $ngrams_source); + $ngrams_source = implode(' ', $ngrams_source); $ngrams = $ngram_engine->getNgramsFromString($ngrams_source, 'index'); diff --git a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php index 437ca0ce4f..43db65f10b 100644 --- a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php +++ b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php @@ -1409,8 +1409,11 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery return array(); } + $op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING; + $engine = $this->ferretEngine; $ngram_engine = new PhabricatorNgramEngine(); + $stemmer = new PhutilSearchStemmer(); $ngram_table = $engine->newNgramsObject(); $ngram_table_name = $ngram_table->getTableName(); @@ -1422,22 +1425,49 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery $length = count(phutil_utf8v($value)); - if ($length >= 3) { - $ngrams = $ngram_engine->getNgramsFromString($value, 'query'); - $prefix = false; - } else if ($length == 2) { - $ngrams = $ngram_engine->getNgramsFromString($value, 'prefix'); - $prefix = false; + if ($raw_token->getOperator() == $op_sub) { + $is_substring = true; } else { - $ngrams = array(' '.$value); - $prefix = true; + $is_substring = false; + } + + // If the user specified a substring query for a substring which is + // shorter than the ngram length, we can't use the ngram index, so + // don't do a join. We'll fall back to just doing LIKE on the full + // corpus. + if ($is_substring) { + if ($length < 3) { + continue; + } + } + + if ($raw_token->isQuoted()) { + $is_stemmed = false; + } else { + $is_stemmed = true; + } + + if ($is_substring) { + $ngrams = $ngram_engine->getNgramsFromString($value, 'query'); + } else { + $ngrams = $ngram_engine->getNgramsFromString($value, 'index'); + + // If this is a stemmed term, only look for ngrams present in both the + // unstemmed and stemmed variations. + if ($is_stemmed) { + $stem_value = $stemmer->stemToken($value); + $stem_ngrams = $ngram_engine->getNgramsFromString( + $stem_value, + 'index'); + + $ngrams = array_intersect($ngrams, $stem_ngrams); + } } foreach ($ngrams as $ngram) { $flat[] = array( 'table' => $ngram_table_name, 'ngram' => $ngram, - 'prefix' => $prefix, ); } } @@ -1472,29 +1502,17 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery foreach ($flat as $spec) { $table = $spec['table']; $ngram = $spec['ngram']; - $prefix = $spec['prefix']; $alias = 'ft'.$idx++; - if ($prefix) { - $joins[] = qsprintf( - $conn, - 'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram LIKE %>', - $table, - $alias, - $alias, - $alias, - $ngram); - } else { - $joins[] = qsprintf( - $conn, - 'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram = %s', - $table, - $alias, - $alias, - $alias, - $ngram); - } + $joins[] = qsprintf( + $conn, + 'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram = %s', + $table, + $alias, + $alias, + $alias, + $ngram); } $joins[] = qsprintf( @@ -1510,16 +1528,72 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery return array(); } + $ngram_engine = new PhabricatorNgramEngine(); + $stemmer = new PhutilSearchStemmer(); + $op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING; + $where = array(); foreach ($this->ferretTokens as $fulltext_token) { $raw_token = $fulltext_token->getToken(); $value = $raw_token->getValue(); - $where[] = qsprintf( + if ($raw_token->getOperator() == $op_sub) { + $is_substring = true; + } else { + $is_substring = false; + } + + // If we're doing substring search, we just match against the raw corpus + // and we're done. + if ($is_substring) { + $where[] = qsprintf( + $conn, + '(ftfield.rawCorpus LIKE %~)', + $value); + continue; + } + + // Otherwise, we need to match against the term corpus and the normal + // corpus, so that searching for "raw" does not find "strawberry". + if ($raw_token->isQuoted()) { + $is_quoted = true; + $is_stemmed = false; + } else { + $is_quoted = false; + $is_stemmed = true; + } + + $term_constraints = array(); + + $term_value = ' '.$ngram_engine->newTermsCorpus($value).' '; + $term_constraints[] = qsprintf( $conn, - '(ftfield.rawCorpus LIKE %~ OR ftfield.normalCorpus LIKE %~)', - $value, - $value); + '(ftfield.termCorpus LIKE %~)', + $term_value); + + if ($is_stemmed) { + $stem_value = $stemmer->stemToken($value); + $stem_value = $ngram_engine->newTermsCorpus($stem_value); + $stem_value = ' '.$stem_value.' '; + + $term_constraints[] = qsprintf( + $conn, + '(ftfield.normalCorpus LIKE %~)', + $stem_value); + } + + if ($is_quoted) { + $where[] = qsprintf( + $conn, + '(ftfield.rawCorpus LIKE %~ AND (%Q))', + $value, + implode(' OR ', $term_constraints)); + } else { + $where[] = qsprintf( + $conn, + '(%Q)', + implode(' OR ', $term_constraints)); + } } return $where;