From f880000eb0cd25b4b8c708aec24fed89d51d1542 Mon Sep 17 00:00:00 2001 From: epriestley Date: Wed, 19 Apr 2017 08:56:35 -0700 Subject: [PATCH] Stem fulltext tokens before filtering them for stopwords Summary: Fixes T12596. A query for a token (like "having") which stems to a stopword (like "have") currently survives filtering. Stem it first so it gets caught. Also, for InnoDB, a custom stopword table can be configured. If it is, read that instead of the default stopword list (I configured it locally, but the default list is reasonable so we never formally recommended installs configure it). Test Plan: Queried for words that stem to stopwords, saw them filtered: {F4915843} Queried for the original problem query and saw "having" caught with "have" in the stopword list: {F4915844} Fiddled with local InnoDB stopword table config and saw the stopword list get loaded correctly. Reviewers: chad Reviewed By: chad Maniphest Tasks: T12596 Differential Revision: https://secure.phabricator.com/D17728 --- .../PhabricatorMySQLFulltextStorageEngine.php | 27 +++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/src/applications/search/fulltextstorage/PhabricatorMySQLFulltextStorageEngine.php b/src/applications/search/fulltextstorage/PhabricatorMySQLFulltextStorageEngine.php index 5bb9943e59..b22076cf92 100644 --- a/src/applications/search/fulltextstorage/PhabricatorMySQLFulltextStorageEngine.php +++ b/src/applications/search/fulltextstorage/PhabricatorMySQLFulltextStorageEngine.php @@ -228,6 +228,13 @@ final class PhabricatorMySQLFulltextStorageEngine $fulltext_tokens[$key] = $fulltext_token; $value = $token->getValue(); + + // If the value is unquoted, we'll stem it in the query, so stem it + // here before performing filtering tests. See T12596. + if (!$token->isQuoted()) { + $value = $stemmer->stemToken($value); + } + if (phutil_utf8_strlen($value) < $min_length) { $fulltext_token->setIsShort(true); continue; @@ -479,16 +486,32 @@ final class PhabricatorMySQLFulltextStorageEngine try { $result = queryfx_one( $conn, - 'SELECT @@innodb_ft_min_token_size innodb_max'); + 'SELECT @@innodb_ft_min_token_size innodb_max, + @@innodb_ft_server_stopword_table innodb_stopword_config'); } catch (AphrontQueryException $ex) { $result = null; } if ($result) { $min_len = $result['innodb_max']; + + $stopword_config = $result['innodb_stopword_config']; + if (preg_match('(/)', $stopword_config)) { + // If the setting is nonempty and contains a slash, query the + // table the user has configured. + $parts = explode('/', $stopword_config); + list($stopword_database, $stopword_table) = $parts; + } else { + // Otherwise, query the InnoDB default stopword table. + $stopword_database = 'INFORMATION_SCHEMA'; + $stopword_table = 'INNODB_FT_DEFAULT_STOPWORD'; + } + $stopwords = queryfx_all( $conn, - 'SELECT * FROM INFORMATION_SCHEMA.INNODB_FT_DEFAULT_STOPWORD'); + 'SELECT * FROM %T.%T', + $stopword_database, + $stopword_table); $stopwords = ipull($stopwords, 'value'); $stopwords = array_fuse($stopwords);