1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-22 06:42:42 +01:00

Stem fulltext tokens before filtering them for stopwords

Summary:
Fixes T12596. A query for a token (like "having") which stems to a stopword (like "have") currently survives filtering. Stem it first so it gets caught.

Also, for InnoDB, a custom stopword table can be configured. If it is, read that instead of the default stopword list (I configured it locally, but the default list is reasonable so we never formally recommended installs configure it).

Test Plan:
Queried for words that stem to stopwords, saw them filtered:

{F4915843}

Queried for the original problem query and saw "having" caught with "have" in the stopword list:

{F4915844}

Fiddled with local InnoDB stopword table config and saw the stopword list get loaded correctly.

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T12596

Differential Revision: https://secure.phabricator.com/D17728
This commit is contained in:
epriestley 2017-04-19 08:56:35 -07:00
parent df7f56d8e3
commit f880000eb0

View file

@ -228,6 +228,13 @@ final class PhabricatorMySQLFulltextStorageEngine
$fulltext_tokens[$key] = $fulltext_token;
$value = $token->getValue();
// If the value is unquoted, we'll stem it in the query, so stem it
// here before performing filtering tests. See T12596.
if (!$token->isQuoted()) {
$value = $stemmer->stemToken($value);
}
if (phutil_utf8_strlen($value) < $min_length) {
$fulltext_token->setIsShort(true);
continue;
@ -479,16 +486,32 @@ final class PhabricatorMySQLFulltextStorageEngine
try {
$result = queryfx_one(
$conn,
'SELECT @@innodb_ft_min_token_size innodb_max');
'SELECT @@innodb_ft_min_token_size innodb_max,
@@innodb_ft_server_stopword_table innodb_stopword_config');
} catch (AphrontQueryException $ex) {
$result = null;
}
if ($result) {
$min_len = $result['innodb_max'];
$stopword_config = $result['innodb_stopword_config'];
if (preg_match('(/)', $stopword_config)) {
// If the setting is nonempty and contains a slash, query the
// table the user has configured.
$parts = explode('/', $stopword_config);
list($stopword_database, $stopword_table) = $parts;
} else {
// Otherwise, query the InnoDB default stopword table.
$stopword_database = 'INFORMATION_SCHEMA';
$stopword_table = 'INNODB_FT_DEFAULT_STOPWORD';
}
$stopwords = queryfx_all(
$conn,
'SELECT * FROM INFORMATION_SCHEMA.INNODB_FT_DEFAULT_STOPWORD');
'SELECT * FROM %T.%T',
$stopword_database,
$stopword_table);
$stopwords = ipull($stopwords, 'value');
$stopwords = array_fuse($stopwords);