mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-26 00:32:42 +01:00
Stem fulltext tokens before filtering them for stopwords
Summary: Fixes T12596. A query for a token (like "having") which stems to a stopword (like "have") currently survives filtering. Stem it first so it gets caught. Also, for InnoDB, a custom stopword table can be configured. If it is, read that instead of the default stopword list (I configured it locally, but the default list is reasonable so we never formally recommended installs configure it). Test Plan: Queried for words that stem to stopwords, saw them filtered: {F4915843} Queried for the original problem query and saw "having" caught with "have" in the stopword list: {F4915844} Fiddled with local InnoDB stopword table config and saw the stopword list get loaded correctly. Reviewers: chad Reviewed By: chad Maniphest Tasks: T12596 Differential Revision: https://secure.phabricator.com/D17728
This commit is contained in:
parent
df7f56d8e3
commit
f880000eb0
1 changed files with 25 additions and 2 deletions
|
@ -228,6 +228,13 @@ final class PhabricatorMySQLFulltextStorageEngine
|
|||
$fulltext_tokens[$key] = $fulltext_token;
|
||||
|
||||
$value = $token->getValue();
|
||||
|
||||
// If the value is unquoted, we'll stem it in the query, so stem it
|
||||
// here before performing filtering tests. See T12596.
|
||||
if (!$token->isQuoted()) {
|
||||
$value = $stemmer->stemToken($value);
|
||||
}
|
||||
|
||||
if (phutil_utf8_strlen($value) < $min_length) {
|
||||
$fulltext_token->setIsShort(true);
|
||||
continue;
|
||||
|
@ -479,16 +486,32 @@ final class PhabricatorMySQLFulltextStorageEngine
|
|||
try {
|
||||
$result = queryfx_one(
|
||||
$conn,
|
||||
'SELECT @@innodb_ft_min_token_size innodb_max');
|
||||
'SELECT @@innodb_ft_min_token_size innodb_max,
|
||||
@@innodb_ft_server_stopword_table innodb_stopword_config');
|
||||
} catch (AphrontQueryException $ex) {
|
||||
$result = null;
|
||||
}
|
||||
|
||||
if ($result) {
|
||||
$min_len = $result['innodb_max'];
|
||||
|
||||
$stopword_config = $result['innodb_stopword_config'];
|
||||
if (preg_match('(/)', $stopword_config)) {
|
||||
// If the setting is nonempty and contains a slash, query the
|
||||
// table the user has configured.
|
||||
$parts = explode('/', $stopword_config);
|
||||
list($stopword_database, $stopword_table) = $parts;
|
||||
} else {
|
||||
// Otherwise, query the InnoDB default stopword table.
|
||||
$stopword_database = 'INFORMATION_SCHEMA';
|
||||
$stopword_table = 'INNODB_FT_DEFAULT_STOPWORD';
|
||||
}
|
||||
|
||||
$stopwords = queryfx_all(
|
||||
$conn,
|
||||
'SELECT * FROM INFORMATION_SCHEMA.INNODB_FT_DEFAULT_STOPWORD');
|
||||
'SELECT * FROM %T.%T',
|
||||
$stopword_database,
|
||||
$stopword_table);
|
||||
$stopwords = ipull($stopwords, 'value');
|
||||
$stopwords = array_fuse($stopwords);
|
||||
|
||||
|
|
Loading…
Reference in a new issue