From 767528c0ed556299660ab1574eb6fc810f4468c6 Mon Sep 17 00:00:00 2001 From: epriestley Date: Tue, 14 Jan 2020 11:40:00 -0800 Subject: [PATCH] Move search query parser/compiler classes to Phabricator Summary: Ref T13472. Ref T13395. These classes are only used by Phabricator and not likely to find much use in Arcanist. Test Plan: Grepped libphutil and Arcanist for removed symbols. Maniphest Tasks: T13472, T13395 Differential Revision: https://secure.phabricator.com/D20939 --- externals/porter-stemmer/LICENSE | 20 + externals/porter-stemmer/README.md | 42 ++ externals/porter-stemmer/src/Porter.php | 426 ++++++++++++++++++ src/__phutil_library_map__.php | 12 + .../compiler/PhutilSearchQueryCompiler.php | 374 +++++++++++++++ ...utilSearchQueryCompilerSyntaxException.php | 4 + .../compiler/PhutilSearchQueryToken.php | 37 ++ .../search/compiler/PhutilSearchStemmer.php | 74 +++ .../PhutilSearchQueryCompilerTestCase.php | 220 +++++++++ .../__tests__/PhutilSearchStemmerTestCase.php | 85 ++++ 10 files changed, 1294 insertions(+) create mode 100644 externals/porter-stemmer/LICENSE create mode 100644 externals/porter-stemmer/README.md create mode 100644 externals/porter-stemmer/src/Porter.php create mode 100644 src/applications/search/compiler/PhutilSearchQueryCompiler.php create mode 100644 src/applications/search/compiler/PhutilSearchQueryCompilerSyntaxException.php create mode 100644 src/applications/search/compiler/PhutilSearchQueryToken.php create mode 100644 src/applications/search/compiler/PhutilSearchStemmer.php create mode 100644 src/applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php create mode 100644 src/applications/search/compiler/__tests__/PhutilSearchStemmerTestCase.php diff --git a/externals/porter-stemmer/LICENSE b/externals/porter-stemmer/LICENSE new file mode 100644 index 0000000000..d4afc6cfa7 --- /dev/null +++ b/externals/porter-stemmer/LICENSE @@ -0,0 +1,20 @@ +The MIT License (MIT) + +Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/) + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/externals/porter-stemmer/README.md b/externals/porter-stemmer/README.md new file mode 100644 index 0000000000..06bc417f9f --- /dev/null +++ b/externals/porter-stemmer/README.md @@ -0,0 +1,42 @@ +# Porter Stemmer by Richard Heyes + +# Installation (with composer) + +```json +{ + "require": { + "camspiers/porter-stemmer": "1.0.0" + } +} +``` + + $ composer install + +# Usage + +```php +$stem = Porter::Stem($word); +``` + +# License + +The MIT License (MIT) + +Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/) + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/externals/porter-stemmer/src/Porter.php b/externals/porter-stemmer/src/Porter.php new file mode 100644 index 0000000000..0720c53ba0 --- /dev/null +++ b/externals/porter-stemmer/src/Porter.php @@ -0,0 +1,426 @@ + + * + * Originally available under the GPL 2 or greater. Relicensed with permission + * of original authors under the MIT License in 2016. + * + * All rights reserved. + * + * @package PorterStemmer + * @author Richard Heyes + * @author Jon Abernathy + * @copyright 2005-2016 Richard Heyes (http://www.phpguru.org/) + * @license http://www.opensource.org/licenses/mit-license.html MIT License + */ + +/** + * PHP 5 Implementation of the Porter Stemmer algorithm. Certain elements + * were borrowed from the (broken) implementation by Jon Abernathy. + * + * See http://tartarus.org/~martin/PorterStemmer/ for a description of the + * algorithm. + * + * Usage: + * + * $stem = PorterStemmer::Stem($word); + * + * How easy is that? + * + * @package PorterStemmer + * @author Richard Heyes + * @author Jon Abernathy + * @copyright 2005-2016 Richard Heyes (http://www.phpguru.org/) + * @license http://www.opensource.org/licenses/mit-license.html MIT License + */ +class Porter +{ + /** + * Regex for matching a consonant + * + * @var string + */ + private static $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)'; + + /** + * Regex for matching a vowel + * + * @var string + */ + private static $regex_vowel = '(?:[aeiou]|(? 1) { + self::replace($word, 'e', ''); + + } elseif (self::m(substr($word, 0, -1)) == 1) { + + if (!self::cvc(substr($word, 0, -1))) { + self::replace($word, 'e', ''); + } + } + } + + // Part b + if (self::m($word) > 1 AND self::doubleConsonant($word) AND substr($word, -1) == 'l') { + $word = substr($word, 0, -1); + } + + return $word; + } + + /** + * Replaces the first string with the second, at the end of the string + * + * If third arg is given, then the preceding string must match that m + * count at least. + * + * @param string $str String to check + * @param string $check Ending to check for + * @param string $repl Replacement string + * @param int $m Optional minimum number of m() to meet + * + * @return bool Whether the $check string was at the end of the $str + * string. True does not necessarily mean that it was + * replaced. + */ + private static function replace(&$str, $check, $repl, $m = null) + { + $len = 0 - strlen($check); + + if (substr($str, $len) == $check) { + $substr = substr($str, 0, $len); + if (is_null($m) OR self::m($substr) > $m) { + $str = $substr . $repl; + } + + return true; + } + + return false; + } + + /** + * What, you mean it's not obvious from the name? + * + * m() measures the number of consonant sequences in $str. if c is + * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary + * presence, + * + * gives 0 + * vc gives 1 + * vcvc gives 2 + * vcvcvc gives 3 + * + * @param string $str The string to return the m count for + * + * @return int The m count + */ + private static function m($str) + { + $c = self::$regex_consonant; + $v = self::$regex_vowel; + + $str = preg_replace("#^$c+#", '', $str); + $str = preg_replace("#$v+$#", '', $str); + + preg_match_all("#($v+$c+)#", $str, $matches); + + return count($matches[1]); + } + + /** + * Returns true/false as to whether the given string contains two + * of the same consonant next to each other at the end of the string. + * + * @param string $str String to check + * + * @return bool Result + */ + private static function doubleConsonant($str) + { + $c = self::$regex_consonant; + + return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1}; + } + + /** + * Checks for ending CVC sequence where second C is not W, X or Y + * + * @param string $str String to check + * + * @return bool Result + */ + private static function cvc($str) + { + $c = self::$regex_consonant; + $v = self::$regex_vowel; + + return preg_match("#($c$v$c)$#", $str, $matches) + AND strlen($matches[1]) == 3 + AND $matches[1]{2} != 'w' + AND $matches[1]{2} != 'x' + AND $matches[1]{2} != 'y'; + } +} diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php index 174c0e4876..9b15a81c49 100644 --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -5658,6 +5658,12 @@ phutil_register_library_map(array( 'PhutilRemarkupTableBlockRule' => 'infrastructure/markup/blockrule/PhutilRemarkupTableBlockRule.php', 'PhutilRemarkupTestInterpreterRule' => 'infrastructure/markup/blockrule/PhutilRemarkupTestInterpreterRule.php', 'PhutilRemarkupUnderlineRule' => 'infrastructure/markup/markuprule/PhutilRemarkupUnderlineRule.php', + 'PhutilSearchQueryCompiler' => 'applications/search/compiler/PhutilSearchQueryCompiler.php', + 'PhutilSearchQueryCompilerSyntaxException' => 'applications/search/compiler/PhutilSearchQueryCompilerSyntaxException.php', + 'PhutilSearchQueryCompilerTestCase' => 'applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php', + 'PhutilSearchQueryToken' => 'applications/search/compiler/PhutilSearchQueryToken.php', + 'PhutilSearchStemmer' => 'applications/search/compiler/PhutilSearchStemmer.php', + 'PhutilSearchStemmerTestCase' => 'applications/search/compiler/__tests__/PhutilSearchStemmerTestCase.php', 'PhutilSlackAuthAdapter' => 'applications/auth/adapter/PhutilSlackAuthAdapter.php', 'PhutilTwitchAuthAdapter' => 'applications/auth/adapter/PhutilTwitchAuthAdapter.php', 'PhutilTwitterAuthAdapter' => 'applications/auth/adapter/PhutilTwitterAuthAdapter.php', @@ -12483,6 +12489,12 @@ phutil_register_library_map(array( 'PhutilRemarkupTableBlockRule' => 'PhutilRemarkupBlockRule', 'PhutilRemarkupTestInterpreterRule' => 'PhutilRemarkupBlockInterpreter', 'PhutilRemarkupUnderlineRule' => 'PhutilRemarkupRule', + 'PhutilSearchQueryCompiler' => 'Phobject', + 'PhutilSearchQueryCompilerSyntaxException' => 'Exception', + 'PhutilSearchQueryCompilerTestCase' => 'PhutilTestCase', + 'PhutilSearchQueryToken' => 'Phobject', + 'PhutilSearchStemmer' => 'Phobject', + 'PhutilSearchStemmerTestCase' => 'PhutilTestCase', 'PhutilSlackAuthAdapter' => 'PhutilOAuthAuthAdapter', 'PhutilTwitchAuthAdapter' => 'PhutilOAuthAuthAdapter', 'PhutilTwitterAuthAdapter' => 'PhutilOAuth1AuthAdapter', diff --git a/src/applications/search/compiler/PhutilSearchQueryCompiler.php b/src/applications/search/compiler/PhutilSearchQueryCompiler.php new file mode 100644 index 0000000000..eb5ee860c9 --- /dev/null +++ b/src/applications/search/compiler/PhutilSearchQueryCompiler.php @@ -0,0 +1,374 @@ +<()~*:""&|'; + private $query; + private $stemmer; + private $enableFunctions = false; + + const OPERATOR_NOT = 'not'; + const OPERATOR_AND = 'and'; + const OPERATOR_SUBSTRING = 'sub'; + const OPERATOR_EXACT = 'exact'; + + public function setOperators($operators) { + $this->operators = $operators; + return $this; + } + + public function getOperators() { + return $this->operators; + } + + public function setStemmer(PhutilSearchStemmer $stemmer) { + $this->stemmer = $stemmer; + return $this; + } + + public function getStemmer() { + return $this->stemmer; + } + + public function setEnableFunctions($enable_functions) { + $this->enableFunctions = $enable_functions; + return $this; + } + + public function getEnableFunctions() { + return $this->enableFunctions; + } + + public function compileQuery(array $tokens) { + assert_instances_of($tokens, 'PhutilSearchQueryToken'); + + $result = array(); + foreach ($tokens as $token) { + $result[] = $this->renderToken($token); + } + + return $this->compileRenderedTokens($result); + } + + public function compileLiteralQuery(array $tokens) { + assert_instances_of($tokens, 'PhutilSearchQueryToken'); + + $result = array(); + foreach ($tokens as $token) { + if (!$token->isQuoted()) { + continue; + } + $result[] = $this->renderToken($token); + } + + return $this->compileRenderedTokens($result); + } + + public function compileStemmedQuery(array $tokens) { + assert_instances_of($tokens, 'PhutilSearchQueryToken'); + + $result = array(); + foreach ($tokens as $token) { + if ($token->isQuoted()) { + continue; + } + $result[] = $this->renderToken($token, $this->getStemmer()); + } + + return $this->compileRenderedTokens($result); + } + + private function compileRenderedTokens(array $list) { + if (!$list) { + return null; + } + + $list = array_unique($list); + return implode(' ', $list); + } + + public function newTokens($query) { + $results = $this->tokenizeQuery($query); + + $tokens = array(); + foreach ($results as $result) { + $tokens[] = PhutilSearchQueryToken::newFromDictionary($result); + } + + return $tokens; + } + + private function tokenizeQuery($query) { + $maximum_bytes = 1024; + + $query_bytes = strlen($query); + if ($query_bytes > $maximum_bytes) { + throw new PhutilSearchQueryCompilerSyntaxException( + pht( + 'Query is too long (%s bytes, maximum is %s bytes).', + new PhutilNumber($query_bytes), + new PhutilNumber($maximum_bytes))); + } + + $query = phutil_utf8v($query); + $length = count($query); + + $enable_functions = $this->getEnableFunctions(); + + $mode = 'scan'; + $current_operator = array(); + $current_token = array(); + $current_function = null; + $is_quoted = false; + $tokens = array(); + + if ($enable_functions) { + $operator_characters = '[~=+-]'; + } else { + $operator_characters = '[+-]'; + } + + for ($ii = 0; $ii < $length; $ii++) { + $character = $query[$ii]; + + if ($mode == 'scan') { + if (preg_match('/^\s\z/u', $character)) { + continue; + } + + $mode = 'function'; + } + + if ($mode == 'function') { + $mode = 'operator'; + + if ($enable_functions) { + $found = false; + for ($jj = $ii; $jj < $length; $jj++) { + if (preg_match('/^[a-zA-Z]\z/u', $query[$jj])) { + continue; + } + if ($query[$jj] == ':') { + $found = $jj; + } + break; + } + + if ($found !== false) { + $function = array_slice($query, $ii, ($jj - $ii)); + $current_function = implode('', $function); + + if (!strlen($current_function)) { + $current_function = null; + } + + $ii = $jj; + continue; + } + } + } + + if ($mode == 'operator') { + if (preg_match('/^\s\z/u', $character)) { + continue; + } + + if (preg_match('/^'.$operator_characters.'\z/', $character)) { + $current_operator[] = $character; + continue; + } + + $mode = 'quote'; + } + + if ($mode == 'quote') { + if (preg_match('/^"\z/', $character)) { + $is_quoted = true; + $mode = 'token'; + continue; + } + + $mode = 'token'; + } + + if ($mode == 'token') { + $capture = false; + $was_quoted = $is_quoted; + if ($is_quoted) { + if (preg_match('/^"\z/', $character)) { + $capture = true; + $mode = 'scan'; + $is_quoted = false; + } + } else { + if (preg_match('/^\s\z/u', $character)) { + $capture = true; + $mode = 'scan'; + } + + if (preg_match('/^"\z/', $character)) { + $capture = true; + $mode = 'token'; + $is_quoted = true; + } + } + + if ($capture) { + $token = array( + 'operator' => $current_operator, + 'quoted' => $was_quoted, + 'value' => $current_token, + ); + + if ($enable_functions) { + $token['function'] = $current_function; + } + + $tokens[] = $token; + + $current_operator = array(); + $current_token = array(); + $current_function = null; + continue; + } else { + $current_token[] = $character; + } + } + } + + if ($is_quoted) { + throw new PhutilSearchQueryCompilerSyntaxException( + pht( + 'Query contains unmatched double quotes.')); + } + + if ($mode == 'operator') { + throw new PhutilSearchQueryCompilerSyntaxException( + pht( + 'Query contains operator ("%s") with no search term.', + implode('', $current_operator))); + } + + $token = array( + 'operator' => $current_operator, + 'quoted' => false, + 'value' => $current_token, + ); + + if ($enable_functions) { + $token['function'] = $current_function; + } + + $tokens[] = $token; + + $results = array(); + foreach ($tokens as $token) { + $value = implode('', $token['value']); + $operator_string = implode('', $token['operator']); + + if (!strlen($value)) { + continue; + } + + $is_quoted = $token['quoted']; + + switch ($operator_string) { + case '-': + $operator = self::OPERATOR_NOT; + break; + case '~': + $operator = self::OPERATOR_SUBSTRING; + break; + case '=': + $operator = self::OPERATOR_EXACT; + break; + case '+': + $operator = self::OPERATOR_AND; + break; + case '': + // See T12995. If this query term contains Chinese, Japanese or + // Korean characters, treat the term as a substring term by default. + // These languages do not separate words with spaces, so the term + // search mode is normally useless. + if ($enable_functions && !$is_quoted && phutil_utf8_is_cjk($value)) { + $operator = self::OPERATOR_SUBSTRING; + } else { + $operator = self::OPERATOR_AND; + } + break; + default: + throw new PhutilSearchQueryCompilerSyntaxException( + pht( + 'Query has an invalid sequence of operators ("%s").', + $operator_string)); + } + + $result = array( + 'operator' => $operator, + 'quoted' => $is_quoted, + 'value' => $value, + ); + + if ($enable_functions) { + $result['function'] = $token['function']; + } + + $results[] = $result; + } + + return $results; + } + + private function renderToken( + PhutilSearchQueryToken $token, + PhutilSearchStemmer $stemmer = null) { + $value = $token->getValue(); + + if ($stemmer) { + $value = $stemmer->stemToken($value); + } + + $value = $this->quoteToken($value); + $operator = $token->getOperator(); + $prefix = $this->getOperatorPrefix($operator); + + $value = $prefix.$value; + + return $value; + } + + private function getOperatorPrefix($operator) { + $operators = $this->operators; + + switch ($operator) { + case self::OPERATOR_AND: + $prefix = $operators[0]; + break; + case self::OPERATOR_NOT: + $prefix = $operators[2]; + break; + default: + throw new PhutilSearchQueryCompilerSyntaxException( + pht( + 'Unsupported operator prefix "%s".', + $operator)); + } + + if ($prefix == ' ') { + $prefix = null; + } + + return $prefix; + } + + private function quoteToken($value) { + $operators = $this->operators; + + $open_quote = $this->operators[10]; + $close_quote = $this->operators[11]; + + return $open_quote.$value.$close_quote; + } + +} diff --git a/src/applications/search/compiler/PhutilSearchQueryCompilerSyntaxException.php b/src/applications/search/compiler/PhutilSearchQueryCompilerSyntaxException.php new file mode 100644 index 0000000000..b09d36d04f --- /dev/null +++ b/src/applications/search/compiler/PhutilSearchQueryCompilerSyntaxException.php @@ -0,0 +1,4 @@ +isQuoted = $dictionary['quoted']; + $token->operator = $dictionary['operator']; + $token->value = $dictionary['value']; + $token->function = idx($dictionary, 'function'); + + return $token; + } + + public function isQuoted() { + return $this->isQuoted; + } + + public function getValue() { + return $this->value; + } + + public function getOperator() { + return $this->operator; + } + + public function getFunction() { + return $this->function; + } + +} diff --git a/src/applications/search/compiler/PhutilSearchStemmer.php b/src/applications/search/compiler/PhutilSearchStemmer.php new file mode 100644 index 0000000000..3255336ff0 --- /dev/null +++ b/src/applications/search/compiler/PhutilSearchStemmer.php @@ -0,0 +1,74 @@ +normalizeToken($token); + return $this->applyStemmer($token); + } + + public function stemCorpus($corpus) { + $corpus = $this->normalizeCorpus($corpus); + $tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF._]+/', $corpus); + + $words = array(); + foreach ($tokens as $key => $token) { + $token = trim($token, '._'); + + if (strlen($token) < 3) { + continue; + } + + $words[$token] = $token; + } + + $stems = array(); + foreach ($words as $word) { + $stems[] = $this->applyStemmer($word); + } + + return implode(' ', $stems); + } + + private function normalizeToken($token) { + return phutil_utf8_strtolower($token); + } + + private function normalizeCorpus($corpus) { + return phutil_utf8_strtolower($corpus); + } + + /** + * @phutil-external-symbol class Porter + */ + private function applyStemmer($normalized_token) { + // If the token has internal punctuation, handle it literally. This + // deals with things like domain names, Conduit API methods, and other + // sorts of informal tokens. + if (preg_match('/[._]/', $normalized_token)) { + return $normalized_token; + } + + static $loaded; + + if ($loaded === null) { + $root = dirname(phutil_get_library_root('phabricator')); + require_once $root.'/externals/porter-stemmer/src/Porter.php'; + $loaded = true; + } + + + $stem = Porter::stem($normalized_token); + + // If the stem is too short, it won't be a candidate for indexing. These + // tokens are also likely to be acronyms (like "DNS") rather than real + // English words. + if (strlen($stem) < 3) { + return $normalized_token; + } + + return $stem; + } + +} diff --git a/src/applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php b/src/applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php new file mode 100644 index 0000000000..647386f52c --- /dev/null +++ b/src/applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php @@ -0,0 +1,220 @@ + null, + 'cat dog' => '+"cat" +"dog"', + 'cat -dog' => '+"cat" -"dog"', + 'cat-dog' => '+"cat-dog"', + + // If there are spaces after an operator, the operator applies to the + // next search term. + 'cat - dog' => '+"cat" -"dog"', + + // Double quotes serve as delimiters even if there is no whitespace + // between terms. + '"cat"dog' => '+"cat" +"dog"', + + // This query is too long. + str_repeat('x', 2048) => false, + + // Multiple operators are not permitted. + '++cat' => false, + '+-cat' => false, + '--cat' => false, + + // Stray operators are not permitted. + '+' => false, + 'cat +' => false, + + // Double quotes must be paired. + '"' => false, + 'cat "' => false, + '"cat' => false, + 'A"' => false, + 'A"B"' => '+"A" +"B"', + ); + + $this->assertCompileQueries($tests); + + // Test that we compile queries correctly if the operators have been + // swapped to use "AND" by default. + $operator_tests = array( + 'cat dog' => '"cat" "dog"', + 'cat -dog' => '"cat" -"dog"', + ); + $this->assertCompileQueries($operator_tests, ' |-><()~*:""&\''); + + + // Test that we compile queries correctly if the quote operators have + // been swapped to differ. + $quote_tests = array( + 'cat dog' => '+[cat] +[dog]', + 'cat -dog' => '+[cat] -[dog]', + ); + $this->assertCompileQueries($quote_tests, '+ -><()~*:[]&|'); + + } + + public function testCompileQueriesWithStemming() { + $stemming_tests = array( + 'cat dog' => array( + null, + '+"cat" +"dog"', + ), + 'cats dogs' => array( + null, + '+"cat" +"dog"', + ), + 'cats "dogs"' => array( + '+"dogs"', + '+"cat"', + ), + '"blessed blade" of the windseeker' => array( + '+"blessed blade"', + '+"of" +"the" +"windseek"', + ), + 'mailing users for mentions on tasks' => array( + null, + '+"mail" +"user" +"for" +"mention" +"on" +"task"', + ), + ); + + $stemmer = new PhutilSearchStemmer(); + $this->assertCompileQueries($stemming_tests, null, $stemmer); + } + + public function testCompileQueriesWithFunctions() { + $op_and = PhutilSearchQueryCompiler::OPERATOR_AND; + $op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING; + $op_exact = PhutilSearchQueryCompiler::OPERATOR_EXACT; + + $mao = "\xE7\x8C\xAB"; + + $function_tests = array( + 'cat' => array( + array(null, $op_and, 'cat'), + ), + ':cat' => array( + array(null, $op_and, 'cat'), + ), + 'title:cat' => array( + array('title', $op_and, 'cat'), + ), + 'title:cat:dog' => array( + array('title', $op_and, 'cat:dog'), + ), + 'title:~cat' => array( + array('title', $op_sub, 'cat'), + ), + 'cat title:="Meow Meow"' => array( + array(null, $op_and, 'cat'), + array('title', $op_exact, 'Meow Meow'), + ), + 'title:cat title:dog' => array( + array('title', $op_and, 'cat'), + array('title', $op_and, 'dog'), + ), + '~"core and seven years ag"' => array( + array(null, $op_sub, 'core and seven years ag'), + ), + $mao => array( + array(null, $op_sub, $mao), + ), + '+'.$mao => array( + array(null, $op_and, $mao), + ), + '~'.$mao => array( + array(null, $op_sub, $mao), + ), + '"'.$mao.'"' => array( + array(null, $op_and, $mao), + ), + ); + + $this->assertCompileFunctionQueries($function_tests); + } + + private function assertCompileQueries( + array $tests, + $operators = null, + PhutilSearchStemmer $stemmer = null) { + foreach ($tests as $input => $expect) { + $caught = null; + + $query = null; + $literal_query = null; + $stemmed_query = null; + + try { + $compiler = new PhutilSearchQueryCompiler(); + + if ($operators !== null) { + $compiler->setOperators($operators); + } + + if ($stemmer !== null) { + $compiler->setStemmer($stemmer); + } + + $tokens = $compiler->newTokens($input); + + if ($stemmer) { + $literal_query = $compiler->compileLiteralQuery($tokens); + $stemmed_query = $compiler->compileStemmedQuery($tokens); + } else { + $query = $compiler->compileQuery($tokens); + } + } catch (PhutilSearchQueryCompilerSyntaxException $ex) { + $caught = $ex; + } + + if ($caught !== null) { + $query = false; + $literal_query = false; + $stemmed_query = false; + } + + if (!$stemmer) { + $this->assertEqual( + $expect, + $query, + pht('Compilation of query: %s', $input)); + } else { + $this->assertEqual( + $expect, + ($literal_query === false) + ? false + : array($literal_query, $stemmed_query), + pht('Stemmed compilation of query: %s', $input)); + } + } + } + + private function assertCompileFunctionQueries(array $tests) { + foreach ($tests as $input => $expect) { + $compiler = id(new PhutilSearchQueryCompiler()) + ->setEnableFunctions(true); + + $tokens = $compiler->newTokens($input); + + $result = array(); + foreach ($tokens as $token) { + $result[] = array( + $token->getFunction(), + $token->getOperator(), + $token->getValue(), + ); + } + + $this->assertEqual( + $expect, + $result, + pht('Function compilation of query: %s', $input)); + } + } + +} diff --git a/src/applications/search/compiler/__tests__/PhutilSearchStemmerTestCase.php b/src/applications/search/compiler/__tests__/PhutilSearchStemmerTestCase.php new file mode 100644 index 0000000000..34154d9ca3 --- /dev/null +++ b/src/applications/search/compiler/__tests__/PhutilSearchStemmerTestCase.php @@ -0,0 +1,85 @@ + 'token', + 'panels' => 'panel', + + 'renames' => 'renam', + 'rename' => 'renam', + + 'components' => 'compon', + 'component' => 'compon', + + 'implementation' => 'implement', + 'implements' => 'implement', + 'implementing' => 'implement', + 'implementer' => 'implement', + + 'deleting' => 'delet', + 'deletion' => 'delet', + 'delete' => 'delet', + + 'erratically' => 'errat', + 'erratic' => 'errat', + + // Stems should be normalized. + 'DOG' => 'dog', + + // If stemming would bring a token under 3 characters, it should not + // be stemmed. + 'dns' => 'dns', + 'nis' => 'nis', + + // Complex tokens with internal punctuation should be left untouched; + // these are usually things like domain names, API calls, informal tags, + // etc. + 'apples' => 'appl', + 'bananas' => 'banana', + 'apples_bananas' => 'apples_bananas', + 'apples_bananas.apples_bananas' => 'apples_bananas.apples_bananas', + ); + + $stemmer = new PhutilSearchStemmer(); + foreach ($tests as $input => $expect) { + $stem = $stemmer->stemToken($input); + $this->assertEqual( + $expect, + $stem, + pht('Token stem of "%s".', $input)); + } + } + + public function testStemDocuments() { + $tests = array( + 'The wild boar meandered erratically.' => + 'the wild boar meander errat', + 'Fool me onc, shame on you. Fool me twice, shame on me.' => + 'fool onc shame you twice', + 'Fireball is a seventh-level spell which deals 2d16 points of damage '. + 'in a 1-meter radius around a target.' => + 'firebal seventh level spell which deal 2d16 point damag meter '. + 'radiu around target', + 'apples-bananas' => 'appl banana', + 'apples_bananas' => 'apples_bananas', + 'apples.bananas' => 'apples.bananas', + 'oddly-proportioned' => 'oddli proport', + ); + + $stemmer = new PhutilSearchStemmer(); + foreach ($tests as $input => $expect) { + $stem = $stemmer->stemCorpus($input); + $this->assertEqual( + $expect, + $stem, + pht('Corpus stem of: %s', $input)); + } + } + + +}