1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-22 06:42:42 +01:00

Move search query parser/compiler classes to Phabricator

Summary: Ref T13472. Ref T13395. These classes are only used by Phabricator and not likely to find much use in Arcanist.

Test Plan: Grepped libphutil and Arcanist for removed symbols.

Maniphest Tasks: T13472, T13395

Differential Revision: https://secure.phabricator.com/D20939
This commit is contained in:
epriestley 2020-01-14 11:40:00 -08:00
parent 54bcbdaba9
commit 767528c0ed
10 changed files with 1294 additions and 0 deletions

20
externals/porter-stemmer/LICENSE vendored Normal file
View file

@ -0,0 +1,20 @@
The MIT License (MIT)
Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/)
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

42
externals/porter-stemmer/README.md vendored Normal file
View file

@ -0,0 +1,42 @@
# Porter Stemmer by Richard Heyes
# Installation (with composer)
```json
{
"require": {
"camspiers/porter-stemmer": "1.0.0"
}
}
```
$ composer install
# Usage
```php
$stem = Porter::Stem($word);
```
# License
The MIT License (MIT)
Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/)
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

426
externals/porter-stemmer/src/Porter.php vendored Normal file
View file

@ -0,0 +1,426 @@
<?php
# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
/**
* Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/)
*
* Portions Copyright 2003-2007 Jon Abernathy <jon@chuggnutt.com>
*
* Originally available under the GPL 2 or greater. Relicensed with permission
* of original authors under the MIT License in 2016.
*
* All rights reserved.
*
* @package PorterStemmer
* @author Richard Heyes
* @author Jon Abernathy <jon@chuggnutt.com>
* @copyright 2005-2016 Richard Heyes (http://www.phpguru.org/)
* @license http://www.opensource.org/licenses/mit-license.html MIT License
*/
/**
* PHP 5 Implementation of the Porter Stemmer algorithm. Certain elements
* were borrowed from the (broken) implementation by Jon Abernathy.
*
* See http://tartarus.org/~martin/PorterStemmer/ for a description of the
* algorithm.
*
* Usage:
*
* $stem = PorterStemmer::Stem($word);
*
* How easy is that?
*
* @package PorterStemmer
* @author Richard Heyes
* @author Jon Abernathy <jon@chuggnutt.com>
* @copyright 2005-2016 Richard Heyes (http://www.phpguru.org/)
* @license http://www.opensource.org/licenses/mit-license.html MIT License
*/
class Porter
{
/**
* Regex for matching a consonant
*
* @var string
*/
private static $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
/**
* Regex for matching a vowel
*
* @var string
*/
private static $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
/**
* Stems a word. Simple huh?
*
* @param string $word Word to stem
*
* @return string Stemmed word
*/
public static function Stem($word)
{
if (strlen($word) <= 2) {
return $word;
}
$word = self::step1ab($word);
$word = self::step1c($word);
$word = self::step2($word);
$word = self::step3($word);
$word = self::step4($word);
$word = self::step5($word);
return $word;
}
/**
* Step 1
*/
private static function step1ab($word)
{
// Part a
if (substr($word, -1) == 's') {
self::replace($word, 'sses', 'ss')
OR self::replace($word, 'ies', 'i')
OR self::replace($word, 'ss', 'ss')
OR self::replace($word, 's', '');
}
// Part b
if (substr($word, -2, 1) != 'e' OR !self::replace($word, 'eed', 'ee', 0)) { // First rule
$v = self::$regex_vowel;
// ing and ed
if ( preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '')
OR preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons
// If one of above two test successful
if ( !self::replace($word, 'at', 'ate')
AND !self::replace($word, 'bl', 'ble')
AND !self::replace($word, 'iz', 'ize')) {
// Double consonant ending
if ( self::doubleConsonant($word)
AND substr($word, -2) != 'll'
AND substr($word, -2) != 'ss'
AND substr($word, -2) != 'zz') {
$word = substr($word, 0, -1);
} elseif (self::m($word) == 1 AND self::cvc($word)) {
$word .= 'e';
}
}
}
}
return $word;
}
/**
* Step 1c
*
* @param string $word Word to stem
*/
private static function step1c($word)
{
$v = self::$regex_vowel;
if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
self::replace($word, 'y', 'i');
}
return $word;
}
/**
* Step 2
*
* @param string $word Word to stem
*/
private static function step2($word)
{
switch (substr($word, -2, 1)) {
case 'a':
self::replace($word, 'ational', 'ate', 0)
OR self::replace($word, 'tional', 'tion', 0);
break;
case 'c':
self::replace($word, 'enci', 'ence', 0)
OR self::replace($word, 'anci', 'ance', 0);
break;
case 'e':
self::replace($word, 'izer', 'ize', 0);
break;
case 'g':
self::replace($word, 'logi', 'log', 0);
break;
case 'l':
self::replace($word, 'entli', 'ent', 0)
OR self::replace($word, 'ousli', 'ous', 0)
OR self::replace($word, 'alli', 'al', 0)
OR self::replace($word, 'bli', 'ble', 0)
OR self::replace($word, 'eli', 'e', 0);
break;
case 'o':
self::replace($word, 'ization', 'ize', 0)
OR self::replace($word, 'ation', 'ate', 0)
OR self::replace($word, 'ator', 'ate', 0);
break;
case 's':
self::replace($word, 'iveness', 'ive', 0)
OR self::replace($word, 'fulness', 'ful', 0)
OR self::replace($word, 'ousness', 'ous', 0)
OR self::replace($word, 'alism', 'al', 0);
break;
case 't':
self::replace($word, 'biliti', 'ble', 0)
OR self::replace($word, 'aliti', 'al', 0)
OR self::replace($word, 'iviti', 'ive', 0);
break;
}
return $word;
}
/**
* Step 3
*
* @param string $word String to stem
*/
private static function step3($word)
{
switch (substr($word, -2, 1)) {
case 'a':
self::replace($word, 'ical', 'ic', 0);
break;
case 's':
self::replace($word, 'ness', '', 0);
break;
case 't':
self::replace($word, 'icate', 'ic', 0)
OR self::replace($word, 'iciti', 'ic', 0);
break;
case 'u':
self::replace($word, 'ful', '', 0);
break;
case 'v':
self::replace($word, 'ative', '', 0);
break;
case 'z':
self::replace($word, 'alize', 'al', 0);
break;
}
return $word;
}
/**
* Step 4
*
* @param string $word Word to stem
*/
private static function step4($word)
{
switch (substr($word, -2, 1)) {
case 'a':
self::replace($word, 'al', '', 1);
break;
case 'c':
self::replace($word, 'ance', '', 1)
OR self::replace($word, 'ence', '', 1);
break;
case 'e':
self::replace($word, 'er', '', 1);
break;
case 'i':
self::replace($word, 'ic', '', 1);
break;
case 'l':
self::replace($word, 'able', '', 1)
OR self::replace($word, 'ible', '', 1);
break;
case 'n':
self::replace($word, 'ant', '', 1)
OR self::replace($word, 'ement', '', 1)
OR self::replace($word, 'ment', '', 1)
OR self::replace($word, 'ent', '', 1);
break;
case 'o':
if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
self::replace($word, 'ion', '', 1);
} else {
self::replace($word, 'ou', '', 1);
}
break;
case 's':
self::replace($word, 'ism', '', 1);
break;
case 't':
self::replace($word, 'ate', '', 1)
OR self::replace($word, 'iti', '', 1);
break;
case 'u':
self::replace($word, 'ous', '', 1);
break;
case 'v':
self::replace($word, 'ive', '', 1);
break;
case 'z':
self::replace($word, 'ize', '', 1);
break;
}
return $word;
}
/**
* Step 5
*
* @param string $word Word to stem
*/
private static function step5($word)
{
// Part a
if (substr($word, -1) == 'e') {
if (self::m(substr($word, 0, -1)) > 1) {
self::replace($word, 'e', '');
} elseif (self::m(substr($word, 0, -1)) == 1) {
if (!self::cvc(substr($word, 0, -1))) {
self::replace($word, 'e', '');
}
}
}
// Part b
if (self::m($word) > 1 AND self::doubleConsonant($word) AND substr($word, -1) == 'l') {
$word = substr($word, 0, -1);
}
return $word;
}
/**
* Replaces the first string with the second, at the end of the string
*
* If third arg is given, then the preceding string must match that m
* count at least.
*
* @param string $str String to check
* @param string $check Ending to check for
* @param string $repl Replacement string
* @param int $m Optional minimum number of m() to meet
*
* @return bool Whether the $check string was at the end of the $str
* string. True does not necessarily mean that it was
* replaced.
*/
private static function replace(&$str, $check, $repl, $m = null)
{
$len = 0 - strlen($check);
if (substr($str, $len) == $check) {
$substr = substr($str, 0, $len);
if (is_null($m) OR self::m($substr) > $m) {
$str = $substr . $repl;
}
return true;
}
return false;
}
/**
* What, you mean it's not obvious from the name?
*
* m() measures the number of consonant sequences in $str. if c is
* a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
* presence,
*
* <c><v> gives 0
* <c>vc<v> gives 1
* <c>vcvc<v> gives 2
* <c>vcvcvc<v> gives 3
*
* @param string $str The string to return the m count for
*
* @return int The m count
*/
private static function m($str)
{
$c = self::$regex_consonant;
$v = self::$regex_vowel;
$str = preg_replace("#^$c+#", '', $str);
$str = preg_replace("#$v+$#", '', $str);
preg_match_all("#($v+$c+)#", $str, $matches);
return count($matches[1]);
}
/**
* Returns true/false as to whether the given string contains two
* of the same consonant next to each other at the end of the string.
*
* @param string $str String to check
*
* @return bool Result
*/
private static function doubleConsonant($str)
{
$c = self::$regex_consonant;
return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1};
}
/**
* Checks for ending CVC sequence where second C is not W, X or Y
*
* @param string $str String to check
*
* @return bool Result
*/
private static function cvc($str)
{
$c = self::$regex_consonant;
$v = self::$regex_vowel;
return preg_match("#($c$v$c)$#", $str, $matches)
AND strlen($matches[1]) == 3
AND $matches[1]{2} != 'w'
AND $matches[1]{2} != 'x'
AND $matches[1]{2} != 'y';
}
}

View file

@ -5658,6 +5658,12 @@ phutil_register_library_map(array(
'PhutilRemarkupTableBlockRule' => 'infrastructure/markup/blockrule/PhutilRemarkupTableBlockRule.php',
'PhutilRemarkupTestInterpreterRule' => 'infrastructure/markup/blockrule/PhutilRemarkupTestInterpreterRule.php',
'PhutilRemarkupUnderlineRule' => 'infrastructure/markup/markuprule/PhutilRemarkupUnderlineRule.php',
'PhutilSearchQueryCompiler' => 'applications/search/compiler/PhutilSearchQueryCompiler.php',
'PhutilSearchQueryCompilerSyntaxException' => 'applications/search/compiler/PhutilSearchQueryCompilerSyntaxException.php',
'PhutilSearchQueryCompilerTestCase' => 'applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php',
'PhutilSearchQueryToken' => 'applications/search/compiler/PhutilSearchQueryToken.php',
'PhutilSearchStemmer' => 'applications/search/compiler/PhutilSearchStemmer.php',
'PhutilSearchStemmerTestCase' => 'applications/search/compiler/__tests__/PhutilSearchStemmerTestCase.php',
'PhutilSlackAuthAdapter' => 'applications/auth/adapter/PhutilSlackAuthAdapter.php',
'PhutilTwitchAuthAdapter' => 'applications/auth/adapter/PhutilTwitchAuthAdapter.php',
'PhutilTwitterAuthAdapter' => 'applications/auth/adapter/PhutilTwitterAuthAdapter.php',
@ -12483,6 +12489,12 @@ phutil_register_library_map(array(
'PhutilRemarkupTableBlockRule' => 'PhutilRemarkupBlockRule',
'PhutilRemarkupTestInterpreterRule' => 'PhutilRemarkupBlockInterpreter',
'PhutilRemarkupUnderlineRule' => 'PhutilRemarkupRule',
'PhutilSearchQueryCompiler' => 'Phobject',
'PhutilSearchQueryCompilerSyntaxException' => 'Exception',
'PhutilSearchQueryCompilerTestCase' => 'PhutilTestCase',
'PhutilSearchQueryToken' => 'Phobject',
'PhutilSearchStemmer' => 'Phobject',
'PhutilSearchStemmerTestCase' => 'PhutilTestCase',
'PhutilSlackAuthAdapter' => 'PhutilOAuthAuthAdapter',
'PhutilTwitchAuthAdapter' => 'PhutilOAuthAuthAdapter',
'PhutilTwitterAuthAdapter' => 'PhutilOAuth1AuthAdapter',

View file

@ -0,0 +1,374 @@
<?php
final class PhutilSearchQueryCompiler
extends Phobject {
private $operators = '+ -><()~*:""&|';
private $query;
private $stemmer;
private $enableFunctions = false;
const OPERATOR_NOT = 'not';
const OPERATOR_AND = 'and';
const OPERATOR_SUBSTRING = 'sub';
const OPERATOR_EXACT = 'exact';
public function setOperators($operators) {
$this->operators = $operators;
return $this;
}
public function getOperators() {
return $this->operators;
}
public function setStemmer(PhutilSearchStemmer $stemmer) {
$this->stemmer = $stemmer;
return $this;
}
public function getStemmer() {
return $this->stemmer;
}
public function setEnableFunctions($enable_functions) {
$this->enableFunctions = $enable_functions;
return $this;
}
public function getEnableFunctions() {
return $this->enableFunctions;
}
public function compileQuery(array $tokens) {
assert_instances_of($tokens, 'PhutilSearchQueryToken');
$result = array();
foreach ($tokens as $token) {
$result[] = $this->renderToken($token);
}
return $this->compileRenderedTokens($result);
}
public function compileLiteralQuery(array $tokens) {
assert_instances_of($tokens, 'PhutilSearchQueryToken');
$result = array();
foreach ($tokens as $token) {
if (!$token->isQuoted()) {
continue;
}
$result[] = $this->renderToken($token);
}
return $this->compileRenderedTokens($result);
}
public function compileStemmedQuery(array $tokens) {
assert_instances_of($tokens, 'PhutilSearchQueryToken');
$result = array();
foreach ($tokens as $token) {
if ($token->isQuoted()) {
continue;
}
$result[] = $this->renderToken($token, $this->getStemmer());
}
return $this->compileRenderedTokens($result);
}
private function compileRenderedTokens(array $list) {
if (!$list) {
return null;
}
$list = array_unique($list);
return implode(' ', $list);
}
public function newTokens($query) {
$results = $this->tokenizeQuery($query);
$tokens = array();
foreach ($results as $result) {
$tokens[] = PhutilSearchQueryToken::newFromDictionary($result);
}
return $tokens;
}
private function tokenizeQuery($query) {
$maximum_bytes = 1024;
$query_bytes = strlen($query);
if ($query_bytes > $maximum_bytes) {
throw new PhutilSearchQueryCompilerSyntaxException(
pht(
'Query is too long (%s bytes, maximum is %s bytes).',
new PhutilNumber($query_bytes),
new PhutilNumber($maximum_bytes)));
}
$query = phutil_utf8v($query);
$length = count($query);
$enable_functions = $this->getEnableFunctions();
$mode = 'scan';
$current_operator = array();
$current_token = array();
$current_function = null;
$is_quoted = false;
$tokens = array();
if ($enable_functions) {
$operator_characters = '[~=+-]';
} else {
$operator_characters = '[+-]';
}
for ($ii = 0; $ii < $length; $ii++) {
$character = $query[$ii];
if ($mode == 'scan') {
if (preg_match('/^\s\z/u', $character)) {
continue;
}
$mode = 'function';
}
if ($mode == 'function') {
$mode = 'operator';
if ($enable_functions) {
$found = false;
for ($jj = $ii; $jj < $length; $jj++) {
if (preg_match('/^[a-zA-Z]\z/u', $query[$jj])) {
continue;
}
if ($query[$jj] == ':') {
$found = $jj;
}
break;
}
if ($found !== false) {
$function = array_slice($query, $ii, ($jj - $ii));
$current_function = implode('', $function);
if (!strlen($current_function)) {
$current_function = null;
}
$ii = $jj;
continue;
}
}
}
if ($mode == 'operator') {
if (preg_match('/^\s\z/u', $character)) {
continue;
}
if (preg_match('/^'.$operator_characters.'\z/', $character)) {
$current_operator[] = $character;
continue;
}
$mode = 'quote';
}
if ($mode == 'quote') {
if (preg_match('/^"\z/', $character)) {
$is_quoted = true;
$mode = 'token';
continue;
}
$mode = 'token';
}
if ($mode == 'token') {
$capture = false;
$was_quoted = $is_quoted;
if ($is_quoted) {
if (preg_match('/^"\z/', $character)) {
$capture = true;
$mode = 'scan';
$is_quoted = false;
}
} else {
if (preg_match('/^\s\z/u', $character)) {
$capture = true;
$mode = 'scan';
}
if (preg_match('/^"\z/', $character)) {
$capture = true;
$mode = 'token';
$is_quoted = true;
}
}
if ($capture) {
$token = array(
'operator' => $current_operator,
'quoted' => $was_quoted,
'value' => $current_token,
);
if ($enable_functions) {
$token['function'] = $current_function;
}
$tokens[] = $token;
$current_operator = array();
$current_token = array();
$current_function = null;
continue;
} else {
$current_token[] = $character;
}
}
}
if ($is_quoted) {
throw new PhutilSearchQueryCompilerSyntaxException(
pht(
'Query contains unmatched double quotes.'));
}
if ($mode == 'operator') {
throw new PhutilSearchQueryCompilerSyntaxException(
pht(
'Query contains operator ("%s") with no search term.',
implode('', $current_operator)));
}
$token = array(
'operator' => $current_operator,
'quoted' => false,
'value' => $current_token,
);
if ($enable_functions) {
$token['function'] = $current_function;
}
$tokens[] = $token;
$results = array();
foreach ($tokens as $token) {
$value = implode('', $token['value']);
$operator_string = implode('', $token['operator']);
if (!strlen($value)) {
continue;
}
$is_quoted = $token['quoted'];
switch ($operator_string) {
case '-':
$operator = self::OPERATOR_NOT;
break;
case '~':
$operator = self::OPERATOR_SUBSTRING;
break;
case '=':
$operator = self::OPERATOR_EXACT;
break;
case '+':
$operator = self::OPERATOR_AND;
break;
case '':
// See T12995. If this query term contains Chinese, Japanese or
// Korean characters, treat the term as a substring term by default.
// These languages do not separate words with spaces, so the term
// search mode is normally useless.
if ($enable_functions && !$is_quoted && phutil_utf8_is_cjk($value)) {
$operator = self::OPERATOR_SUBSTRING;
} else {
$operator = self::OPERATOR_AND;
}
break;
default:
throw new PhutilSearchQueryCompilerSyntaxException(
pht(
'Query has an invalid sequence of operators ("%s").',
$operator_string));
}
$result = array(
'operator' => $operator,
'quoted' => $is_quoted,
'value' => $value,
);
if ($enable_functions) {
$result['function'] = $token['function'];
}
$results[] = $result;
}
return $results;
}
private function renderToken(
PhutilSearchQueryToken $token,
PhutilSearchStemmer $stemmer = null) {
$value = $token->getValue();
if ($stemmer) {
$value = $stemmer->stemToken($value);
}
$value = $this->quoteToken($value);
$operator = $token->getOperator();
$prefix = $this->getOperatorPrefix($operator);
$value = $prefix.$value;
return $value;
}
private function getOperatorPrefix($operator) {
$operators = $this->operators;
switch ($operator) {
case self::OPERATOR_AND:
$prefix = $operators[0];
break;
case self::OPERATOR_NOT:
$prefix = $operators[2];
break;
default:
throw new PhutilSearchQueryCompilerSyntaxException(
pht(
'Unsupported operator prefix "%s".',
$operator));
}
if ($prefix == ' ') {
$prefix = null;
}
return $prefix;
}
private function quoteToken($value) {
$operators = $this->operators;
$open_quote = $this->operators[10];
$close_quote = $this->operators[11];
return $open_quote.$value.$close_quote;
}
}

View file

@ -0,0 +1,4 @@
<?php
final class PhutilSearchQueryCompilerSyntaxException
extends Exception {}

View file

@ -0,0 +1,37 @@
<?php
final class PhutilSearchQueryToken extends Phobject {
private $isQuoted;
private $value;
private $operator;
private $function;
public static function newFromDictionary(array $dictionary) {
$token = new self();
$token->isQuoted = $dictionary['quoted'];
$token->operator = $dictionary['operator'];
$token->value = $dictionary['value'];
$token->function = idx($dictionary, 'function');
return $token;
}
public function isQuoted() {
return $this->isQuoted;
}
public function getValue() {
return $this->value;
}
public function getOperator() {
return $this->operator;
}
public function getFunction() {
return $this->function;
}
}

View file

@ -0,0 +1,74 @@
<?php
final class PhutilSearchStemmer
extends Phobject {
public function stemToken($token) {
$token = $this->normalizeToken($token);
return $this->applyStemmer($token);
}
public function stemCorpus($corpus) {
$corpus = $this->normalizeCorpus($corpus);
$tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF._]+/', $corpus);
$words = array();
foreach ($tokens as $key => $token) {
$token = trim($token, '._');
if (strlen($token) < 3) {
continue;
}
$words[$token] = $token;
}
$stems = array();
foreach ($words as $word) {
$stems[] = $this->applyStemmer($word);
}
return implode(' ', $stems);
}
private function normalizeToken($token) {
return phutil_utf8_strtolower($token);
}
private function normalizeCorpus($corpus) {
return phutil_utf8_strtolower($corpus);
}
/**
* @phutil-external-symbol class Porter
*/
private function applyStemmer($normalized_token) {
// If the token has internal punctuation, handle it literally. This
// deals with things like domain names, Conduit API methods, and other
// sorts of informal tokens.
if (preg_match('/[._]/', $normalized_token)) {
return $normalized_token;
}
static $loaded;
if ($loaded === null) {
$root = dirname(phutil_get_library_root('phabricator'));
require_once $root.'/externals/porter-stemmer/src/Porter.php';
$loaded = true;
}
$stem = Porter::stem($normalized_token);
// If the stem is too short, it won't be a candidate for indexing. These
// tokens are also likely to be acronyms (like "DNS") rather than real
// English words.
if (strlen($stem) < 3) {
return $normalized_token;
}
return $stem;
}
}

View file

@ -0,0 +1,220 @@
<?php
final class PhutilSearchQueryCompilerTestCase
extends PhutilTestCase {
public function testCompileQueries() {
$tests = array(
'' => null,
'cat dog' => '+"cat" +"dog"',
'cat -dog' => '+"cat" -"dog"',
'cat-dog' => '+"cat-dog"',
// If there are spaces after an operator, the operator applies to the
// next search term.
'cat - dog' => '+"cat" -"dog"',
// Double quotes serve as delimiters even if there is no whitespace
// between terms.
'"cat"dog' => '+"cat" +"dog"',
// This query is too long.
str_repeat('x', 2048) => false,
// Multiple operators are not permitted.
'++cat' => false,
'+-cat' => false,
'--cat' => false,
// Stray operators are not permitted.
'+' => false,
'cat +' => false,
// Double quotes must be paired.
'"' => false,
'cat "' => false,
'"cat' => false,
'A"' => false,
'A"B"' => '+"A" +"B"',
);
$this->assertCompileQueries($tests);
// Test that we compile queries correctly if the operators have been
// swapped to use "AND" by default.
$operator_tests = array(
'cat dog' => '"cat" "dog"',
'cat -dog' => '"cat" -"dog"',
);
$this->assertCompileQueries($operator_tests, ' |-><()~*:""&\'');
// Test that we compile queries correctly if the quote operators have
// been swapped to differ.
$quote_tests = array(
'cat dog' => '+[cat] +[dog]',
'cat -dog' => '+[cat] -[dog]',
);
$this->assertCompileQueries($quote_tests, '+ -><()~*:[]&|');
}
public function testCompileQueriesWithStemming() {
$stemming_tests = array(
'cat dog' => array(
null,
'+"cat" +"dog"',
),
'cats dogs' => array(
null,
'+"cat" +"dog"',
),
'cats "dogs"' => array(
'+"dogs"',
'+"cat"',
),
'"blessed blade" of the windseeker' => array(
'+"blessed blade"',
'+"of" +"the" +"windseek"',
),
'mailing users for mentions on tasks' => array(
null,
'+"mail" +"user" +"for" +"mention" +"on" +"task"',
),
);
$stemmer = new PhutilSearchStemmer();
$this->assertCompileQueries($stemming_tests, null, $stemmer);
}
public function testCompileQueriesWithFunctions() {
$op_and = PhutilSearchQueryCompiler::OPERATOR_AND;
$op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING;
$op_exact = PhutilSearchQueryCompiler::OPERATOR_EXACT;
$mao = "\xE7\x8C\xAB";
$function_tests = array(
'cat' => array(
array(null, $op_and, 'cat'),
),
':cat' => array(
array(null, $op_and, 'cat'),
),
'title:cat' => array(
array('title', $op_and, 'cat'),
),
'title:cat:dog' => array(
array('title', $op_and, 'cat:dog'),
),
'title:~cat' => array(
array('title', $op_sub, 'cat'),
),
'cat title:="Meow Meow"' => array(
array(null, $op_and, 'cat'),
array('title', $op_exact, 'Meow Meow'),
),
'title:cat title:dog' => array(
array('title', $op_and, 'cat'),
array('title', $op_and, 'dog'),
),
'~"core and seven years ag"' => array(
array(null, $op_sub, 'core and seven years ag'),
),
$mao => array(
array(null, $op_sub, $mao),
),
'+'.$mao => array(
array(null, $op_and, $mao),
),
'~'.$mao => array(
array(null, $op_sub, $mao),
),
'"'.$mao.'"' => array(
array(null, $op_and, $mao),
),
);
$this->assertCompileFunctionQueries($function_tests);
}
private function assertCompileQueries(
array $tests,
$operators = null,
PhutilSearchStemmer $stemmer = null) {
foreach ($tests as $input => $expect) {
$caught = null;
$query = null;
$literal_query = null;
$stemmed_query = null;
try {
$compiler = new PhutilSearchQueryCompiler();
if ($operators !== null) {
$compiler->setOperators($operators);
}
if ($stemmer !== null) {
$compiler->setStemmer($stemmer);
}
$tokens = $compiler->newTokens($input);
if ($stemmer) {
$literal_query = $compiler->compileLiteralQuery($tokens);
$stemmed_query = $compiler->compileStemmedQuery($tokens);
} else {
$query = $compiler->compileQuery($tokens);
}
} catch (PhutilSearchQueryCompilerSyntaxException $ex) {
$caught = $ex;
}
if ($caught !== null) {
$query = false;
$literal_query = false;
$stemmed_query = false;
}
if (!$stemmer) {
$this->assertEqual(
$expect,
$query,
pht('Compilation of query: %s', $input));
} else {
$this->assertEqual(
$expect,
($literal_query === false)
? false
: array($literal_query, $stemmed_query),
pht('Stemmed compilation of query: %s', $input));
}
}
}
private function assertCompileFunctionQueries(array $tests) {
foreach ($tests as $input => $expect) {
$compiler = id(new PhutilSearchQueryCompiler())
->setEnableFunctions(true);
$tokens = $compiler->newTokens($input);
$result = array();
foreach ($tokens as $token) {
$result[] = array(
$token->getFunction(),
$token->getOperator(),
$token->getValue(),
);
}
$this->assertEqual(
$expect,
$result,
pht('Function compilation of query: %s', $input));
}
}
}

View file

@ -0,0 +1,85 @@
<?php
final class PhutilSearchStemmerTestCase
extends PhutilTestCase {
public function testStemTokens() {
$tests = array(
// Various real-world cases collected from users before we implemented
// stemming.
'tokens' => 'token',
'panels' => 'panel',
'renames' => 'renam',
'rename' => 'renam',
'components' => 'compon',
'component' => 'compon',
'implementation' => 'implement',
'implements' => 'implement',
'implementing' => 'implement',
'implementer' => 'implement',
'deleting' => 'delet',
'deletion' => 'delet',
'delete' => 'delet',
'erratically' => 'errat',
'erratic' => 'errat',
// Stems should be normalized.
'DOG' => 'dog',
// If stemming would bring a token under 3 characters, it should not
// be stemmed.
'dns' => 'dns',
'nis' => 'nis',
// Complex tokens with internal punctuation should be left untouched;
// these are usually things like domain names, API calls, informal tags,
// etc.
'apples' => 'appl',
'bananas' => 'banana',
'apples_bananas' => 'apples_bananas',
'apples_bananas.apples_bananas' => 'apples_bananas.apples_bananas',
);
$stemmer = new PhutilSearchStemmer();
foreach ($tests as $input => $expect) {
$stem = $stemmer->stemToken($input);
$this->assertEqual(
$expect,
$stem,
pht('Token stem of "%s".', $input));
}
}
public function testStemDocuments() {
$tests = array(
'The wild boar meandered erratically.' =>
'the wild boar meander errat',
'Fool me onc, shame on you. Fool me twice, shame on me.' =>
'fool onc shame you twice',
'Fireball is a seventh-level spell which deals 2d16 points of damage '.
'in a 1-meter radius around a target.' =>
'firebal seventh level spell which deal 2d16 point damag meter '.
'radiu around target',
'apples-bananas' => 'appl banana',
'apples_bananas' => 'apples_bananas',
'apples.bananas' => 'apples.bananas',
'oddly-proportioned' => 'oddli proport',
);
$stemmer = new PhutilSearchStemmer();
foreach ($tests as $input => $expect) {
$stem = $stemmer->stemCorpus($input);
$this->assertEqual(
$expect,
$stem,
pht('Corpus stem of: %s', $input));
}
}
}