mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-25 16:22:43 +01:00
Move search query parser/compiler classes to Phabricator
Summary: Ref T13472. Ref T13395. These classes are only used by Phabricator and not likely to find much use in Arcanist. Test Plan: Grepped libphutil and Arcanist for removed symbols. Maniphest Tasks: T13472, T13395 Differential Revision: https://secure.phabricator.com/D20939
This commit is contained in:
parent
54bcbdaba9
commit
767528c0ed
10 changed files with 1294 additions and 0 deletions
20
externals/porter-stemmer/LICENSE
vendored
Normal file
20
externals/porter-stemmer/LICENSE
vendored
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
The MIT License (MIT)
|
||||||
|
|
||||||
|
Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/)
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||||
|
this software and associated documentation files (the "Software"), to deal in
|
||||||
|
the Software without restriction, including without limitation the rights to
|
||||||
|
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||||
|
the Software, and to permit persons to whom the Software is furnished to do so,
|
||||||
|
subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||||
|
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||||
|
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||||
|
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
42
externals/porter-stemmer/README.md
vendored
Normal file
42
externals/porter-stemmer/README.md
vendored
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
# Porter Stemmer by Richard Heyes
|
||||||
|
|
||||||
|
# Installation (with composer)
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"require": {
|
||||||
|
"camspiers/porter-stemmer": "1.0.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
$ composer install
|
||||||
|
|
||||||
|
# Usage
|
||||||
|
|
||||||
|
```php
|
||||||
|
$stem = Porter::Stem($word);
|
||||||
|
```
|
||||||
|
|
||||||
|
# License
|
||||||
|
|
||||||
|
The MIT License (MIT)
|
||||||
|
|
||||||
|
Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/)
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||||
|
this software and associated documentation files (the "Software"), to deal in
|
||||||
|
the Software without restriction, including without limitation the rights to
|
||||||
|
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||||
|
the Software, and to permit persons to whom the Software is furnished to do so,
|
||||||
|
subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||||
|
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||||
|
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||||
|
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
426
externals/porter-stemmer/src/Porter.php
vendored
Normal file
426
externals/porter-stemmer/src/Porter.php
vendored
Normal file
|
@ -0,0 +1,426 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/)
|
||||||
|
*
|
||||||
|
* Portions Copyright 2003-2007 Jon Abernathy <jon@chuggnutt.com>
|
||||||
|
*
|
||||||
|
* Originally available under the GPL 2 or greater. Relicensed with permission
|
||||||
|
* of original authors under the MIT License in 2016.
|
||||||
|
*
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* @package PorterStemmer
|
||||||
|
* @author Richard Heyes
|
||||||
|
* @author Jon Abernathy <jon@chuggnutt.com>
|
||||||
|
* @copyright 2005-2016 Richard Heyes (http://www.phpguru.org/)
|
||||||
|
* @license http://www.opensource.org/licenses/mit-license.html MIT License
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PHP 5 Implementation of the Porter Stemmer algorithm. Certain elements
|
||||||
|
* were borrowed from the (broken) implementation by Jon Abernathy.
|
||||||
|
*
|
||||||
|
* See http://tartarus.org/~martin/PorterStemmer/ for a description of the
|
||||||
|
* algorithm.
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
*
|
||||||
|
* $stem = PorterStemmer::Stem($word);
|
||||||
|
*
|
||||||
|
* How easy is that?
|
||||||
|
*
|
||||||
|
* @package PorterStemmer
|
||||||
|
* @author Richard Heyes
|
||||||
|
* @author Jon Abernathy <jon@chuggnutt.com>
|
||||||
|
* @copyright 2005-2016 Richard Heyes (http://www.phpguru.org/)
|
||||||
|
* @license http://www.opensource.org/licenses/mit-license.html MIT License
|
||||||
|
*/
|
||||||
|
class Porter
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Regex for matching a consonant
|
||||||
|
*
|
||||||
|
* @var string
|
||||||
|
*/
|
||||||
|
private static $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Regex for matching a vowel
|
||||||
|
*
|
||||||
|
* @var string
|
||||||
|
*/
|
||||||
|
private static $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stems a word. Simple huh?
|
||||||
|
*
|
||||||
|
* @param string $word Word to stem
|
||||||
|
*
|
||||||
|
* @return string Stemmed word
|
||||||
|
*/
|
||||||
|
public static function Stem($word)
|
||||||
|
{
|
||||||
|
if (strlen($word) <= 2) {
|
||||||
|
return $word;
|
||||||
|
}
|
||||||
|
|
||||||
|
$word = self::step1ab($word);
|
||||||
|
$word = self::step1c($word);
|
||||||
|
$word = self::step2($word);
|
||||||
|
$word = self::step3($word);
|
||||||
|
$word = self::step4($word);
|
||||||
|
$word = self::step5($word);
|
||||||
|
|
||||||
|
return $word;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Step 1
|
||||||
|
*/
|
||||||
|
private static function step1ab($word)
|
||||||
|
{
|
||||||
|
// Part a
|
||||||
|
if (substr($word, -1) == 's') {
|
||||||
|
|
||||||
|
self::replace($word, 'sses', 'ss')
|
||||||
|
OR self::replace($word, 'ies', 'i')
|
||||||
|
OR self::replace($word, 'ss', 'ss')
|
||||||
|
OR self::replace($word, 's', '');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Part b
|
||||||
|
if (substr($word, -2, 1) != 'e' OR !self::replace($word, 'eed', 'ee', 0)) { // First rule
|
||||||
|
$v = self::$regex_vowel;
|
||||||
|
|
||||||
|
// ing and ed
|
||||||
|
if ( preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '')
|
||||||
|
OR preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons
|
||||||
|
|
||||||
|
// If one of above two test successful
|
||||||
|
if ( !self::replace($word, 'at', 'ate')
|
||||||
|
AND !self::replace($word, 'bl', 'ble')
|
||||||
|
AND !self::replace($word, 'iz', 'ize')) {
|
||||||
|
|
||||||
|
// Double consonant ending
|
||||||
|
if ( self::doubleConsonant($word)
|
||||||
|
AND substr($word, -2) != 'll'
|
||||||
|
AND substr($word, -2) != 'ss'
|
||||||
|
AND substr($word, -2) != 'zz') {
|
||||||
|
|
||||||
|
$word = substr($word, 0, -1);
|
||||||
|
|
||||||
|
} elseif (self::m($word) == 1 AND self::cvc($word)) {
|
||||||
|
$word .= 'e';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $word;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Step 1c
|
||||||
|
*
|
||||||
|
* @param string $word Word to stem
|
||||||
|
*/
|
||||||
|
private static function step1c($word)
|
||||||
|
{
|
||||||
|
$v = self::$regex_vowel;
|
||||||
|
|
||||||
|
if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
|
||||||
|
self::replace($word, 'y', 'i');
|
||||||
|
}
|
||||||
|
|
||||||
|
return $word;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Step 2
|
||||||
|
*
|
||||||
|
* @param string $word Word to stem
|
||||||
|
*/
|
||||||
|
private static function step2($word)
|
||||||
|
{
|
||||||
|
switch (substr($word, -2, 1)) {
|
||||||
|
case 'a':
|
||||||
|
self::replace($word, 'ational', 'ate', 0)
|
||||||
|
OR self::replace($word, 'tional', 'tion', 0);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'c':
|
||||||
|
self::replace($word, 'enci', 'ence', 0)
|
||||||
|
OR self::replace($word, 'anci', 'ance', 0);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'e':
|
||||||
|
self::replace($word, 'izer', 'ize', 0);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'g':
|
||||||
|
self::replace($word, 'logi', 'log', 0);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'l':
|
||||||
|
self::replace($word, 'entli', 'ent', 0)
|
||||||
|
OR self::replace($word, 'ousli', 'ous', 0)
|
||||||
|
OR self::replace($word, 'alli', 'al', 0)
|
||||||
|
OR self::replace($word, 'bli', 'ble', 0)
|
||||||
|
OR self::replace($word, 'eli', 'e', 0);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'o':
|
||||||
|
self::replace($word, 'ization', 'ize', 0)
|
||||||
|
OR self::replace($word, 'ation', 'ate', 0)
|
||||||
|
OR self::replace($word, 'ator', 'ate', 0);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 's':
|
||||||
|
self::replace($word, 'iveness', 'ive', 0)
|
||||||
|
OR self::replace($word, 'fulness', 'ful', 0)
|
||||||
|
OR self::replace($word, 'ousness', 'ous', 0)
|
||||||
|
OR self::replace($word, 'alism', 'al', 0);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 't':
|
||||||
|
self::replace($word, 'biliti', 'ble', 0)
|
||||||
|
OR self::replace($word, 'aliti', 'al', 0)
|
||||||
|
OR self::replace($word, 'iviti', 'ive', 0);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $word;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Step 3
|
||||||
|
*
|
||||||
|
* @param string $word String to stem
|
||||||
|
*/
|
||||||
|
private static function step3($word)
|
||||||
|
{
|
||||||
|
switch (substr($word, -2, 1)) {
|
||||||
|
case 'a':
|
||||||
|
self::replace($word, 'ical', 'ic', 0);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 's':
|
||||||
|
self::replace($word, 'ness', '', 0);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 't':
|
||||||
|
self::replace($word, 'icate', 'ic', 0)
|
||||||
|
OR self::replace($word, 'iciti', 'ic', 0);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'u':
|
||||||
|
self::replace($word, 'ful', '', 0);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'v':
|
||||||
|
self::replace($word, 'ative', '', 0);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'z':
|
||||||
|
self::replace($word, 'alize', 'al', 0);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $word;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Step 4
|
||||||
|
*
|
||||||
|
* @param string $word Word to stem
|
||||||
|
*/
|
||||||
|
private static function step4($word)
|
||||||
|
{
|
||||||
|
switch (substr($word, -2, 1)) {
|
||||||
|
case 'a':
|
||||||
|
self::replace($word, 'al', '', 1);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'c':
|
||||||
|
self::replace($word, 'ance', '', 1)
|
||||||
|
OR self::replace($word, 'ence', '', 1);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'e':
|
||||||
|
self::replace($word, 'er', '', 1);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'i':
|
||||||
|
self::replace($word, 'ic', '', 1);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'l':
|
||||||
|
self::replace($word, 'able', '', 1)
|
||||||
|
OR self::replace($word, 'ible', '', 1);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'n':
|
||||||
|
self::replace($word, 'ant', '', 1)
|
||||||
|
OR self::replace($word, 'ement', '', 1)
|
||||||
|
OR self::replace($word, 'ment', '', 1)
|
||||||
|
OR self::replace($word, 'ent', '', 1);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'o':
|
||||||
|
if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
|
||||||
|
self::replace($word, 'ion', '', 1);
|
||||||
|
} else {
|
||||||
|
self::replace($word, 'ou', '', 1);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 's':
|
||||||
|
self::replace($word, 'ism', '', 1);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 't':
|
||||||
|
self::replace($word, 'ate', '', 1)
|
||||||
|
OR self::replace($word, 'iti', '', 1);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'u':
|
||||||
|
self::replace($word, 'ous', '', 1);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'v':
|
||||||
|
self::replace($word, 'ive', '', 1);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'z':
|
||||||
|
self::replace($word, 'ize', '', 1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $word;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Step 5
|
||||||
|
*
|
||||||
|
* @param string $word Word to stem
|
||||||
|
*/
|
||||||
|
private static function step5($word)
|
||||||
|
{
|
||||||
|
// Part a
|
||||||
|
if (substr($word, -1) == 'e') {
|
||||||
|
if (self::m(substr($word, 0, -1)) > 1) {
|
||||||
|
self::replace($word, 'e', '');
|
||||||
|
|
||||||
|
} elseif (self::m(substr($word, 0, -1)) == 1) {
|
||||||
|
|
||||||
|
if (!self::cvc(substr($word, 0, -1))) {
|
||||||
|
self::replace($word, 'e', '');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Part b
|
||||||
|
if (self::m($word) > 1 AND self::doubleConsonant($word) AND substr($word, -1) == 'l') {
|
||||||
|
$word = substr($word, 0, -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $word;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Replaces the first string with the second, at the end of the string
|
||||||
|
*
|
||||||
|
* If third arg is given, then the preceding string must match that m
|
||||||
|
* count at least.
|
||||||
|
*
|
||||||
|
* @param string $str String to check
|
||||||
|
* @param string $check Ending to check for
|
||||||
|
* @param string $repl Replacement string
|
||||||
|
* @param int $m Optional minimum number of m() to meet
|
||||||
|
*
|
||||||
|
* @return bool Whether the $check string was at the end of the $str
|
||||||
|
* string. True does not necessarily mean that it was
|
||||||
|
* replaced.
|
||||||
|
*/
|
||||||
|
private static function replace(&$str, $check, $repl, $m = null)
|
||||||
|
{
|
||||||
|
$len = 0 - strlen($check);
|
||||||
|
|
||||||
|
if (substr($str, $len) == $check) {
|
||||||
|
$substr = substr($str, 0, $len);
|
||||||
|
if (is_null($m) OR self::m($substr) > $m) {
|
||||||
|
$str = $substr . $repl;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* What, you mean it's not obvious from the name?
|
||||||
|
*
|
||||||
|
* m() measures the number of consonant sequences in $str. if c is
|
||||||
|
* a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
|
||||||
|
* presence,
|
||||||
|
*
|
||||||
|
* <c><v> gives 0
|
||||||
|
* <c>vc<v> gives 1
|
||||||
|
* <c>vcvc<v> gives 2
|
||||||
|
* <c>vcvcvc<v> gives 3
|
||||||
|
*
|
||||||
|
* @param string $str The string to return the m count for
|
||||||
|
*
|
||||||
|
* @return int The m count
|
||||||
|
*/
|
||||||
|
private static function m($str)
|
||||||
|
{
|
||||||
|
$c = self::$regex_consonant;
|
||||||
|
$v = self::$regex_vowel;
|
||||||
|
|
||||||
|
$str = preg_replace("#^$c+#", '', $str);
|
||||||
|
$str = preg_replace("#$v+$#", '', $str);
|
||||||
|
|
||||||
|
preg_match_all("#($v+$c+)#", $str, $matches);
|
||||||
|
|
||||||
|
return count($matches[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true/false as to whether the given string contains two
|
||||||
|
* of the same consonant next to each other at the end of the string.
|
||||||
|
*
|
||||||
|
* @param string $str String to check
|
||||||
|
*
|
||||||
|
* @return bool Result
|
||||||
|
*/
|
||||||
|
private static function doubleConsonant($str)
|
||||||
|
{
|
||||||
|
$c = self::$regex_consonant;
|
||||||
|
|
||||||
|
return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks for ending CVC sequence where second C is not W, X or Y
|
||||||
|
*
|
||||||
|
* @param string $str String to check
|
||||||
|
*
|
||||||
|
* @return bool Result
|
||||||
|
*/
|
||||||
|
private static function cvc($str)
|
||||||
|
{
|
||||||
|
$c = self::$regex_consonant;
|
||||||
|
$v = self::$regex_vowel;
|
||||||
|
|
||||||
|
return preg_match("#($c$v$c)$#", $str, $matches)
|
||||||
|
AND strlen($matches[1]) == 3
|
||||||
|
AND $matches[1]{2} != 'w'
|
||||||
|
AND $matches[1]{2} != 'x'
|
||||||
|
AND $matches[1]{2} != 'y';
|
||||||
|
}
|
||||||
|
}
|
|
@ -5658,6 +5658,12 @@ phutil_register_library_map(array(
|
||||||
'PhutilRemarkupTableBlockRule' => 'infrastructure/markup/blockrule/PhutilRemarkupTableBlockRule.php',
|
'PhutilRemarkupTableBlockRule' => 'infrastructure/markup/blockrule/PhutilRemarkupTableBlockRule.php',
|
||||||
'PhutilRemarkupTestInterpreterRule' => 'infrastructure/markup/blockrule/PhutilRemarkupTestInterpreterRule.php',
|
'PhutilRemarkupTestInterpreterRule' => 'infrastructure/markup/blockrule/PhutilRemarkupTestInterpreterRule.php',
|
||||||
'PhutilRemarkupUnderlineRule' => 'infrastructure/markup/markuprule/PhutilRemarkupUnderlineRule.php',
|
'PhutilRemarkupUnderlineRule' => 'infrastructure/markup/markuprule/PhutilRemarkupUnderlineRule.php',
|
||||||
|
'PhutilSearchQueryCompiler' => 'applications/search/compiler/PhutilSearchQueryCompiler.php',
|
||||||
|
'PhutilSearchQueryCompilerSyntaxException' => 'applications/search/compiler/PhutilSearchQueryCompilerSyntaxException.php',
|
||||||
|
'PhutilSearchQueryCompilerTestCase' => 'applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php',
|
||||||
|
'PhutilSearchQueryToken' => 'applications/search/compiler/PhutilSearchQueryToken.php',
|
||||||
|
'PhutilSearchStemmer' => 'applications/search/compiler/PhutilSearchStemmer.php',
|
||||||
|
'PhutilSearchStemmerTestCase' => 'applications/search/compiler/__tests__/PhutilSearchStemmerTestCase.php',
|
||||||
'PhutilSlackAuthAdapter' => 'applications/auth/adapter/PhutilSlackAuthAdapter.php',
|
'PhutilSlackAuthAdapter' => 'applications/auth/adapter/PhutilSlackAuthAdapter.php',
|
||||||
'PhutilTwitchAuthAdapter' => 'applications/auth/adapter/PhutilTwitchAuthAdapter.php',
|
'PhutilTwitchAuthAdapter' => 'applications/auth/adapter/PhutilTwitchAuthAdapter.php',
|
||||||
'PhutilTwitterAuthAdapter' => 'applications/auth/adapter/PhutilTwitterAuthAdapter.php',
|
'PhutilTwitterAuthAdapter' => 'applications/auth/adapter/PhutilTwitterAuthAdapter.php',
|
||||||
|
@ -12483,6 +12489,12 @@ phutil_register_library_map(array(
|
||||||
'PhutilRemarkupTableBlockRule' => 'PhutilRemarkupBlockRule',
|
'PhutilRemarkupTableBlockRule' => 'PhutilRemarkupBlockRule',
|
||||||
'PhutilRemarkupTestInterpreterRule' => 'PhutilRemarkupBlockInterpreter',
|
'PhutilRemarkupTestInterpreterRule' => 'PhutilRemarkupBlockInterpreter',
|
||||||
'PhutilRemarkupUnderlineRule' => 'PhutilRemarkupRule',
|
'PhutilRemarkupUnderlineRule' => 'PhutilRemarkupRule',
|
||||||
|
'PhutilSearchQueryCompiler' => 'Phobject',
|
||||||
|
'PhutilSearchQueryCompilerSyntaxException' => 'Exception',
|
||||||
|
'PhutilSearchQueryCompilerTestCase' => 'PhutilTestCase',
|
||||||
|
'PhutilSearchQueryToken' => 'Phobject',
|
||||||
|
'PhutilSearchStemmer' => 'Phobject',
|
||||||
|
'PhutilSearchStemmerTestCase' => 'PhutilTestCase',
|
||||||
'PhutilSlackAuthAdapter' => 'PhutilOAuthAuthAdapter',
|
'PhutilSlackAuthAdapter' => 'PhutilOAuthAuthAdapter',
|
||||||
'PhutilTwitchAuthAdapter' => 'PhutilOAuthAuthAdapter',
|
'PhutilTwitchAuthAdapter' => 'PhutilOAuthAuthAdapter',
|
||||||
'PhutilTwitterAuthAdapter' => 'PhutilOAuth1AuthAdapter',
|
'PhutilTwitterAuthAdapter' => 'PhutilOAuth1AuthAdapter',
|
||||||
|
|
374
src/applications/search/compiler/PhutilSearchQueryCompiler.php
Normal file
374
src/applications/search/compiler/PhutilSearchQueryCompiler.php
Normal file
|
@ -0,0 +1,374 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
final class PhutilSearchQueryCompiler
|
||||||
|
extends Phobject {
|
||||||
|
|
||||||
|
private $operators = '+ -><()~*:""&|';
|
||||||
|
private $query;
|
||||||
|
private $stemmer;
|
||||||
|
private $enableFunctions = false;
|
||||||
|
|
||||||
|
const OPERATOR_NOT = 'not';
|
||||||
|
const OPERATOR_AND = 'and';
|
||||||
|
const OPERATOR_SUBSTRING = 'sub';
|
||||||
|
const OPERATOR_EXACT = 'exact';
|
||||||
|
|
||||||
|
public function setOperators($operators) {
|
||||||
|
$this->operators = $operators;
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getOperators() {
|
||||||
|
return $this->operators;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function setStemmer(PhutilSearchStemmer $stemmer) {
|
||||||
|
$this->stemmer = $stemmer;
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getStemmer() {
|
||||||
|
return $this->stemmer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function setEnableFunctions($enable_functions) {
|
||||||
|
$this->enableFunctions = $enable_functions;
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getEnableFunctions() {
|
||||||
|
return $this->enableFunctions;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function compileQuery(array $tokens) {
|
||||||
|
assert_instances_of($tokens, 'PhutilSearchQueryToken');
|
||||||
|
|
||||||
|
$result = array();
|
||||||
|
foreach ($tokens as $token) {
|
||||||
|
$result[] = $this->renderToken($token);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->compileRenderedTokens($result);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function compileLiteralQuery(array $tokens) {
|
||||||
|
assert_instances_of($tokens, 'PhutilSearchQueryToken');
|
||||||
|
|
||||||
|
$result = array();
|
||||||
|
foreach ($tokens as $token) {
|
||||||
|
if (!$token->isQuoted()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
$result[] = $this->renderToken($token);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->compileRenderedTokens($result);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function compileStemmedQuery(array $tokens) {
|
||||||
|
assert_instances_of($tokens, 'PhutilSearchQueryToken');
|
||||||
|
|
||||||
|
$result = array();
|
||||||
|
foreach ($tokens as $token) {
|
||||||
|
if ($token->isQuoted()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
$result[] = $this->renderToken($token, $this->getStemmer());
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->compileRenderedTokens($result);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function compileRenderedTokens(array $list) {
|
||||||
|
if (!$list) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
$list = array_unique($list);
|
||||||
|
return implode(' ', $list);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function newTokens($query) {
|
||||||
|
$results = $this->tokenizeQuery($query);
|
||||||
|
|
||||||
|
$tokens = array();
|
||||||
|
foreach ($results as $result) {
|
||||||
|
$tokens[] = PhutilSearchQueryToken::newFromDictionary($result);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function tokenizeQuery($query) {
|
||||||
|
$maximum_bytes = 1024;
|
||||||
|
|
||||||
|
$query_bytes = strlen($query);
|
||||||
|
if ($query_bytes > $maximum_bytes) {
|
||||||
|
throw new PhutilSearchQueryCompilerSyntaxException(
|
||||||
|
pht(
|
||||||
|
'Query is too long (%s bytes, maximum is %s bytes).',
|
||||||
|
new PhutilNumber($query_bytes),
|
||||||
|
new PhutilNumber($maximum_bytes)));
|
||||||
|
}
|
||||||
|
|
||||||
|
$query = phutil_utf8v($query);
|
||||||
|
$length = count($query);
|
||||||
|
|
||||||
|
$enable_functions = $this->getEnableFunctions();
|
||||||
|
|
||||||
|
$mode = 'scan';
|
||||||
|
$current_operator = array();
|
||||||
|
$current_token = array();
|
||||||
|
$current_function = null;
|
||||||
|
$is_quoted = false;
|
||||||
|
$tokens = array();
|
||||||
|
|
||||||
|
if ($enable_functions) {
|
||||||
|
$operator_characters = '[~=+-]';
|
||||||
|
} else {
|
||||||
|
$operator_characters = '[+-]';
|
||||||
|
}
|
||||||
|
|
||||||
|
for ($ii = 0; $ii < $length; $ii++) {
|
||||||
|
$character = $query[$ii];
|
||||||
|
|
||||||
|
if ($mode == 'scan') {
|
||||||
|
if (preg_match('/^\s\z/u', $character)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$mode = 'function';
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($mode == 'function') {
|
||||||
|
$mode = 'operator';
|
||||||
|
|
||||||
|
if ($enable_functions) {
|
||||||
|
$found = false;
|
||||||
|
for ($jj = $ii; $jj < $length; $jj++) {
|
||||||
|
if (preg_match('/^[a-zA-Z]\z/u', $query[$jj])) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if ($query[$jj] == ':') {
|
||||||
|
$found = $jj;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($found !== false) {
|
||||||
|
$function = array_slice($query, $ii, ($jj - $ii));
|
||||||
|
$current_function = implode('', $function);
|
||||||
|
|
||||||
|
if (!strlen($current_function)) {
|
||||||
|
$current_function = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
$ii = $jj;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($mode == 'operator') {
|
||||||
|
if (preg_match('/^\s\z/u', $character)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (preg_match('/^'.$operator_characters.'\z/', $character)) {
|
||||||
|
$current_operator[] = $character;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$mode = 'quote';
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($mode == 'quote') {
|
||||||
|
if (preg_match('/^"\z/', $character)) {
|
||||||
|
$is_quoted = true;
|
||||||
|
$mode = 'token';
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$mode = 'token';
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($mode == 'token') {
|
||||||
|
$capture = false;
|
||||||
|
$was_quoted = $is_quoted;
|
||||||
|
if ($is_quoted) {
|
||||||
|
if (preg_match('/^"\z/', $character)) {
|
||||||
|
$capture = true;
|
||||||
|
$mode = 'scan';
|
||||||
|
$is_quoted = false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (preg_match('/^\s\z/u', $character)) {
|
||||||
|
$capture = true;
|
||||||
|
$mode = 'scan';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (preg_match('/^"\z/', $character)) {
|
||||||
|
$capture = true;
|
||||||
|
$mode = 'token';
|
||||||
|
$is_quoted = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($capture) {
|
||||||
|
$token = array(
|
||||||
|
'operator' => $current_operator,
|
||||||
|
'quoted' => $was_quoted,
|
||||||
|
'value' => $current_token,
|
||||||
|
);
|
||||||
|
|
||||||
|
if ($enable_functions) {
|
||||||
|
$token['function'] = $current_function;
|
||||||
|
}
|
||||||
|
|
||||||
|
$tokens[] = $token;
|
||||||
|
|
||||||
|
$current_operator = array();
|
||||||
|
$current_token = array();
|
||||||
|
$current_function = null;
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
$current_token[] = $character;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($is_quoted) {
|
||||||
|
throw new PhutilSearchQueryCompilerSyntaxException(
|
||||||
|
pht(
|
||||||
|
'Query contains unmatched double quotes.'));
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($mode == 'operator') {
|
||||||
|
throw new PhutilSearchQueryCompilerSyntaxException(
|
||||||
|
pht(
|
||||||
|
'Query contains operator ("%s") with no search term.',
|
||||||
|
implode('', $current_operator)));
|
||||||
|
}
|
||||||
|
|
||||||
|
$token = array(
|
||||||
|
'operator' => $current_operator,
|
||||||
|
'quoted' => false,
|
||||||
|
'value' => $current_token,
|
||||||
|
);
|
||||||
|
|
||||||
|
if ($enable_functions) {
|
||||||
|
$token['function'] = $current_function;
|
||||||
|
}
|
||||||
|
|
||||||
|
$tokens[] = $token;
|
||||||
|
|
||||||
|
$results = array();
|
||||||
|
foreach ($tokens as $token) {
|
||||||
|
$value = implode('', $token['value']);
|
||||||
|
$operator_string = implode('', $token['operator']);
|
||||||
|
|
||||||
|
if (!strlen($value)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$is_quoted = $token['quoted'];
|
||||||
|
|
||||||
|
switch ($operator_string) {
|
||||||
|
case '-':
|
||||||
|
$operator = self::OPERATOR_NOT;
|
||||||
|
break;
|
||||||
|
case '~':
|
||||||
|
$operator = self::OPERATOR_SUBSTRING;
|
||||||
|
break;
|
||||||
|
case '=':
|
||||||
|
$operator = self::OPERATOR_EXACT;
|
||||||
|
break;
|
||||||
|
case '+':
|
||||||
|
$operator = self::OPERATOR_AND;
|
||||||
|
break;
|
||||||
|
case '':
|
||||||
|
// See T12995. If this query term contains Chinese, Japanese or
|
||||||
|
// Korean characters, treat the term as a substring term by default.
|
||||||
|
// These languages do not separate words with spaces, so the term
|
||||||
|
// search mode is normally useless.
|
||||||
|
if ($enable_functions && !$is_quoted && phutil_utf8_is_cjk($value)) {
|
||||||
|
$operator = self::OPERATOR_SUBSTRING;
|
||||||
|
} else {
|
||||||
|
$operator = self::OPERATOR_AND;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new PhutilSearchQueryCompilerSyntaxException(
|
||||||
|
pht(
|
||||||
|
'Query has an invalid sequence of operators ("%s").',
|
||||||
|
$operator_string));
|
||||||
|
}
|
||||||
|
|
||||||
|
$result = array(
|
||||||
|
'operator' => $operator,
|
||||||
|
'quoted' => $is_quoted,
|
||||||
|
'value' => $value,
|
||||||
|
);
|
||||||
|
|
||||||
|
if ($enable_functions) {
|
||||||
|
$result['function'] = $token['function'];
|
||||||
|
}
|
||||||
|
|
||||||
|
$results[] = $result;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $results;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function renderToken(
|
||||||
|
PhutilSearchQueryToken $token,
|
||||||
|
PhutilSearchStemmer $stemmer = null) {
|
||||||
|
$value = $token->getValue();
|
||||||
|
|
||||||
|
if ($stemmer) {
|
||||||
|
$value = $stemmer->stemToken($value);
|
||||||
|
}
|
||||||
|
|
||||||
|
$value = $this->quoteToken($value);
|
||||||
|
$operator = $token->getOperator();
|
||||||
|
$prefix = $this->getOperatorPrefix($operator);
|
||||||
|
|
||||||
|
$value = $prefix.$value;
|
||||||
|
|
||||||
|
return $value;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function getOperatorPrefix($operator) {
|
||||||
|
$operators = $this->operators;
|
||||||
|
|
||||||
|
switch ($operator) {
|
||||||
|
case self::OPERATOR_AND:
|
||||||
|
$prefix = $operators[0];
|
||||||
|
break;
|
||||||
|
case self::OPERATOR_NOT:
|
||||||
|
$prefix = $operators[2];
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new PhutilSearchQueryCompilerSyntaxException(
|
||||||
|
pht(
|
||||||
|
'Unsupported operator prefix "%s".',
|
||||||
|
$operator));
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($prefix == ' ') {
|
||||||
|
$prefix = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $prefix;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function quoteToken($value) {
|
||||||
|
$operators = $this->operators;
|
||||||
|
|
||||||
|
$open_quote = $this->operators[10];
|
||||||
|
$close_quote = $this->operators[11];
|
||||||
|
|
||||||
|
return $open_quote.$value.$close_quote;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,4 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
final class PhutilSearchQueryCompilerSyntaxException
|
||||||
|
extends Exception {}
|
37
src/applications/search/compiler/PhutilSearchQueryToken.php
Normal file
37
src/applications/search/compiler/PhutilSearchQueryToken.php
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
final class PhutilSearchQueryToken extends Phobject {
|
||||||
|
|
||||||
|
private $isQuoted;
|
||||||
|
private $value;
|
||||||
|
private $operator;
|
||||||
|
private $function;
|
||||||
|
|
||||||
|
public static function newFromDictionary(array $dictionary) {
|
||||||
|
$token = new self();
|
||||||
|
|
||||||
|
$token->isQuoted = $dictionary['quoted'];
|
||||||
|
$token->operator = $dictionary['operator'];
|
||||||
|
$token->value = $dictionary['value'];
|
||||||
|
$token->function = idx($dictionary, 'function');
|
||||||
|
|
||||||
|
return $token;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function isQuoted() {
|
||||||
|
return $this->isQuoted;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getValue() {
|
||||||
|
return $this->value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getOperator() {
|
||||||
|
return $this->operator;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getFunction() {
|
||||||
|
return $this->function;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
74
src/applications/search/compiler/PhutilSearchStemmer.php
Normal file
74
src/applications/search/compiler/PhutilSearchStemmer.php
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
final class PhutilSearchStemmer
|
||||||
|
extends Phobject {
|
||||||
|
|
||||||
|
public function stemToken($token) {
|
||||||
|
$token = $this->normalizeToken($token);
|
||||||
|
return $this->applyStemmer($token);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function stemCorpus($corpus) {
|
||||||
|
$corpus = $this->normalizeCorpus($corpus);
|
||||||
|
$tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF._]+/', $corpus);
|
||||||
|
|
||||||
|
$words = array();
|
||||||
|
foreach ($tokens as $key => $token) {
|
||||||
|
$token = trim($token, '._');
|
||||||
|
|
||||||
|
if (strlen($token) < 3) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$words[$token] = $token;
|
||||||
|
}
|
||||||
|
|
||||||
|
$stems = array();
|
||||||
|
foreach ($words as $word) {
|
||||||
|
$stems[] = $this->applyStemmer($word);
|
||||||
|
}
|
||||||
|
|
||||||
|
return implode(' ', $stems);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function normalizeToken($token) {
|
||||||
|
return phutil_utf8_strtolower($token);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function normalizeCorpus($corpus) {
|
||||||
|
return phutil_utf8_strtolower($corpus);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @phutil-external-symbol class Porter
|
||||||
|
*/
|
||||||
|
private function applyStemmer($normalized_token) {
|
||||||
|
// If the token has internal punctuation, handle it literally. This
|
||||||
|
// deals with things like domain names, Conduit API methods, and other
|
||||||
|
// sorts of informal tokens.
|
||||||
|
if (preg_match('/[._]/', $normalized_token)) {
|
||||||
|
return $normalized_token;
|
||||||
|
}
|
||||||
|
|
||||||
|
static $loaded;
|
||||||
|
|
||||||
|
if ($loaded === null) {
|
||||||
|
$root = dirname(phutil_get_library_root('phabricator'));
|
||||||
|
require_once $root.'/externals/porter-stemmer/src/Porter.php';
|
||||||
|
$loaded = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
$stem = Porter::stem($normalized_token);
|
||||||
|
|
||||||
|
// If the stem is too short, it won't be a candidate for indexing. These
|
||||||
|
// tokens are also likely to be acronyms (like "DNS") rather than real
|
||||||
|
// English words.
|
||||||
|
if (strlen($stem) < 3) {
|
||||||
|
return $normalized_token;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $stem;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,220 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
final class PhutilSearchQueryCompilerTestCase
|
||||||
|
extends PhutilTestCase {
|
||||||
|
|
||||||
|
public function testCompileQueries() {
|
||||||
|
$tests = array(
|
||||||
|
'' => null,
|
||||||
|
'cat dog' => '+"cat" +"dog"',
|
||||||
|
'cat -dog' => '+"cat" -"dog"',
|
||||||
|
'cat-dog' => '+"cat-dog"',
|
||||||
|
|
||||||
|
// If there are spaces after an operator, the operator applies to the
|
||||||
|
// next search term.
|
||||||
|
'cat - dog' => '+"cat" -"dog"',
|
||||||
|
|
||||||
|
// Double quotes serve as delimiters even if there is no whitespace
|
||||||
|
// between terms.
|
||||||
|
'"cat"dog' => '+"cat" +"dog"',
|
||||||
|
|
||||||
|
// This query is too long.
|
||||||
|
str_repeat('x', 2048) => false,
|
||||||
|
|
||||||
|
// Multiple operators are not permitted.
|
||||||
|
'++cat' => false,
|
||||||
|
'+-cat' => false,
|
||||||
|
'--cat' => false,
|
||||||
|
|
||||||
|
// Stray operators are not permitted.
|
||||||
|
'+' => false,
|
||||||
|
'cat +' => false,
|
||||||
|
|
||||||
|
// Double quotes must be paired.
|
||||||
|
'"' => false,
|
||||||
|
'cat "' => false,
|
||||||
|
'"cat' => false,
|
||||||
|
'A"' => false,
|
||||||
|
'A"B"' => '+"A" +"B"',
|
||||||
|
);
|
||||||
|
|
||||||
|
$this->assertCompileQueries($tests);
|
||||||
|
|
||||||
|
// Test that we compile queries correctly if the operators have been
|
||||||
|
// swapped to use "AND" by default.
|
||||||
|
$operator_tests = array(
|
||||||
|
'cat dog' => '"cat" "dog"',
|
||||||
|
'cat -dog' => '"cat" -"dog"',
|
||||||
|
);
|
||||||
|
$this->assertCompileQueries($operator_tests, ' |-><()~*:""&\'');
|
||||||
|
|
||||||
|
|
||||||
|
// Test that we compile queries correctly if the quote operators have
|
||||||
|
// been swapped to differ.
|
||||||
|
$quote_tests = array(
|
||||||
|
'cat dog' => '+[cat] +[dog]',
|
||||||
|
'cat -dog' => '+[cat] -[dog]',
|
||||||
|
);
|
||||||
|
$this->assertCompileQueries($quote_tests, '+ -><()~*:[]&|');
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testCompileQueriesWithStemming() {
|
||||||
|
$stemming_tests = array(
|
||||||
|
'cat dog' => array(
|
||||||
|
null,
|
||||||
|
'+"cat" +"dog"',
|
||||||
|
),
|
||||||
|
'cats dogs' => array(
|
||||||
|
null,
|
||||||
|
'+"cat" +"dog"',
|
||||||
|
),
|
||||||
|
'cats "dogs"' => array(
|
||||||
|
'+"dogs"',
|
||||||
|
'+"cat"',
|
||||||
|
),
|
||||||
|
'"blessed blade" of the windseeker' => array(
|
||||||
|
'+"blessed blade"',
|
||||||
|
'+"of" +"the" +"windseek"',
|
||||||
|
),
|
||||||
|
'mailing users for mentions on tasks' => array(
|
||||||
|
null,
|
||||||
|
'+"mail" +"user" +"for" +"mention" +"on" +"task"',
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
$stemmer = new PhutilSearchStemmer();
|
||||||
|
$this->assertCompileQueries($stemming_tests, null, $stemmer);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testCompileQueriesWithFunctions() {
|
||||||
|
$op_and = PhutilSearchQueryCompiler::OPERATOR_AND;
|
||||||
|
$op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING;
|
||||||
|
$op_exact = PhutilSearchQueryCompiler::OPERATOR_EXACT;
|
||||||
|
|
||||||
|
$mao = "\xE7\x8C\xAB";
|
||||||
|
|
||||||
|
$function_tests = array(
|
||||||
|
'cat' => array(
|
||||||
|
array(null, $op_and, 'cat'),
|
||||||
|
),
|
||||||
|
':cat' => array(
|
||||||
|
array(null, $op_and, 'cat'),
|
||||||
|
),
|
||||||
|
'title:cat' => array(
|
||||||
|
array('title', $op_and, 'cat'),
|
||||||
|
),
|
||||||
|
'title:cat:dog' => array(
|
||||||
|
array('title', $op_and, 'cat:dog'),
|
||||||
|
),
|
||||||
|
'title:~cat' => array(
|
||||||
|
array('title', $op_sub, 'cat'),
|
||||||
|
),
|
||||||
|
'cat title:="Meow Meow"' => array(
|
||||||
|
array(null, $op_and, 'cat'),
|
||||||
|
array('title', $op_exact, 'Meow Meow'),
|
||||||
|
),
|
||||||
|
'title:cat title:dog' => array(
|
||||||
|
array('title', $op_and, 'cat'),
|
||||||
|
array('title', $op_and, 'dog'),
|
||||||
|
),
|
||||||
|
'~"core and seven years ag"' => array(
|
||||||
|
array(null, $op_sub, 'core and seven years ag'),
|
||||||
|
),
|
||||||
|
$mao => array(
|
||||||
|
array(null, $op_sub, $mao),
|
||||||
|
),
|
||||||
|
'+'.$mao => array(
|
||||||
|
array(null, $op_and, $mao),
|
||||||
|
),
|
||||||
|
'~'.$mao => array(
|
||||||
|
array(null, $op_sub, $mao),
|
||||||
|
),
|
||||||
|
'"'.$mao.'"' => array(
|
||||||
|
array(null, $op_and, $mao),
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
$this->assertCompileFunctionQueries($function_tests);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function assertCompileQueries(
|
||||||
|
array $tests,
|
||||||
|
$operators = null,
|
||||||
|
PhutilSearchStemmer $stemmer = null) {
|
||||||
|
foreach ($tests as $input => $expect) {
|
||||||
|
$caught = null;
|
||||||
|
|
||||||
|
$query = null;
|
||||||
|
$literal_query = null;
|
||||||
|
$stemmed_query = null;
|
||||||
|
|
||||||
|
try {
|
||||||
|
$compiler = new PhutilSearchQueryCompiler();
|
||||||
|
|
||||||
|
if ($operators !== null) {
|
||||||
|
$compiler->setOperators($operators);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($stemmer !== null) {
|
||||||
|
$compiler->setStemmer($stemmer);
|
||||||
|
}
|
||||||
|
|
||||||
|
$tokens = $compiler->newTokens($input);
|
||||||
|
|
||||||
|
if ($stemmer) {
|
||||||
|
$literal_query = $compiler->compileLiteralQuery($tokens);
|
||||||
|
$stemmed_query = $compiler->compileStemmedQuery($tokens);
|
||||||
|
} else {
|
||||||
|
$query = $compiler->compileQuery($tokens);
|
||||||
|
}
|
||||||
|
} catch (PhutilSearchQueryCompilerSyntaxException $ex) {
|
||||||
|
$caught = $ex;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($caught !== null) {
|
||||||
|
$query = false;
|
||||||
|
$literal_query = false;
|
||||||
|
$stemmed_query = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!$stemmer) {
|
||||||
|
$this->assertEqual(
|
||||||
|
$expect,
|
||||||
|
$query,
|
||||||
|
pht('Compilation of query: %s', $input));
|
||||||
|
} else {
|
||||||
|
$this->assertEqual(
|
||||||
|
$expect,
|
||||||
|
($literal_query === false)
|
||||||
|
? false
|
||||||
|
: array($literal_query, $stemmed_query),
|
||||||
|
pht('Stemmed compilation of query: %s', $input));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private function assertCompileFunctionQueries(array $tests) {
|
||||||
|
foreach ($tests as $input => $expect) {
|
||||||
|
$compiler = id(new PhutilSearchQueryCompiler())
|
||||||
|
->setEnableFunctions(true);
|
||||||
|
|
||||||
|
$tokens = $compiler->newTokens($input);
|
||||||
|
|
||||||
|
$result = array();
|
||||||
|
foreach ($tokens as $token) {
|
||||||
|
$result[] = array(
|
||||||
|
$token->getFunction(),
|
||||||
|
$token->getOperator(),
|
||||||
|
$token->getValue(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->assertEqual(
|
||||||
|
$expect,
|
||||||
|
$result,
|
||||||
|
pht('Function compilation of query: %s', $input));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,85 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
final class PhutilSearchStemmerTestCase
|
||||||
|
extends PhutilTestCase {
|
||||||
|
|
||||||
|
public function testStemTokens() {
|
||||||
|
$tests = array(
|
||||||
|
// Various real-world cases collected from users before we implemented
|
||||||
|
// stemming.
|
||||||
|
'tokens' => 'token',
|
||||||
|
'panels' => 'panel',
|
||||||
|
|
||||||
|
'renames' => 'renam',
|
||||||
|
'rename' => 'renam',
|
||||||
|
|
||||||
|
'components' => 'compon',
|
||||||
|
'component' => 'compon',
|
||||||
|
|
||||||
|
'implementation' => 'implement',
|
||||||
|
'implements' => 'implement',
|
||||||
|
'implementing' => 'implement',
|
||||||
|
'implementer' => 'implement',
|
||||||
|
|
||||||
|
'deleting' => 'delet',
|
||||||
|
'deletion' => 'delet',
|
||||||
|
'delete' => 'delet',
|
||||||
|
|
||||||
|
'erratically' => 'errat',
|
||||||
|
'erratic' => 'errat',
|
||||||
|
|
||||||
|
// Stems should be normalized.
|
||||||
|
'DOG' => 'dog',
|
||||||
|
|
||||||
|
// If stemming would bring a token under 3 characters, it should not
|
||||||
|
// be stemmed.
|
||||||
|
'dns' => 'dns',
|
||||||
|
'nis' => 'nis',
|
||||||
|
|
||||||
|
// Complex tokens with internal punctuation should be left untouched;
|
||||||
|
// these are usually things like domain names, API calls, informal tags,
|
||||||
|
// etc.
|
||||||
|
'apples' => 'appl',
|
||||||
|
'bananas' => 'banana',
|
||||||
|
'apples_bananas' => 'apples_bananas',
|
||||||
|
'apples_bananas.apples_bananas' => 'apples_bananas.apples_bananas',
|
||||||
|
);
|
||||||
|
|
||||||
|
$stemmer = new PhutilSearchStemmer();
|
||||||
|
foreach ($tests as $input => $expect) {
|
||||||
|
$stem = $stemmer->stemToken($input);
|
||||||
|
$this->assertEqual(
|
||||||
|
$expect,
|
||||||
|
$stem,
|
||||||
|
pht('Token stem of "%s".', $input));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testStemDocuments() {
|
||||||
|
$tests = array(
|
||||||
|
'The wild boar meandered erratically.' =>
|
||||||
|
'the wild boar meander errat',
|
||||||
|
'Fool me onc, shame on you. Fool me twice, shame on me.' =>
|
||||||
|
'fool onc shame you twice',
|
||||||
|
'Fireball is a seventh-level spell which deals 2d16 points of damage '.
|
||||||
|
'in a 1-meter radius around a target.' =>
|
||||||
|
'firebal seventh level spell which deal 2d16 point damag meter '.
|
||||||
|
'radiu around target',
|
||||||
|
'apples-bananas' => 'appl banana',
|
||||||
|
'apples_bananas' => 'apples_bananas',
|
||||||
|
'apples.bananas' => 'apples.bananas',
|
||||||
|
'oddly-proportioned' => 'oddli proport',
|
||||||
|
);
|
||||||
|
|
||||||
|
$stemmer = new PhutilSearchStemmer();
|
||||||
|
foreach ($tests as $input => $expect) {
|
||||||
|
$stem = $stemmer->stemCorpus($input);
|
||||||
|
$this->assertEqual(
|
||||||
|
$expect,
|
||||||
|
$stem,
|
||||||
|
pht('Corpus stem of: %s', $input));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in a new issue