mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-29 10:12:41 +01:00
Improve search highlighting for CJK and substring queries
Summary: Fixes T12995. Currently, the result highlighter (which shows //where// terms matched) only works in "term" mode, not in "substring" mode. Provide better feedback and behvaior: - When a term is a substring term, color it a little differently and add a tooltip. (This is partly to make it easier to debug/diagnose things, probably not enormously valuable to users.) - When a term is a substring term, highlight it anywhere in the results. Test Plan: Queried for latin and CJK terms. Here is CJK being highlighted: {F5192195} Here is substring vs non-substring implicit behavior: {F5192196} Here's ONLY terms being highlighted: {F5192198} Here's terms and substrings, since the query now has a substring: {F5192201} Reviewers: amckinley Reviewed By: amckinley Maniphest Tasks: T12995 Differential Revision: https://secure.phabricator.com/D18635
This commit is contained in:
parent
36df39761e
commit
1ac52c09e7
3 changed files with 102 additions and 56 deletions
|
@ -56,6 +56,10 @@ final class PhabricatorFulltextToken extends Phobject {
|
|||
$shade = PHUITagView::COLOR_RED;
|
||||
$icon = 'fa-minus';
|
||||
break;
|
||||
case PhutilSearchQueryCompiler::OPERATOR_SUBSTRING:
|
||||
$tip = pht('Substring Search');
|
||||
$shade = PHUITagView::COLOR_VIOLET;
|
||||
break;
|
||||
default:
|
||||
$shade = PHUITagView::COLOR_BLUE;
|
||||
break;
|
||||
|
|
|
@ -261,7 +261,7 @@ final class PhabricatorSearchApplicationSearchEngine
|
|||
foreach ($results as $phid => $handle) {
|
||||
$view = id(new PhabricatorSearchResultView())
|
||||
->setHandle($handle)
|
||||
->setQuery($query)
|
||||
->setTokens($fulltext_tokens)
|
||||
->setObject(idx($objects, $phid))
|
||||
->render();
|
||||
$list->addItem($view);
|
||||
|
|
|
@ -3,16 +3,17 @@
|
|||
final class PhabricatorSearchResultView extends AphrontView {
|
||||
|
||||
private $handle;
|
||||
private $query;
|
||||
private $object;
|
||||
private $tokens;
|
||||
|
||||
public function setHandle(PhabricatorObjectHandle $handle) {
|
||||
$this->handle = $handle;
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function setQuery(PhabricatorSavedQuery $query) {
|
||||
$this->query = $query;
|
||||
public function setTokens(array $tokens) {
|
||||
assert_instances_of($tokens, 'PhabricatorFulltextToken');
|
||||
$this->tokens = $tokens;
|
||||
return $this;
|
||||
}
|
||||
|
||||
|
@ -56,88 +57,129 @@ final class PhabricatorSearchResultView extends AphrontView {
|
|||
* matched their query.
|
||||
*/
|
||||
private function emboldenQuery($str) {
|
||||
$query = $this->query->getParameter('query');
|
||||
$tokens = $this->tokens;
|
||||
|
||||
if (!strlen($query) || !strlen($str)) {
|
||||
if (!$tokens) {
|
||||
return $str;
|
||||
}
|
||||
|
||||
// This algorithm is safe but not especially fast, so don't bother if
|
||||
// we're dealing with a lot of data. This mostly prevents silly/malicious
|
||||
// queries from doing anything bad.
|
||||
if (strlen($query) + strlen($str) > 2048) {
|
||||
if (count($tokens) > 16) {
|
||||
return $str;
|
||||
}
|
||||
|
||||
// Keep track of which characters we're going to make bold. This is
|
||||
// byte oriented, but we'll make sure we don't put a bold in the middle
|
||||
// of a character later.
|
||||
$bold = array_fill(0, strlen($str), false);
|
||||
|
||||
// Split the query into words.
|
||||
$parts = preg_split('/ +/', $query);
|
||||
|
||||
// Find all occurrences of each word, and mark them to be emboldened.
|
||||
foreach ($parts as $part) {
|
||||
$part = trim($part);
|
||||
$part = trim($part, '"+');
|
||||
if (!strlen($part)) {
|
||||
continue;
|
||||
if (!strlen($str)) {
|
||||
return $str;
|
||||
}
|
||||
|
||||
if (strlen($str) > 2048) {
|
||||
return $str;
|
||||
}
|
||||
|
||||
$patterns = array();
|
||||
foreach ($tokens as $token) {
|
||||
$raw_token = $token->getToken();
|
||||
$operator = $raw_token->getOperator();
|
||||
|
||||
$value = $raw_token->getValue();
|
||||
|
||||
switch ($operator) {
|
||||
case PhutilSearchQueryCompiler::OPERATOR_SUBSTRING:
|
||||
$patterns[] = '(('.preg_quote($value).'))ui';
|
||||
break;
|
||||
case PhutilSearchQueryCompiler::OPERATOR_AND:
|
||||
$patterns[] = '((?<=\W|^)('.preg_quote($value).')(?=\W|\z))ui';
|
||||
break;
|
||||
default:
|
||||
// Don't highlight anything else, particularly "NOT".
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Find all matches for all query terms in the document title, then reduce
|
||||
// them to a map from offsets to highlighted sequence lengths. If two terms
|
||||
// match at the same position, we choose the longer one.
|
||||
$all_matches = array();
|
||||
foreach ($patterns as $pattern) {
|
||||
$matches = null;
|
||||
$has_matches = preg_match_all(
|
||||
'/(?:^|\b)('.preg_quote($part, '/').')/i',
|
||||
$ok = preg_match_all(
|
||||
$pattern,
|
||||
$str,
|
||||
$matches,
|
||||
PREG_OFFSET_CAPTURE);
|
||||
|
||||
if (!$has_matches) {
|
||||
if (!$ok) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Flag the matching part of the range for boldening.
|
||||
foreach ($matches[1] as $match) {
|
||||
$offset = $match[1];
|
||||
for ($ii = 0; $ii < strlen($match[0]); $ii++) {
|
||||
$bold[$offset + $ii] = true;
|
||||
$match_text = $match[0];
|
||||
$match_offset = $match[1];
|
||||
|
||||
if (!isset($all_matches[$match_offset])) {
|
||||
$all_matches[$match_offset] = 0;
|
||||
}
|
||||
|
||||
$all_matches[$match_offset] = max(
|
||||
$all_matches[$match_offset],
|
||||
strlen($match_text));
|
||||
}
|
||||
}
|
||||
|
||||
// Split the string into ranges, applying bold styling as required.
|
||||
$out = array();
|
||||
$buf = '';
|
||||
$pos = 0;
|
||||
$is_bold = false;
|
||||
// Go through the string one display glyph at a time. If a glyph starts
|
||||
// on a highlighted byte position, turn on highlighting for the nubmer
|
||||
// of matching bytes. If a query searches for "e" and the document contains
|
||||
// an "e" followed by a bunch of combining marks, this will correctly
|
||||
// highlight the entire glyph.
|
||||
$parts = array();
|
||||
$highlight = 0;
|
||||
$offset = 0;
|
||||
foreach (phutil_utf8v_combined($str) as $character) {
|
||||
$length = strlen($character);
|
||||
|
||||
// Make sure this is UTF8 because phutil_utf8v() will explode if it isn't.
|
||||
$str = phutil_utf8ize($str);
|
||||
foreach (phutil_utf8v($str) as $chr) {
|
||||
if ($bold[$pos] != $is_bold) {
|
||||
if (strlen($buf)) {
|
||||
if ($is_bold) {
|
||||
$out[] = phutil_tag('strong', array(), $buf);
|
||||
if (isset($all_matches[$offset])) {
|
||||
$highlight = $all_matches[$offset];
|
||||
}
|
||||
|
||||
if ($highlight > 0) {
|
||||
$is_highlighted = true;
|
||||
$highlight -= $length;
|
||||
} else {
|
||||
$out[] = $buf;
|
||||
}
|
||||
$buf = '';
|
||||
}
|
||||
$is_bold = !$is_bold;
|
||||
}
|
||||
$buf .= $chr;
|
||||
$pos += strlen($chr);
|
||||
$is_highlighted = false;
|
||||
}
|
||||
|
||||
if (strlen($buf)) {
|
||||
if ($is_bold) {
|
||||
$out[] = phutil_tag('strong', array(), $buf);
|
||||
$parts[] = array(
|
||||
'text' => $character,
|
||||
'highlighted' => $is_highlighted,
|
||||
);
|
||||
|
||||
$offset += $length;
|
||||
}
|
||||
|
||||
// Combine all the sequences together so we aren't emitting a tag around
|
||||
// every individual character.
|
||||
$last = null;
|
||||
foreach ($parts as $key => $part) {
|
||||
if ($last !== null) {
|
||||
if ($part['highlighted'] == $parts[$last]['highlighted']) {
|
||||
$parts[$last]['text'] .= $part['text'];
|
||||
unset($parts[$key]);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
$last = $key;
|
||||
}
|
||||
|
||||
// Finally, add tags.
|
||||
$result = array();
|
||||
foreach ($parts as $part) {
|
||||
if ($part['highlighted']) {
|
||||
$result[] = phutil_tag('strong', array(), $part['text']);
|
||||
} else {
|
||||
$out[] = $buf;
|
||||
$result[] = $part['text'];
|
||||
}
|
||||
}
|
||||
|
||||
return $out;
|
||||
return $result;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue