1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-25 16:22:43 +01:00

Improve search highlighting for CJK and substring queries

Summary:
Fixes T12995. Currently, the result highlighter (which shows //where// terms matched) only works in "term" mode, not in "substring" mode.

Provide better feedback and behvaior:

  - When a term is a substring term, color it a little differently and add a tooltip. (This is partly to make it easier to debug/diagnose things, probably not enormously valuable to users.)
  - When a term is a substring term, highlight it anywhere in the results.

Test Plan:
Queried for latin and CJK terms.

Here is CJK being highlighted:

{F5192195}

Here is substring vs non-substring implicit behavior:

{F5192196}

Here's ONLY terms being highlighted:

{F5192198}

Here's terms and substrings, since the query now has a substring:

{F5192201}

Reviewers: amckinley

Reviewed By: amckinley

Maniphest Tasks: T12995

Differential Revision: https://secure.phabricator.com/D18635
This commit is contained in:
epriestley 2017-09-22 08:10:12 -07:00
parent 36df39761e
commit 1ac52c09e7
3 changed files with 102 additions and 56 deletions

View file

@ -56,6 +56,10 @@ final class PhabricatorFulltextToken extends Phobject {
$shade = PHUITagView::COLOR_RED; $shade = PHUITagView::COLOR_RED;
$icon = 'fa-minus'; $icon = 'fa-minus';
break; break;
case PhutilSearchQueryCompiler::OPERATOR_SUBSTRING:
$tip = pht('Substring Search');
$shade = PHUITagView::COLOR_VIOLET;
break;
default: default:
$shade = PHUITagView::COLOR_BLUE; $shade = PHUITagView::COLOR_BLUE;
break; break;

View file

@ -261,7 +261,7 @@ final class PhabricatorSearchApplicationSearchEngine
foreach ($results as $phid => $handle) { foreach ($results as $phid => $handle) {
$view = id(new PhabricatorSearchResultView()) $view = id(new PhabricatorSearchResultView())
->setHandle($handle) ->setHandle($handle)
->setQuery($query) ->setTokens($fulltext_tokens)
->setObject(idx($objects, $phid)) ->setObject(idx($objects, $phid))
->render(); ->render();
$list->addItem($view); $list->addItem($view);

View file

@ -3,16 +3,17 @@
final class PhabricatorSearchResultView extends AphrontView { final class PhabricatorSearchResultView extends AphrontView {
private $handle; private $handle;
private $query;
private $object; private $object;
private $tokens;
public function setHandle(PhabricatorObjectHandle $handle) { public function setHandle(PhabricatorObjectHandle $handle) {
$this->handle = $handle; $this->handle = $handle;
return $this; return $this;
} }
public function setQuery(PhabricatorSavedQuery $query) { public function setTokens(array $tokens) {
$this->query = $query; assert_instances_of($tokens, 'PhabricatorFulltextToken');
$this->tokens = $tokens;
return $this; return $this;
} }
@ -56,88 +57,129 @@ final class PhabricatorSearchResultView extends AphrontView {
* matched their query. * matched their query.
*/ */
private function emboldenQuery($str) { private function emboldenQuery($str) {
$query = $this->query->getParameter('query'); $tokens = $this->tokens;
if (!strlen($query) || !strlen($str)) { if (!$tokens) {
return $str; return $str;
} }
// This algorithm is safe but not especially fast, so don't bother if if (count($tokens) > 16) {
// we're dealing with a lot of data. This mostly prevents silly/malicious
// queries from doing anything bad.
if (strlen($query) + strlen($str) > 2048) {
return $str; return $str;
} }
// Keep track of which characters we're going to make bold. This is if (!strlen($str)) {
// byte oriented, but we'll make sure we don't put a bold in the middle return $str;
// of a character later. }
$bold = array_fill(0, strlen($str), false);
// Split the query into words. if (strlen($str) > 2048) {
$parts = preg_split('/ +/', $query); return $str;
}
// Find all occurrences of each word, and mark them to be emboldened. $patterns = array();
foreach ($parts as $part) { foreach ($tokens as $token) {
$part = trim($part); $raw_token = $token->getToken();
$part = trim($part, '"+'); $operator = $raw_token->getOperator();
if (!strlen($part)) {
continue; $value = $raw_token->getValue();
switch ($operator) {
case PhutilSearchQueryCompiler::OPERATOR_SUBSTRING:
$patterns[] = '(('.preg_quote($value).'))ui';
break;
case PhutilSearchQueryCompiler::OPERATOR_AND:
$patterns[] = '((?<=\W|^)('.preg_quote($value).')(?=\W|\z))ui';
break;
default:
// Don't highlight anything else, particularly "NOT".
break;
} }
}
// Find all matches for all query terms in the document title, then reduce
// them to a map from offsets to highlighted sequence lengths. If two terms
// match at the same position, we choose the longer one.
$all_matches = array();
foreach ($patterns as $pattern) {
$matches = null; $matches = null;
$has_matches = preg_match_all( $ok = preg_match_all(
'/(?:^|\b)('.preg_quote($part, '/').')/i', $pattern,
$str, $str,
$matches, $matches,
PREG_OFFSET_CAPTURE); PREG_OFFSET_CAPTURE);
if (!$ok) {
if (!$has_matches) {
continue; continue;
} }
// Flag the matching part of the range for boldening.
foreach ($matches[1] as $match) { foreach ($matches[1] as $match) {
$offset = $match[1]; $match_text = $match[0];
for ($ii = 0; $ii < strlen($match[0]); $ii++) { $match_offset = $match[1];
$bold[$offset + $ii] = true;
if (!isset($all_matches[$match_offset])) {
$all_matches[$match_offset] = 0;
} }
$all_matches[$match_offset] = max(
$all_matches[$match_offset],
strlen($match_text));
} }
} }
// Split the string into ranges, applying bold styling as required. // Go through the string one display glyph at a time. If a glyph starts
$out = array(); // on a highlighted byte position, turn on highlighting for the nubmer
$buf = ''; // of matching bytes. If a query searches for "e" and the document contains
$pos = 0; // an "e" followed by a bunch of combining marks, this will correctly
$is_bold = false; // highlight the entire glyph.
$parts = array();
$highlight = 0;
$offset = 0;
foreach (phutil_utf8v_combined($str) as $character) {
$length = strlen($character);
// Make sure this is UTF8 because phutil_utf8v() will explode if it isn't. if (isset($all_matches[$offset])) {
$str = phutil_utf8ize($str); $highlight = $all_matches[$offset];
foreach (phutil_utf8v($str) as $chr) {
if ($bold[$pos] != $is_bold) {
if (strlen($buf)) {
if ($is_bold) {
$out[] = phutil_tag('strong', array(), $buf);
} else {
$out[] = $buf;
}
$buf = '';
}
$is_bold = !$is_bold;
} }
$buf .= $chr;
$pos += strlen($chr);
}
if (strlen($buf)) { if ($highlight > 0) {
if ($is_bold) { $is_highlighted = true;
$out[] = phutil_tag('strong', array(), $buf); $highlight -= $length;
} else { } else {
$out[] = $buf; $is_highlighted = false;
}
$parts[] = array(
'text' => $character,
'highlighted' => $is_highlighted,
);
$offset += $length;
}
// Combine all the sequences together so we aren't emitting a tag around
// every individual character.
$last = null;
foreach ($parts as $key => $part) {
if ($last !== null) {
if ($part['highlighted'] == $parts[$last]['highlighted']) {
$parts[$last]['text'] .= $part['text'];
unset($parts[$key]);
continue;
}
}
$last = $key;
}
// Finally, add tags.
$result = array();
foreach ($parts as $part) {
if ($part['highlighted']) {
$result[] = phutil_tag('strong', array(), $part['text']);
} else {
$result[] = $part['text'];
} }
} }
return $out; return $result;
} }
} }