1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-29 10:12:41 +01:00

Improve search highlighting for CJK and substring queries

Summary:
Fixes T12995. Currently, the result highlighter (which shows //where// terms matched) only works in "term" mode, not in "substring" mode.

Provide better feedback and behvaior:

  - When a term is a substring term, color it a little differently and add a tooltip. (This is partly to make it easier to debug/diagnose things, probably not enormously valuable to users.)
  - When a term is a substring term, highlight it anywhere in the results.

Test Plan:
Queried for latin and CJK terms.

Here is CJK being highlighted:

{F5192195}

Here is substring vs non-substring implicit behavior:

{F5192196}

Here's ONLY terms being highlighted:

{F5192198}

Here's terms and substrings, since the query now has a substring:

{F5192201}

Reviewers: amckinley

Reviewed By: amckinley

Maniphest Tasks: T12995

Differential Revision: https://secure.phabricator.com/D18635
This commit is contained in:
epriestley 2017-09-22 08:10:12 -07:00
parent 36df39761e
commit 1ac52c09e7
3 changed files with 102 additions and 56 deletions

View file

@ -56,6 +56,10 @@ final class PhabricatorFulltextToken extends Phobject {
$shade = PHUITagView::COLOR_RED;
$icon = 'fa-minus';
break;
case PhutilSearchQueryCompiler::OPERATOR_SUBSTRING:
$tip = pht('Substring Search');
$shade = PHUITagView::COLOR_VIOLET;
break;
default:
$shade = PHUITagView::COLOR_BLUE;
break;

View file

@ -261,7 +261,7 @@ final class PhabricatorSearchApplicationSearchEngine
foreach ($results as $phid => $handle) {
$view = id(new PhabricatorSearchResultView())
->setHandle($handle)
->setQuery($query)
->setTokens($fulltext_tokens)
->setObject(idx($objects, $phid))
->render();
$list->addItem($view);

View file

@ -3,16 +3,17 @@
final class PhabricatorSearchResultView extends AphrontView {
private $handle;
private $query;
private $object;
private $tokens;
public function setHandle(PhabricatorObjectHandle $handle) {
$this->handle = $handle;
return $this;
}
public function setQuery(PhabricatorSavedQuery $query) {
$this->query = $query;
public function setTokens(array $tokens) {
assert_instances_of($tokens, 'PhabricatorFulltextToken');
$this->tokens = $tokens;
return $this;
}
@ -56,88 +57,129 @@ final class PhabricatorSearchResultView extends AphrontView {
* matched their query.
*/
private function emboldenQuery($str) {
$query = $this->query->getParameter('query');
$tokens = $this->tokens;
if (!strlen($query) || !strlen($str)) {
if (!$tokens) {
return $str;
}
// This algorithm is safe but not especially fast, so don't bother if
// we're dealing with a lot of data. This mostly prevents silly/malicious
// queries from doing anything bad.
if (strlen($query) + strlen($str) > 2048) {
if (count($tokens) > 16) {
return $str;
}
// Keep track of which characters we're going to make bold. This is
// byte oriented, but we'll make sure we don't put a bold in the middle
// of a character later.
$bold = array_fill(0, strlen($str), false);
// Split the query into words.
$parts = preg_split('/ +/', $query);
// Find all occurrences of each word, and mark them to be emboldened.
foreach ($parts as $part) {
$part = trim($part);
$part = trim($part, '"+');
if (!strlen($part)) {
continue;
if (!strlen($str)) {
return $str;
}
if (strlen($str) > 2048) {
return $str;
}
$patterns = array();
foreach ($tokens as $token) {
$raw_token = $token->getToken();
$operator = $raw_token->getOperator();
$value = $raw_token->getValue();
switch ($operator) {
case PhutilSearchQueryCompiler::OPERATOR_SUBSTRING:
$patterns[] = '(('.preg_quote($value).'))ui';
break;
case PhutilSearchQueryCompiler::OPERATOR_AND:
$patterns[] = '((?<=\W|^)('.preg_quote($value).')(?=\W|\z))ui';
break;
default:
// Don't highlight anything else, particularly "NOT".
break;
}
}
// Find all matches for all query terms in the document title, then reduce
// them to a map from offsets to highlighted sequence lengths. If two terms
// match at the same position, we choose the longer one.
$all_matches = array();
foreach ($patterns as $pattern) {
$matches = null;
$has_matches = preg_match_all(
'/(?:^|\b)('.preg_quote($part, '/').')/i',
$ok = preg_match_all(
$pattern,
$str,
$matches,
PREG_OFFSET_CAPTURE);
if (!$has_matches) {
if (!$ok) {
continue;
}
// Flag the matching part of the range for boldening.
foreach ($matches[1] as $match) {
$offset = $match[1];
for ($ii = 0; $ii < strlen($match[0]); $ii++) {
$bold[$offset + $ii] = true;
$match_text = $match[0];
$match_offset = $match[1];
if (!isset($all_matches[$match_offset])) {
$all_matches[$match_offset] = 0;
}
$all_matches[$match_offset] = max(
$all_matches[$match_offset],
strlen($match_text));
}
}
// Split the string into ranges, applying bold styling as required.
$out = array();
$buf = '';
$pos = 0;
$is_bold = false;
// Go through the string one display glyph at a time. If a glyph starts
// on a highlighted byte position, turn on highlighting for the nubmer
// of matching bytes. If a query searches for "e" and the document contains
// an "e" followed by a bunch of combining marks, this will correctly
// highlight the entire glyph.
$parts = array();
$highlight = 0;
$offset = 0;
foreach (phutil_utf8v_combined($str) as $character) {
$length = strlen($character);
// Make sure this is UTF8 because phutil_utf8v() will explode if it isn't.
$str = phutil_utf8ize($str);
foreach (phutil_utf8v($str) as $chr) {
if ($bold[$pos] != $is_bold) {
if (strlen($buf)) {
if ($is_bold) {
$out[] = phutil_tag('strong', array(), $buf);
if (isset($all_matches[$offset])) {
$highlight = $all_matches[$offset];
}
if ($highlight > 0) {
$is_highlighted = true;
$highlight -= $length;
} else {
$out[] = $buf;
}
$buf = '';
}
$is_bold = !$is_bold;
}
$buf .= $chr;
$pos += strlen($chr);
$is_highlighted = false;
}
if (strlen($buf)) {
if ($is_bold) {
$out[] = phutil_tag('strong', array(), $buf);
$parts[] = array(
'text' => $character,
'highlighted' => $is_highlighted,
);
$offset += $length;
}
// Combine all the sequences together so we aren't emitting a tag around
// every individual character.
$last = null;
foreach ($parts as $key => $part) {
if ($last !== null) {
if ($part['highlighted'] == $parts[$last]['highlighted']) {
$parts[$last]['text'] .= $part['text'];
unset($parts[$key]);
continue;
}
}
$last = $key;
}
// Finally, add tags.
$result = array();
foreach ($parts as $part) {
if ($part['highlighted']) {
$result[] = phutil_tag('strong', array(), $part['text']);
} else {
$out[] = $buf;
$result[] = $part['text'];
}
}
return $out;
return $result;
}
}