mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-25 16:22:43 +01:00
Improve search highlighting for CJK and substring queries
Summary: Fixes T12995. Currently, the result highlighter (which shows //where// terms matched) only works in "term" mode, not in "substring" mode. Provide better feedback and behvaior: - When a term is a substring term, color it a little differently and add a tooltip. (This is partly to make it easier to debug/diagnose things, probably not enormously valuable to users.) - When a term is a substring term, highlight it anywhere in the results. Test Plan: Queried for latin and CJK terms. Here is CJK being highlighted: {F5192195} Here is substring vs non-substring implicit behavior: {F5192196} Here's ONLY terms being highlighted: {F5192198} Here's terms and substrings, since the query now has a substring: {F5192201} Reviewers: amckinley Reviewed By: amckinley Maniphest Tasks: T12995 Differential Revision: https://secure.phabricator.com/D18635
This commit is contained in:
parent
36df39761e
commit
1ac52c09e7
3 changed files with 102 additions and 56 deletions
|
@ -56,6 +56,10 @@ final class PhabricatorFulltextToken extends Phobject {
|
||||||
$shade = PHUITagView::COLOR_RED;
|
$shade = PHUITagView::COLOR_RED;
|
||||||
$icon = 'fa-minus';
|
$icon = 'fa-minus';
|
||||||
break;
|
break;
|
||||||
|
case PhutilSearchQueryCompiler::OPERATOR_SUBSTRING:
|
||||||
|
$tip = pht('Substring Search');
|
||||||
|
$shade = PHUITagView::COLOR_VIOLET;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
$shade = PHUITagView::COLOR_BLUE;
|
$shade = PHUITagView::COLOR_BLUE;
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -261,7 +261,7 @@ final class PhabricatorSearchApplicationSearchEngine
|
||||||
foreach ($results as $phid => $handle) {
|
foreach ($results as $phid => $handle) {
|
||||||
$view = id(new PhabricatorSearchResultView())
|
$view = id(new PhabricatorSearchResultView())
|
||||||
->setHandle($handle)
|
->setHandle($handle)
|
||||||
->setQuery($query)
|
->setTokens($fulltext_tokens)
|
||||||
->setObject(idx($objects, $phid))
|
->setObject(idx($objects, $phid))
|
||||||
->render();
|
->render();
|
||||||
$list->addItem($view);
|
$list->addItem($view);
|
||||||
|
|
|
@ -3,16 +3,17 @@
|
||||||
final class PhabricatorSearchResultView extends AphrontView {
|
final class PhabricatorSearchResultView extends AphrontView {
|
||||||
|
|
||||||
private $handle;
|
private $handle;
|
||||||
private $query;
|
|
||||||
private $object;
|
private $object;
|
||||||
|
private $tokens;
|
||||||
|
|
||||||
public function setHandle(PhabricatorObjectHandle $handle) {
|
public function setHandle(PhabricatorObjectHandle $handle) {
|
||||||
$this->handle = $handle;
|
$this->handle = $handle;
|
||||||
return $this;
|
return $this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function setQuery(PhabricatorSavedQuery $query) {
|
public function setTokens(array $tokens) {
|
||||||
$this->query = $query;
|
assert_instances_of($tokens, 'PhabricatorFulltextToken');
|
||||||
|
$this->tokens = $tokens;
|
||||||
return $this;
|
return $this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -56,88 +57,129 @@ final class PhabricatorSearchResultView extends AphrontView {
|
||||||
* matched their query.
|
* matched their query.
|
||||||
*/
|
*/
|
||||||
private function emboldenQuery($str) {
|
private function emboldenQuery($str) {
|
||||||
$query = $this->query->getParameter('query');
|
$tokens = $this->tokens;
|
||||||
|
|
||||||
if (!strlen($query) || !strlen($str)) {
|
if (!$tokens) {
|
||||||
return $str;
|
return $str;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This algorithm is safe but not especially fast, so don't bother if
|
if (count($tokens) > 16) {
|
||||||
// we're dealing with a lot of data. This mostly prevents silly/malicious
|
|
||||||
// queries from doing anything bad.
|
|
||||||
if (strlen($query) + strlen($str) > 2048) {
|
|
||||||
return $str;
|
return $str;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Keep track of which characters we're going to make bold. This is
|
if (!strlen($str)) {
|
||||||
// byte oriented, but we'll make sure we don't put a bold in the middle
|
return $str;
|
||||||
// of a character later.
|
}
|
||||||
$bold = array_fill(0, strlen($str), false);
|
|
||||||
|
|
||||||
// Split the query into words.
|
if (strlen($str) > 2048) {
|
||||||
$parts = preg_split('/ +/', $query);
|
return $str;
|
||||||
|
}
|
||||||
|
|
||||||
// Find all occurrences of each word, and mark them to be emboldened.
|
$patterns = array();
|
||||||
foreach ($parts as $part) {
|
foreach ($tokens as $token) {
|
||||||
$part = trim($part);
|
$raw_token = $token->getToken();
|
||||||
$part = trim($part, '"+');
|
$operator = $raw_token->getOperator();
|
||||||
if (!strlen($part)) {
|
|
||||||
continue;
|
$value = $raw_token->getValue();
|
||||||
|
|
||||||
|
switch ($operator) {
|
||||||
|
case PhutilSearchQueryCompiler::OPERATOR_SUBSTRING:
|
||||||
|
$patterns[] = '(('.preg_quote($value).'))ui';
|
||||||
|
break;
|
||||||
|
case PhutilSearchQueryCompiler::OPERATOR_AND:
|
||||||
|
$patterns[] = '((?<=\W|^)('.preg_quote($value).')(?=\W|\z))ui';
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
// Don't highlight anything else, particularly "NOT".
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find all matches for all query terms in the document title, then reduce
|
||||||
|
// them to a map from offsets to highlighted sequence lengths. If two terms
|
||||||
|
// match at the same position, we choose the longer one.
|
||||||
|
$all_matches = array();
|
||||||
|
foreach ($patterns as $pattern) {
|
||||||
$matches = null;
|
$matches = null;
|
||||||
$has_matches = preg_match_all(
|
$ok = preg_match_all(
|
||||||
'/(?:^|\b)('.preg_quote($part, '/').')/i',
|
$pattern,
|
||||||
$str,
|
$str,
|
||||||
$matches,
|
$matches,
|
||||||
PREG_OFFSET_CAPTURE);
|
PREG_OFFSET_CAPTURE);
|
||||||
|
if (!$ok) {
|
||||||
if (!$has_matches) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Flag the matching part of the range for boldening.
|
|
||||||
foreach ($matches[1] as $match) {
|
foreach ($matches[1] as $match) {
|
||||||
$offset = $match[1];
|
$match_text = $match[0];
|
||||||
for ($ii = 0; $ii < strlen($match[0]); $ii++) {
|
$match_offset = $match[1];
|
||||||
$bold[$offset + $ii] = true;
|
|
||||||
|
if (!isset($all_matches[$match_offset])) {
|
||||||
|
$all_matches[$match_offset] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$all_matches[$match_offset] = max(
|
||||||
|
$all_matches[$match_offset],
|
||||||
|
strlen($match_text));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Split the string into ranges, applying bold styling as required.
|
// Go through the string one display glyph at a time. If a glyph starts
|
||||||
$out = array();
|
// on a highlighted byte position, turn on highlighting for the nubmer
|
||||||
$buf = '';
|
// of matching bytes. If a query searches for "e" and the document contains
|
||||||
$pos = 0;
|
// an "e" followed by a bunch of combining marks, this will correctly
|
||||||
$is_bold = false;
|
// highlight the entire glyph.
|
||||||
|
$parts = array();
|
||||||
|
$highlight = 0;
|
||||||
|
$offset = 0;
|
||||||
|
foreach (phutil_utf8v_combined($str) as $character) {
|
||||||
|
$length = strlen($character);
|
||||||
|
|
||||||
// Make sure this is UTF8 because phutil_utf8v() will explode if it isn't.
|
if (isset($all_matches[$offset])) {
|
||||||
$str = phutil_utf8ize($str);
|
$highlight = $all_matches[$offset];
|
||||||
foreach (phutil_utf8v($str) as $chr) {
|
|
||||||
if ($bold[$pos] != $is_bold) {
|
|
||||||
if (strlen($buf)) {
|
|
||||||
if ($is_bold) {
|
|
||||||
$out[] = phutil_tag('strong', array(), $buf);
|
|
||||||
} else {
|
|
||||||
$out[] = $buf;
|
|
||||||
}
|
|
||||||
$buf = '';
|
|
||||||
}
|
|
||||||
$is_bold = !$is_bold;
|
|
||||||
}
|
}
|
||||||
$buf .= $chr;
|
|
||||||
$pos += strlen($chr);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (strlen($buf)) {
|
if ($highlight > 0) {
|
||||||
if ($is_bold) {
|
$is_highlighted = true;
|
||||||
$out[] = phutil_tag('strong', array(), $buf);
|
$highlight -= $length;
|
||||||
} else {
|
} else {
|
||||||
$out[] = $buf;
|
$is_highlighted = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
$parts[] = array(
|
||||||
|
'text' => $character,
|
||||||
|
'highlighted' => $is_highlighted,
|
||||||
|
);
|
||||||
|
|
||||||
|
$offset += $length;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Combine all the sequences together so we aren't emitting a tag around
|
||||||
|
// every individual character.
|
||||||
|
$last = null;
|
||||||
|
foreach ($parts as $key => $part) {
|
||||||
|
if ($last !== null) {
|
||||||
|
if ($part['highlighted'] == $parts[$last]['highlighted']) {
|
||||||
|
$parts[$last]['text'] .= $part['text'];
|
||||||
|
unset($parts[$key]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$last = $key;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finally, add tags.
|
||||||
|
$result = array();
|
||||||
|
foreach ($parts as $part) {
|
||||||
|
if ($part['highlighted']) {
|
||||||
|
$result[] = phutil_tag('strong', array(), $part['text']);
|
||||||
|
} else {
|
||||||
|
$result[] = $part['text'];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return $out;
|
return $result;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue