1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2025-01-27 15:08:20 +01:00

Write search bolding in a way which is certainly HTML-safe

Summary:
This algorithm is tricky, and uses `phutil_safe_html()` directly, which makes it potentially unsafe.

In particular, D8859 fixes a bug with it which caused it to produce non-utf8 output. This doesn't guarantee it's a security problem, but does make it suspicious.

I don't actually see a way to break it, but rewrite it so that it's absolutely bulletproof and does not need to call `phutil_safe_html()`.

Test Plan:
{F147487}

@rugabarbo, if you have a chance, can you check if this still works for you?

Reviewers: btrahan

Reviewed By: btrahan

Subscribers: epriestley, rugabarbo

Differential Revision: https://secure.phabricator.com/D8862
This commit is contained in:
epriestley 2014-04-26 12:44:16 -07:00
parent 1b0d53ec65
commit 88ae246593

View file

@ -76,33 +76,92 @@ final class PhabricatorSearchResultView extends AphrontView {
$link);
}
/**
* Find the words which are part of the query string, and bold them in a
* result string. This makes it easier for users to see why a result
* matched their query.
*/
private function emboldenQuery($str) {
if (!$this->query) {
$query = $this->query->getParameter('query');
if (!strlen($query) || !strlen($str)) {
return $str;
}
$query = $this->query->getParameter('query');
$quoted_regexp = '/"([^"]*)"/';
$matches = array(1 => array());
preg_match_all($quoted_regexp, $query, $matches);
$quoted_queries = $matches[1];
$query = preg_replace($quoted_regexp, '', $query);
$query = preg_split('/\s+[+|]?/u', $query);
$query = array_filter($query);
$query = array_merge($query, $quoted_queries);
$str = phutil_escape_html($str);
foreach ($query as $word) {
$word = phutil_escape_html($word);
$word = preg_quote($word, '/');
$word = preg_replace('/\\\\\*$/', '\w*', $word);
$str = preg_replace(
'/(?:^|\b)('.$word.')(?:\b|$)/i',
'<strong>\1</strong>',
$str);
// This algorithm is safe but not especially fast, so don't bother if
// we're dealing with a lot of data. This mostly prevents silly/malicious
// queries from doing anything bad.
if (strlen($query) + strlen($str) > 2048) {
return $str;
}
return phutil_safe_html($str);
// Keep track of which characters we're going to make bold. This is
// byte oriented, but we'll make sure we don't put a bold in the middle
// of a character later.
$bold = array_fill(0, strlen($str), false);
// Split the query into words.
$parts = preg_split('/ +/', $query);
// Find all occurrences of each word, and mark them to be emboldened.
foreach ($parts as $part) {
$part = trim($part);
$part = trim($part, '"+');
if (!strlen($part)) {
continue;
}
$matches = null;
$has_matches = preg_match_all(
'/(?:^|\b)('.preg_quote($part, '/').')/i',
$str,
$matches,
PREG_OFFSET_CAPTURE);
if (!$has_matches) {
continue;
}
// Flag the matching part of the range for boldening.
foreach ($matches[1] as $match) {
$offset = $match[1];
for ($ii = 0; $ii < strlen($match[0]); $ii++) {
$bold[$offset + $ii] = true;
}
}
}
// Split the string into ranges, applying bold styling as required.
$out = array();
$buf = '';
$pos = 0;
$is_bold = false;
foreach (phutil_utf8v($str) as $chr) {
if ($bold[$pos] != $is_bold) {
if (strlen($buf)) {
if ($is_bold) {
$out[] = phutil_tag('strong', array(), $buf);
} else {
$out[] = $buf;
}
$buf = '';
}
$is_bold = !$is_bold;
}
$buf .= $chr;
$pos += strlen($chr);
}
if (strlen($buf)) {
if ($is_bold) {
$out[] = phutil_tag('strong', array(), $buf);
} else {
$out[] = $buf;
}
}
return $out;
}
}