1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-12-22 21:40:55 +01:00

Limit remarkup URI protocol length to 32 characters to avoid expensive regex behavior

Summary:
Ref T13608. When searching for bare URIs in remarkup text, don't look for URIs with a protocol string longer than 32 characters.

This avoids a case where the regexp engine may be tricked into executing at `O(N^2)` or some similar complexity.

Test Plan:
  - Applied remarkup to "AAAA..." (512KB).
  - Before: 64 seconds to process.
  - After: <10ms to process.
  - Ran unit tests.

Maniphest Tasks: T13608

Differential Revision: https://secure.phabricator.com/D21562
This commit is contained in:
epriestley 2021-02-17 13:08:14 -08:00
parent 6703fec3e2
commit bd4d9d88f2

View file

@ -9,18 +9,47 @@ final class PhutilRemarkupHyperlinkRule extends PhutilRemarkupRule {
}
public function apply($text) {
static $angle_pattern;
static $curly_pattern;
static $bare_pattern;
if ($angle_pattern === null) {
// See T13608. Limit protocol matches to 32 characters to improve the
// performance of the "<protocol>://" pattern, which can take a very long
// time to match against long inputs if the maximum length of a protocol
// sequence is unrestricted.
$protocol_fragment = '\w{3,32}';
$uri_fragment = '[^\s'.PhutilRemarkupBlockStorage::MAGIC_BYTE.']+';
$angle_pattern = sprintf(
'(<(%s://%s?)>)',
$protocol_fragment,
$uri_fragment);
$curly_pattern = sprintf(
'({(%s://%s?)})',
$protocol_fragment,
$uri_fragment);
$bare_pattern = sprintf(
'(%s://%s)',
$protocol_fragment,
$uri_fragment);
}
// Hyperlinks with explicit "<>" around them get linked exactly, without
// the "<>". Angle brackets are basically special and mean "this is a URL
// with weird characters". This is assumed to be reasonable because they
// don't appear in normal text or normal URLs.
// don't appear in most normal text or most normal URLs.
$text = preg_replace_callback(
'@<(\w{3,}://[^\s'.PhutilRemarkupBlockStorage::MAGIC_BYTE.']+?)>@',
$angle_pattern,
array($this, 'markupHyperlinkAngle'),
$text);
// We match "{uri}", but do not link it by default.
$text = preg_replace_callback(
'@{(\w{3,}://[^\s'.PhutilRemarkupBlockStorage::MAGIC_BYTE.']+?)}@',
$curly_pattern,
array($this, 'markupHyperlinkCurly'),
$text);
@ -31,8 +60,9 @@ final class PhutilRemarkupHyperlinkRule extends PhutilRemarkupRule {
// NOTE: We're explicitly avoiding capturing stored blocks, so text like
// `http://www.example.com/[[x | y]]` doesn't get aggressively captured.
$text = preg_replace_callback(
'@(\w{3,}://[^\s'.PhutilRemarkupBlockStorage::MAGIC_BYTE.']+)@',
$bare_pattern,
array($this, 'markupHyperlinkUngreedy'),
$text);
@ -110,7 +140,7 @@ final class PhutilRemarkupHyperlinkRule extends PhutilRemarkupRule {
}
protected function markupHyperlinkUngreedy($matches) {
$match = $matches[1];
$match = $matches[0];
$tail = null;
$trailing = null;
if (preg_match('/[;,.:!?]+$/', $match, $trailing)) {