From bd4d9d88f2df1203a0b6259c701f1306865a44ef Mon Sep 17 00:00:00 2001 From: epriestley Date: Wed, 17 Feb 2021 13:08:14 -0800 Subject: [PATCH] Limit remarkup URI protocol length to 32 characters to avoid expensive regex behavior Summary: Ref T13608. When searching for bare URIs in remarkup text, don't look for URIs with a protocol string longer than 32 characters. This avoids a case where the regexp engine may be tricked into executing at `O(N^2)` or some similar complexity. Test Plan: - Applied remarkup to "AAAA..." (512KB). - Before: 64 seconds to process. - After: <10ms to process. - Ran unit tests. Maniphest Tasks: T13608 Differential Revision: https://secure.phabricator.com/D21562 --- .../PhutilRemarkupHyperlinkRule.php | 40 ++++++++++++++++--- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/src/infrastructure/markup/markuprule/PhutilRemarkupHyperlinkRule.php b/src/infrastructure/markup/markuprule/PhutilRemarkupHyperlinkRule.php index a926ea44c1..77168c97e3 100644 --- a/src/infrastructure/markup/markuprule/PhutilRemarkupHyperlinkRule.php +++ b/src/infrastructure/markup/markuprule/PhutilRemarkupHyperlinkRule.php @@ -9,18 +9,47 @@ final class PhutilRemarkupHyperlinkRule extends PhutilRemarkupRule { } public function apply($text) { + static $angle_pattern; + static $curly_pattern; + static $bare_pattern; + + if ($angle_pattern === null) { + // See T13608. Limit protocol matches to 32 characters to improve the + // performance of the "://" pattern, which can take a very long + // time to match against long inputs if the maximum length of a protocol + // sequence is unrestricted. + + $protocol_fragment = '\w{3,32}'; + $uri_fragment = '[^\s'.PhutilRemarkupBlockStorage::MAGIC_BYTE.']+'; + + $angle_pattern = sprintf( + '(<(%s://%s?)>)', + $protocol_fragment, + $uri_fragment); + + $curly_pattern = sprintf( + '({(%s://%s?)})', + $protocol_fragment, + $uri_fragment); + + $bare_pattern = sprintf( + '(%s://%s)', + $protocol_fragment, + $uri_fragment); + } + // Hyperlinks with explicit "<>" around them get linked exactly, without // the "<>". Angle brackets are basically special and mean "this is a URL // with weird characters". This is assumed to be reasonable because they - // don't appear in normal text or normal URLs. + // don't appear in most normal text or most normal URLs. $text = preg_replace_callback( - '@<(\w{3,}://[^\s'.PhutilRemarkupBlockStorage::MAGIC_BYTE.']+?)>@', + $angle_pattern, array($this, 'markupHyperlinkAngle'), $text); // We match "{uri}", but do not link it by default. $text = preg_replace_callback( - '@{(\w{3,}://[^\s'.PhutilRemarkupBlockStorage::MAGIC_BYTE.']+?)}@', + $curly_pattern, array($this, 'markupHyperlinkCurly'), $text); @@ -31,8 +60,9 @@ final class PhutilRemarkupHyperlinkRule extends PhutilRemarkupRule { // NOTE: We're explicitly avoiding capturing stored blocks, so text like // `http://www.example.com/[[x | y]]` doesn't get aggressively captured. + $text = preg_replace_callback( - '@(\w{3,}://[^\s'.PhutilRemarkupBlockStorage::MAGIC_BYTE.']+)@', + $bare_pattern, array($this, 'markupHyperlinkUngreedy'), $text); @@ -110,7 +140,7 @@ final class PhutilRemarkupHyperlinkRule extends PhutilRemarkupRule { } protected function markupHyperlinkUngreedy($matches) { - $match = $matches[1]; + $match = $matches[0]; $tail = null; $trailing = null; if (preg_match('/[;,.:!?]+$/', $match, $trailing)) {