1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-22 23:02:42 +01:00

Improve Remarkup parsing performance for certain large input blocks

Summary: Fixes T13487. In PHI1628, an install has a 4MB remarkup corpus which takes a long time to render. This is broadly expected, but a few reasonable improvements fell out of running it through the profiler.

Test Plan:
  - Saw local cold-cache end-to-end rendering time drop from 12s to 4s for the highly secret input corpus.
  - Verified output has the same hashes before/after.
  - Ran all remarkup unit tests.

Maniphest Tasks: T13487

Differential Revision: https://secure.phabricator.com/D20968
This commit is contained in:
epriestley 2020-02-04 14:19:04 -08:00
parent 0e82bd024a
commit fdbe9ba149
3 changed files with 143 additions and 74 deletions

View file

@ -100,22 +100,28 @@ final class PhutilRemarkupNoteBlockRule extends PhutilRemarkupBlockRule {
} }
private function getRegEx() { private function getRegEx() {
$words = array( static $regex;
'NOTE',
'IMPORTANT',
'WARNING',
);
foreach ($words as $k => $word) { if ($regex === null) {
$words[$k] = preg_quote($word, '/'); $words = array(
'NOTE',
'IMPORTANT',
'WARNING',
);
foreach ($words as $k => $word) {
$words[$k] = preg_quote($word, '/');
}
$words = implode('|', $words);
$regex =
'/^(?:'.
'(?:\((?P<hideword>'.$words.')\))'.
'|'.
'(?:(?P<showword>'.$words.'):))\s*'.
'/';
} }
$words = implode('|', $words);
return return $regex;
'/^(?:'.
'(?:\((?P<hideword>'.$words.')\))'.
'|'.
'(?:(?P<showword>'.$words.'):))\s*'.
'/';
} }
} }

View file

@ -153,33 +153,54 @@ final class PhutilRemarkupEngine extends PhutilMarkupEngine {
$block_rules = $this->blockRules; $block_rules = $this->blockRules;
$blocks = array(); $blocks = array();
$cursor = 0; $cursor = 0;
$prev_block = array();
$can_merge = array();
foreach ($block_rules as $key => $block_rule) {
if ($block_rule instanceof PhutilRemarkupDefaultBlockRule) {
$can_merge[$key] = true;
}
}
$last_block = null;
$last_block_key = -1;
// See T13487. For very large inputs, block separation can dominate
// runtime. This is written somewhat clumsily to attempt to handle
// very large inputs as gracefully as is practical.
while (isset($text[$cursor])) { while (isset($text[$cursor])) {
$starting_cursor = $cursor; $starting_cursor = $cursor;
foreach ($block_rules as $block_rule) { foreach ($block_rules as $block_key => $block_rule) {
$num_lines = $block_rule->getMatchingLineCount($text, $cursor); $num_lines = $block_rule->getMatchingLineCount($text, $cursor);
if ($num_lines) { if ($num_lines) {
if ($blocks) { $current_block = array(
$prev_block = last($blocks);
}
$curr_block = array(
'start' => $cursor, 'start' => $cursor,
'num_lines' => $num_lines, 'num_lines' => $num_lines,
'rule' => $block_rule, 'rule' => $block_rule,
'is_empty' => self::isEmptyBlock($text, $cursor, $num_lines), 'empty' => self::isEmptyBlock($text, $cursor, $num_lines),
'children' => array(), 'children' => array(),
'merge' => isset($can_merge[$block_key]),
); );
if ($prev_block $should_merge = self::shouldMergeParagraphBlocks(
&& self::shouldMergeBlocks($text, $prev_block, $curr_block)) { $text,
$blocks[last_key($blocks)]['num_lines'] += $curr_block['num_lines']; $last_block,
$blocks[last_key($blocks)]['is_empty'] = $current_block);
$blocks[last_key($blocks)]['is_empty'] && $curr_block['is_empty'];
if ($should_merge) {
$last_block['num_lines'] =
($last_block['num_lines'] + $current_block['num_lines']);
$last_block['empty'] =
($last_block['empty'] && $current_block['empty']);
$blocks[$last_block_key] = $last_block;
} else { } else {
$blocks[] = $curr_block; $blocks[] = $current_block;
$last_block = $current_block;
$last_block_key++;
} }
$cursor += $num_lines; $cursor += $num_lines;
@ -192,9 +213,20 @@ final class PhutilRemarkupEngine extends PhutilMarkupEngine {
} }
} }
// See T13487. It's common for blocks to be small, and this loop seems to
// measure as faster if we manually concatenate blocks than if we
// "array_slice()" and "implode()" blocks. This is a bit muddy.
foreach ($blocks as $key => $block) { foreach ($blocks as $key => $block) {
$lines = array_slice($text, $block['start'], $block['num_lines']); $min = $block['start'];
$blocks[$key]['text'] = implode('', $lines); $max = $min + $block['num_lines'];
$lines = '';
for ($ii = $min; $ii < $max; $ii++) {
$lines .= $text[$ii];
}
$blocks[$key]['text'] = $lines;
} }
// Stop splitting child blocks apart if we get too deep. This arrests // Stop splitting child blocks apart if we get too deep. This arrests
@ -246,30 +278,48 @@ final class PhutilRemarkupEngine extends PhutilMarkupEngine {
return $output; return $output;
} }
private static function shouldMergeBlocks($text, $prev_block, $curr_block) { private static function shouldMergeParagraphBlocks(
$block_rules = ipull(array($prev_block, $curr_block), 'rule'); $text,
$last_block,
$current_block) {
$default_rule = 'PhutilRemarkupDefaultBlockRule'; // If we're at the beginning of the input, we can't merge.
try { if ($last_block === null) {
assert_instances_of($block_rules, $default_rule); return false;
}
// If the last block was empty keep merging // If the previous block wasn't a default block, we can't merge.
if ($prev_block['is_empty']) { if (!$last_block['merge']) {
return true; return false;
} }
// If this line is blank keep merging // If the current block isn't a default block, we can't merge.
if ($curr_block['is_empty']) { if (!$current_block['merge']) {
return true; return false;
} }
// If the current line and the last line have content, keep merging // If the last block was empty, we definitely want to merge.
if (strlen(trim($text[$curr_block['start'] - 1]))) { if ($last_block['empty']) {
if (strlen(trim($text[$curr_block['start']]))) { return true;
return true; }
}
} // If this block is empty, we definitely want to merge.
} catch (Exception $e) {} if ($current_block['empty']) {
return true;
}
// Check if the last line of the previous block or the first line of this
// block have any non-whitespace text. If they both do, we're going to
// merge.
// If either of them are a blank line or a line with only whitespace, we
// do not merge: this means we've found a paragraph break.
$tail = $text[$current_block['start'] - 1];
$head = $text[$current_block['start']];
if (strlen(trim($tail)) && strlen(trim($head))) {
return true;
}
return false; return false;
} }

View file

@ -2,6 +2,9 @@
abstract class PhabricatorObjectRemarkupRule extends PhutilRemarkupRule { abstract class PhabricatorObjectRemarkupRule extends PhutilRemarkupRule {
private $referencePattern;
private $embedPattern;
const KEY_RULE_OBJECT = 'rule.object'; const KEY_RULE_OBJECT = 'rule.object';
const KEY_MENTIONED_OBJECTS = 'rule.object.mentioned'; const KEY_MENTIONED_OBJECTS = 'rule.object.mentioned';
@ -192,38 +195,48 @@ abstract class PhabricatorObjectRemarkupRule extends PhutilRemarkupRule {
} }
private function getObjectEmbedPattern() { private function getObjectEmbedPattern() {
$prefix = $this->getObjectNamePrefix(); if ($this->embedPattern === null) {
$prefix = preg_quote($prefix); $prefix = $this->getObjectNamePrefix();
$id = $this->getObjectIDPattern(); $prefix = preg_quote($prefix);
$id = $this->getObjectIDPattern();
return '(\B{'.$prefix.'('.$id.')([,\s](?:[^}\\\\]|\\\\.)*)?}\B)u'; $this->embedPattern =
'(\B{'.$prefix.'('.$id.')([,\s](?:[^}\\\\]|\\\\.)*)?}\B)u';
}
return $this->embedPattern;
} }
private function getObjectReferencePattern() { private function getObjectReferencePattern() {
$prefix = $this->getObjectNamePrefix(); if ($this->referencePattern === null) {
$prefix = preg_quote($prefix); $prefix = $this->getObjectNamePrefix();
$prefix = preg_quote($prefix);
$id = $this->getObjectIDPattern(); $id = $this->getObjectIDPattern();
// If the prefix starts with a word character (like "D"), we want to // If the prefix starts with a word character (like "D"), we want to
// require a word boundary so that we don't match "XD1" as "D1". If the // require a word boundary so that we don't match "XD1" as "D1". If the
// prefix does not start with a word character, we want to require no word // prefix does not start with a word character, we want to require no word
// boundary for the same reasons. Test if the prefix starts with a word // boundary for the same reasons. Test if the prefix starts with a word
// character. // character.
if ($this->getObjectNamePrefixBeginsWithWordCharacter()) { if ($this->getObjectNamePrefixBeginsWithWordCharacter()) {
$boundary = '\\b'; $boundary = '\\b';
} else { } else {
$boundary = '\\B'; $boundary = '\\B';
}
// The "(?<![#@-])" prevents us from linking "#abcdef" or similar, and
// "ABC-T1" (see T5714), and from matching "@T1" as a task (it is a user)
// (see T9479).
// The "\b" allows us to link "(abcdef)" or similar without linking things
// in the middle of words.
$this->referencePattern =
'((?<![#@-])'.$boundary.$prefix.'('.$id.')(?:#([-\w\d]+))?(?!\w))u';
} }
// The "(?<![#@-])" prevents us from linking "#abcdef" or similar, and return $this->referencePattern;
// "ABC-T1" (see T5714), and from matching "@T1" as a task (it is a user)
// (see T9479).
// The "\b" allows us to link "(abcdef)" or similar without linking things
// in the middle of words.
return '((?<![#@-])'.$boundary.$prefix.'('.$id.')(?:#([-\w\d]+))?(?!\w))u';
} }