Improve Remarkup parsing performance for certain large input blocks

Summary: Fixes T13487. In PHI1628, an install has a 4MB remarkup corpus which takes a long time to render. This is broadly expected, but a few reasonable improvements fell out of running it through the profiler. Test Plan: - Saw local cold-cache end-to-end rendering time drop from 12s to 4s for the highly secret input corpus. - Verified output has the same hashes before/after. - Ran all remarkup unit tests. Maniphest Tasks: T13487 Differential Revision: https://secure.phabricator.com/D20968
2024-11-22 23:02:42 +01:00 · 2020-02-04 14:19:04 -08:00 · 2020-02-04 14:19:04 -08:00 · fdbe9ba149
commit fdbe9ba149
parent 0e82bd024a
3 changed files with 143 additions and 74 deletions
--- a/src/infrastructure/markup/blockrule/PhutilRemarkupNoteBlockRule.php
+++ b/src/infrastructure/markup/blockrule/PhutilRemarkupNoteBlockRule.php
@ -100,22 +100,28 @@ final class PhutilRemarkupNoteBlockRule extends PhutilRemarkupBlockRule {
  }
  private function getRegEx() {
-    $words = array(
+    static $regex;
      'NOTE',
      'IMPORTANT',
      'WARNING',
    );
-    foreach ($words as $k => $word) {
+    if ($regex === null) {
-      $words[$k] = preg_quote($word, '/');
+      $words = array(
        'NOTE',
        'IMPORTANT',
        'WARNING',
      );
      foreach ($words as $k => $word) {
        $words[$k] = preg_quote($word, '/');
      }
      $words = implode('|', $words);
      $regex =
        '/^(?:'.
        '(?:\((?P<hideword>'.$words.')\))'.
        '|'.
        '(?:(?P<showword>'.$words.'):))\s*'.
        '/';
    }
    $words = implode('|', $words);
-    return
+    return $regex;
      '/^(?:'.
      '(?:\((?P<hideword>'.$words.')\))'.
      '|'.
      '(?:(?P<showword>'.$words.'):))\s*'.
      '/';
  }
 }
--- a/src/infrastructure/markup/remarkup/PhutilRemarkupEngine.php
+++ b/src/infrastructure/markup/remarkup/PhutilRemarkupEngine.php
@ -153,33 +153,54 @@ final class PhutilRemarkupEngine extends PhutilMarkupEngine {
    $block_rules = $this->blockRules;
    $blocks = array();
    $cursor = 0;
-    $prev_block = array();
+
    $can_merge = array();
    foreach ($block_rules as $key => $block_rule) {
      if ($block_rule instanceof PhutilRemarkupDefaultBlockRule) {
        $can_merge[$key] = true;
      }
    }
    $last_block = null;
    $last_block_key = -1;
    // See T13487. For very large inputs, block separation can dominate
    // runtime. This is written somewhat clumsily to attempt to handle
    // very large inputs as gracefully as is practical.
    while (isset($text[$cursor])) {
      $starting_cursor = $cursor;
-      foreach ($block_rules as $block_rule) {
+      foreach ($block_rules as $block_key => $block_rule) {
        $num_lines = $block_rule->getMatchingLineCount($text, $cursor);
        if ($num_lines) {
-          if ($blocks) {
+          $current_block = array(
            $prev_block = last($blocks);
          }
          $curr_block = array(
            'start' => $cursor,
            'num_lines' => $num_lines,
            'rule' => $block_rule,
-            'is_empty' => self::isEmptyBlock($text, $cursor, $num_lines),
+            'empty' => self::isEmptyBlock($text, $cursor, $num_lines),
            'children' => array(),
            'merge' => isset($can_merge[$block_key]),
          );
-          if ($prev_block
+          $should_merge = self::shouldMergeParagraphBlocks(
-            && self::shouldMergeBlocks($text, $prev_block, $curr_block)) {
+            $text,
-            $blocks[last_key($blocks)]['num_lines'] += $curr_block['num_lines'];
+            $last_block,
-            $blocks[last_key($blocks)]['is_empty'] =
+            $current_block);
-              $blocks[last_key($blocks)]['is_empty'] && $curr_block['is_empty'];
+
          if ($should_merge) {
            $last_block['num_lines'] =
              ($last_block['num_lines'] + $current_block['num_lines']);
            $last_block['empty'] =
              ($last_block['empty'] && $current_block['empty']);
            $blocks[$last_block_key] = $last_block;
          } else {
-            $blocks[] = $curr_block;
+            $blocks[] = $current_block;
            $last_block = $current_block;
            $last_block_key++;
          }
          $cursor += $num_lines;
@ -192,9 +213,20 @@ final class PhutilRemarkupEngine extends PhutilMarkupEngine {
      }
    }
    // See T13487. It's common for blocks to be small, and this loop seems to
    // measure as faster if we manually concatenate blocks than if we
    // "array_slice()" and "implode()" blocks. This is a bit muddy.
    foreach ($blocks as $key => $block) {
-      $lines = array_slice($text, $block['start'], $block['num_lines']);
+      $min = $block['start'];
-      $blocks[$key]['text'] = implode('', $lines);
+      $max = $min + $block['num_lines'];
      $lines = '';
      for ($ii = $min; $ii < $max; $ii++) {
        $lines .= $text[$ii];
      }
      $blocks[$key]['text'] = $lines;
    }
    // Stop splitting child blocks apart if we get too deep. This arrests
@ -246,30 +278,48 @@ final class PhutilRemarkupEngine extends PhutilMarkupEngine {
    return $output;
  }
-  private static function shouldMergeBlocks($text, $prev_block, $curr_block) {
+  private static function shouldMergeParagraphBlocks(
-    $block_rules = ipull(array($prev_block, $curr_block), 'rule');
+    $text,
    $last_block,
    $current_block) {
-    $default_rule = 'PhutilRemarkupDefaultBlockRule';
+    // If we're at the beginning of the input, we can't merge.
-    try {
+    if ($last_block === null) {
-      assert_instances_of($block_rules, $default_rule);
+      return false;
    }
-      // If the last block was empty keep merging
+    // If the previous block wasn't a default block, we can't merge.
-      if ($prev_block['is_empty']) {
+    if (!$last_block['merge']) {
-        return true;
+      return false;
-      }
+    }
-      // If this line is blank keep merging
+    // If the current block isn't a default block, we can't merge.
-      if ($curr_block['is_empty']) {
+    if (!$current_block['merge']) {
-        return true;
+      return false;
-      }
+    }
-      // If the current line and the last line have content, keep merging
+    // If the last block was empty, we definitely want to merge.
-      if (strlen(trim($text[$curr_block['start'] - 1]))) {
+    if ($last_block['empty']) {
-        if (strlen(trim($text[$curr_block['start']]))) {
+      return true;
-          return true;
+    }
-        }
+
-      }
+    // If this block is empty, we definitely want to merge.
-    } catch (Exception $e) {}
+    if ($current_block['empty']) {
      return true;
    }
    // Check if the last line of the previous block or the first line of this
    // block have any non-whitespace text. If they both do, we're going to
    // merge.
    // If either of them are a blank line or a line with only whitespace, we
    // do not merge: this means we've found a paragraph break.
    $tail = $text[$current_block['start'] - 1];
    $head = $text[$current_block['start']];
    if (strlen(trim($tail)) && strlen(trim($head))) {
      return true;
    }
    return false;
  }
--- a/src/infrastructure/markup/rule/PhabricatorObjectRemarkupRule.php
+++ b/src/infrastructure/markup/rule/PhabricatorObjectRemarkupRule.php
@ -2,6 +2,9 @@
 abstract class PhabricatorObjectRemarkupRule extends PhutilRemarkupRule {
  private $referencePattern;
  private $embedPattern;
  const KEY_RULE_OBJECT = 'rule.object';
  const KEY_MENTIONED_OBJECTS = 'rule.object.mentioned';
@ -192,38 +195,48 @@ abstract class PhabricatorObjectRemarkupRule extends PhutilRemarkupRule {
  }
  private function getObjectEmbedPattern() {
-    $prefix = $this->getObjectNamePrefix();
+    if ($this->embedPattern === null) {
-    $prefix = preg_quote($prefix);
+      $prefix = $this->getObjectNamePrefix();
-    $id = $this->getObjectIDPattern();
+      $prefix = preg_quote($prefix);
      $id = $this->getObjectIDPattern();
-    return '(\B{'.$prefix.'('.$id.')([,\s](?:[^}\\\\]|\\\\.)*)?}\B)u';
+      $this->embedPattern =
        '(\B{'.$prefix.'('.$id.')([,\s](?:[^}\\\\]|\\\\.)*)?}\B)u';
    }
    return $this->embedPattern;
  }
  private function getObjectReferencePattern() {
-    $prefix = $this->getObjectNamePrefix();
+    if ($this->referencePattern === null) {
-    $prefix = preg_quote($prefix);
+      $prefix = $this->getObjectNamePrefix();
      $prefix = preg_quote($prefix);
-    $id = $this->getObjectIDPattern();
+      $id = $this->getObjectIDPattern();
-    // If the prefix starts with a word character (like "D"), we want to
+      // If the prefix starts with a word character (like "D"), we want to
-    // require a word boundary so that we don't match "XD1" as "D1". If the
+      // require a word boundary so that we don't match "XD1" as "D1". If the
-    // prefix does not start with a word character, we want to require no word
+      // prefix does not start with a word character, we want to require no word
-    // boundary for the same reasons. Test if the prefix starts with a word
+      // boundary for the same reasons. Test if the prefix starts with a word
-    // character.
+      // character.
-    if ($this->getObjectNamePrefixBeginsWithWordCharacter()) {
+      if ($this->getObjectNamePrefixBeginsWithWordCharacter()) {
-      $boundary = '\\b';
+        $boundary = '\\b';
-    } else {
+      } else {
-      $boundary = '\\B';
+        $boundary = '\\B';
      }
      // The "(?<![#@-])" prevents us from linking "#abcdef" or similar, and
      // "ABC-T1" (see T5714), and from matching "@T1" as a task (it is a user)
      // (see T9479).
      // The "\b" allows us to link "(abcdef)" or similar without linking things
      // in the middle of words.
      $this->referencePattern =
        '((?<![#@-])'.$boundary.$prefix.'('.$id.')(?:#([-\w\d]+))?(?!\w))u';
    }
-    // The "(?<![#@-])" prevents us from linking "#abcdef" or similar, and
+    return $this->referencePattern;
    // "ABC-T1" (see T5714), and from matching "@T1" as a task (it is a user)
    // (see T9479).
    // The "\b" allows us to link "(abcdef)" or similar without linking things
    // in the middle of words.
    return '((?<![#@-])'.$boundary.$prefix.'('.$id.')(?:#([-\w\d]+))?(?!\w))u';
  }