Improve Remarkup parsing performance for certain large input blocks

Summary: Fixes T13487. In PHI1628, an install has a 4MB remarkup corpus which takes a long time to render. This is broadly expected, but a few reasonable improvements fell out of running it through the profiler. Test Plan: - Saw local cold-cache end-to-end rendering time drop from 12s to 4s for the highly secret input corpus. - Verified output has the same hashes before/after. - Ran all remarkup unit tests. Maniphest Tasks: T13487 Differential Revision: https://secure.phabricator.com/D20968
2024-11-22 14:52:41 +01:00 · 2020-02-04 14:19:04 -08:00 · 2020-02-04 14:19:04 -08:00 · fdbe9ba149
commit fdbe9ba149
parent 0e82bd024a
3 changed files with 143 additions and 74 deletions
--- a/src/infrastructure/markup/blockrule/PhutilRemarkupNoteBlockRule.php
+++ b/src/infrastructure/markup/blockrule/PhutilRemarkupNoteBlockRule.php
@ -100,22 +100,28 @@ final class PhutilRemarkupNoteBlockRule extends PhutilRemarkupBlockRule {
  }

  private function getRegEx() {
-    $words = array(
-      'NOTE',
-      'IMPORTANT',
-      'WARNING',
-    );
+    static $regex;

-    foreach ($words as $k => $word) {
-      $words[$k] = preg_quote($word, '/');
+    if ($regex === null) {
+      $words = array(
+        'NOTE',
+        'IMPORTANT',
+        'WARNING',
+      );
+
+      foreach ($words as $k => $word) {
+        $words[$k] = preg_quote($word, '/');
+      }
+      $words = implode('|', $words);
+
+      $regex =
+        '/^(?:'.
+        '(?:\((?P<hideword>'.$words.')\))'.
+        '|'.
+        '(?:(?P<showword>'.$words.'):))\s*'.
+        '/';
    }
-    $words = implode('|', $words);

-    return
-      '/^(?:'.
-      '(?:\((?P<hideword>'.$words.')\))'.
-      '|'.
-      '(?:(?P<showword>'.$words.'):))\s*'.
-      '/';
+    return $regex;
  }
 }
--- a/src/infrastructure/markup/remarkup/PhutilRemarkupEngine.php
+++ b/src/infrastructure/markup/remarkup/PhutilRemarkupEngine.php
@ -153,33 +153,54 @@ final class PhutilRemarkupEngine extends PhutilMarkupEngine {
    $block_rules = $this->blockRules;
    $blocks = array();
    $cursor = 0;
-    $prev_block = array();
+
+    $can_merge = array();
+    foreach ($block_rules as $key => $block_rule) {
+      if ($block_rule instanceof PhutilRemarkupDefaultBlockRule) {
+        $can_merge[$key] = true;
+      }
+    }
+
+    $last_block = null;
+    $last_block_key = -1;
+
+    // See T13487. For very large inputs, block separation can dominate
+    // runtime. This is written somewhat clumsily to attempt to handle
+    // very large inputs as gracefully as is practical.

    while (isset($text[$cursor])) {
      $starting_cursor = $cursor;
-      foreach ($block_rules as $block_rule) {
+      foreach ($block_rules as $block_key => $block_rule) {
        $num_lines = $block_rule->getMatchingLineCount($text, $cursor);

        if ($num_lines) {
-          if ($blocks) {
-            $prev_block = last($blocks);
-          }
-
-          $curr_block = array(
+          $current_block = array(
            'start' => $cursor,
            'num_lines' => $num_lines,
            'rule' => $block_rule,
-            'is_empty' => self::isEmptyBlock($text, $cursor, $num_lines),
+            'empty' => self::isEmptyBlock($text, $cursor, $num_lines),
            'children' => array(),
+            'merge' => isset($can_merge[$block_key]),
          );

-          if ($prev_block
-            && self::shouldMergeBlocks($text, $prev_block, $curr_block)) {
-            $blocks[last_key($blocks)]['num_lines'] += $curr_block['num_lines'];
-            $blocks[last_key($blocks)]['is_empty'] =
-              $blocks[last_key($blocks)]['is_empty'] && $curr_block['is_empty'];
+          $should_merge = self::shouldMergeParagraphBlocks(
+            $text,
+            $last_block,
+            $current_block);
+
+          if ($should_merge) {
+            $last_block['num_lines'] =
+              ($last_block['num_lines'] + $current_block['num_lines']);
+
+            $last_block['empty'] =
+              ($last_block['empty'] && $current_block['empty']);
+
+            $blocks[$last_block_key] = $last_block;
          } else {
-            $blocks[] = $curr_block;
+            $blocks[] = $current_block;
+
+            $last_block = $current_block;
+            $last_block_key++;
          }

          $cursor += $num_lines;
@ -192,9 +213,20 @@ final class PhutilRemarkupEngine extends PhutilMarkupEngine {
      }
    }

+    // See T13487. It's common for blocks to be small, and this loop seems to
+    // measure as faster if we manually concatenate blocks than if we
+    // "array_slice()" and "implode()" blocks. This is a bit muddy.
+
    foreach ($blocks as $key => $block) {
-      $lines = array_slice($text, $block['start'], $block['num_lines']);
-      $blocks[$key]['text'] = implode('', $lines);
+      $min = $block['start'];
+      $max = $min + $block['num_lines'];
+
+      $lines = '';
+      for ($ii = $min; $ii < $max; $ii++) {
+        $lines .= $text[$ii];
+      }
+
+      $blocks[$key]['text'] = $lines;
    }

    // Stop splitting child blocks apart if we get too deep. This arrests
@ -246,30 +278,48 @@ final class PhutilRemarkupEngine extends PhutilMarkupEngine {
    return $output;
  }

-  private static function shouldMergeBlocks($text, $prev_block, $curr_block) {
-    $block_rules = ipull(array($prev_block, $curr_block), 'rule');
+  private static function shouldMergeParagraphBlocks(
+    $text,
+    $last_block,
+    $current_block) {

-    $default_rule = 'PhutilRemarkupDefaultBlockRule';
-    try {
-      assert_instances_of($block_rules, $default_rule);
+    // If we're at the beginning of the input, we can't merge.
+    if ($last_block === null) {
+      return false;
+    }

-      // If the last block was empty keep merging
-      if ($prev_block['is_empty']) {
-        return true;
-      }
+    // If the previous block wasn't a default block, we can't merge.
+    if (!$last_block['merge']) {
+      return false;
+    }

-      // If this line is blank keep merging
-      if ($curr_block['is_empty']) {
-        return true;
-      }
+    // If the current block isn't a default block, we can't merge.
+    if (!$current_block['merge']) {
+      return false;
+    }

-      // If the current line and the last line have content, keep merging
-      if (strlen(trim($text[$curr_block['start'] - 1]))) {
-        if (strlen(trim($text[$curr_block['start']]))) {
-          return true;
-        }
-      }
-    } catch (Exception $e) {}
+    // If the last block was empty, we definitely want to merge.
+    if ($last_block['empty']) {
+      return true;
+    }
+
+    // If this block is empty, we definitely want to merge.
+    if ($current_block['empty']) {
+      return true;
+    }
+
+    // Check if the last line of the previous block or the first line of this
+    // block have any non-whitespace text. If they both do, we're going to
+    // merge.
+
+    // If either of them are a blank line or a line with only whitespace, we
+    // do not merge: this means we've found a paragraph break.
+
+    $tail = $text[$current_block['start'] - 1];
+    $head = $text[$current_block['start']];
+    if (strlen(trim($tail)) && strlen(trim($head))) {
+      return true;
+    }

    return false;
  }
--- a/src/infrastructure/markup/rule/PhabricatorObjectRemarkupRule.php
+++ b/src/infrastructure/markup/rule/PhabricatorObjectRemarkupRule.php
@ -2,6 +2,9 @@

 abstract class PhabricatorObjectRemarkupRule extends PhutilRemarkupRule {

+  private $referencePattern;
+  private $embedPattern;
+
  const KEY_RULE_OBJECT = 'rule.object';
  const KEY_MENTIONED_OBJECTS = 'rule.object.mentioned';

@ -192,38 +195,48 @@ abstract class PhabricatorObjectRemarkupRule extends PhutilRemarkupRule {
  }

  private function getObjectEmbedPattern() {
-    $prefix = $this->getObjectNamePrefix();
-    $prefix = preg_quote($prefix);
-    $id = $this->getObjectIDPattern();
+    if ($this->embedPattern === null) {
+      $prefix = $this->getObjectNamePrefix();
+      $prefix = preg_quote($prefix);
+      $id = $this->getObjectIDPattern();

-    return '(\B{'.$prefix.'('.$id.')([,\s](?:[^}\\\\]|\\\\.)*)?}\B)u';
+      $this->embedPattern =
+        '(\B{'.$prefix.'('.$id.')([,\s](?:[^}\\\\]|\\\\.)*)?}\B)u';
+    }
+
+    return $this->embedPattern;
  }

  private function getObjectReferencePattern() {
-    $prefix = $this->getObjectNamePrefix();
-    $prefix = preg_quote($prefix);
+    if ($this->referencePattern === null) {
+      $prefix = $this->getObjectNamePrefix();
+      $prefix = preg_quote($prefix);

-    $id = $this->getObjectIDPattern();
+      $id = $this->getObjectIDPattern();

-    // If the prefix starts with a word character (like "D"), we want to
-    // require a word boundary so that we don't match "XD1" as "D1". If the
-    // prefix does not start with a word character, we want to require no word
-    // boundary for the same reasons. Test if the prefix starts with a word
-    // character.
-    if ($this->getObjectNamePrefixBeginsWithWordCharacter()) {
-      $boundary = '\\b';
-    } else {
-      $boundary = '\\B';
+      // If the prefix starts with a word character (like "D"), we want to
+      // require a word boundary so that we don't match "XD1" as "D1". If the
+      // prefix does not start with a word character, we want to require no word
+      // boundary for the same reasons. Test if the prefix starts with a word
+      // character.
+      if ($this->getObjectNamePrefixBeginsWithWordCharacter()) {
+        $boundary = '\\b';
+      } else {
+        $boundary = '\\B';
+      }
+
+      // The "(?<![#@-])" prevents us from linking "#abcdef" or similar, and
+      // "ABC-T1" (see T5714), and from matching "@T1" as a task (it is a user)
+      // (see T9479).
+
+      // The "\b" allows us to link "(abcdef)" or similar without linking things
+      // in the middle of words.
+
+      $this->referencePattern =
+        '((?<![#@-])'.$boundary.$prefix.'('.$id.')(?:#([-\w\d]+))?(?!\w))u';
    }

-    // The "(?<![#@-])" prevents us from linking "#abcdef" or similar, and
-    // "ABC-T1" (see T5714), and from matching "@T1" as a task (it is a user)
-    // (see T9479).
-
-    // The "\b" allows us to link "(abcdef)" or similar without linking things
-    // in the middle of words.
-
-    return '((?<![#@-])'.$boundary.$prefix.'('.$id.')(?:#([-\w\d]+))?(?!\w))u';
+    return $this->referencePattern;
  }