Improve the performance of tab replacement in common cases

Summary: See PHI1210. For certain large inputs, we spend more time than we need to replacing tabs with spaces. Add some fast paths: - When a line only has tabs at the beginning of the line, we don't need to do as much work parsing the rest of the line. - When a line has no unicode characters, we don't need to vectorize it to get the right result. Test Plan: - Added test coverage. - Profiled this, got a ~60x performance increase on a 36,000 line 3MB text file. Reviewers: amckinley Reviewed By: amckinley Differential Revision: https://secure.phabricator.com/D20477
2025-02-26 21:49:08 +01:00 · 2019-04-23 20:26:13 -07:00 · 2019-04-23 20:26:13 -07:00 · 6bff2cee22
commit 6bff2cee22
parent c5ecc388a2
3 changed files with 169 additions and 36 deletions
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@ -662,6 +662,7 @@ phutil_register_library_map(array(
    'DifferentialSubscribersCommitMessageField' => 'applications/differential/field/DifferentialSubscribersCommitMessageField.php',
    'DifferentialSummaryCommitMessageField' => 'applications/differential/field/DifferentialSummaryCommitMessageField.php',
    'DifferentialSummaryField' => 'applications/differential/customfield/DifferentialSummaryField.php',
+    'DifferentialTabReplacementTestCase' => 'applications/differential/parser/__tests__/DifferentialTabReplacementTestCase.php',
    'DifferentialTagsCommitMessageField' => 'applications/differential/field/DifferentialTagsCommitMessageField.php',
    'DifferentialTasksCommitMessageField' => 'applications/differential/field/DifferentialTasksCommitMessageField.php',
    'DifferentialTestPlanCommitMessageField' => 'applications/differential/field/DifferentialTestPlanCommitMessageField.php',
@ -6332,6 +6333,7 @@ phutil_register_library_map(array(
    'DifferentialSubscribersCommitMessageField' => 'DifferentialCommitMessageField',
    'DifferentialSummaryCommitMessageField' => 'DifferentialCommitMessageField',
    'DifferentialSummaryField' => 'DifferentialCoreCustomField',
+    'DifferentialTabReplacementTestCase' => 'PhabricatorTestCase',
    'DifferentialTagsCommitMessageField' => 'DifferentialCommitMessageField',
    'DifferentialTasksCommitMessageField' => 'DifferentialCommitMessageField',
    'DifferentialTestPlanCommitMessageField' => 'DifferentialCommitMessageField',
--- a/src/applications/differential/parser/DifferentialChangesetParser.php
+++ b/src/applications/differential/parser/DifferentialChangesetParser.php
@ -1456,9 +1456,10 @@ final class DifferentialChangesetParser extends Phobject {

    $line = phutil_string_cast($line);

-    if (strpos($line, "\t") !== false) {
-      $line = $this->replaceTabsWithSpaces($line);
-    }
+    // TODO: This should be flexible, eventually.
+    $tab_width = 2;
+
+    $line = self::replaceTabsWithSpaces($line, $tab_width);
    $line = str_replace($search, $replace, $line);

    if ($is_html) {
@ -1543,13 +1544,9 @@ final class DifferentialChangesetParser extends Phobject {
    return $rules;
  }

-  private function replaceTabsWithSpaces($line) {
-    // TODO: This should be flexible, eventually.
-    $tab_width = 2;
-
-    static $tags;
-    if ($tags === null) {
-      $tags = array();
+  public static function replaceTabsWithSpaces($line, $tab_width) {
+    static $tags = array();
+    if (empty($tags[$tab_width])) {
      for ($ii = 1; $ii <= $tab_width; $ii++) {
        $tag = phutil_tag(
          'span',
@ -1562,42 +1559,120 @@ final class DifferentialChangesetParser extends Phobject {
      }
    }

-    // If the line is particularly long, don't try to vectorize it. Use a
-    // faster approximation of the correct tabstop expansion instead. This
-    // usually still arrives at the right result.
-    if (strlen($line) > 256) {
-      return str_replace("\t", $tags[$tab_width], $line);
+    // Expand all prefix tabs until we encounter any non-tab character. This
+    // is cheap and often immediately produces the correct result with no
+    // further work (and, particularly, no need to handle any unicode cases).
+
+    $len = strlen($line);
+
+    $head = 0;
+    for ($head = 0; $head < $len; $head++) {
+      $char = $line[$head];
+      if ($char !== "\t") {
+        break;
+      }
+    }
+
+    if ($head) {
+      if (empty($tags[$tab_width * $head])) {
+        $tags[$tab_width * $head] = str_repeat($tags[$tab_width], $head);
+      }
+      $prefix = $tags[$tab_width * $head];
+      $line = substr($line, $head);
+    } else {
+      $prefix = '';
+    }
+
+    // If we have no remaining tabs elsewhere in the string after taking care
+    // of all the prefix tabs, we're done.
+    if (strpos($line, "\t") === false) {
+      return $prefix.$line;
+    }
+
+    $len = strlen($line);
+
+    // If the line is particularly long, don't try to do anything special with
+    // it. Use a faster approximation of the correct tabstop expansion instead.
+    // This usually still arrives at the right result.
+    if ($len > 256) {
+      return $prefix.str_replace("\t", $tags[$tab_width], $line);
    }

-    $line = phutil_utf8v_combined($line);
    $in_tag = false;
    $pos = 0;
-    foreach ($line as $key => $char) {
-      if ($char === '<') {
-        $in_tag = true;
-        continue;
+
+    // See PHI1210. If the line only has single-byte characters, we don't need
+    // to vectorize it and can avoid an expensive UTF8 call.
+
+    $fast_path = preg_match('/^[\x01-\x7F]*\z/', $line);
+    if ($fast_path) {
+      $replace = array();
+      for ($ii = 0; $ii < $len; $ii++) {
+        $char = $line[$ii];
+        if ($char === '>') {
+          $in_tag = false;
+          continue;
+        }
+
+        if ($in_tag) {
+          continue;
+        }
+
+        if ($char === '<') {
+          $in_tag = true;
+          continue;
+        }
+
+        if ($char === "\t") {
+          $count = $tab_width - ($pos % $tab_width);
+          $pos += $count;
+          $replace[$ii] = $tags[$count];
+          continue;
+        }
+
+        $pos++;
      }

-      if ($char === '>') {
-        $in_tag = false;
-        continue;
+      if ($replace) {
+        // Apply replacements starting at the end of the string so they
+        // don't mess up the offsets for following replacements.
+        $replace = array_reverse($replace, true);
+
+        foreach ($replace as $replace_pos => $replacement) {
+          $line = substr_replace($line, $replacement, $replace_pos, 1);
+        }
+      }
+    } else {
+      $line = phutil_utf8v_combined($line);
+      foreach ($line as $key => $char) {
+        if ($char === '>') {
+          $in_tag = false;
+          continue;
+        }
+
+        if ($in_tag) {
+          continue;
+        }
+
+        if ($char === '<') {
+          $in_tag = true;
+          continue;
+        }
+
+        if ($char === "\t") {
+          $count = $tab_width - ($pos % $tab_width);
+          $pos += $count;
+          $line[$key] = $tags[$count];
+          continue;
+        }
+
+        $pos++;
      }

-      if ($in_tag) {
-        continue;
-      }
-
-      if ($char === "\t") {
-        $count = $tab_width - ($pos % $tab_width);
-        $pos += $count;
-        $line[$key] = $tags[$count];
-        continue;
-      }
-
-      $pos++;
+      $line = implode('', $line);
    }

-    return implode('', $line);
+    return $prefix.$line;
  }

 }
--- a/src/applications/differential/parser/tests/DifferentialTabReplacementTestCase.php
+++ b/src/applications/differential/parser/tests/DifferentialTabReplacementTestCase.php
@ -0,0 +1,56 @@
+<?php
+
+final class DifferentialTabReplacementTestCase
+  extends PhabricatorTestCase {
+
+  public function testTabReplacement() {
+    $tab1 = "<span data-copy-text=\"\t\"> </span>";
+    $tab2 = "<span data-copy-text=\"\t\">  </span>";
+
+    $cat = "\xF0\x9F\x90\xB1";
+
+    $cases = array(
+      '' => '',
+      'x' => 'x',
+
+      // Tabs inside HTML tags should not be replaced.
+      "<\t>x" => "<\t>x",
+
+      // Normal tabs should be replaced. These are all aligned to the tab
+      // width, so they'll be replaced inline.
+      "\tx" => "{$tab2}x",
+      "  \tx" => "  {$tab2}x",
+      "\t x" => "{$tab2} x",
+      "aa\tx" => "aa{$tab2}x",
+      "aa  \tx" => "aa  {$tab2}x",
+      "aa\t x" => "aa{$tab2} x",
+
+      // This tab is not tabstop-aligned, so it is replaced with fewer
+      // spaces to bring us to the next tabstop.
+      " \tx" => " {$tab1}x",
+
+      // Text inside HTML tags should not count when aligning tabs with
+      // tabstops.
+      "<tag> </tag>\tx" => "<tag> </tag>{$tab1}x",
+      "<tag2> </tag>\tx" => "<tag2> </tag>{$tab1}x",
+
+      // The code has to take a slow path when inputs contain unicode, but
+      // should produce the right results and align tabs to tabstops while
+      // respecting UTF8 display character widths, not byte widths.
+      "{$cat}\tx" => "{$cat}{$tab1}x",
+      "{$cat}{$cat}\tx" => "{$cat}{$cat}{$tab2}x",
+    );
+
+    foreach ($cases as $input => $expect) {
+      $actual = DifferentialChangesetParser::replaceTabsWithSpaces(
+        $input,
+        2);
+
+      $this->assertEqual(
+        $expect,
+        $actual,
+        pht('Tabs to Spaces: %s', $input));
+    }
+  }
+
+}