1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2025-01-26 22:48:19 +01:00

Improve the performance of tab replacement in common cases

Summary:
See PHI1210. For certain large inputs, we spend more time than we need to replacing tabs with spaces. Add some fast paths:

  - When a line only has tabs at the beginning of the line, we don't need to do as much work parsing the rest of the line.
  - When a line has no unicode characters, we don't need to vectorize it to get the right result.

Test Plan:
  - Added test coverage.
  - Profiled this, got a ~60x performance increase on a 36,000 line 3MB text file.

Reviewers: amckinley

Reviewed By: amckinley

Differential Revision: https://secure.phabricator.com/D20477
This commit is contained in:
epriestley 2019-04-23 20:26:13 -07:00
parent c5ecc388a2
commit 6bff2cee22
3 changed files with 169 additions and 36 deletions

View file

@ -662,6 +662,7 @@ phutil_register_library_map(array(
'DifferentialSubscribersCommitMessageField' => 'applications/differential/field/DifferentialSubscribersCommitMessageField.php',
'DifferentialSummaryCommitMessageField' => 'applications/differential/field/DifferentialSummaryCommitMessageField.php',
'DifferentialSummaryField' => 'applications/differential/customfield/DifferentialSummaryField.php',
'DifferentialTabReplacementTestCase' => 'applications/differential/parser/__tests__/DifferentialTabReplacementTestCase.php',
'DifferentialTagsCommitMessageField' => 'applications/differential/field/DifferentialTagsCommitMessageField.php',
'DifferentialTasksCommitMessageField' => 'applications/differential/field/DifferentialTasksCommitMessageField.php',
'DifferentialTestPlanCommitMessageField' => 'applications/differential/field/DifferentialTestPlanCommitMessageField.php',
@ -6332,6 +6333,7 @@ phutil_register_library_map(array(
'DifferentialSubscribersCommitMessageField' => 'DifferentialCommitMessageField',
'DifferentialSummaryCommitMessageField' => 'DifferentialCommitMessageField',
'DifferentialSummaryField' => 'DifferentialCoreCustomField',
'DifferentialTabReplacementTestCase' => 'PhabricatorTestCase',
'DifferentialTagsCommitMessageField' => 'DifferentialCommitMessageField',
'DifferentialTasksCommitMessageField' => 'DifferentialCommitMessageField',
'DifferentialTestPlanCommitMessageField' => 'DifferentialCommitMessageField',

View file

@ -1456,9 +1456,10 @@ final class DifferentialChangesetParser extends Phobject {
$line = phutil_string_cast($line);
if (strpos($line, "\t") !== false) {
$line = $this->replaceTabsWithSpaces($line);
}
// TODO: This should be flexible, eventually.
$tab_width = 2;
$line = self::replaceTabsWithSpaces($line, $tab_width);
$line = str_replace($search, $replace, $line);
if ($is_html) {
@ -1543,13 +1544,9 @@ final class DifferentialChangesetParser extends Phobject {
return $rules;
}
private function replaceTabsWithSpaces($line) {
// TODO: This should be flexible, eventually.
$tab_width = 2;
static $tags;
if ($tags === null) {
$tags = array();
public static function replaceTabsWithSpaces($line, $tab_width) {
static $tags = array();
if (empty($tags[$tab_width])) {
for ($ii = 1; $ii <= $tab_width; $ii++) {
$tag = phutil_tag(
'span',
@ -1562,42 +1559,120 @@ final class DifferentialChangesetParser extends Phobject {
}
}
// If the line is particularly long, don't try to vectorize it. Use a
// faster approximation of the correct tabstop expansion instead. This
// usually still arrives at the right result.
if (strlen($line) > 256) {
return str_replace("\t", $tags[$tab_width], $line);
// Expand all prefix tabs until we encounter any non-tab character. This
// is cheap and often immediately produces the correct result with no
// further work (and, particularly, no need to handle any unicode cases).
$len = strlen($line);
$head = 0;
for ($head = 0; $head < $len; $head++) {
$char = $line[$head];
if ($char !== "\t") {
break;
}
}
if ($head) {
if (empty($tags[$tab_width * $head])) {
$tags[$tab_width * $head] = str_repeat($tags[$tab_width], $head);
}
$prefix = $tags[$tab_width * $head];
$line = substr($line, $head);
} else {
$prefix = '';
}
// If we have no remaining tabs elsewhere in the string after taking care
// of all the prefix tabs, we're done.
if (strpos($line, "\t") === false) {
return $prefix.$line;
}
$len = strlen($line);
// If the line is particularly long, don't try to do anything special with
// it. Use a faster approximation of the correct tabstop expansion instead.
// This usually still arrives at the right result.
if ($len > 256) {
return $prefix.str_replace("\t", $tags[$tab_width], $line);
}
$line = phutil_utf8v_combined($line);
$in_tag = false;
$pos = 0;
foreach ($line as $key => $char) {
if ($char === '<') {
$in_tag = true;
continue;
// See PHI1210. If the line only has single-byte characters, we don't need
// to vectorize it and can avoid an expensive UTF8 call.
$fast_path = preg_match('/^[\x01-\x7F]*\z/', $line);
if ($fast_path) {
$replace = array();
for ($ii = 0; $ii < $len; $ii++) {
$char = $line[$ii];
if ($char === '>') {
$in_tag = false;
continue;
}
if ($in_tag) {
continue;
}
if ($char === '<') {
$in_tag = true;
continue;
}
if ($char === "\t") {
$count = $tab_width - ($pos % $tab_width);
$pos += $count;
$replace[$ii] = $tags[$count];
continue;
}
$pos++;
}
if ($char === '>') {
$in_tag = false;
continue;
if ($replace) {
// Apply replacements starting at the end of the string so they
// don't mess up the offsets for following replacements.
$replace = array_reverse($replace, true);
foreach ($replace as $replace_pos => $replacement) {
$line = substr_replace($line, $replacement, $replace_pos, 1);
}
}
} else {
$line = phutil_utf8v_combined($line);
foreach ($line as $key => $char) {
if ($char === '>') {
$in_tag = false;
continue;
}
if ($in_tag) {
continue;
}
if ($char === '<') {
$in_tag = true;
continue;
}
if ($char === "\t") {
$count = $tab_width - ($pos % $tab_width);
$pos += $count;
$line[$key] = $tags[$count];
continue;
}
$pos++;
}
if ($in_tag) {
continue;
}
if ($char === "\t") {
$count = $tab_width - ($pos % $tab_width);
$pos += $count;
$line[$key] = $tags[$count];
continue;
}
$pos++;
$line = implode('', $line);
}
return implode('', $line);
return $prefix.$line;
}
}

View file

@ -0,0 +1,56 @@
<?php
final class DifferentialTabReplacementTestCase
extends PhabricatorTestCase {
public function testTabReplacement() {
$tab1 = "<span data-copy-text=\"\t\"> </span>";
$tab2 = "<span data-copy-text=\"\t\"> </span>";
$cat = "\xF0\x9F\x90\xB1";
$cases = array(
'' => '',
'x' => 'x',
// Tabs inside HTML tags should not be replaced.
"<\t>x" => "<\t>x",
// Normal tabs should be replaced. These are all aligned to the tab
// width, so they'll be replaced inline.
"\tx" => "{$tab2}x",
" \tx" => " {$tab2}x",
"\t x" => "{$tab2} x",
"aa\tx" => "aa{$tab2}x",
"aa \tx" => "aa {$tab2}x",
"aa\t x" => "aa{$tab2} x",
// This tab is not tabstop-aligned, so it is replaced with fewer
// spaces to bring us to the next tabstop.
" \tx" => " {$tab1}x",
// Text inside HTML tags should not count when aligning tabs with
// tabstops.
"<tag> </tag>\tx" => "<tag> </tag>{$tab1}x",
"<tag2> </tag>\tx" => "<tag2> </tag>{$tab1}x",
// The code has to take a slow path when inputs contain unicode, but
// should produce the right results and align tabs to tabstops while
// respecting UTF8 display character widths, not byte widths.
"{$cat}\tx" => "{$cat}{$tab1}x",
"{$cat}{$cat}\tx" => "{$cat}{$cat}{$tab2}x",
);
foreach ($cases as $input => $expect) {
$actual = DifferentialChangesetParser::replaceTabsWithSpaces(
$input,
2);
$this->assertEqual(
$expect,
$actual,
pht('Tabs to Spaces: %s', $input));
}
}
}