mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-26 08:42:41 +01:00
Improve the performance of tab replacement in common cases
Summary: See PHI1210. For certain large inputs, we spend more time than we need to replacing tabs with spaces. Add some fast paths: - When a line only has tabs at the beginning of the line, we don't need to do as much work parsing the rest of the line. - When a line has no unicode characters, we don't need to vectorize it to get the right result. Test Plan: - Added test coverage. - Profiled this, got a ~60x performance increase on a 36,000 line 3MB text file. Reviewers: amckinley Reviewed By: amckinley Differential Revision: https://secure.phabricator.com/D20477
This commit is contained in:
parent
c5ecc388a2
commit
6bff2cee22
3 changed files with 169 additions and 36 deletions
|
@ -662,6 +662,7 @@ phutil_register_library_map(array(
|
||||||
'DifferentialSubscribersCommitMessageField' => 'applications/differential/field/DifferentialSubscribersCommitMessageField.php',
|
'DifferentialSubscribersCommitMessageField' => 'applications/differential/field/DifferentialSubscribersCommitMessageField.php',
|
||||||
'DifferentialSummaryCommitMessageField' => 'applications/differential/field/DifferentialSummaryCommitMessageField.php',
|
'DifferentialSummaryCommitMessageField' => 'applications/differential/field/DifferentialSummaryCommitMessageField.php',
|
||||||
'DifferentialSummaryField' => 'applications/differential/customfield/DifferentialSummaryField.php',
|
'DifferentialSummaryField' => 'applications/differential/customfield/DifferentialSummaryField.php',
|
||||||
|
'DifferentialTabReplacementTestCase' => 'applications/differential/parser/__tests__/DifferentialTabReplacementTestCase.php',
|
||||||
'DifferentialTagsCommitMessageField' => 'applications/differential/field/DifferentialTagsCommitMessageField.php',
|
'DifferentialTagsCommitMessageField' => 'applications/differential/field/DifferentialTagsCommitMessageField.php',
|
||||||
'DifferentialTasksCommitMessageField' => 'applications/differential/field/DifferentialTasksCommitMessageField.php',
|
'DifferentialTasksCommitMessageField' => 'applications/differential/field/DifferentialTasksCommitMessageField.php',
|
||||||
'DifferentialTestPlanCommitMessageField' => 'applications/differential/field/DifferentialTestPlanCommitMessageField.php',
|
'DifferentialTestPlanCommitMessageField' => 'applications/differential/field/DifferentialTestPlanCommitMessageField.php',
|
||||||
|
@ -6332,6 +6333,7 @@ phutil_register_library_map(array(
|
||||||
'DifferentialSubscribersCommitMessageField' => 'DifferentialCommitMessageField',
|
'DifferentialSubscribersCommitMessageField' => 'DifferentialCommitMessageField',
|
||||||
'DifferentialSummaryCommitMessageField' => 'DifferentialCommitMessageField',
|
'DifferentialSummaryCommitMessageField' => 'DifferentialCommitMessageField',
|
||||||
'DifferentialSummaryField' => 'DifferentialCoreCustomField',
|
'DifferentialSummaryField' => 'DifferentialCoreCustomField',
|
||||||
|
'DifferentialTabReplacementTestCase' => 'PhabricatorTestCase',
|
||||||
'DifferentialTagsCommitMessageField' => 'DifferentialCommitMessageField',
|
'DifferentialTagsCommitMessageField' => 'DifferentialCommitMessageField',
|
||||||
'DifferentialTasksCommitMessageField' => 'DifferentialCommitMessageField',
|
'DifferentialTasksCommitMessageField' => 'DifferentialCommitMessageField',
|
||||||
'DifferentialTestPlanCommitMessageField' => 'DifferentialCommitMessageField',
|
'DifferentialTestPlanCommitMessageField' => 'DifferentialCommitMessageField',
|
||||||
|
|
|
@ -1456,9 +1456,10 @@ final class DifferentialChangesetParser extends Phobject {
|
||||||
|
|
||||||
$line = phutil_string_cast($line);
|
$line = phutil_string_cast($line);
|
||||||
|
|
||||||
if (strpos($line, "\t") !== false) {
|
// TODO: This should be flexible, eventually.
|
||||||
$line = $this->replaceTabsWithSpaces($line);
|
$tab_width = 2;
|
||||||
}
|
|
||||||
|
$line = self::replaceTabsWithSpaces($line, $tab_width);
|
||||||
$line = str_replace($search, $replace, $line);
|
$line = str_replace($search, $replace, $line);
|
||||||
|
|
||||||
if ($is_html) {
|
if ($is_html) {
|
||||||
|
@ -1543,13 +1544,9 @@ final class DifferentialChangesetParser extends Phobject {
|
||||||
return $rules;
|
return $rules;
|
||||||
}
|
}
|
||||||
|
|
||||||
private function replaceTabsWithSpaces($line) {
|
public static function replaceTabsWithSpaces($line, $tab_width) {
|
||||||
// TODO: This should be flexible, eventually.
|
static $tags = array();
|
||||||
$tab_width = 2;
|
if (empty($tags[$tab_width])) {
|
||||||
|
|
||||||
static $tags;
|
|
||||||
if ($tags === null) {
|
|
||||||
$tags = array();
|
|
||||||
for ($ii = 1; $ii <= $tab_width; $ii++) {
|
for ($ii = 1; $ii <= $tab_width; $ii++) {
|
||||||
$tag = phutil_tag(
|
$tag = phutil_tag(
|
||||||
'span',
|
'span',
|
||||||
|
@ -1562,22 +1559,56 @@ final class DifferentialChangesetParser extends Phobject {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the line is particularly long, don't try to vectorize it. Use a
|
// Expand all prefix tabs until we encounter any non-tab character. This
|
||||||
// faster approximation of the correct tabstop expansion instead. This
|
// is cheap and often immediately produces the correct result with no
|
||||||
// usually still arrives at the right result.
|
// further work (and, particularly, no need to handle any unicode cases).
|
||||||
if (strlen($line) > 256) {
|
|
||||||
return str_replace("\t", $tags[$tab_width], $line);
|
$len = strlen($line);
|
||||||
|
|
||||||
|
$head = 0;
|
||||||
|
for ($head = 0; $head < $len; $head++) {
|
||||||
|
$char = $line[$head];
|
||||||
|
if ($char !== "\t") {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($head) {
|
||||||
|
if (empty($tags[$tab_width * $head])) {
|
||||||
|
$tags[$tab_width * $head] = str_repeat($tags[$tab_width], $head);
|
||||||
|
}
|
||||||
|
$prefix = $tags[$tab_width * $head];
|
||||||
|
$line = substr($line, $head);
|
||||||
|
} else {
|
||||||
|
$prefix = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we have no remaining tabs elsewhere in the string after taking care
|
||||||
|
// of all the prefix tabs, we're done.
|
||||||
|
if (strpos($line, "\t") === false) {
|
||||||
|
return $prefix.$line;
|
||||||
|
}
|
||||||
|
|
||||||
|
$len = strlen($line);
|
||||||
|
|
||||||
|
// If the line is particularly long, don't try to do anything special with
|
||||||
|
// it. Use a faster approximation of the correct tabstop expansion instead.
|
||||||
|
// This usually still arrives at the right result.
|
||||||
|
if ($len > 256) {
|
||||||
|
return $prefix.str_replace("\t", $tags[$tab_width], $line);
|
||||||
}
|
}
|
||||||
|
|
||||||
$line = phutil_utf8v_combined($line);
|
|
||||||
$in_tag = false;
|
$in_tag = false;
|
||||||
$pos = 0;
|
$pos = 0;
|
||||||
foreach ($line as $key => $char) {
|
|
||||||
if ($char === '<') {
|
|
||||||
$in_tag = true;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// See PHI1210. If the line only has single-byte characters, we don't need
|
||||||
|
// to vectorize it and can avoid an expensive UTF8 call.
|
||||||
|
|
||||||
|
$fast_path = preg_match('/^[\x01-\x7F]*\z/', $line);
|
||||||
|
if ($fast_path) {
|
||||||
|
$replace = array();
|
||||||
|
for ($ii = 0; $ii < $len; $ii++) {
|
||||||
|
$char = $line[$ii];
|
||||||
if ($char === '>') {
|
if ($char === '>') {
|
||||||
$in_tag = false;
|
$in_tag = false;
|
||||||
continue;
|
continue;
|
||||||
|
@ -1587,6 +1618,47 @@ final class DifferentialChangesetParser extends Phobject {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ($char === '<') {
|
||||||
|
$in_tag = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($char === "\t") {
|
||||||
|
$count = $tab_width - ($pos % $tab_width);
|
||||||
|
$pos += $count;
|
||||||
|
$replace[$ii] = $tags[$count];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$pos++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($replace) {
|
||||||
|
// Apply replacements starting at the end of the string so they
|
||||||
|
// don't mess up the offsets for following replacements.
|
||||||
|
$replace = array_reverse($replace, true);
|
||||||
|
|
||||||
|
foreach ($replace as $replace_pos => $replacement) {
|
||||||
|
$line = substr_replace($line, $replacement, $replace_pos, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
$line = phutil_utf8v_combined($line);
|
||||||
|
foreach ($line as $key => $char) {
|
||||||
|
if ($char === '>') {
|
||||||
|
$in_tag = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($in_tag) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($char === '<') {
|
||||||
|
$in_tag = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if ($char === "\t") {
|
if ($char === "\t") {
|
||||||
$count = $tab_width - ($pos % $tab_width);
|
$count = $tab_width - ($pos % $tab_width);
|
||||||
$pos += $count;
|
$pos += $count;
|
||||||
|
@ -1597,7 +1669,10 @@ final class DifferentialChangesetParser extends Phobject {
|
||||||
$pos++;
|
$pos++;
|
||||||
}
|
}
|
||||||
|
|
||||||
return implode('', $line);
|
$line = implode('', $line);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $prefix.$line;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,56 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
final class DifferentialTabReplacementTestCase
|
||||||
|
extends PhabricatorTestCase {
|
||||||
|
|
||||||
|
public function testTabReplacement() {
|
||||||
|
$tab1 = "<span data-copy-text=\"\t\"> </span>";
|
||||||
|
$tab2 = "<span data-copy-text=\"\t\"> </span>";
|
||||||
|
|
||||||
|
$cat = "\xF0\x9F\x90\xB1";
|
||||||
|
|
||||||
|
$cases = array(
|
||||||
|
'' => '',
|
||||||
|
'x' => 'x',
|
||||||
|
|
||||||
|
// Tabs inside HTML tags should not be replaced.
|
||||||
|
"<\t>x" => "<\t>x",
|
||||||
|
|
||||||
|
// Normal tabs should be replaced. These are all aligned to the tab
|
||||||
|
// width, so they'll be replaced inline.
|
||||||
|
"\tx" => "{$tab2}x",
|
||||||
|
" \tx" => " {$tab2}x",
|
||||||
|
"\t x" => "{$tab2} x",
|
||||||
|
"aa\tx" => "aa{$tab2}x",
|
||||||
|
"aa \tx" => "aa {$tab2}x",
|
||||||
|
"aa\t x" => "aa{$tab2} x",
|
||||||
|
|
||||||
|
// This tab is not tabstop-aligned, so it is replaced with fewer
|
||||||
|
// spaces to bring us to the next tabstop.
|
||||||
|
" \tx" => " {$tab1}x",
|
||||||
|
|
||||||
|
// Text inside HTML tags should not count when aligning tabs with
|
||||||
|
// tabstops.
|
||||||
|
"<tag> </tag>\tx" => "<tag> </tag>{$tab1}x",
|
||||||
|
"<tag2> </tag>\tx" => "<tag2> </tag>{$tab1}x",
|
||||||
|
|
||||||
|
// The code has to take a slow path when inputs contain unicode, but
|
||||||
|
// should produce the right results and align tabs to tabstops while
|
||||||
|
// respecting UTF8 display character widths, not byte widths.
|
||||||
|
"{$cat}\tx" => "{$cat}{$tab1}x",
|
||||||
|
"{$cat}{$cat}\tx" => "{$cat}{$cat}{$tab2}x",
|
||||||
|
);
|
||||||
|
|
||||||
|
foreach ($cases as $input => $expect) {
|
||||||
|
$actual = DifferentialChangesetParser::replaceTabsWithSpaces(
|
||||||
|
$input,
|
||||||
|
2);
|
||||||
|
|
||||||
|
$this->assertEqual(
|
||||||
|
$expect,
|
||||||
|
$actual,
|
||||||
|
pht('Tabs to Spaces: %s', $input));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in a new issue