mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-26 00:32:42 +01:00
Improve the performance of tab replacement in common cases
Summary: See PHI1210. For certain large inputs, we spend more time than we need to replacing tabs with spaces. Add some fast paths: - When a line only has tabs at the beginning of the line, we don't need to do as much work parsing the rest of the line. - When a line has no unicode characters, we don't need to vectorize it to get the right result. Test Plan: - Added test coverage. - Profiled this, got a ~60x performance increase on a 36,000 line 3MB text file. Reviewers: amckinley Reviewed By: amckinley Differential Revision: https://secure.phabricator.com/D20477
This commit is contained in:
parent
c5ecc388a2
commit
6bff2cee22
3 changed files with 169 additions and 36 deletions
|
@ -662,6 +662,7 @@ phutil_register_library_map(array(
|
|||
'DifferentialSubscribersCommitMessageField' => 'applications/differential/field/DifferentialSubscribersCommitMessageField.php',
|
||||
'DifferentialSummaryCommitMessageField' => 'applications/differential/field/DifferentialSummaryCommitMessageField.php',
|
||||
'DifferentialSummaryField' => 'applications/differential/customfield/DifferentialSummaryField.php',
|
||||
'DifferentialTabReplacementTestCase' => 'applications/differential/parser/__tests__/DifferentialTabReplacementTestCase.php',
|
||||
'DifferentialTagsCommitMessageField' => 'applications/differential/field/DifferentialTagsCommitMessageField.php',
|
||||
'DifferentialTasksCommitMessageField' => 'applications/differential/field/DifferentialTasksCommitMessageField.php',
|
||||
'DifferentialTestPlanCommitMessageField' => 'applications/differential/field/DifferentialTestPlanCommitMessageField.php',
|
||||
|
@ -6332,6 +6333,7 @@ phutil_register_library_map(array(
|
|||
'DifferentialSubscribersCommitMessageField' => 'DifferentialCommitMessageField',
|
||||
'DifferentialSummaryCommitMessageField' => 'DifferentialCommitMessageField',
|
||||
'DifferentialSummaryField' => 'DifferentialCoreCustomField',
|
||||
'DifferentialTabReplacementTestCase' => 'PhabricatorTestCase',
|
||||
'DifferentialTagsCommitMessageField' => 'DifferentialCommitMessageField',
|
||||
'DifferentialTasksCommitMessageField' => 'DifferentialCommitMessageField',
|
||||
'DifferentialTestPlanCommitMessageField' => 'DifferentialCommitMessageField',
|
||||
|
|
|
@ -1456,9 +1456,10 @@ final class DifferentialChangesetParser extends Phobject {
|
|||
|
||||
$line = phutil_string_cast($line);
|
||||
|
||||
if (strpos($line, "\t") !== false) {
|
||||
$line = $this->replaceTabsWithSpaces($line);
|
||||
}
|
||||
// TODO: This should be flexible, eventually.
|
||||
$tab_width = 2;
|
||||
|
||||
$line = self::replaceTabsWithSpaces($line, $tab_width);
|
||||
$line = str_replace($search, $replace, $line);
|
||||
|
||||
if ($is_html) {
|
||||
|
@ -1543,13 +1544,9 @@ final class DifferentialChangesetParser extends Phobject {
|
|||
return $rules;
|
||||
}
|
||||
|
||||
private function replaceTabsWithSpaces($line) {
|
||||
// TODO: This should be flexible, eventually.
|
||||
$tab_width = 2;
|
||||
|
||||
static $tags;
|
||||
if ($tags === null) {
|
||||
$tags = array();
|
||||
public static function replaceTabsWithSpaces($line, $tab_width) {
|
||||
static $tags = array();
|
||||
if (empty($tags[$tab_width])) {
|
||||
for ($ii = 1; $ii <= $tab_width; $ii++) {
|
||||
$tag = phutil_tag(
|
||||
'span',
|
||||
|
@ -1562,42 +1559,120 @@ final class DifferentialChangesetParser extends Phobject {
|
|||
}
|
||||
}
|
||||
|
||||
// If the line is particularly long, don't try to vectorize it. Use a
|
||||
// faster approximation of the correct tabstop expansion instead. This
|
||||
// usually still arrives at the right result.
|
||||
if (strlen($line) > 256) {
|
||||
return str_replace("\t", $tags[$tab_width], $line);
|
||||
// Expand all prefix tabs until we encounter any non-tab character. This
|
||||
// is cheap and often immediately produces the correct result with no
|
||||
// further work (and, particularly, no need to handle any unicode cases).
|
||||
|
||||
$len = strlen($line);
|
||||
|
||||
$head = 0;
|
||||
for ($head = 0; $head < $len; $head++) {
|
||||
$char = $line[$head];
|
||||
if ($char !== "\t") {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ($head) {
|
||||
if (empty($tags[$tab_width * $head])) {
|
||||
$tags[$tab_width * $head] = str_repeat($tags[$tab_width], $head);
|
||||
}
|
||||
$prefix = $tags[$tab_width * $head];
|
||||
$line = substr($line, $head);
|
||||
} else {
|
||||
$prefix = '';
|
||||
}
|
||||
|
||||
// If we have no remaining tabs elsewhere in the string after taking care
|
||||
// of all the prefix tabs, we're done.
|
||||
if (strpos($line, "\t") === false) {
|
||||
return $prefix.$line;
|
||||
}
|
||||
|
||||
$len = strlen($line);
|
||||
|
||||
// If the line is particularly long, don't try to do anything special with
|
||||
// it. Use a faster approximation of the correct tabstop expansion instead.
|
||||
// This usually still arrives at the right result.
|
||||
if ($len > 256) {
|
||||
return $prefix.str_replace("\t", $tags[$tab_width], $line);
|
||||
}
|
||||
|
||||
$line = phutil_utf8v_combined($line);
|
||||
$in_tag = false;
|
||||
$pos = 0;
|
||||
foreach ($line as $key => $char) {
|
||||
if ($char === '<') {
|
||||
$in_tag = true;
|
||||
continue;
|
||||
|
||||
// See PHI1210. If the line only has single-byte characters, we don't need
|
||||
// to vectorize it and can avoid an expensive UTF8 call.
|
||||
|
||||
$fast_path = preg_match('/^[\x01-\x7F]*\z/', $line);
|
||||
if ($fast_path) {
|
||||
$replace = array();
|
||||
for ($ii = 0; $ii < $len; $ii++) {
|
||||
$char = $line[$ii];
|
||||
if ($char === '>') {
|
||||
$in_tag = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($in_tag) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($char === '<') {
|
||||
$in_tag = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($char === "\t") {
|
||||
$count = $tab_width - ($pos % $tab_width);
|
||||
$pos += $count;
|
||||
$replace[$ii] = $tags[$count];
|
||||
continue;
|
||||
}
|
||||
|
||||
$pos++;
|
||||
}
|
||||
|
||||
if ($char === '>') {
|
||||
$in_tag = false;
|
||||
continue;
|
||||
if ($replace) {
|
||||
// Apply replacements starting at the end of the string so they
|
||||
// don't mess up the offsets for following replacements.
|
||||
$replace = array_reverse($replace, true);
|
||||
|
||||
foreach ($replace as $replace_pos => $replacement) {
|
||||
$line = substr_replace($line, $replacement, $replace_pos, 1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
$line = phutil_utf8v_combined($line);
|
||||
foreach ($line as $key => $char) {
|
||||
if ($char === '>') {
|
||||
$in_tag = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($in_tag) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($char === '<') {
|
||||
$in_tag = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($char === "\t") {
|
||||
$count = $tab_width - ($pos % $tab_width);
|
||||
$pos += $count;
|
||||
$line[$key] = $tags[$count];
|
||||
continue;
|
||||
}
|
||||
|
||||
$pos++;
|
||||
}
|
||||
|
||||
if ($in_tag) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($char === "\t") {
|
||||
$count = $tab_width - ($pos % $tab_width);
|
||||
$pos += $count;
|
||||
$line[$key] = $tags[$count];
|
||||
continue;
|
||||
}
|
||||
|
||||
$pos++;
|
||||
$line = implode('', $line);
|
||||
}
|
||||
|
||||
return implode('', $line);
|
||||
return $prefix.$line;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,56 @@
|
|||
<?php
|
||||
|
||||
final class DifferentialTabReplacementTestCase
|
||||
extends PhabricatorTestCase {
|
||||
|
||||
public function testTabReplacement() {
|
||||
$tab1 = "<span data-copy-text=\"\t\"> </span>";
|
||||
$tab2 = "<span data-copy-text=\"\t\"> </span>";
|
||||
|
||||
$cat = "\xF0\x9F\x90\xB1";
|
||||
|
||||
$cases = array(
|
||||
'' => '',
|
||||
'x' => 'x',
|
||||
|
||||
// Tabs inside HTML tags should not be replaced.
|
||||
"<\t>x" => "<\t>x",
|
||||
|
||||
// Normal tabs should be replaced. These are all aligned to the tab
|
||||
// width, so they'll be replaced inline.
|
||||
"\tx" => "{$tab2}x",
|
||||
" \tx" => " {$tab2}x",
|
||||
"\t x" => "{$tab2} x",
|
||||
"aa\tx" => "aa{$tab2}x",
|
||||
"aa \tx" => "aa {$tab2}x",
|
||||
"aa\t x" => "aa{$tab2} x",
|
||||
|
||||
// This tab is not tabstop-aligned, so it is replaced with fewer
|
||||
// spaces to bring us to the next tabstop.
|
||||
" \tx" => " {$tab1}x",
|
||||
|
||||
// Text inside HTML tags should not count when aligning tabs with
|
||||
// tabstops.
|
||||
"<tag> </tag>\tx" => "<tag> </tag>{$tab1}x",
|
||||
"<tag2> </tag>\tx" => "<tag2> </tag>{$tab1}x",
|
||||
|
||||
// The code has to take a slow path when inputs contain unicode, but
|
||||
// should produce the right results and align tabs to tabstops while
|
||||
// respecting UTF8 display character widths, not byte widths.
|
||||
"{$cat}\tx" => "{$cat}{$tab1}x",
|
||||
"{$cat}{$cat}\tx" => "{$cat}{$cat}{$tab2}x",
|
||||
);
|
||||
|
||||
foreach ($cases as $input => $expect) {
|
||||
$actual = DifferentialChangesetParser::replaceTabsWithSpaces(
|
||||
$input,
|
||||
2);
|
||||
|
||||
$this->assertEqual(
|
||||
$expect,
|
||||
$actual,
|
||||
pht('Tabs to Spaces: %s', $input));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in a new issue