mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-22 23:02:42 +01:00
In prose diffs, use hash-and-diff for coarse "level 0" diffing to scale better
Summary: Depends on D20838. Fixes T13414. Instead of doing coarse diffing with "PhutilEditDistanceMatrix", use hash-and-diff with "DocumentEngine". Test Plan: - On a large document (~3K top level blocks), saw a more sensible diff, instead of the whole thing falling back to "everything changed" mode. - On a small document, still saw a sensible granular diff. {F6888249} Maniphest Tasks: T13414 Differential Revision: https://secure.phabricator.com/D20839
This commit is contained in:
parent
9d884f144f
commit
884cd74cc4
4 changed files with 130 additions and 48 deletions
|
@ -12402,7 +12402,7 @@ phutil_register_library_map(array(
|
||||||
'PhutilPHPCodeSnippetContextFreeGrammar' => 'PhutilCLikeCodeSnippetContextFreeGrammar',
|
'PhutilPHPCodeSnippetContextFreeGrammar' => 'PhutilCLikeCodeSnippetContextFreeGrammar',
|
||||||
'PhutilPhabricatorAuthAdapter' => 'PhutilOAuthAuthAdapter',
|
'PhutilPhabricatorAuthAdapter' => 'PhutilOAuthAuthAdapter',
|
||||||
'PhutilProseDiff' => 'Phobject',
|
'PhutilProseDiff' => 'Phobject',
|
||||||
'PhutilProseDiffTestCase' => 'PhutilTestCase',
|
'PhutilProseDiffTestCase' => 'PhabricatorTestCase',
|
||||||
'PhutilProseDifferenceEngine' => 'Phobject',
|
'PhutilProseDifferenceEngine' => 'Phobject',
|
||||||
'PhutilQueryString' => 'Phobject',
|
'PhutilQueryString' => 'Phobject',
|
||||||
'PhutilRealNameContextFreeGrammar' => 'PhutilContextFreeGrammar',
|
'PhutilRealNameContextFreeGrammar' => 'PhutilContextFreeGrammar',
|
||||||
|
|
|
@ -50,20 +50,35 @@ final class PhabricatorDocumentEngineBlocks
|
||||||
|
|
||||||
if ($old_line) {
|
if ($old_line) {
|
||||||
$old_hash = rtrim($old_line['text'], "\n");
|
$old_hash = rtrim($old_line['text'], "\n");
|
||||||
|
if (!strlen($old_hash)) {
|
||||||
|
// This can happen when one of the sources has no blocks.
|
||||||
|
$old_block = null;
|
||||||
|
} else {
|
||||||
$old_block = array_shift($old_map[$old_hash]);
|
$old_block = array_shift($old_map[$old_hash]);
|
||||||
$old_block->setDifferenceType($old_line['type']);
|
$old_block->setDifferenceType($old_line['type']);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
$old_block = null;
|
$old_block = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($new_line) {
|
if ($new_line) {
|
||||||
$new_hash = rtrim($new_line['text'], "\n");
|
$new_hash = rtrim($new_line['text'], "\n");
|
||||||
|
if (!strlen($new_hash)) {
|
||||||
|
$new_block = null;
|
||||||
|
} else {
|
||||||
$new_block = array_shift($new_map[$new_hash]);
|
$new_block = array_shift($new_map[$new_hash]);
|
||||||
$new_block->setDifferenceType($new_line['type']);
|
$new_block->setDifferenceType($new_line['type']);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
$new_block = null;
|
$new_block = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If both lists are empty, we may generate a row which has two empty
|
||||||
|
// blocks.
|
||||||
|
if (!$old_block && !$new_block) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
$rows[] = array(
|
$rows[] = array(
|
||||||
$old_block,
|
$old_block,
|
||||||
$new_block,
|
$new_block,
|
||||||
|
|
|
@ -10,47 +10,14 @@ final class PhutilProseDifferenceEngine extends Phobject {
|
||||||
$u_parts = $this->splitCorpus($u, $level);
|
$u_parts = $this->splitCorpus($u, $level);
|
||||||
$v_parts = $this->splitCorpus($v, $level);
|
$v_parts = $this->splitCorpus($v, $level);
|
||||||
|
|
||||||
$matrix = id(new PhutilEditDistanceMatrix())
|
if ($level === 0) {
|
||||||
->setMaximumLength(128)
|
$diff = $this->newHashDiff($u_parts, $v_parts);
|
||||||
->setSequences($u_parts, $v_parts)
|
$too_large = false;
|
||||||
->setComputeString(true);
|
|
||||||
|
|
||||||
// For word-level and character-level changes, smooth the output string
|
|
||||||
// to reduce the choppiness of the diff.
|
|
||||||
if ($level > 1) {
|
|
||||||
$matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
$u_pos = 0;
|
|
||||||
$v_pos = 0;
|
|
||||||
|
|
||||||
$edits = $matrix->getEditString();
|
|
||||||
$edits_length = strlen($edits);
|
|
||||||
|
|
||||||
$diff = new PhutilProseDiff();
|
|
||||||
for ($ii = 0; $ii < $edits_length; $ii++) {
|
|
||||||
$c = $edits[$ii];
|
|
||||||
if ($c == 's') {
|
|
||||||
$diff->addPart('=', $u_parts[$u_pos]);
|
|
||||||
$u_pos++;
|
|
||||||
$v_pos++;
|
|
||||||
} else if ($c == 'd') {
|
|
||||||
$diff->addPart('-', $u_parts[$u_pos]);
|
|
||||||
$u_pos++;
|
|
||||||
} else if ($c == 'i') {
|
|
||||||
$diff->addPart('+', $v_parts[$v_pos]);
|
|
||||||
$v_pos++;
|
|
||||||
} else if ($c == 'x') {
|
|
||||||
$diff->addPart('-', $u_parts[$u_pos]);
|
|
||||||
$diff->addPart('+', $v_parts[$v_pos]);
|
|
||||||
$u_pos++;
|
|
||||||
$v_pos++;
|
|
||||||
} else {
|
} else {
|
||||||
throw new Exception(
|
list($diff, $too_large) = $this->newEditDistanceMatrixDiff(
|
||||||
pht(
|
$u_parts,
|
||||||
'Unexpected character ("%s") in edit string.',
|
$v_parts,
|
||||||
$c));
|
$level);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$diff->reorderParts();
|
$diff->reorderParts();
|
||||||
|
@ -119,7 +86,7 @@ final class PhutilProseDifferenceEngine extends Phobject {
|
||||||
} else if (!strlen($new)) {
|
} else if (!strlen($new)) {
|
||||||
$result->addPart('-', $old);
|
$result->addPart('-', $old);
|
||||||
} else {
|
} else {
|
||||||
if ($matrix->didReachMaximumLength()) {
|
if ($too_large) {
|
||||||
// If this text was too big to diff, don't try to subdivide it.
|
// If this text was too big to diff, don't try to subdivide it.
|
||||||
$result->addPart('-', $old);
|
$result->addPart('-', $old);
|
||||||
$result->addPart('+', $new);
|
$result->addPart('+', $new);
|
||||||
|
@ -206,4 +173,103 @@ final class PhutilProseDifferenceEngine extends Phobject {
|
||||||
return $results;
|
return $results;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private function newEditDistanceMatrixDiff(
|
||||||
|
array $u_parts,
|
||||||
|
array $v_parts,
|
||||||
|
$level) {
|
||||||
|
|
||||||
|
$matrix = id(new PhutilEditDistanceMatrix())
|
||||||
|
->setMaximumLength(128)
|
||||||
|
->setSequences($u_parts, $v_parts)
|
||||||
|
->setComputeString(true);
|
||||||
|
|
||||||
|
// For word-level and character-level changes, smooth the output string
|
||||||
|
// to reduce the choppiness of the diff.
|
||||||
|
if ($level > 1) {
|
||||||
|
$matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
$u_pos = 0;
|
||||||
|
$v_pos = 0;
|
||||||
|
|
||||||
|
$edits = $matrix->getEditString();
|
||||||
|
$edits_length = strlen($edits);
|
||||||
|
|
||||||
|
$diff = new PhutilProseDiff();
|
||||||
|
for ($ii = 0; $ii < $edits_length; $ii++) {
|
||||||
|
$c = $edits[$ii];
|
||||||
|
if ($c == 's') {
|
||||||
|
$diff->addPart('=', $u_parts[$u_pos]);
|
||||||
|
$u_pos++;
|
||||||
|
$v_pos++;
|
||||||
|
} else if ($c == 'd') {
|
||||||
|
$diff->addPart('-', $u_parts[$u_pos]);
|
||||||
|
$u_pos++;
|
||||||
|
} else if ($c == 'i') {
|
||||||
|
$diff->addPart('+', $v_parts[$v_pos]);
|
||||||
|
$v_pos++;
|
||||||
|
} else if ($c == 'x') {
|
||||||
|
$diff->addPart('-', $u_parts[$u_pos]);
|
||||||
|
$diff->addPart('+', $v_parts[$v_pos]);
|
||||||
|
$u_pos++;
|
||||||
|
$v_pos++;
|
||||||
|
} else {
|
||||||
|
throw new Exception(
|
||||||
|
pht(
|
||||||
|
'Unexpected character ("%s") in edit string.',
|
||||||
|
$c));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return array($diff, $matrix->didReachMaximumLength());
|
||||||
|
}
|
||||||
|
|
||||||
|
private function newHashDiff(array $u_parts, array $v_parts) {
|
||||||
|
|
||||||
|
$u_ref = new PhabricatorDocumentRef();
|
||||||
|
$v_ref = new PhabricatorDocumentRef();
|
||||||
|
|
||||||
|
$u_blocks = $this->newDocumentEngineBlocks($u_parts);
|
||||||
|
$v_blocks = $this->newDocumentEngineBlocks($v_parts);
|
||||||
|
|
||||||
|
$rows = id(new PhabricatorDocumentEngineBlocks())
|
||||||
|
->addBlockList($u_ref, $u_blocks)
|
||||||
|
->addBlockList($v_ref, $v_blocks)
|
||||||
|
->newTwoUpLayout();
|
||||||
|
|
||||||
|
$diff = new PhutilProseDiff();
|
||||||
|
foreach ($rows as $row) {
|
||||||
|
list($u_block, $v_block) = $row;
|
||||||
|
|
||||||
|
if ($u_block && $v_block) {
|
||||||
|
if ($u_block->getDifferenceType() === '-') {
|
||||||
|
$diff->addPart('-', $u_block->getContent());
|
||||||
|
$diff->addPart('+', $v_block->getContent());
|
||||||
|
} else {
|
||||||
|
$diff->addPart('=', $u_block->getContent());
|
||||||
|
}
|
||||||
|
} else if ($u_block) {
|
||||||
|
$diff->addPart('-', $u_block->getContent());
|
||||||
|
} else {
|
||||||
|
$diff->addPart('+', $v_block->getContent());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $diff;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function newDocumentEngineBlocks(array $parts) {
|
||||||
|
$blocks = array();
|
||||||
|
|
||||||
|
foreach ($parts as $part) {
|
||||||
|
$hash = PhabricatorHash::digestForIndex($part);
|
||||||
|
|
||||||
|
$blocks[] = id(new PhabricatorDocumentEngineBlock())
|
||||||
|
->setContent($part)
|
||||||
|
->setDifferenceHash($hash);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $blocks;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
<?php
|
<?php
|
||||||
|
|
||||||
final class PhutilProseDiffTestCase extends PhutilTestCase {
|
final class PhutilProseDiffTestCase
|
||||||
|
extends PhabricatorTestCase {
|
||||||
|
|
||||||
public function testProseDiffsDistance() {
|
public function testProseDiffsDistance() {
|
||||||
$this->assertProseParts(
|
$this->assertProseParts(
|
||||||
|
|
Loading…
Reference in a new issue