mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-22 06:42:42 +01:00
In prose diffs, use hash-and-diff for coarse "level 0" diffing to scale better
Summary: Depends on D20838. Fixes T13414. Instead of doing coarse diffing with "PhutilEditDistanceMatrix", use hash-and-diff with "DocumentEngine". Test Plan: - On a large document (~3K top level blocks), saw a more sensible diff, instead of the whole thing falling back to "everything changed" mode. - On a small document, still saw a sensible granular diff. {F6888249} Maniphest Tasks: T13414 Differential Revision: https://secure.phabricator.com/D20839
This commit is contained in:
parent
9d884f144f
commit
884cd74cc4
4 changed files with 130 additions and 48 deletions
|
@ -12402,7 +12402,7 @@ phutil_register_library_map(array(
|
|||
'PhutilPHPCodeSnippetContextFreeGrammar' => 'PhutilCLikeCodeSnippetContextFreeGrammar',
|
||||
'PhutilPhabricatorAuthAdapter' => 'PhutilOAuthAuthAdapter',
|
||||
'PhutilProseDiff' => 'Phobject',
|
||||
'PhutilProseDiffTestCase' => 'PhutilTestCase',
|
||||
'PhutilProseDiffTestCase' => 'PhabricatorTestCase',
|
||||
'PhutilProseDifferenceEngine' => 'Phobject',
|
||||
'PhutilQueryString' => 'Phobject',
|
||||
'PhutilRealNameContextFreeGrammar' => 'PhutilContextFreeGrammar',
|
||||
|
|
|
@ -50,20 +50,35 @@ final class PhabricatorDocumentEngineBlocks
|
|||
|
||||
if ($old_line) {
|
||||
$old_hash = rtrim($old_line['text'], "\n");
|
||||
$old_block = array_shift($old_map[$old_hash]);
|
||||
$old_block->setDifferenceType($old_line['type']);
|
||||
if (!strlen($old_hash)) {
|
||||
// This can happen when one of the sources has no blocks.
|
||||
$old_block = null;
|
||||
} else {
|
||||
$old_block = array_shift($old_map[$old_hash]);
|
||||
$old_block->setDifferenceType($old_line['type']);
|
||||
}
|
||||
} else {
|
||||
$old_block = null;
|
||||
}
|
||||
|
||||
if ($new_line) {
|
||||
$new_hash = rtrim($new_line['text'], "\n");
|
||||
$new_block = array_shift($new_map[$new_hash]);
|
||||
$new_block->setDifferenceType($new_line['type']);
|
||||
if (!strlen($new_hash)) {
|
||||
$new_block = null;
|
||||
} else {
|
||||
$new_block = array_shift($new_map[$new_hash]);
|
||||
$new_block->setDifferenceType($new_line['type']);
|
||||
}
|
||||
} else {
|
||||
$new_block = null;
|
||||
}
|
||||
|
||||
// If both lists are empty, we may generate a row which has two empty
|
||||
// blocks.
|
||||
if (!$old_block && !$new_block) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$rows[] = array(
|
||||
$old_block,
|
||||
$new_block,
|
||||
|
|
|
@ -10,47 +10,14 @@ final class PhutilProseDifferenceEngine extends Phobject {
|
|||
$u_parts = $this->splitCorpus($u, $level);
|
||||
$v_parts = $this->splitCorpus($v, $level);
|
||||
|
||||
$matrix = id(new PhutilEditDistanceMatrix())
|
||||
->setMaximumLength(128)
|
||||
->setSequences($u_parts, $v_parts)
|
||||
->setComputeString(true);
|
||||
|
||||
// For word-level and character-level changes, smooth the output string
|
||||
// to reduce the choppiness of the diff.
|
||||
if ($level > 1) {
|
||||
$matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL);
|
||||
}
|
||||
|
||||
$u_pos = 0;
|
||||
$v_pos = 0;
|
||||
|
||||
$edits = $matrix->getEditString();
|
||||
$edits_length = strlen($edits);
|
||||
|
||||
$diff = new PhutilProseDiff();
|
||||
for ($ii = 0; $ii < $edits_length; $ii++) {
|
||||
$c = $edits[$ii];
|
||||
if ($c == 's') {
|
||||
$diff->addPart('=', $u_parts[$u_pos]);
|
||||
$u_pos++;
|
||||
$v_pos++;
|
||||
} else if ($c == 'd') {
|
||||
$diff->addPart('-', $u_parts[$u_pos]);
|
||||
$u_pos++;
|
||||
} else if ($c == 'i') {
|
||||
$diff->addPart('+', $v_parts[$v_pos]);
|
||||
$v_pos++;
|
||||
} else if ($c == 'x') {
|
||||
$diff->addPart('-', $u_parts[$u_pos]);
|
||||
$diff->addPart('+', $v_parts[$v_pos]);
|
||||
$u_pos++;
|
||||
$v_pos++;
|
||||
} else {
|
||||
throw new Exception(
|
||||
pht(
|
||||
'Unexpected character ("%s") in edit string.',
|
||||
$c));
|
||||
}
|
||||
if ($level === 0) {
|
||||
$diff = $this->newHashDiff($u_parts, $v_parts);
|
||||
$too_large = false;
|
||||
} else {
|
||||
list($diff, $too_large) = $this->newEditDistanceMatrixDiff(
|
||||
$u_parts,
|
||||
$v_parts,
|
||||
$level);
|
||||
}
|
||||
|
||||
$diff->reorderParts();
|
||||
|
@ -119,7 +86,7 @@ final class PhutilProseDifferenceEngine extends Phobject {
|
|||
} else if (!strlen($new)) {
|
||||
$result->addPart('-', $old);
|
||||
} else {
|
||||
if ($matrix->didReachMaximumLength()) {
|
||||
if ($too_large) {
|
||||
// If this text was too big to diff, don't try to subdivide it.
|
||||
$result->addPart('-', $old);
|
||||
$result->addPart('+', $new);
|
||||
|
@ -206,4 +173,103 @@ final class PhutilProseDifferenceEngine extends Phobject {
|
|||
return $results;
|
||||
}
|
||||
|
||||
private function newEditDistanceMatrixDiff(
|
||||
array $u_parts,
|
||||
array $v_parts,
|
||||
$level) {
|
||||
|
||||
$matrix = id(new PhutilEditDistanceMatrix())
|
||||
->setMaximumLength(128)
|
||||
->setSequences($u_parts, $v_parts)
|
||||
->setComputeString(true);
|
||||
|
||||
// For word-level and character-level changes, smooth the output string
|
||||
// to reduce the choppiness of the diff.
|
||||
if ($level > 1) {
|
||||
$matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL);
|
||||
}
|
||||
|
||||
$u_pos = 0;
|
||||
$v_pos = 0;
|
||||
|
||||
$edits = $matrix->getEditString();
|
||||
$edits_length = strlen($edits);
|
||||
|
||||
$diff = new PhutilProseDiff();
|
||||
for ($ii = 0; $ii < $edits_length; $ii++) {
|
||||
$c = $edits[$ii];
|
||||
if ($c == 's') {
|
||||
$diff->addPart('=', $u_parts[$u_pos]);
|
||||
$u_pos++;
|
||||
$v_pos++;
|
||||
} else if ($c == 'd') {
|
||||
$diff->addPart('-', $u_parts[$u_pos]);
|
||||
$u_pos++;
|
||||
} else if ($c == 'i') {
|
||||
$diff->addPart('+', $v_parts[$v_pos]);
|
||||
$v_pos++;
|
||||
} else if ($c == 'x') {
|
||||
$diff->addPart('-', $u_parts[$u_pos]);
|
||||
$diff->addPart('+', $v_parts[$v_pos]);
|
||||
$u_pos++;
|
||||
$v_pos++;
|
||||
} else {
|
||||
throw new Exception(
|
||||
pht(
|
||||
'Unexpected character ("%s") in edit string.',
|
||||
$c));
|
||||
}
|
||||
}
|
||||
|
||||
return array($diff, $matrix->didReachMaximumLength());
|
||||
}
|
||||
|
||||
private function newHashDiff(array $u_parts, array $v_parts) {
|
||||
|
||||
$u_ref = new PhabricatorDocumentRef();
|
||||
$v_ref = new PhabricatorDocumentRef();
|
||||
|
||||
$u_blocks = $this->newDocumentEngineBlocks($u_parts);
|
||||
$v_blocks = $this->newDocumentEngineBlocks($v_parts);
|
||||
|
||||
$rows = id(new PhabricatorDocumentEngineBlocks())
|
||||
->addBlockList($u_ref, $u_blocks)
|
||||
->addBlockList($v_ref, $v_blocks)
|
||||
->newTwoUpLayout();
|
||||
|
||||
$diff = new PhutilProseDiff();
|
||||
foreach ($rows as $row) {
|
||||
list($u_block, $v_block) = $row;
|
||||
|
||||
if ($u_block && $v_block) {
|
||||
if ($u_block->getDifferenceType() === '-') {
|
||||
$diff->addPart('-', $u_block->getContent());
|
||||
$diff->addPart('+', $v_block->getContent());
|
||||
} else {
|
||||
$diff->addPart('=', $u_block->getContent());
|
||||
}
|
||||
} else if ($u_block) {
|
||||
$diff->addPart('-', $u_block->getContent());
|
||||
} else {
|
||||
$diff->addPart('+', $v_block->getContent());
|
||||
}
|
||||
}
|
||||
|
||||
return $diff;
|
||||
}
|
||||
|
||||
private function newDocumentEngineBlocks(array $parts) {
|
||||
$blocks = array();
|
||||
|
||||
foreach ($parts as $part) {
|
||||
$hash = PhabricatorHash::digestForIndex($part);
|
||||
|
||||
$blocks[] = id(new PhabricatorDocumentEngineBlock())
|
||||
->setContent($part)
|
||||
->setDifferenceHash($hash);
|
||||
}
|
||||
|
||||
return $blocks;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
<?php
|
||||
|
||||
final class PhutilProseDiffTestCase extends PhutilTestCase {
|
||||
final class PhutilProseDiffTestCase
|
||||
extends PhabricatorTestCase {
|
||||
|
||||
public function testProseDiffsDistance() {
|
||||
$this->assertProseParts(
|
||||
|
|
Loading…
Reference in a new issue