1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-26 00:32:42 +01:00

In prose diffs, use hash-and-diff for coarse "level 0" diffing to scale better

Summary: Depends on D20838. Fixes T13414. Instead of doing coarse diffing with "PhutilEditDistanceMatrix", use hash-and-diff with "DocumentEngine".

Test Plan:
  - On a large document (~3K top level blocks), saw a more sensible diff, instead of the whole thing falling back to "everything changed" mode.
  - On a small document, still saw a sensible granular diff.

{F6888249}

Maniphest Tasks: T13414

Differential Revision: https://secure.phabricator.com/D20839
This commit is contained in:
epriestley 2019-09-25 14:48:34 -07:00
parent 9d884f144f
commit 884cd74cc4
4 changed files with 130 additions and 48 deletions

View file

@ -12402,7 +12402,7 @@ phutil_register_library_map(array(
'PhutilPHPCodeSnippetContextFreeGrammar' => 'PhutilCLikeCodeSnippetContextFreeGrammar', 'PhutilPHPCodeSnippetContextFreeGrammar' => 'PhutilCLikeCodeSnippetContextFreeGrammar',
'PhutilPhabricatorAuthAdapter' => 'PhutilOAuthAuthAdapter', 'PhutilPhabricatorAuthAdapter' => 'PhutilOAuthAuthAdapter',
'PhutilProseDiff' => 'Phobject', 'PhutilProseDiff' => 'Phobject',
'PhutilProseDiffTestCase' => 'PhutilTestCase', 'PhutilProseDiffTestCase' => 'PhabricatorTestCase',
'PhutilProseDifferenceEngine' => 'Phobject', 'PhutilProseDifferenceEngine' => 'Phobject',
'PhutilQueryString' => 'Phobject', 'PhutilQueryString' => 'Phobject',
'PhutilRealNameContextFreeGrammar' => 'PhutilContextFreeGrammar', 'PhutilRealNameContextFreeGrammar' => 'PhutilContextFreeGrammar',

View file

@ -50,20 +50,35 @@ final class PhabricatorDocumentEngineBlocks
if ($old_line) { if ($old_line) {
$old_hash = rtrim($old_line['text'], "\n"); $old_hash = rtrim($old_line['text'], "\n");
$old_block = array_shift($old_map[$old_hash]); if (!strlen($old_hash)) {
$old_block->setDifferenceType($old_line['type']); // This can happen when one of the sources has no blocks.
$old_block = null;
} else {
$old_block = array_shift($old_map[$old_hash]);
$old_block->setDifferenceType($old_line['type']);
}
} else { } else {
$old_block = null; $old_block = null;
} }
if ($new_line) { if ($new_line) {
$new_hash = rtrim($new_line['text'], "\n"); $new_hash = rtrim($new_line['text'], "\n");
$new_block = array_shift($new_map[$new_hash]); if (!strlen($new_hash)) {
$new_block->setDifferenceType($new_line['type']); $new_block = null;
} else {
$new_block = array_shift($new_map[$new_hash]);
$new_block->setDifferenceType($new_line['type']);
}
} else { } else {
$new_block = null; $new_block = null;
} }
// If both lists are empty, we may generate a row which has two empty
// blocks.
if (!$old_block && !$new_block) {
continue;
}
$rows[] = array( $rows[] = array(
$old_block, $old_block,
$new_block, $new_block,

View file

@ -10,47 +10,14 @@ final class PhutilProseDifferenceEngine extends Phobject {
$u_parts = $this->splitCorpus($u, $level); $u_parts = $this->splitCorpus($u, $level);
$v_parts = $this->splitCorpus($v, $level); $v_parts = $this->splitCorpus($v, $level);
$matrix = id(new PhutilEditDistanceMatrix()) if ($level === 0) {
->setMaximumLength(128) $diff = $this->newHashDiff($u_parts, $v_parts);
->setSequences($u_parts, $v_parts) $too_large = false;
->setComputeString(true); } else {
list($diff, $too_large) = $this->newEditDistanceMatrixDiff(
// For word-level and character-level changes, smooth the output string $u_parts,
// to reduce the choppiness of the diff. $v_parts,
if ($level > 1) { $level);
$matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL);
}
$u_pos = 0;
$v_pos = 0;
$edits = $matrix->getEditString();
$edits_length = strlen($edits);
$diff = new PhutilProseDiff();
for ($ii = 0; $ii < $edits_length; $ii++) {
$c = $edits[$ii];
if ($c == 's') {
$diff->addPart('=', $u_parts[$u_pos]);
$u_pos++;
$v_pos++;
} else if ($c == 'd') {
$diff->addPart('-', $u_parts[$u_pos]);
$u_pos++;
} else if ($c == 'i') {
$diff->addPart('+', $v_parts[$v_pos]);
$v_pos++;
} else if ($c == 'x') {
$diff->addPart('-', $u_parts[$u_pos]);
$diff->addPart('+', $v_parts[$v_pos]);
$u_pos++;
$v_pos++;
} else {
throw new Exception(
pht(
'Unexpected character ("%s") in edit string.',
$c));
}
} }
$diff->reorderParts(); $diff->reorderParts();
@ -119,7 +86,7 @@ final class PhutilProseDifferenceEngine extends Phobject {
} else if (!strlen($new)) { } else if (!strlen($new)) {
$result->addPart('-', $old); $result->addPart('-', $old);
} else { } else {
if ($matrix->didReachMaximumLength()) { if ($too_large) {
// If this text was too big to diff, don't try to subdivide it. // If this text was too big to diff, don't try to subdivide it.
$result->addPart('-', $old); $result->addPart('-', $old);
$result->addPart('+', $new); $result->addPart('+', $new);
@ -206,4 +173,103 @@ final class PhutilProseDifferenceEngine extends Phobject {
return $results; return $results;
} }
private function newEditDistanceMatrixDiff(
array $u_parts,
array $v_parts,
$level) {
$matrix = id(new PhutilEditDistanceMatrix())
->setMaximumLength(128)
->setSequences($u_parts, $v_parts)
->setComputeString(true);
// For word-level and character-level changes, smooth the output string
// to reduce the choppiness of the diff.
if ($level > 1) {
$matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL);
}
$u_pos = 0;
$v_pos = 0;
$edits = $matrix->getEditString();
$edits_length = strlen($edits);
$diff = new PhutilProseDiff();
for ($ii = 0; $ii < $edits_length; $ii++) {
$c = $edits[$ii];
if ($c == 's') {
$diff->addPart('=', $u_parts[$u_pos]);
$u_pos++;
$v_pos++;
} else if ($c == 'd') {
$diff->addPart('-', $u_parts[$u_pos]);
$u_pos++;
} else if ($c == 'i') {
$diff->addPart('+', $v_parts[$v_pos]);
$v_pos++;
} else if ($c == 'x') {
$diff->addPart('-', $u_parts[$u_pos]);
$diff->addPart('+', $v_parts[$v_pos]);
$u_pos++;
$v_pos++;
} else {
throw new Exception(
pht(
'Unexpected character ("%s") in edit string.',
$c));
}
}
return array($diff, $matrix->didReachMaximumLength());
}
private function newHashDiff(array $u_parts, array $v_parts) {
$u_ref = new PhabricatorDocumentRef();
$v_ref = new PhabricatorDocumentRef();
$u_blocks = $this->newDocumentEngineBlocks($u_parts);
$v_blocks = $this->newDocumentEngineBlocks($v_parts);
$rows = id(new PhabricatorDocumentEngineBlocks())
->addBlockList($u_ref, $u_blocks)
->addBlockList($v_ref, $v_blocks)
->newTwoUpLayout();
$diff = new PhutilProseDiff();
foreach ($rows as $row) {
list($u_block, $v_block) = $row;
if ($u_block && $v_block) {
if ($u_block->getDifferenceType() === '-') {
$diff->addPart('-', $u_block->getContent());
$diff->addPart('+', $v_block->getContent());
} else {
$diff->addPart('=', $u_block->getContent());
}
} else if ($u_block) {
$diff->addPart('-', $u_block->getContent());
} else {
$diff->addPart('+', $v_block->getContent());
}
}
return $diff;
}
private function newDocumentEngineBlocks(array $parts) {
$blocks = array();
foreach ($parts as $part) {
$hash = PhabricatorHash::digestForIndex($part);
$blocks[] = id(new PhabricatorDocumentEngineBlock())
->setContent($part)
->setDifferenceHash($hash);
}
return $blocks;
}
} }

View file

@ -1,6 +1,7 @@
<?php <?php
final class PhutilProseDiffTestCase extends PhutilTestCase { final class PhutilProseDiffTestCase
extends PhabricatorTestCase {
public function testProseDiffsDistance() { public function testProseDiffsDistance() {
$this->assertProseParts( $this->assertProseParts(