mirror of
https://we.phorge.it/source/arcanist.git
synced 2024-11-10 08:52:39 +01:00
Compute UTF8 string differences correctly, accounting for combining characters
Summary: @Afaque_Hussain has done a bunch of utf8 work here; combined with PhutilEditDistanceMatrix we can now do utf8 diffs correctly, in a general way, without a significant performance impact. Use PhutilEditDistanceMatrix and `phutil_utf8v_combined()` to compute accurate diffs for all (or, at least, most) UTF8 text. The only thing this doesn't handle completely correctly is lines beginning with combining characters. This is messy/expensive to handle and will probably never actually happen, so I'm punting for now. Nothing should actually break. The utf8 stuff will be slow, but we only pay for it when we need it. Test Plan: Ran unit tests. I changed a few unit tests to use a non-combining character (snowman) for clarity, and some results are different now (since we get combining characters right). {F44064} Reviewers: btrahan, Afaque_Hussain Reviewed By: btrahan CC: aran Maniphest Tasks: T2379 Differential Revision: https://secure.phabricator.com/D6019
This commit is contained in:
parent
24d54a5fbd
commit
1fcf7bac4e
2 changed files with 163 additions and 184 deletions
|
@ -48,69 +48,17 @@ final class ArcanistDiffUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static function generateIntralineDiff($o, $n) {
|
public static function generateIntralineDiff($o, $n) {
|
||||||
if (!strlen($o) || !strlen($n)) {
|
$ol = strlen($o);
|
||||||
|
$nl = strlen($n);
|
||||||
|
|
||||||
|
if (($o === $n) || !$ol || !$nl) {
|
||||||
return array(
|
return array(
|
||||||
array(array(0, strlen($o))),
|
array(array(0, $ol)),
|
||||||
array(array(0, strlen($n)))
|
array(array(0, $nl))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// This algorithm is byte-oriented and thus not safe for UTF-8, so just
|
return self::computeIntralineEdits($o, $n);
|
||||||
// mark all the text as changed if either string has multibyte characters
|
|
||||||
// in it. TODO: Fix this so that this algorithm is UTF-8 aware.
|
|
||||||
if (preg_match('/[\x80-\xFF]/', $o.$n)) {
|
|
||||||
return self::generateUTF8IntralineDiff($o, $n);
|
|
||||||
}
|
|
||||||
|
|
||||||
$result = self::buildLevenshteinDifferenceString($o, $n);
|
|
||||||
|
|
||||||
do {
|
|
||||||
$orig = $result;
|
|
||||||
$result = preg_replace(
|
|
||||||
'/([xdi])(s{3})([xdi])/',
|
|
||||||
'$1xxx$3',
|
|
||||||
$result);
|
|
||||||
$result = preg_replace(
|
|
||||||
'/([xdi])(s{2})([xdi])/',
|
|
||||||
'$1xx$3',
|
|
||||||
$result);
|
|
||||||
$result = preg_replace(
|
|
||||||
'/([xdi])(s{1})([xdi])/',
|
|
||||||
'$1x$3',
|
|
||||||
$result);
|
|
||||||
} while ($result != $orig);
|
|
||||||
|
|
||||||
$o_bright = array();
|
|
||||||
$n_bright = array();
|
|
||||||
$rlen = strlen($result);
|
|
||||||
$len = -1;
|
|
||||||
$cur = $result[0];
|
|
||||||
$result .= '-';
|
|
||||||
for ($ii = 0; $ii < strlen($result); $ii++) {
|
|
||||||
$len++;
|
|
||||||
$now = $result[$ii];
|
|
||||||
if ($result[$ii] == $cur) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if ($cur == 's') {
|
|
||||||
$o_bright[] = array(0, $len);
|
|
||||||
$n_bright[] = array(0, $len);
|
|
||||||
} else if ($cur == 'd') {
|
|
||||||
$o_bright[] = array(1, $len);
|
|
||||||
} else if ($cur == 'i') {
|
|
||||||
$n_bright[] = array(1, $len);
|
|
||||||
} else if ($cur == 'x') {
|
|
||||||
$o_bright[] = array(1, $len);
|
|
||||||
$n_bright[] = array(1, $len);
|
|
||||||
}
|
|
||||||
$cur = $now;
|
|
||||||
$len = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
$o_bright = self::collapseIntralineRuns($o_bright);
|
|
||||||
$n_bright = self::collapseIntralineRuns($n_bright);
|
|
||||||
|
|
||||||
return array($o_bright, $n_bright);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static function applyIntralineDiff($str, $intra_stack) {
|
public static function applyIntralineDiff($str, $intra_stack) {
|
||||||
|
@ -198,108 +146,111 @@ final class ArcanistDiffUtils {
|
||||||
return array_values($runs);
|
return array_values($runs);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static function buildLevenshteinDifferenceString($o, $n) {
|
public static function generateEditString(array $ov, array $nv, $max = 80) {
|
||||||
$olt = strlen($o);
|
|
||||||
$nlt = strlen($n);
|
|
||||||
|
|
||||||
if (!$olt) {
|
|
||||||
return str_repeat('i', $nlt);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!$nlt) {
|
|
||||||
return str_repeat('d', $olt);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ($o === $n) {
|
|
||||||
return str_repeat('s', $olt);
|
|
||||||
}
|
|
||||||
|
|
||||||
$ov = str_split($o);
|
|
||||||
$nv = str_split($n);
|
|
||||||
|
|
||||||
return id(new PhutilEditDistanceMatrix())
|
return id(new PhutilEditDistanceMatrix())
|
||||||
->setComputeString(true)
|
->setComputeString(true)
|
||||||
->setAlterCost(0.001)
|
->setAlterCost(1 / ($max * 2))
|
||||||
->setReplaceCost(2)
|
->setReplaceCost(2)
|
||||||
->setMaximumLength(80)
|
->setMaximumLength($max)
|
||||||
->setSequences($ov, $nv)
|
->setSequences($ov, $nv)
|
||||||
->getEditString();
|
->getEditString();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static function generateUTF8IntralineDiff($o, $n) {
|
public static function computeIntralineEdits($o, $n) {
|
||||||
if (!strlen($o) || !strlen($n)) {
|
if (preg_match('/[\x80-\xFF]/', $o.$n)) {
|
||||||
return array(
|
$ov = phutil_utf8v_combined($o);
|
||||||
array(array(0, strlen($o))),
|
$nv = phutil_utf8v_combined($n);
|
||||||
array(array(0, strlen($n)))
|
$multibyte = true;
|
||||||
);
|
} else {
|
||||||
|
$ov = str_split($o);
|
||||||
|
$nv = str_split($n);
|
||||||
|
$multibyte = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Breaking both the strings into their component characters
|
$result = self::generateEditString($ov, $nv);
|
||||||
$old_characters = phutil_utf8v($o);
|
|
||||||
$new_characters = phutil_utf8v($n);
|
|
||||||
|
|
||||||
$old_count = count($old_characters);
|
// Smooth the string out, by replacing short runs of similar characters
|
||||||
$new_count = count($new_characters);
|
// with 'x' operations. This makes the result more readable to humans, since
|
||||||
|
// there are fewer choppy runs of short added and removed substrings.
|
||||||
|
do {
|
||||||
|
$original = $result;
|
||||||
|
$result = preg_replace(
|
||||||
|
'/([xdi])(s{3})([xdi])/',
|
||||||
|
'$1xxx$3',
|
||||||
|
$result);
|
||||||
|
$result = preg_replace(
|
||||||
|
'/([xdi])(s{2})([xdi])/',
|
||||||
|
'$1xx$3',
|
||||||
|
$result);
|
||||||
|
$result = preg_replace(
|
||||||
|
'/([xdi])(s{1})([xdi])/',
|
||||||
|
'$1x$3',
|
||||||
|
$result);
|
||||||
|
} while ($result != $original);
|
||||||
|
|
||||||
$prefix_match_length = 0;
|
// Now we have a character-based description of the edit. We need to
|
||||||
$suffix_match_length = 0;
|
// convert into a byte-based description. Walk through the edit string and
|
||||||
|
// adjust each operation to reflect the number of bytes in the underlying
|
||||||
|
// character.
|
||||||
|
|
||||||
// Prefix matching.
|
$o_pos = 0;
|
||||||
for ($i = 0; $i < $old_count; $i++) {
|
$n_pos = 0;
|
||||||
if ($old_characters[$i] != $new_characters[$i]) {
|
$result_len = strlen($result);
|
||||||
$prefix_match_length = $i;
|
$o_run = array();
|
||||||
|
$n_run = array();
|
||||||
|
|
||||||
|
$old_char_len = 1;
|
||||||
|
$new_char_len = 1;
|
||||||
|
|
||||||
|
for ($ii = 0; $ii < $result_len; $ii++) {
|
||||||
|
$c = $result[$ii];
|
||||||
|
|
||||||
|
if ($multibyte) {
|
||||||
|
$old_char_len = strlen($ov[$o_pos]);
|
||||||
|
$new_char_len = strlen($nv[$n_pos]);
|
||||||
|
}
|
||||||
|
|
||||||
|
switch ($c) {
|
||||||
|
case 's':
|
||||||
|
case 'x':
|
||||||
|
$byte_o = $old_char_len;
|
||||||
|
$byte_n = $new_char_len;
|
||||||
|
$o_pos++;
|
||||||
|
$n_pos++;
|
||||||
break;
|
break;
|
||||||
}
|
case 'i':
|
||||||
}
|
$byte_o = 0;
|
||||||
|
$byte_n = $new_char_len;
|
||||||
// Return no change.
|
$n_pos++;
|
||||||
if ($old_count == $new_count && $i == $old_count) {
|
break;
|
||||||
return array(
|
case 'd':
|
||||||
array(array(0, strlen($o))),
|
$byte_o = $old_char_len;
|
||||||
array(array(0, strlen($n)))
|
$byte_n = 0;
|
||||||
);
|
$o_pos++;
|
||||||
}
|
|
||||||
|
|
||||||
// Suffix Matching.
|
|
||||||
$i = $old_count - 1;
|
|
||||||
$j = $new_count - 1;
|
|
||||||
|
|
||||||
while ($i >= 0 && $j >= 0) {
|
|
||||||
if ($old_characters[$i] != $new_characters[$j]) {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
$i--;
|
if ($byte_o) {
|
||||||
$j--;
|
if ($c == 's') {
|
||||||
$suffix_match_length++;
|
$o_run[] = array(0, $byte_o);
|
||||||
|
} else {
|
||||||
|
$o_run[] = array(1, $byte_o);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Just a temporary fix for the edge cases where, the strings differ
|
if ($byte_n) {
|
||||||
// only at beginnning, only in the end and both at the beginning and end.
|
if ($c == 's') {
|
||||||
if (!$prefix_match_length || !$suffix_match_length) {
|
$n_run[] = array(0, $byte_n);
|
||||||
return array(
|
} else {
|
||||||
array(array(1, strlen($o))),
|
$n_run[] = array(1, $byte_n);
|
||||||
array(array(1, strlen($n)))
|
}
|
||||||
);
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
$old_length = strlen($o);
|
$o_run = self::collapseIntralineRuns($o_run);
|
||||||
$new_length = strlen($n);
|
$n_run = self::collapseIntralineRuns($n_run);
|
||||||
|
|
||||||
return array(
|
|
||||||
array(
|
|
||||||
array(0, $prefix_match_length),
|
|
||||||
array(1, $old_length - $prefix_match_length - $suffix_match_length),
|
|
||||||
array(0, $suffix_match_length),
|
|
||||||
),
|
|
||||||
array(
|
|
||||||
array(0, $prefix_match_length),
|
|
||||||
array(1, $new_length - $prefix_match_length - $suffix_match_length),
|
|
||||||
array(0, $suffix_match_length),
|
|
||||||
)
|
|
||||||
);
|
|
||||||
|
|
||||||
|
return array($o_run, $n_run);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -92,11 +92,28 @@ final class ArcanistDiffUtilsTestCase extends ArcanistTestCase {
|
||||||
foreach ($tests as $test) {
|
foreach ($tests as $test) {
|
||||||
$this->assertEqual(
|
$this->assertEqual(
|
||||||
$test[2],
|
$test[2],
|
||||||
ArcanistDiffUtils::buildLevenshteinDifferenceString(
|
ArcanistDiffUtils::generateEditString(
|
||||||
$test[0],
|
str_split($test[0]),
|
||||||
$test[1]),
|
str_split($test[1])),
|
||||||
"'{$test[0]}' vs '{$test[1]}'");
|
"'{$test[0]}' vs '{$test[1]}'");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$utf8_tests = array(
|
||||||
|
array(
|
||||||
|
"GrumpyCat",
|
||||||
|
"Grumpy\xE2\x98\x83at",
|
||||||
|
'ssssssxss',
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
foreach ($tests as $test) {
|
||||||
|
$this->assertEqual(
|
||||||
|
$test[2],
|
||||||
|
ArcanistDiffUtils::generateEditString(
|
||||||
|
phutil_utf8v_combined($test[0]),
|
||||||
|
phutil_utf8v_combined($test[1])),
|
||||||
|
"'{$test[0]}' vs '{$test[1]}' (utf8)");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public function testGenerateUTF8IntralineDiff() {
|
public function testGenerateUTF8IntralineDiff() {
|
||||||
|
@ -109,106 +126,117 @@ final class ArcanistDiffUtilsTestCase extends ArcanistTestCase {
|
||||||
);
|
);
|
||||||
$this->assertEqual(
|
$this->assertEqual(
|
||||||
$result,
|
$result,
|
||||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
ArcanistDiffUtils::generateIntralineDiff($left, $right));
|
||||||
|
|
||||||
// Left String Empty.
|
// Left String Empty.
|
||||||
$left = "";
|
$left = "";
|
||||||
$right = "Grumpy\xCD\xA0at";
|
$right = "Grumpy\xE2\x98\x83at";
|
||||||
$result = array(
|
$result = array(
|
||||||
array(array(0, 0)),
|
array(array(0, 0)),
|
||||||
array(array(0, 10))
|
array(array(0, 11))
|
||||||
);
|
);
|
||||||
$this->assertEqual(
|
$this->assertEqual(
|
||||||
$result,
|
$result,
|
||||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
ArcanistDiffUtils::generateIntralineDiff($left, $right));
|
||||||
|
|
||||||
// Right String Empty.
|
// Right String Empty.
|
||||||
$left = "Grumpy\xCD\xA0at";
|
$left = "Grumpy\xE2\x98\x83at";
|
||||||
$right = "";
|
$right = "";
|
||||||
$result = array(
|
$result = array(
|
||||||
array(array(0, 10)),
|
array(array(0, 11)),
|
||||||
array(array(0, 0))
|
array(array(0, 0))
|
||||||
);
|
);
|
||||||
$this->assertEqual(
|
$this->assertEqual(
|
||||||
$result,
|
$result,
|
||||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
ArcanistDiffUtils::generateIntralineDiff($left, $right));
|
||||||
|
|
||||||
// Both Strings Same
|
// Both Strings Same
|
||||||
$left = "Grumpy\xCD\xA0at";
|
$left = "Grumpy\xE2\x98\x83at";
|
||||||
$right = "Grumpy\xCD\xA0at";
|
$right = "Grumpy\xE2\x98\x83at";
|
||||||
$result = array(
|
$result = array(
|
||||||
array(array(0, 10)),
|
array(array(0, 11)),
|
||||||
array(array(0, 10))
|
array(array(0, 11))
|
||||||
);
|
);
|
||||||
$this->assertEqual(
|
$this->assertEqual(
|
||||||
$result,
|
$result,
|
||||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
ArcanistDiffUtils::generateIntralineDiff($left, $right));
|
||||||
|
|
||||||
// Both Strings are different.
|
// Both Strings are different.
|
||||||
$left = "Grumpy\xCD\xA0at";
|
$left = "Grumpy\xE2\x98\x83at";
|
||||||
$right = "Smiling Dog";
|
$right = "Smiling Dog";
|
||||||
$result = array(
|
$result = array(
|
||||||
array(array(1, 10)),
|
array(array(1, 11)),
|
||||||
array(array(1, 11))
|
array(array(1, 11))
|
||||||
);
|
);
|
||||||
$this->assertEqual(
|
$this->assertEqual(
|
||||||
$result,
|
$result,
|
||||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
ArcanistDiffUtils::generateIntralineDiff($left, $right));
|
||||||
|
|
||||||
// String with one difference in the middle.
|
// String with one difference in the middle.
|
||||||
$left = "GrumpyCat";
|
$left = "GrumpyCat";
|
||||||
$right = "Grumpy\xCD\xA0at";
|
$right = "Grumpy\xE2\x98\x83at";
|
||||||
$result = array(
|
$result = array(
|
||||||
array(array(0, 6), array(1, 1), array(0, 2)),
|
array(array(0, 6), array(1, 1), array(0, 2)),
|
||||||
array(array(0, 6), array(1, 2), array(0, 2))
|
array(array(0, 6), array(1, 3), array(0, 2))
|
||||||
);
|
);
|
||||||
$this->assertEqual(
|
$this->assertEqual(
|
||||||
$result,
|
$result,
|
||||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
ArcanistDiffUtils::generateIntralineDiff($left, $right));
|
||||||
|
|
||||||
// Differences in middle, not connected to each other.
|
// Differences in middle, not connected to each other.
|
||||||
$left = "GrumpyCat";
|
$left = "GrumpyCat";
|
||||||
$right = "Grumpy\xCD\xA0a\xCD\xA0t";
|
$right = "Grumpy\xE2\x98\x83a\xE2\x98\x83t";
|
||||||
$result = array(
|
$result = array(
|
||||||
array(array(0, 6), array(1, 2), array(0, 1)),
|
array(array(0, 6), array(1, 2), array(0, 1)),
|
||||||
array(array(0, 6), array(1, 5), array(0, 1))
|
array(array(0, 6), array(1, 7), array(0, 1))
|
||||||
);
|
);
|
||||||
$this->assertEqual(
|
$this->assertEqual(
|
||||||
$result,
|
$result,
|
||||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
ArcanistDiffUtils::generateIntralineDiff($left, $right));
|
||||||
|
|
||||||
// String with difference at the beginning.
|
// String with difference at the beginning.
|
||||||
$left = "GrumpyC\xCD\xA0t";
|
$left = "GrumpyC\xE2\x98\x83t";
|
||||||
$right = "DrumpyC\xCD\xA0t";
|
$right = "DrumpyC\xE2\x98\x83t";
|
||||||
$result = array(
|
$result = array(
|
||||||
array(array(1, 10)),
|
array(array(1, 1), array(0, 10)),
|
||||||
array(array(1, 10))
|
array(array(1, 1), array(0, 10))
|
||||||
);
|
);
|
||||||
$this->assertEqual(
|
$this->assertEqual(
|
||||||
$result,
|
$result,
|
||||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
ArcanistDiffUtils::generateIntralineDiff($left, $right));
|
||||||
|
|
||||||
// String with difference at the end.
|
// String with difference at the end.
|
||||||
$left = "GrumpyC\xCD\xA0t";
|
$left = "GrumpyC\xE2\x98\x83t";
|
||||||
$right = "GrumpyC\xCD\xA0P";
|
$right = "GrumpyC\xE2\x98\x83P";
|
||||||
$result = array(
|
$result = array(
|
||||||
array(array(1, 10)),
|
array(array(0, 10), array(1, 1)),
|
||||||
array(array(1, 10))
|
array(array(0, 10), array(1, 1))
|
||||||
);
|
);
|
||||||
$this->assertEqual(
|
$this->assertEqual(
|
||||||
$result,
|
$result,
|
||||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
ArcanistDiffUtils::generateIntralineDiff($left, $right));
|
||||||
|
|
||||||
// String with differences at the beginning and end.
|
// String with differences at the beginning and end.
|
||||||
$left = "GrumpyC\xCD\xA0t";
|
$left = "GrumpyC\xE2\x98\x83t";
|
||||||
$right = "DrumpyC\xCD\xA0P";
|
$right = "DrumpyC\xE2\x98\x83P";
|
||||||
$result = array(
|
$result = array(
|
||||||
array(array(1, 10)),
|
array(array(1, 1), array(0, 9), array(1, 1)),
|
||||||
array(array(1, 10))
|
array(array(1, 1), array(0, 9), array(1, 1))
|
||||||
);
|
);
|
||||||
$this->assertEqual(
|
$this->assertEqual(
|
||||||
$result,
|
$result,
|
||||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
ArcanistDiffUtils::generateIntralineDiff($left, $right));
|
||||||
|
|
||||||
|
// This is a unicode combining character, "COMBINING DOUBLE TILDE".
|
||||||
|
$cc = "\xCD\xA0";
|
||||||
|
$left = "Senor";
|
||||||
|
$right = "Sen{$cc}or";
|
||||||
|
$result = array(
|
||||||
|
array(array(0, 2), array(1, 1), array(0, 2)),
|
||||||
|
array(array(0, 2), array(1, 3), array(0, 2))
|
||||||
|
);
|
||||||
|
$this->assertEqual(
|
||||||
|
$result,
|
||||||
|
ArcanistDiffUtils::generateIntralineDiff($left, $right));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue