mirror of
https://we.phorge.it/source/arcanist.git
synced 2024-11-26 00:32:41 +01:00
Implementing prefix and suffix diffing for multi-byte unicode characters.
Summary: Prefix and suffix diffing for multi-byte unicode characters. Test Plan: Test diff: ------------------------------------------------------------ diff --git a/test b/test index 252ad2f..a5bad3b 100644 --- a/test +++ b/test @@ -1 +1 @@ -This file contains this and this stuff. +This file contains žžžž and this stuff. -------------------------------------------------------------- {F43738} -------------------------------------------------------------- Reviewers: epriestley Reviewed By: epriestley CC: aran, Korvin, AnhNhan Maniphest Tasks: T2379 Differential Revision: https://secure.phabricator.com/D5925
This commit is contained in:
parent
a1c0ba785d
commit
0b8e9823fa
2 changed files with 190 additions and 4 deletions
|
@ -59,10 +59,7 @@ final class ArcanistDiffUtils {
|
||||||
// mark all the text as changed if either string has multibyte characters
|
// mark all the text as changed if either string has multibyte characters
|
||||||
// in it. TODO: Fix this so that this algorithm is UTF-8 aware.
|
// in it. TODO: Fix this so that this algorithm is UTF-8 aware.
|
||||||
if (preg_match('/[\x80-\xFF]/', $o.$n)) {
|
if (preg_match('/[\x80-\xFF]/', $o.$n)) {
|
||||||
return array(
|
return self::generateUTF8IntralineDiff($o, $n);
|
||||||
array(array(1, strlen($o))),
|
|
||||||
array(array(1, strlen($n))),
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$result = self::buildLevenshteinDifferenceString($o, $n);
|
$result = self::buildLevenshteinDifferenceString($o, $n);
|
||||||
|
@ -342,4 +339,80 @@ final class ArcanistDiffUtils {
|
||||||
return $prefix.strrev($result).$suffix;
|
return $prefix.strrev($result).$suffix;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static function generateUTF8IntralineDiff($o, $n) {
|
||||||
|
if (!strlen($o) || !strlen($n)) {
|
||||||
|
return array(
|
||||||
|
array(array(0, strlen($o))),
|
||||||
|
array(array(0, strlen($n)))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Breaking both the strings into their component characters
|
||||||
|
$old_characters = phutil_utf8v($o);
|
||||||
|
$new_characters = phutil_utf8v($n);
|
||||||
|
|
||||||
|
$old_count = count($old_characters);
|
||||||
|
$new_count = count($new_characters);
|
||||||
|
|
||||||
|
$prefix_match_length = 0;
|
||||||
|
$suffix_match_length = 0;
|
||||||
|
|
||||||
|
// Prefix matching.
|
||||||
|
for ($i = 0; $i < $old_count; $i++) {
|
||||||
|
if ($old_characters[$i] != $new_characters[$i]) {
|
||||||
|
$prefix_match_length = $i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return no change.
|
||||||
|
if ($old_count == $new_count && $i == $old_count) {
|
||||||
|
return array(
|
||||||
|
array(array(0, strlen($o))),
|
||||||
|
array(array(0, strlen($n)))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Suffix Matching.
|
||||||
|
$i = $old_count - 1;
|
||||||
|
$j = $new_count - 1;
|
||||||
|
|
||||||
|
while ($i >= 0 && $j >= 0) {
|
||||||
|
if ($old_characters[$i] != $new_characters[$j]) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
$i--;
|
||||||
|
$j--;
|
||||||
|
$suffix_match_length++;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Just a temporary fix for the edge cases where, the strings differ
|
||||||
|
// only at beginnning, only in the end and both at the beginning and end.
|
||||||
|
if (!$prefix_match_length || !$suffix_match_length) {
|
||||||
|
return array(
|
||||||
|
array(array(1, strlen($o))),
|
||||||
|
array(array(1, strlen($n)))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
$old_length = strlen($o);
|
||||||
|
$new_length = strlen($n);
|
||||||
|
|
||||||
|
return array(
|
||||||
|
array(
|
||||||
|
array(0, $prefix_match_length),
|
||||||
|
array(1, $old_length - $prefix_match_length - $suffix_match_length),
|
||||||
|
array(0, $suffix_match_length),
|
||||||
|
),
|
||||||
|
array(
|
||||||
|
array(0, $prefix_match_length),
|
||||||
|
array(1, $new_length - $prefix_match_length - $suffix_match_length),
|
||||||
|
array(0, $suffix_match_length),
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -97,4 +97,117 @@ final class ArcanistDiffUtilsTestCase extends ArcanistTestCase {
|
||||||
$test[1]));
|
$test[1]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function testGenerateUTF8IntralineDiff() {
|
||||||
|
// Both Strings Empty.
|
||||||
|
$left = "";
|
||||||
|
$right = "";
|
||||||
|
$result = array(
|
||||||
|
array(array(0, 0)),
|
||||||
|
array(array(0, 0))
|
||||||
|
);
|
||||||
|
$this->assertEqual(
|
||||||
|
$result,
|
||||||
|
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||||
|
|
||||||
|
// Left String Empty.
|
||||||
|
$left = "";
|
||||||
|
$right = "Grumpy\xCD\xA0at";
|
||||||
|
$result = array(
|
||||||
|
array(array(0, 0)),
|
||||||
|
array(array(0, 10))
|
||||||
|
);
|
||||||
|
$this->assertEqual(
|
||||||
|
$result,
|
||||||
|
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||||
|
|
||||||
|
// Right String Empty.
|
||||||
|
$left = "Grumpy\xCD\xA0at";
|
||||||
|
$right = "";
|
||||||
|
$result = array(
|
||||||
|
array(array(0, 10)),
|
||||||
|
array(array(0, 0))
|
||||||
|
);
|
||||||
|
$this->assertEqual(
|
||||||
|
$result,
|
||||||
|
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||||
|
|
||||||
|
// Both Strings Same
|
||||||
|
$left = "Grumpy\xCD\xA0at";
|
||||||
|
$right = "Grumpy\xCD\xA0at";
|
||||||
|
$result = array(
|
||||||
|
array(array(0, 10)),
|
||||||
|
array(array(0, 10))
|
||||||
|
);
|
||||||
|
$this->assertEqual(
|
||||||
|
$result,
|
||||||
|
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||||
|
|
||||||
|
// Both Strings are different.
|
||||||
|
$left = "Grumpy\xCD\xA0at";
|
||||||
|
$right = "Smiling Dog";
|
||||||
|
$result = array(
|
||||||
|
array(array(1, 10)),
|
||||||
|
array(array(1, 11))
|
||||||
|
);
|
||||||
|
$this->assertEqual(
|
||||||
|
$result,
|
||||||
|
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||||
|
|
||||||
|
// String with one difference in the middle.
|
||||||
|
$left = "GrumpyCat";
|
||||||
|
$right = "Grumpy\xCD\xA0at";
|
||||||
|
$result = array(
|
||||||
|
array(array(0, 6), array(1, 1), array(0, 2)),
|
||||||
|
array(array(0, 6), array(1, 2), array(0, 2))
|
||||||
|
);
|
||||||
|
$this->assertEqual(
|
||||||
|
$result,
|
||||||
|
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||||
|
|
||||||
|
// Differences in middle, not connected to each other.
|
||||||
|
$left = "GrumpyCat";
|
||||||
|
$right = "Grumpy\xCD\xA0a\xCD\xA0t";
|
||||||
|
$result = array(
|
||||||
|
array(array(0, 6), array(1, 2), array(0, 1)),
|
||||||
|
array(array(0, 6), array(1, 5), array(0, 1))
|
||||||
|
);
|
||||||
|
$this->assertEqual(
|
||||||
|
$result,
|
||||||
|
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||||
|
|
||||||
|
// String with difference at the beginning.
|
||||||
|
$left = "GrumpyC\xCD\xA0t";
|
||||||
|
$right = "DrumpyC\xCD\xA0t";
|
||||||
|
$result = array(
|
||||||
|
array(array(1, 10)),
|
||||||
|
array(array(1, 10))
|
||||||
|
);
|
||||||
|
$this->assertEqual(
|
||||||
|
$result,
|
||||||
|
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||||
|
|
||||||
|
// String with difference at the end.
|
||||||
|
$left = "GrumpyC\xCD\xA0t";
|
||||||
|
$right = "GrumpyC\xCD\xA0P";
|
||||||
|
$result = array(
|
||||||
|
array(array(1, 10)),
|
||||||
|
array(array(1, 10))
|
||||||
|
);
|
||||||
|
$this->assertEqual(
|
||||||
|
$result,
|
||||||
|
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||||
|
|
||||||
|
// String with differences at the beginning and end.
|
||||||
|
$left = "GrumpyC\xCD\xA0t";
|
||||||
|
$right = "DrumpyC\xCD\xA0P";
|
||||||
|
$result = array(
|
||||||
|
array(array(1, 10)),
|
||||||
|
array(array(1, 10))
|
||||||
|
);
|
||||||
|
$this->assertEqual(
|
||||||
|
$result,
|
||||||
|
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue