mirror of
https://we.phorge.it/source/arcanist.git
synced 2025-01-22 20:51:09 +01:00
Implementing prefix and suffix diffing for multi-byte unicode characters.
Summary: Prefix and suffix diffing for multi-byte unicode characters. Test Plan: Test diff: ------------------------------------------------------------ diff --git a/test b/test index 252ad2f..a5bad3b 100644 --- a/test +++ b/test @@ -1 +1 @@ -This file contains this and this stuff. +This file contains žžžž and this stuff. -------------------------------------------------------------- {F43738} -------------------------------------------------------------- Reviewers: epriestley Reviewed By: epriestley CC: aran, Korvin, AnhNhan Maniphest Tasks: T2379 Differential Revision: https://secure.phabricator.com/D5925
This commit is contained in:
parent
a1c0ba785d
commit
0b8e9823fa
2 changed files with 190 additions and 4 deletions
|
@ -59,10 +59,7 @@ final class ArcanistDiffUtils {
|
|||
// mark all the text as changed if either string has multibyte characters
|
||||
// in it. TODO: Fix this so that this algorithm is UTF-8 aware.
|
||||
if (preg_match('/[\x80-\xFF]/', $o.$n)) {
|
||||
return array(
|
||||
array(array(1, strlen($o))),
|
||||
array(array(1, strlen($n))),
|
||||
);
|
||||
return self::generateUTF8IntralineDiff($o, $n);
|
||||
}
|
||||
|
||||
$result = self::buildLevenshteinDifferenceString($o, $n);
|
||||
|
@ -342,4 +339,80 @@ final class ArcanistDiffUtils {
|
|||
return $prefix.strrev($result).$suffix;
|
||||
}
|
||||
|
||||
public static function generateUTF8IntralineDiff($o, $n) {
|
||||
if (!strlen($o) || !strlen($n)) {
|
||||
return array(
|
||||
array(array(0, strlen($o))),
|
||||
array(array(0, strlen($n)))
|
||||
);
|
||||
}
|
||||
|
||||
// Breaking both the strings into their component characters
|
||||
$old_characters = phutil_utf8v($o);
|
||||
$new_characters = phutil_utf8v($n);
|
||||
|
||||
$old_count = count($old_characters);
|
||||
$new_count = count($new_characters);
|
||||
|
||||
$prefix_match_length = 0;
|
||||
$suffix_match_length = 0;
|
||||
|
||||
// Prefix matching.
|
||||
for ($i = 0; $i < $old_count; $i++) {
|
||||
if ($old_characters[$i] != $new_characters[$i]) {
|
||||
$prefix_match_length = $i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Return no change.
|
||||
if ($old_count == $new_count && $i == $old_count) {
|
||||
return array(
|
||||
array(array(0, strlen($o))),
|
||||
array(array(0, strlen($n)))
|
||||
);
|
||||
}
|
||||
|
||||
// Suffix Matching.
|
||||
$i = $old_count - 1;
|
||||
$j = $new_count - 1;
|
||||
|
||||
while ($i >= 0 && $j >= 0) {
|
||||
if ($old_characters[$i] != $new_characters[$j]) {
|
||||
break;
|
||||
}
|
||||
|
||||
$i--;
|
||||
$j--;
|
||||
$suffix_match_length++;
|
||||
|
||||
}
|
||||
|
||||
// Just a temporary fix for the edge cases where, the strings differ
|
||||
// only at beginnning, only in the end and both at the beginning and end.
|
||||
if (!$prefix_match_length || !$suffix_match_length) {
|
||||
return array(
|
||||
array(array(1, strlen($o))),
|
||||
array(array(1, strlen($n)))
|
||||
);
|
||||
}
|
||||
|
||||
$old_length = strlen($o);
|
||||
$new_length = strlen($n);
|
||||
|
||||
return array(
|
||||
array(
|
||||
array(0, $prefix_match_length),
|
||||
array(1, $old_length - $prefix_match_length - $suffix_match_length),
|
||||
array(0, $suffix_match_length),
|
||||
),
|
||||
array(
|
||||
array(0, $prefix_match_length),
|
||||
array(1, $new_length - $prefix_match_length - $suffix_match_length),
|
||||
array(0, $suffix_match_length),
|
||||
)
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -97,4 +97,117 @@ final class ArcanistDiffUtilsTestCase extends ArcanistTestCase {
|
|||
$test[1]));
|
||||
}
|
||||
}
|
||||
|
||||
public function testGenerateUTF8IntralineDiff() {
|
||||
// Both Strings Empty.
|
||||
$left = "";
|
||||
$right = "";
|
||||
$result = array(
|
||||
array(array(0, 0)),
|
||||
array(array(0, 0))
|
||||
);
|
||||
$this->assertEqual(
|
||||
$result,
|
||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||
|
||||
// Left String Empty.
|
||||
$left = "";
|
||||
$right = "Grumpy\xCD\xA0at";
|
||||
$result = array(
|
||||
array(array(0, 0)),
|
||||
array(array(0, 10))
|
||||
);
|
||||
$this->assertEqual(
|
||||
$result,
|
||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||
|
||||
// Right String Empty.
|
||||
$left = "Grumpy\xCD\xA0at";
|
||||
$right = "";
|
||||
$result = array(
|
||||
array(array(0, 10)),
|
||||
array(array(0, 0))
|
||||
);
|
||||
$this->assertEqual(
|
||||
$result,
|
||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||
|
||||
// Both Strings Same
|
||||
$left = "Grumpy\xCD\xA0at";
|
||||
$right = "Grumpy\xCD\xA0at";
|
||||
$result = array(
|
||||
array(array(0, 10)),
|
||||
array(array(0, 10))
|
||||
);
|
||||
$this->assertEqual(
|
||||
$result,
|
||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||
|
||||
// Both Strings are different.
|
||||
$left = "Grumpy\xCD\xA0at";
|
||||
$right = "Smiling Dog";
|
||||
$result = array(
|
||||
array(array(1, 10)),
|
||||
array(array(1, 11))
|
||||
);
|
||||
$this->assertEqual(
|
||||
$result,
|
||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||
|
||||
// String with one difference in the middle.
|
||||
$left = "GrumpyCat";
|
||||
$right = "Grumpy\xCD\xA0at";
|
||||
$result = array(
|
||||
array(array(0, 6), array(1, 1), array(0, 2)),
|
||||
array(array(0, 6), array(1, 2), array(0, 2))
|
||||
);
|
||||
$this->assertEqual(
|
||||
$result,
|
||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||
|
||||
// Differences in middle, not connected to each other.
|
||||
$left = "GrumpyCat";
|
||||
$right = "Grumpy\xCD\xA0a\xCD\xA0t";
|
||||
$result = array(
|
||||
array(array(0, 6), array(1, 2), array(0, 1)),
|
||||
array(array(0, 6), array(1, 5), array(0, 1))
|
||||
);
|
||||
$this->assertEqual(
|
||||
$result,
|
||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||
|
||||
// String with difference at the beginning.
|
||||
$left = "GrumpyC\xCD\xA0t";
|
||||
$right = "DrumpyC\xCD\xA0t";
|
||||
$result = array(
|
||||
array(array(1, 10)),
|
||||
array(array(1, 10))
|
||||
);
|
||||
$this->assertEqual(
|
||||
$result,
|
||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||
|
||||
// String with difference at the end.
|
||||
$left = "GrumpyC\xCD\xA0t";
|
||||
$right = "GrumpyC\xCD\xA0P";
|
||||
$result = array(
|
||||
array(array(1, 10)),
|
||||
array(array(1, 10))
|
||||
);
|
||||
$this->assertEqual(
|
||||
$result,
|
||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||
|
||||
// String with differences at the beginning and end.
|
||||
$left = "GrumpyC\xCD\xA0t";
|
||||
$right = "DrumpyC\xCD\xA0P";
|
||||
$result = array(
|
||||
array(array(1, 10)),
|
||||
array(array(1, 10))
|
||||
);
|
||||
$this->assertEqual(
|
||||
$result,
|
||||
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
|
||||
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue