1
0
Fork 0
mirror of https://we.phorge.it/source/arcanist.git synced 2024-11-22 14:52:40 +01:00

Implementing prefix and suffix diffing for multi-byte unicode characters.

Summary: Prefix and suffix diffing for multi-byte unicode characters.

Test Plan:
Test diff:
------------------------------------------------------------
diff --git a/test b/test
index 252ad2f..a5bad3b 100644
--- a/test
+++ b/test
@@ -1 +1 @@
-This file contains this and this stuff.
+This file contains žžžž and this stuff.
--------------------------------------------------------------

{F43738}
--------------------------------------------------------------

Reviewers: epriestley

Reviewed By: epriestley

CC: aran, Korvin, AnhNhan

Maniphest Tasks: T2379

Differential Revision: https://secure.phabricator.com/D5925
This commit is contained in:
Afaque Hussain 2013-05-21 13:06:14 -07:00 committed by epriestley
parent a1c0ba785d
commit 0b8e9823fa
2 changed files with 190 additions and 4 deletions

View file

@ -59,10 +59,7 @@ final class ArcanistDiffUtils {
// mark all the text as changed if either string has multibyte characters // mark all the text as changed if either string has multibyte characters
// in it. TODO: Fix this so that this algorithm is UTF-8 aware. // in it. TODO: Fix this so that this algorithm is UTF-8 aware.
if (preg_match('/[\x80-\xFF]/', $o.$n)) { if (preg_match('/[\x80-\xFF]/', $o.$n)) {
return array( return self::generateUTF8IntralineDiff($o, $n);
array(array(1, strlen($o))),
array(array(1, strlen($n))),
);
} }
$result = self::buildLevenshteinDifferenceString($o, $n); $result = self::buildLevenshteinDifferenceString($o, $n);
@ -342,4 +339,80 @@ final class ArcanistDiffUtils {
return $prefix.strrev($result).$suffix; return $prefix.strrev($result).$suffix;
} }
public static function generateUTF8IntralineDiff($o, $n) {
if (!strlen($o) || !strlen($n)) {
return array(
array(array(0, strlen($o))),
array(array(0, strlen($n)))
);
}
// Breaking both the strings into their component characters
$old_characters = phutil_utf8v($o);
$new_characters = phutil_utf8v($n);
$old_count = count($old_characters);
$new_count = count($new_characters);
$prefix_match_length = 0;
$suffix_match_length = 0;
// Prefix matching.
for ($i = 0; $i < $old_count; $i++) {
if ($old_characters[$i] != $new_characters[$i]) {
$prefix_match_length = $i;
break;
}
}
// Return no change.
if ($old_count == $new_count && $i == $old_count) {
return array(
array(array(0, strlen($o))),
array(array(0, strlen($n)))
);
}
// Suffix Matching.
$i = $old_count - 1;
$j = $new_count - 1;
while ($i >= 0 && $j >= 0) {
if ($old_characters[$i] != $new_characters[$j]) {
break;
}
$i--;
$j--;
$suffix_match_length++;
}
// Just a temporary fix for the edge cases where, the strings differ
// only at beginnning, only in the end and both at the beginning and end.
if (!$prefix_match_length || !$suffix_match_length) {
return array(
array(array(1, strlen($o))),
array(array(1, strlen($n)))
);
}
$old_length = strlen($o);
$new_length = strlen($n);
return array(
array(
array(0, $prefix_match_length),
array(1, $old_length - $prefix_match_length - $suffix_match_length),
array(0, $suffix_match_length),
),
array(
array(0, $prefix_match_length),
array(1, $new_length - $prefix_match_length - $suffix_match_length),
array(0, $suffix_match_length),
)
);
}
} }

View file

@ -97,4 +97,117 @@ final class ArcanistDiffUtilsTestCase extends ArcanistTestCase {
$test[1])); $test[1]));
} }
} }
public function testGenerateUTF8IntralineDiff() {
// Both Strings Empty.
$left = "";
$right = "";
$result = array(
array(array(0, 0)),
array(array(0, 0))
);
$this->assertEqual(
$result,
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
// Left String Empty.
$left = "";
$right = "Grumpy\xCD\xA0at";
$result = array(
array(array(0, 0)),
array(array(0, 10))
);
$this->assertEqual(
$result,
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
// Right String Empty.
$left = "Grumpy\xCD\xA0at";
$right = "";
$result = array(
array(array(0, 10)),
array(array(0, 0))
);
$this->assertEqual(
$result,
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
// Both Strings Same
$left = "Grumpy\xCD\xA0at";
$right = "Grumpy\xCD\xA0at";
$result = array(
array(array(0, 10)),
array(array(0, 10))
);
$this->assertEqual(
$result,
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
// Both Strings are different.
$left = "Grumpy\xCD\xA0at";
$right = "Smiling Dog";
$result = array(
array(array(1, 10)),
array(array(1, 11))
);
$this->assertEqual(
$result,
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
// String with one difference in the middle.
$left = "GrumpyCat";
$right = "Grumpy\xCD\xA0at";
$result = array(
array(array(0, 6), array(1, 1), array(0, 2)),
array(array(0, 6), array(1, 2), array(0, 2))
);
$this->assertEqual(
$result,
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
// Differences in middle, not connected to each other.
$left = "GrumpyCat";
$right = "Grumpy\xCD\xA0a\xCD\xA0t";
$result = array(
array(array(0, 6), array(1, 2), array(0, 1)),
array(array(0, 6), array(1, 5), array(0, 1))
);
$this->assertEqual(
$result,
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
// String with difference at the beginning.
$left = "GrumpyC\xCD\xA0t";
$right = "DrumpyC\xCD\xA0t";
$result = array(
array(array(1, 10)),
array(array(1, 10))
);
$this->assertEqual(
$result,
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
// String with difference at the end.
$left = "GrumpyC\xCD\xA0t";
$right = "GrumpyC\xCD\xA0P";
$result = array(
array(array(1, 10)),
array(array(1, 10))
);
$this->assertEqual(
$result,
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
// String with differences at the beginning and end.
$left = "GrumpyC\xCD\xA0t";
$right = "DrumpyC\xCD\xA0P";
$result = array(
array(array(1, 10)),
array(array(1, 10))
);
$this->assertEqual(
$result,
ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right));
}
} }