diff --git a/src/difference/ArcanistDiffUtils.php b/src/difference/ArcanistDiffUtils.php index ed8220d3..82e71e5d 100644 --- a/src/difference/ArcanistDiffUtils.php +++ b/src/difference/ArcanistDiffUtils.php @@ -59,10 +59,7 @@ final class ArcanistDiffUtils { // mark all the text as changed if either string has multibyte characters // in it. TODO: Fix this so that this algorithm is UTF-8 aware. if (preg_match('/[\x80-\xFF]/', $o.$n)) { - return array( - array(array(1, strlen($o))), - array(array(1, strlen($n))), - ); + return self::generateUTF8IntralineDiff($o, $n); } $result = self::buildLevenshteinDifferenceString($o, $n); @@ -342,4 +339,80 @@ final class ArcanistDiffUtils { return $prefix.strrev($result).$suffix; } + public static function generateUTF8IntralineDiff($o, $n) { + if (!strlen($o) || !strlen($n)) { + return array( + array(array(0, strlen($o))), + array(array(0, strlen($n))) + ); + } + + // Breaking both the strings into their component characters + $old_characters = phutil_utf8v($o); + $new_characters = phutil_utf8v($n); + + $old_count = count($old_characters); + $new_count = count($new_characters); + + $prefix_match_length = 0; + $suffix_match_length = 0; + + // Prefix matching. + for ($i = 0; $i < $old_count; $i++) { + if ($old_characters[$i] != $new_characters[$i]) { + $prefix_match_length = $i; + break; + } + } + + // Return no change. + if ($old_count == $new_count && $i == $old_count) { + return array( + array(array(0, strlen($o))), + array(array(0, strlen($n))) + ); + } + + // Suffix Matching. + $i = $old_count - 1; + $j = $new_count - 1; + + while ($i >= 0 && $j >= 0) { + if ($old_characters[$i] != $new_characters[$j]) { + break; + } + + $i--; + $j--; + $suffix_match_length++; + + } + + // Just a temporary fix for the edge cases where, the strings differ + // only at beginnning, only in the end and both at the beginning and end. + if (!$prefix_match_length || !$suffix_match_length) { + return array( + array(array(1, strlen($o))), + array(array(1, strlen($n))) + ); + } + + $old_length = strlen($o); + $new_length = strlen($n); + + return array( + array( + array(0, $prefix_match_length), + array(1, $old_length - $prefix_match_length - $suffix_match_length), + array(0, $suffix_match_length), + ), + array( + array(0, $prefix_match_length), + array(1, $new_length - $prefix_match_length - $suffix_match_length), + array(0, $suffix_match_length), + ) + ); + + } + } diff --git a/src/difference/__tests__/ArcanistDiffUtilsTestCase.php b/src/difference/__tests__/ArcanistDiffUtilsTestCase.php index 308f89fc..dab25d53 100644 --- a/src/difference/__tests__/ArcanistDiffUtilsTestCase.php +++ b/src/difference/__tests__/ArcanistDiffUtilsTestCase.php @@ -97,4 +97,117 @@ final class ArcanistDiffUtilsTestCase extends ArcanistTestCase { $test[1])); } } + + public function testGenerateUTF8IntralineDiff() { + // Both Strings Empty. + $left = ""; + $right = ""; + $result = array( + array(array(0, 0)), + array(array(0, 0)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // Left String Empty. + $left = ""; + $right = "Grumpy\xCD\xA0at"; + $result = array( + array(array(0, 0)), + array(array(0, 10)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // Right String Empty. + $left = "Grumpy\xCD\xA0at"; + $right = ""; + $result = array( + array(array(0, 10)), + array(array(0, 0)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // Both Strings Same + $left = "Grumpy\xCD\xA0at"; + $right = "Grumpy\xCD\xA0at"; + $result = array( + array(array(0, 10)), + array(array(0, 10)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // Both Strings are different. + $left = "Grumpy\xCD\xA0at"; + $right = "Smiling Dog"; + $result = array( + array(array(1, 10)), + array(array(1, 11)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // String with one difference in the middle. + $left = "GrumpyCat"; + $right = "Grumpy\xCD\xA0at"; + $result = array( + array(array(0, 6), array(1, 1), array(0, 2)), + array(array(0, 6), array(1, 2), array(0, 2)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // Differences in middle, not connected to each other. + $left = "GrumpyCat"; + $right = "Grumpy\xCD\xA0a\xCD\xA0t"; + $result = array( + array(array(0, 6), array(1, 2), array(0, 1)), + array(array(0, 6), array(1, 5), array(0, 1)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // String with difference at the beginning. + $left = "GrumpyC\xCD\xA0t"; + $right = "DrumpyC\xCD\xA0t"; + $result = array( + array(array(1, 10)), + array(array(1, 10)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // String with difference at the end. + $left = "GrumpyC\xCD\xA0t"; + $right = "GrumpyC\xCD\xA0P"; + $result = array( + array(array(1, 10)), + array(array(1, 10)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // String with differences at the beginning and end. + $left = "GrumpyC\xCD\xA0t"; + $right = "DrumpyC\xCD\xA0P"; + $result = array( + array(array(1, 10)), + array(array(1, 10)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + } }