From 0b8e9823fa106cbb3c11125f0a1e03bac9fcc83b Mon Sep 17 00:00:00 2001 From: Afaque Hussain Date: Tue, 21 May 2013 13:06:14 -0700 Subject: [PATCH] Implementing prefix and suffix diffing for multi-byte unicode characters. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Prefix and suffix diffing for multi-byte unicode characters. Test Plan: Test diff: ------------------------------------------------------------ diff --git a/test b/test index 252ad2f..a5bad3b 100644 --- a/test +++ b/test @@ -1 +1 @@ -This file contains this and this stuff. +This file contains žžžž and this stuff. -------------------------------------------------------------- {F43738} -------------------------------------------------------------- Reviewers: epriestley Reviewed By: epriestley CC: aran, Korvin, AnhNhan Maniphest Tasks: T2379 Differential Revision: https://secure.phabricator.com/D5925 --- src/difference/ArcanistDiffUtils.php | 81 ++++++++++++- .../__tests__/ArcanistDiffUtilsTestCase.php | 113 ++++++++++++++++++ 2 files changed, 190 insertions(+), 4 deletions(-) diff --git a/src/difference/ArcanistDiffUtils.php b/src/difference/ArcanistDiffUtils.php index ed8220d3..82e71e5d 100644 --- a/src/difference/ArcanistDiffUtils.php +++ b/src/difference/ArcanistDiffUtils.php @@ -59,10 +59,7 @@ final class ArcanistDiffUtils { // mark all the text as changed if either string has multibyte characters // in it. TODO: Fix this so that this algorithm is UTF-8 aware. if (preg_match('/[\x80-\xFF]/', $o.$n)) { - return array( - array(array(1, strlen($o))), - array(array(1, strlen($n))), - ); + return self::generateUTF8IntralineDiff($o, $n); } $result = self::buildLevenshteinDifferenceString($o, $n); @@ -342,4 +339,80 @@ final class ArcanistDiffUtils { return $prefix.strrev($result).$suffix; } + public static function generateUTF8IntralineDiff($o, $n) { + if (!strlen($o) || !strlen($n)) { + return array( + array(array(0, strlen($o))), + array(array(0, strlen($n))) + ); + } + + // Breaking both the strings into their component characters + $old_characters = phutil_utf8v($o); + $new_characters = phutil_utf8v($n); + + $old_count = count($old_characters); + $new_count = count($new_characters); + + $prefix_match_length = 0; + $suffix_match_length = 0; + + // Prefix matching. + for ($i = 0; $i < $old_count; $i++) { + if ($old_characters[$i] != $new_characters[$i]) { + $prefix_match_length = $i; + break; + } + } + + // Return no change. + if ($old_count == $new_count && $i == $old_count) { + return array( + array(array(0, strlen($o))), + array(array(0, strlen($n))) + ); + } + + // Suffix Matching. + $i = $old_count - 1; + $j = $new_count - 1; + + while ($i >= 0 && $j >= 0) { + if ($old_characters[$i] != $new_characters[$j]) { + break; + } + + $i--; + $j--; + $suffix_match_length++; + + } + + // Just a temporary fix for the edge cases where, the strings differ + // only at beginnning, only in the end and both at the beginning and end. + if (!$prefix_match_length || !$suffix_match_length) { + return array( + array(array(1, strlen($o))), + array(array(1, strlen($n))) + ); + } + + $old_length = strlen($o); + $new_length = strlen($n); + + return array( + array( + array(0, $prefix_match_length), + array(1, $old_length - $prefix_match_length - $suffix_match_length), + array(0, $suffix_match_length), + ), + array( + array(0, $prefix_match_length), + array(1, $new_length - $prefix_match_length - $suffix_match_length), + array(0, $suffix_match_length), + ) + ); + + } + } diff --git a/src/difference/__tests__/ArcanistDiffUtilsTestCase.php b/src/difference/__tests__/ArcanistDiffUtilsTestCase.php index 308f89fc..dab25d53 100644 --- a/src/difference/__tests__/ArcanistDiffUtilsTestCase.php +++ b/src/difference/__tests__/ArcanistDiffUtilsTestCase.php @@ -97,4 +97,117 @@ final class ArcanistDiffUtilsTestCase extends ArcanistTestCase { $test[1])); } } + + public function testGenerateUTF8IntralineDiff() { + // Both Strings Empty. + $left = ""; + $right = ""; + $result = array( + array(array(0, 0)), + array(array(0, 0)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // Left String Empty. + $left = ""; + $right = "Grumpy\xCD\xA0at"; + $result = array( + array(array(0, 0)), + array(array(0, 10)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // Right String Empty. + $left = "Grumpy\xCD\xA0at"; + $right = ""; + $result = array( + array(array(0, 10)), + array(array(0, 0)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // Both Strings Same + $left = "Grumpy\xCD\xA0at"; + $right = "Grumpy\xCD\xA0at"; + $result = array( + array(array(0, 10)), + array(array(0, 10)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // Both Strings are different. + $left = "Grumpy\xCD\xA0at"; + $right = "Smiling Dog"; + $result = array( + array(array(1, 10)), + array(array(1, 11)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // String with one difference in the middle. + $left = "GrumpyCat"; + $right = "Grumpy\xCD\xA0at"; + $result = array( + array(array(0, 6), array(1, 1), array(0, 2)), + array(array(0, 6), array(1, 2), array(0, 2)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // Differences in middle, not connected to each other. + $left = "GrumpyCat"; + $right = "Grumpy\xCD\xA0a\xCD\xA0t"; + $result = array( + array(array(0, 6), array(1, 2), array(0, 1)), + array(array(0, 6), array(1, 5), array(0, 1)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // String with difference at the beginning. + $left = "GrumpyC\xCD\xA0t"; + $right = "DrumpyC\xCD\xA0t"; + $result = array( + array(array(1, 10)), + array(array(1, 10)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // String with difference at the end. + $left = "GrumpyC\xCD\xA0t"; + $right = "GrumpyC\xCD\xA0P"; + $result = array( + array(array(1, 10)), + array(array(1, 10)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + // String with differences at the beginning and end. + $left = "GrumpyC\xCD\xA0t"; + $right = "DrumpyC\xCD\xA0P"; + $result = array( + array(array(1, 10)), + array(array(1, 10)) + ); + $this->assertEqual( + $result, + ArcanistDiffUtils::generateUTF8IntralineDiff($left, $right)); + + } }