From d0957c344156356123048be2219fb95a54c89a85 Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Tue, 16 Aug 2016 13:38:28 -0700 Subject: [PATCH] Parse git renames with inconsistent quoting or custom prefixes Summary: The previous parser failed when only one of the old/new filenames was quoted, as with a rename of a Unicode filename to a non-Unicode filename, or vice versa. It also incorrectly parsed custom prefixes, even going to far as to encode this logic in the tests: a diff of "src/file dst/file" which is not a rename should not be recorded as changing "src/file", but rather "file". Switch to using the "rename from" / "rename to" as the source of truth for old and current filenames when complex renames are done. This matches the logic that git itself uses to parse patches; the contents of the `diff --git` line are merely viewed as a simplest-case prediction, with renames handled later should it not match. The diff parser already had logic to parse "rename from" / "rename to" lines and extract their information. As such, this diff consists primarily of removing logic from the `splitGitDiffPaths` method, and allowing it to quietly fail. This resolves two ambiguity mentioned in comments and tests: in renaming "old file old" to "file", as well as the renaming of "old file with spaces" to "new file with spaces" when neither are quoted. Test Plan: `arc unit`. Many of the existing test cases no longer applied to the `splitGitDiffPaths` method; they were moved into a new test method which also supplies values for "rename from" and "rename to" lines. As noted in the summary, this alters one of the expected values of an existing test. Specifically, the following diff is a file addition of `file` with custom prefixes, and not the addition of "dst/file": ``` diff --git src/file dst/file new file mode 100644 index 0000000..1269488 --- /dev/null +++ dst/file @@ -0,0 +1 @@ +data ``` Reviewers: epriestley, #blessed_reviewers Reviewed By: epriestley, #blessed_reviewers Subscribers: Korvin, epriestley Differential Revision: https://secure.phabricator.com/D16405 --- src/parser/ArcanistDiffParser.php | 111 +++--------- .../__tests__/ArcanistDiffParserTestCase.php | 171 +++++++++++------- .../diff/custom-prefixes-edit.gitdiff | 11 ++ 3 files changed, 145 insertions(+), 148 deletions(-) create mode 100644 src/parser/__tests__/diff/custom-prefixes-edit.gitdiff diff --git a/src/parser/ArcanistDiffParser.php b/src/parser/ArcanistDiffParser.php index 711edb63..a310fb41 100644 --- a/src/parser/ArcanistDiffParser.php +++ b/src/parser/ArcanistDiffParser.php @@ -288,9 +288,11 @@ final class ArcanistDiffParser extends Phobject { if (isset($match['type'])) { if ($match['type'] == 'diff --git') { - list($old, $new) = self::splitGitDiffPaths($match['oldnew']); - $match['old'] = $old; - $match['cur'] = $new; + $filename = self::extractGitCommonFilename($match['oldnew']); + if ($filename !== null) { + $match['old'] = $filename; + $match['cur'] = $filename; + } } } @@ -1307,46 +1309,6 @@ final class ArcanistDiffParser extends Phobject { } - /** - * Strip prefixes off paths from `git diff`. By default git uses a/ and b/, - * but you can set `diff.mnemonicprefix` to get a different set of prefixes, - * or use `--no-prefix`, `--src-prefix` or `--dst-prefix` to set these to - * other arbitrary values. - * - * We strip the default and mnemonic prefixes, and trust the user knows what - * they're doing in the other cases. - * - * @param string Path to strip. - * @return string Stripped path. - */ - public static function stripGitPathPrefix($path) { - - static $regex; - if ($regex === null) { - $prefixes = array( - // These are the defaults. - 'a/', - 'b/', - - // These show up when you set "diff.mnemonicprefix". - 'i/', - 'c/', - 'w/', - 'o/', - '1/', - '2/', - ); - - foreach ($prefixes as $key => $prefix) { - $prefixes[$key] = preg_quote($prefix, '@'); - } - $regex = '@^('.implode('|', $prefixes).')@S'; - } - - return preg_replace($regex, '', $path); - } - - /** * Split the paths on a "diff --git" line into old and new paths. This * is difficult because they may be ambiguous if the files contain spaces. @@ -1354,53 +1316,40 @@ final class ArcanistDiffParser extends Phobject { * @param string Text from a diff line after "diff --git ". * @return pair Old and new paths. */ - public static function splitGitDiffPaths($paths) { + public static function extractGitCommonFilename($paths) { $matches = null; $paths = rtrim($paths, "\r\n"); - $patterns = array( - // Try quoted paths, used for unicode filenames or filenames with quotes. - '@^(?P"(?:\\\\.|[^"\\\\]+)+") (?P"(?:\\\\.|[^"\\\\]+)+")$@', + // Try the exact same string twice in a row separated by a + // space, with an optional prefix. This can hit a false + // positive for moves from files like "old file old" to "file", + // but such a cases will be caught by the "rename from" / + // "rename to" lines. + $prefix = '(?:[^/]+/)?'; + $pattern = + "@^(?P(?P\"?){$prefix}(?P.+)\\k)" + ." " + ."(?P(?P\"?){$prefix}\\k\\k)$@"; - // Try paths without spaces. - '@^(?P[^ ]+) (?P[^ ]+)$@', - - // Try paths with well-known prefixes. - '@^(?P[abicwo12]/.*) (?P[abicwo12]/.*)$@', - - // Try the exact same string twice in a row separated by a space. - // This can hit a false positive for moves from files like "old file old" - // to "file", but such a case combined with custom diff prefixes is - // incredibly obscure. - '@^(?P.*) (?P\\1)$@', - ); - - foreach ($patterns as $pattern) { - if (preg_match($pattern, $paths, $matches)) { - break; - } + if (!preg_match($pattern, $paths, $matches)) { + // A rename or some form; return null for now, and let the + // "rename from" / "rename to" lines fix it up. + return null; } - if (!$matches) { - throw new Exception( - pht( - "Input diff contains ambiguous line '%s'. This line is ambiguous ". - "because there are spaces in the file names, so the parser can not ". - "determine where the file names begin and end. To resolve this ". - "ambiguity, use standard prefixes ('a/' and 'b/') when ". - "generating diffs.", - "diff --git {$paths}")); - } + // Use the common subpart. There may be ambiguity here: "src/file + // dst/file" may _either_ be a prefix-less move, or a change with + // two custom prefixes. We assume it is the latter; if it is a + // rename, diff parsing will update based on the "rename from" / + // "rename to" lines. - $old = $matches['old']; - $old = self::unescapeFilename($old); - $old = self::stripGitPathPrefix($old); - - $new = $matches['new']; + // This re-assembles with the differing prefixes removed, but the + // quoting from the original. Necessary so we know if we should + // unescape characters from the common string. + $new = $matches['newq'].$matches['common'].$matches['newq']; $new = self::unescapeFilename($new); - $new = self::stripGitPathPrefix($new); - return array($old, $new); + return $new; } diff --git a/src/parser/__tests__/ArcanistDiffParserTestCase.php b/src/parser/__tests__/ArcanistDiffParserTestCase.php index 0976fb9b..c3f23b3f 100644 --- a/src/parser/__tests__/ArcanistDiffParserTestCase.php +++ b/src/parser/__tests__/ArcanistDiffParserTestCase.php @@ -535,7 +535,14 @@ EOTEXT $this->assertEqual(1, count($changes)); $change = head($changes); $this->assertEqual( - 'dst/file', + 'file', + $change->getCurrentPath()); + break; + case 'custom-prefixes-edit.gitdiff': + $this->assertEqual(1, count($changes)); + $change = head($changes); + $this->assertEqual( + 'file', $change->getCurrentPath()); break; case 'more-newlines.svndiff': @@ -607,81 +614,111 @@ EOTEXT } } - public function testGitPrefixStripping() { + public function testGitCommonFilenameExtraction() { static $tests = array( - 'a/file.c' => 'file.c', - 'b/file.c' => 'file.c', - 'i/file.c' => 'file.c', - 'c/file.c' => 'file.c', - 'w/file.c' => 'file.c', - 'o/file.c' => 'file.c', - '1/file.c' => 'file.c', - '2/file.c' => 'file.c', - 'src/file.c' => 'src/file.c', - 'file.c' => 'file.c', + 'a/filename.c b/filename.c' => 'filename.c', + "a/filename.c b/filename.c\n" => 'filename.c', + "a/filename.c b/filename.c\r\n" => 'filename.c', + 'filename.c filename.c' => 'filename.c', + '1/filename.c 2/filename.c' => 'filename.c', + '"a/\\"quotes\\"" "b/\\"quotes\\""' => '"quotes"', + '"a/\\"quotes and spaces\\"" "b/\\"quotes and spaces\\""' => + '"quotes and spaces"', + '"a/\\342\\230\\203" "b/\\342\\230\\203"' => + "\xE2\x98\x83", + 'a/Core Data/filename.c b/Core Data/filename.c' => + 'Core Data/filename.c', + 'some file with spaces.c some file with spaces.c' => + 'some file with spaces.c', + '"foo bar.c" foo bar.c' => 'foo bar.c', + '"a/foo bar.c" b/foo bar.c' => 'foo bar.c', + 'src/file dst/file' => 'file', + + // Renames are handled by the "rename from ..." lines later in + // the diff, for simplicity of parsing; this is also how git + // itself handles it. + 'a/foo.c b/bar.c' => null, + 'a/foo bar.c b/baz troz.c' => null, + '"a/foo bar.c" b/baz troz.c' => null, + 'a/foo bar.c "b/baz troz.c"' => null, + '"a/foo bar.c" "b/baz troz.c"' => null, + 'filename file with spaces.c filename file with spaces.c' => + 'filename file with spaces.c', ); foreach ($tests as $input => $expect) { - $this->assertEqual( - $expect, - ArcanistDiffParser::stripGitPathPrefix($input), - pht("Strip git prefix from '%s'.", $input)); - } - } - - public function testGitPathSplitting() { - static $tests = array( - 'a/old.c b/new.c' => array('old.c', 'new.c'), - "a/old.c b/new.c\n" => array('old.c', 'new.c'), - "a/old.c b/new.c\r\n" => array('old.c', 'new.c'), - 'old.c new.c' => array('old.c', 'new.c'), - '1/old.c 2/new.c' => array('old.c', 'new.c'), - '"a/\\"quotes1\\"" "b/\\"quotes2\\""' => array( - '"quotes1"', - '"quotes2"', - ), - '"a/\\"quotes and spaces1\\"" "b/\\"quotes and spaces2\\""' => array( - '"quotes and spaces1"', - '"quotes and spaces2"', - ), - '"a/\\342\\230\\2031" "b/\\342\\230\\2032"' => array( - "\xE2\x98\x831", - "\xE2\x98\x832", - ), - 'a/Core Data/old.c b/Core Data/new.c' => array( - 'Core Data/old.c', - 'Core Data/new.c', - ), - 'some file with spaces.c some file with spaces.c' => array( - 'some file with spaces.c', - 'some file with spaces.c', - ), - ); - - foreach ($tests as $input => $expect) { - $result = ArcanistDiffParser::splitGitDiffPaths($input); + $result = ArcanistDiffParser::extractGitCommonFilename($input); $this->assertEqual( $expect, $result, pht('Split: %s', $input)); } - - - static $ambiguous = array( - 'old file with spaces.c new file with spaces.c', - ); - - foreach ($ambiguous as $input) { - $caught = null; - try { - ArcanistDiffParser::splitGitDiffPaths($input); - } catch (Exception $ex) { - $caught = $ex; - } - $this->assertTrue( - ($caught instanceof Exception), - pht('Ambiguous: %s', $input)); - } } + + public function runSingleRename($diffline, $from, $to, $old, $new) { + $str = "diff --git $diffline\nsimilarity index 95%\n" + ."rename from $from\nrename to $to\n"; + $parser = new ArcanistDiffParser(); + $changes = $parser->parseDiff($str); + $this->assertTrue( + $changes !== null, + pht("Parsed:\n%s", $str)); + $this->assertEqual( + $old == $new ? 1 : 2, count($changes), + pht("Parsed one change:\n%s", $str)); + $change = reset($changes); + $this->assertEqual( + array($old, $new), + array($change->getOldPath(), $change->getCurrentPath()), + pht('Split: %s', $diffline)); + } + + public function testGitRenames() { + $this->runSingleRename('a/old.c b/new.c', + 'old.c', 'new.c', + 'old.c', 'new.c'); + $this->runSingleRename('old.c new.c', + 'old.c', 'new.c', + 'old.c', 'new.c'); + $this->runSingleRename('1/old.c 2/new.c', + 'old.c', 'new.c', + 'old.c', 'new.c'); + $this->runSingleRename('from/file.c to/file.c', + 'from/file.c', 'to/file.c', + 'from/file.c', 'to/file.c'); + $this->runSingleRename('"a/\\"quotes1\\"" "b/\\"quotes2\\""', + '"\\"quotes1\\""', '"\\"quotes2\\""', + '"quotes1"', '"quotes2"'); + $this->runSingleRename('"a/\\"quotes spaces1\\"" "b/\\"quotes spaces2\\""', + '"\\"quotes spaces1\\""', '"\\"quotes spaces2\\""', + '"quotes spaces1"', '"quotes spaces2"'); + $this->runSingleRename('"a/\\342\\230\\2031" "b/\\342\\230\\2032"', + '"\\342\\230\\2031"', '"\\342\\230\\2032"', + "\xE2\x98\x831", "\xE2\x98\x832"); + $this->runSingleRename('a/Core Data/old.c b/Core Data/new.c', + 'Core Data/old.c', 'Core Data/new.c', + 'Core Data/old.c', 'Core Data/new.c'); + $this->runSingleRename('file with spaces.c file with spaces.c', + 'file with spaces.c', 'file with spaces.c', + 'file with spaces.c', 'file with spaces.c'); + $this->runSingleRename('a/non-quoted filename.c "b/quoted filename.c"', + 'non-quoted filename.c', '"quoted filename.c"', + 'non-quoted filename.c', 'quoted filename.c'); + $this->runSingleRename('non-quoted filename.c "quoted filename.c"', + 'non-quoted filename.c', '"quoted filename.c"', + 'non-quoted filename.c', 'quoted filename.c'); + $this->runSingleRename('"a/quoted filename.c" b/non quoted filename.c', + '"quoted filename.c"', 'non quoted filename.c', + 'quoted filename.c', 'non quoted filename.c'); + $this->runSingleRename('"quoted filename.c" non-quoted filename.c', + '"quoted filename.c"', 'non-quoted filename.c', + 'quoted filename.c', 'non-quoted filename.c'); + $this->runSingleRename('old file with spaces.c new file with spaces.c', + 'old file with spaces.c', 'new file with spaces.c', + 'old file with spaces.c', 'new file with spaces.c'); + $this->runSingleRename('old file old file', + 'old file old', 'file', + 'old file old', 'file'); + } } diff --git a/src/parser/__tests__/diff/custom-prefixes-edit.gitdiff b/src/parser/__tests__/diff/custom-prefixes-edit.gitdiff new file mode 100644 index 00000000..31abe8b9 --- /dev/null +++ b/src/parser/__tests__/diff/custom-prefixes-edit.gitdiff @@ -0,0 +1,11 @@ +diff --git src/file dst/file +index 711edb6..a8cd7be 100644 +--- src/file ++++ dst/file +@@ -5,5 +5,5 @@ + some context + other context +-old value ++new value + some context + other context