From 6be5cfd104b5977513e2062b8a1e461bfadb2fdd Mon Sep 17 00:00:00 2001 From: epriestley Date: Thu, 25 Oct 2012 14:46:44 -0700 Subject: [PATCH] Split paths on "diff --git" lines correctly in more cases Summary: See T1973. In T1675, we addressed parsing of diffs with `--no-prefix` or custom `--src-prefix` and `--dst-prefix` flags. However, this inadvetently broke diffing of files with spaces in their names, which Git does not quote. They look like this normally: diff --git a/old file b/new file Prior to D3744, we accidentally got this right by looking for the `a/` and `b/`. However, we no longer do, and instead produce nonsense results. This problem is difficult because for files with spaces, `git diff --no-prefix` may generate an ambiguous line like: diff --git a b c d e f g From this line, we have no way to deterine if this moves "a" to "b c d e f g", or "a b c d" to "e f g", or anything in between. In some diffs we have more information later on, but in some cases we do not, e.g. for binary diffs without `--binary`. Try to get this right in as many cases as possible: - If there are quotes, we can unambiguously get it right. This only happens for filenames with quotes or unicode characters, however. - If there is exactly one space, we can unambiguously get it right. - Interpret the common case of `a/ b/` in the most-likely-correct way again. - Interpret the rare case of ` ` in the most-likely-correct way. - Complain about any `a b c d e f g` garbage. Test Plan: Ran unit tests. Created a diff of a file called "File With Spaces". Reviewers: vrana, btrahan Reviewed By: btrahan CC: aran, ReturnZero Maniphest Tasks: T1973 Differential Revision: https://secure.phabricator.com/D3818 --- src/parser/ArcanistDiffParser.php | 75 ++++++++++++++++--- .../__tests__/ArcanistDiffParserTestCase.php | 56 ++++++++++++++ 2 files changed, 119 insertions(+), 12 deletions(-) diff --git a/src/parser/ArcanistDiffParser.php b/src/parser/ArcanistDiffParser.php index ee55d721..9c40719c 100644 --- a/src/parser/ArcanistDiffParser.php +++ b/src/parser/ArcanistDiffParser.php @@ -222,7 +222,7 @@ final class ArcanistDiffParser { '(?Pcommit) (?P[a-f0-9]+)(?: \(.*\))?', // This is a git diff, probably from "git show" or "git diff". // Note that the filenames may appear quoted. - '(?Pdiff --git) (?P"?.+"?) (?P"?.+"?)', + '(?Pdiff --git) (?P.*)', // This is a unified diff, probably from "diff -u" or synthetic diffing. '(?P---) (?P.+)\s+\d{4}-\d{2}-\d{2}.*', '(?PBinary) files '. @@ -267,14 +267,9 @@ final class ArcanistDiffParser { if (isset($match['type'])) { if ($match['type'] == 'diff --git') { - if (isset($match['old'])) { - $match['old'] = $this->unescapeFilename($match['old']); - $match['old'] = self::stripGitPathPrefix($match['old']); - } - if (isset($match['cur'])) { - $match['cur'] = $this->unescapeFilename($match['cur']); - $match['cur'] = self::stripGitPathPrefix($match['cur']); - } + list($old, $new) = self::splitGitDiffPaths($match['oldnew']); + $match['old'] = $old; + $match['cur'] = $new; } } @@ -598,12 +593,12 @@ final class ArcanistDiffParser { } if (!empty($match['old'])) { - $match['old'] = $this->unescapeFilename($match['old']); + $match['old'] = self::unescapeFilename($match['old']); $change->setOldPath($match['old']); } if (!empty($match['cur'])) { - $match['cur'] = $this->unescapeFilename($match['cur']); + $match['cur'] = self::unescapeFilename($match['cur']); $change->setCurrentPath($match['cur']); } @@ -1134,7 +1129,7 @@ final class ArcanistDiffParser { /** * Unescape escaped filenames, e.g. from "git diff". */ - private function unescapeFilename($name) { + private static function unescapeFilename($name) { if (preg_match('/^".+"$/', $name)) { return stripcslashes(substr($name, 1, -1)); } else { @@ -1200,6 +1195,7 @@ final class ArcanistDiffParser { $this->changes = $changes; } + /** * Strip prefixes off paths from `git diff`. By default git uses a/ and b/, * but you can set `diff.mnemonicprefix` to get a different set of prefixes, @@ -1239,4 +1235,59 @@ final class ArcanistDiffParser { return preg_replace($regex, '', $path); } + + /** + * Split the paths on a "diff --git" line into old and new paths. This + * is difficult because they may be ambiguous if the files contain spaces. + * + * @param string Text from a diff line after "diff --git ". + * @return pair Old and new paths. + */ + public static function splitGitDiffPaths($paths) { + $matches = null; + $paths = rtrim($paths, "\r\n"); + + $patterns = array( + // Try quoted paths, used for unicode filenames or filenames with quotes. + '@^(?P"(?:\\\\.|[^"\\\\]+)+") (?P"(?:\\\\.|[^"\\\\]+)+")$@', + + // Try paths without spaces. + '@^(?P[^ ]+) (?P[^ ]+)$@', + + // Try paths with well-known prefixes. + '@^(?P[abicwo12]/.*) (?P[abicwo12]/.*)$@', + + // Try the exact same string twice in a row separated by a space. + // This can hit a false positive for moves from files like "old file old" + // to "file", but such a case combined with custom diff prefixes is + // incredibly obscure. + '@^(?P.*) (?P\\1)$@', + ); + + foreach ($patterns as $pattern) { + if (preg_match($pattern, $paths, $matches)) { + break; + } + } + + if (!$matches) { + throw new Exception( + "Input diff contains ambiguous line 'diff --git {$paths}'. This line ". + "is ambiguous because there are spaces in the file names, so the ". + "parser can not determine where the file names begin and end. To ". + "resolve this ambiguity, use standard prefixes ('a/' and 'b/') when ". + "generating diffs."); + } + + $old = $matches['old']; + $old = self::unescapeFilename($old); + $old = self::stripGitPathPrefix($old); + + $new = $matches['new']; + $new = self::unescapeFilename($new); + $new = self::stripGitPathPrefix($new); + + return array($old, $new); + } + } diff --git a/src/parser/__tests__/ArcanistDiffParserTestCase.php b/src/parser/__tests__/ArcanistDiffParserTestCase.php index e4c4c3e8..ed664145 100644 --- a/src/parser/__tests__/ArcanistDiffParserTestCase.php +++ b/src/parser/__tests__/ArcanistDiffParserTestCase.php @@ -580,4 +580,60 @@ EOTEXT } } + public function testGitPathSplitting() { + static $tests = array( + "a/old.c b/new.c" => array('old.c', 'new.c'), + "a/old.c b/new.c\n" => array('old.c', 'new.c'), + "a/old.c b/new.c\r\n" => array('old.c', 'new.c'), + "old.c new.c" => array('old.c', 'new.c'), + "1/old.c 2/new.c" => array('old.c', 'new.c'), + '"a/\\"quotes1\\"" "b/\\"quotes2\\""' => array( + '"quotes1"', + '"quotes2"', + ), + '"a/\\"quotes and spaces1\\"" "b/\\"quotes and spaces2\\""' => array( + '"quotes and spaces1"', + '"quotes and spaces2"', + ), + '"a/\\342\\230\\2031" "b/\\342\\230\\2032"' => array( + "\xE2\x98\x831", + "\xE2\x98\x832", + ), + "a/Core Data/old.c b/Core Data/new.c" => array( + 'Core Data/old.c', + 'Core Data/new.c', + ), + "some file with spaces.c some file with spaces.c" => array( + 'some file with spaces.c', + 'some file with spaces.c', + ), + ); + + foreach ($tests as $input => $expect) { + $result = ArcanistDiffParser::splitGitDiffPaths($input); + $this->assertEqual( + $expect, + $result, + "Split: {$input}"); + } + + + static $ambiguous = array( + "old file with spaces.c new file with spaces.c", + ); + + foreach ($ambiguous as $input) { + $caught = null; + try { + ArcanistDiffParser::splitGitDiffPaths($input); + } catch (Exception $ex) { + $caught = $ex; + } + $this->assertEqual( + true, + ($caught instanceof Exception), + "Ambiguous: {$input}"); + } + } + }