1
0
Fork 0
mirror of https://we.phorge.it/source/arcanist.git synced 2025-01-23 13:08:17 +01:00

Split paths on "diff --git" lines correctly in more cases

Summary:
See T1973. In T1675, we addressed parsing of diffs with `--no-prefix` or custom `--src-prefix` and `--dst-prefix` flags. However, this inadvetently broke diffing of files with spaces in their names, which Git does not quote. They look like this normally:

  diff --git a/old file b/new file

Prior to D3744, we accidentally got this right by looking for the `a/` and `b/`. However, we no longer do, and instead produce nonsense results.

This problem is difficult because for files with spaces, `git diff --no-prefix` may generate an ambiguous line like:

  diff --git a b c d e f g

From this line, we have no way to deterine if this moves "a" to "b c d e f g", or "a b c d" to "e f g", or anything in between. In some diffs we have more information later on, but in some cases we do not, e.g. for binary diffs without `--binary`.

Try to get this right in as many cases as possible:

  - If there are quotes, we can unambiguously get it right. This only happens for filenames with quotes or unicode characters, however.
  - If there is exactly one space, we can unambiguously get it right.
  - Interpret the common case of `a/<anything> b/<anything>` in the most-likely-correct way again.
  - Interpret the rare case of `<anything> <that same thing>` in the most-likely-correct way.
  - Complain about any `a b c d e f g` garbage.

Test Plan: Ran unit tests. Created a diff of a file called "File With Spaces".

Reviewers: vrana, btrahan

Reviewed By: btrahan

CC: aran, ReturnZero

Maniphest Tasks: T1973

Differential Revision: https://secure.phabricator.com/D3818
This commit is contained in:
epriestley 2012-10-25 14:46:44 -07:00
parent b8ead97637
commit 6be5cfd104
2 changed files with 119 additions and 12 deletions

View file

@ -222,7 +222,7 @@ final class ArcanistDiffParser {
'(?P<type>commit) (?P<hash>[a-f0-9]+)(?: \(.*\))?',
// This is a git diff, probably from "git show" or "git diff".
// Note that the filenames may appear quoted.
'(?P<type>diff --git) (?P<old>"?.+"?) (?P<cur>"?.+"?)',
'(?P<type>diff --git) (?P<oldnew>.*)',
// This is a unified diff, probably from "diff -u" or synthetic diffing.
'(?P<type>---) (?P<old>.+)\s+\d{4}-\d{2}-\d{2}.*',
'(?P<binary>Binary) files '.
@ -267,14 +267,9 @@ final class ArcanistDiffParser {
if (isset($match['type'])) {
if ($match['type'] == 'diff --git') {
if (isset($match['old'])) {
$match['old'] = $this->unescapeFilename($match['old']);
$match['old'] = self::stripGitPathPrefix($match['old']);
}
if (isset($match['cur'])) {
$match['cur'] = $this->unescapeFilename($match['cur']);
$match['cur'] = self::stripGitPathPrefix($match['cur']);
}
list($old, $new) = self::splitGitDiffPaths($match['oldnew']);
$match['old'] = $old;
$match['cur'] = $new;
}
}
@ -598,12 +593,12 @@ final class ArcanistDiffParser {
}
if (!empty($match['old'])) {
$match['old'] = $this->unescapeFilename($match['old']);
$match['old'] = self::unescapeFilename($match['old']);
$change->setOldPath($match['old']);
}
if (!empty($match['cur'])) {
$match['cur'] = $this->unescapeFilename($match['cur']);
$match['cur'] = self::unescapeFilename($match['cur']);
$change->setCurrentPath($match['cur']);
}
@ -1134,7 +1129,7 @@ final class ArcanistDiffParser {
/**
* Unescape escaped filenames, e.g. from "git diff".
*/
private function unescapeFilename($name) {
private static function unescapeFilename($name) {
if (preg_match('/^".+"$/', $name)) {
return stripcslashes(substr($name, 1, -1));
} else {
@ -1200,6 +1195,7 @@ final class ArcanistDiffParser {
$this->changes = $changes;
}
/**
* Strip prefixes off paths from `git diff`. By default git uses a/ and b/,
* but you can set `diff.mnemonicprefix` to get a different set of prefixes,
@ -1239,4 +1235,59 @@ final class ArcanistDiffParser {
return preg_replace($regex, '', $path);
}
/**
* Split the paths on a "diff --git" line into old and new paths. This
* is difficult because they may be ambiguous if the files contain spaces.
*
* @param string Text from a diff line after "diff --git ".
* @return pair<string, string> Old and new paths.
*/
public static function splitGitDiffPaths($paths) {
$matches = null;
$paths = rtrim($paths, "\r\n");
$patterns = array(
// Try quoted paths, used for unicode filenames or filenames with quotes.
'@^(?P<old>"(?:\\\\.|[^"\\\\]+)+") (?P<new>"(?:\\\\.|[^"\\\\]+)+")$@',
// Try paths without spaces.
'@^(?P<old>[^ ]+) (?P<new>[^ ]+)$@',
// Try paths with well-known prefixes.
'@^(?P<old>[abicwo12]/.*) (?P<new>[abicwo12]/.*)$@',
// Try the exact same string twice in a row separated by a space.
// This can hit a false positive for moves from files like "old file old"
// to "file", but such a case combined with custom diff prefixes is
// incredibly obscure.
'@^(?P<old>.*) (?P<new>\\1)$@',
);
foreach ($patterns as $pattern) {
if (preg_match($pattern, $paths, $matches)) {
break;
}
}
if (!$matches) {
throw new Exception(
"Input diff contains ambiguous line 'diff --git {$paths}'. This line ".
"is ambiguous because there are spaces in the file names, so the ".
"parser can not determine where the file names begin and end. To ".
"resolve this ambiguity, use standard prefixes ('a/' and 'b/') when ".
"generating diffs.");
}
$old = $matches['old'];
$old = self::unescapeFilename($old);
$old = self::stripGitPathPrefix($old);
$new = $matches['new'];
$new = self::unescapeFilename($new);
$new = self::stripGitPathPrefix($new);
return array($old, $new);
}
}

View file

@ -580,4 +580,60 @@ EOTEXT
}
}
public function testGitPathSplitting() {
static $tests = array(
"a/old.c b/new.c" => array('old.c', 'new.c'),
"a/old.c b/new.c\n" => array('old.c', 'new.c'),
"a/old.c b/new.c\r\n" => array('old.c', 'new.c'),
"old.c new.c" => array('old.c', 'new.c'),
"1/old.c 2/new.c" => array('old.c', 'new.c'),
'"a/\\"quotes1\\"" "b/\\"quotes2\\""' => array(
'"quotes1"',
'"quotes2"',
),
'"a/\\"quotes and spaces1\\"" "b/\\"quotes and spaces2\\""' => array(
'"quotes and spaces1"',
'"quotes and spaces2"',
),
'"a/\\342\\230\\2031" "b/\\342\\230\\2032"' => array(
"\xE2\x98\x831",
"\xE2\x98\x832",
),
"a/Core Data/old.c b/Core Data/new.c" => array(
'Core Data/old.c',
'Core Data/new.c',
),
"some file with spaces.c some file with spaces.c" => array(
'some file with spaces.c',
'some file with spaces.c',
),
);
foreach ($tests as $input => $expect) {
$result = ArcanistDiffParser::splitGitDiffPaths($input);
$this->assertEqual(
$expect,
$result,
"Split: {$input}");
}
static $ambiguous = array(
"old file with spaces.c new file with spaces.c",
);
foreach ($ambiguous as $input) {
$caught = null;
try {
ArcanistDiffParser::splitGitDiffPaths($input);
} catch (Exception $ex) {
$caught = $ex;
}
$this->assertEqual(
true,
($caught instanceof Exception),
"Ambiguous: {$input}");
}
}
}