mirror of
https://we.phorge.it/source/arcanist.git
synced 2024-11-26 08:42:40 +01:00
Split paths on "diff --git" lines correctly in more cases
Summary: See T1973. In T1675, we addressed parsing of diffs with `--no-prefix` or custom `--src-prefix` and `--dst-prefix` flags. However, this inadvetently broke diffing of files with spaces in their names, which Git does not quote. They look like this normally: diff --git a/old file b/new file Prior to D3744, we accidentally got this right by looking for the `a/` and `b/`. However, we no longer do, and instead produce nonsense results. This problem is difficult because for files with spaces, `git diff --no-prefix` may generate an ambiguous line like: diff --git a b c d e f g From this line, we have no way to deterine if this moves "a" to "b c d e f g", or "a b c d" to "e f g", or anything in between. In some diffs we have more information later on, but in some cases we do not, e.g. for binary diffs without `--binary`. Try to get this right in as many cases as possible: - If there are quotes, we can unambiguously get it right. This only happens for filenames with quotes or unicode characters, however. - If there is exactly one space, we can unambiguously get it right. - Interpret the common case of `a/<anything> b/<anything>` in the most-likely-correct way again. - Interpret the rare case of `<anything> <that same thing>` in the most-likely-correct way. - Complain about any `a b c d e f g` garbage. Test Plan: Ran unit tests. Created a diff of a file called "File With Spaces". Reviewers: vrana, btrahan Reviewed By: btrahan CC: aran, ReturnZero Maniphest Tasks: T1973 Differential Revision: https://secure.phabricator.com/D3818
This commit is contained in:
parent
b8ead97637
commit
6be5cfd104
2 changed files with 119 additions and 12 deletions
|
@ -222,7 +222,7 @@ final class ArcanistDiffParser {
|
||||||
'(?P<type>commit) (?P<hash>[a-f0-9]+)(?: \(.*\))?',
|
'(?P<type>commit) (?P<hash>[a-f0-9]+)(?: \(.*\))?',
|
||||||
// This is a git diff, probably from "git show" or "git diff".
|
// This is a git diff, probably from "git show" or "git diff".
|
||||||
// Note that the filenames may appear quoted.
|
// Note that the filenames may appear quoted.
|
||||||
'(?P<type>diff --git) (?P<old>"?.+"?) (?P<cur>"?.+"?)',
|
'(?P<type>diff --git) (?P<oldnew>.*)',
|
||||||
// This is a unified diff, probably from "diff -u" or synthetic diffing.
|
// This is a unified diff, probably from "diff -u" or synthetic diffing.
|
||||||
'(?P<type>---) (?P<old>.+)\s+\d{4}-\d{2}-\d{2}.*',
|
'(?P<type>---) (?P<old>.+)\s+\d{4}-\d{2}-\d{2}.*',
|
||||||
'(?P<binary>Binary) files '.
|
'(?P<binary>Binary) files '.
|
||||||
|
@ -267,14 +267,9 @@ final class ArcanistDiffParser {
|
||||||
|
|
||||||
if (isset($match['type'])) {
|
if (isset($match['type'])) {
|
||||||
if ($match['type'] == 'diff --git') {
|
if ($match['type'] == 'diff --git') {
|
||||||
if (isset($match['old'])) {
|
list($old, $new) = self::splitGitDiffPaths($match['oldnew']);
|
||||||
$match['old'] = $this->unescapeFilename($match['old']);
|
$match['old'] = $old;
|
||||||
$match['old'] = self::stripGitPathPrefix($match['old']);
|
$match['cur'] = $new;
|
||||||
}
|
|
||||||
if (isset($match['cur'])) {
|
|
||||||
$match['cur'] = $this->unescapeFilename($match['cur']);
|
|
||||||
$match['cur'] = self::stripGitPathPrefix($match['cur']);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -598,12 +593,12 @@ final class ArcanistDiffParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!empty($match['old'])) {
|
if (!empty($match['old'])) {
|
||||||
$match['old'] = $this->unescapeFilename($match['old']);
|
$match['old'] = self::unescapeFilename($match['old']);
|
||||||
$change->setOldPath($match['old']);
|
$change->setOldPath($match['old']);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!empty($match['cur'])) {
|
if (!empty($match['cur'])) {
|
||||||
$match['cur'] = $this->unescapeFilename($match['cur']);
|
$match['cur'] = self::unescapeFilename($match['cur']);
|
||||||
$change->setCurrentPath($match['cur']);
|
$change->setCurrentPath($match['cur']);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1134,7 +1129,7 @@ final class ArcanistDiffParser {
|
||||||
/**
|
/**
|
||||||
* Unescape escaped filenames, e.g. from "git diff".
|
* Unescape escaped filenames, e.g. from "git diff".
|
||||||
*/
|
*/
|
||||||
private function unescapeFilename($name) {
|
private static function unescapeFilename($name) {
|
||||||
if (preg_match('/^".+"$/', $name)) {
|
if (preg_match('/^".+"$/', $name)) {
|
||||||
return stripcslashes(substr($name, 1, -1));
|
return stripcslashes(substr($name, 1, -1));
|
||||||
} else {
|
} else {
|
||||||
|
@ -1200,6 +1195,7 @@ final class ArcanistDiffParser {
|
||||||
$this->changes = $changes;
|
$this->changes = $changes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Strip prefixes off paths from `git diff`. By default git uses a/ and b/,
|
* Strip prefixes off paths from `git diff`. By default git uses a/ and b/,
|
||||||
* but you can set `diff.mnemonicprefix` to get a different set of prefixes,
|
* but you can set `diff.mnemonicprefix` to get a different set of prefixes,
|
||||||
|
@ -1239,4 +1235,59 @@ final class ArcanistDiffParser {
|
||||||
return preg_replace($regex, '', $path);
|
return preg_replace($regex, '', $path);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Split the paths on a "diff --git" line into old and new paths. This
|
||||||
|
* is difficult because they may be ambiguous if the files contain spaces.
|
||||||
|
*
|
||||||
|
* @param string Text from a diff line after "diff --git ".
|
||||||
|
* @return pair<string, string> Old and new paths.
|
||||||
|
*/
|
||||||
|
public static function splitGitDiffPaths($paths) {
|
||||||
|
$matches = null;
|
||||||
|
$paths = rtrim($paths, "\r\n");
|
||||||
|
|
||||||
|
$patterns = array(
|
||||||
|
// Try quoted paths, used for unicode filenames or filenames with quotes.
|
||||||
|
'@^(?P<old>"(?:\\\\.|[^"\\\\]+)+") (?P<new>"(?:\\\\.|[^"\\\\]+)+")$@',
|
||||||
|
|
||||||
|
// Try paths without spaces.
|
||||||
|
'@^(?P<old>[^ ]+) (?P<new>[^ ]+)$@',
|
||||||
|
|
||||||
|
// Try paths with well-known prefixes.
|
||||||
|
'@^(?P<old>[abicwo12]/.*) (?P<new>[abicwo12]/.*)$@',
|
||||||
|
|
||||||
|
// Try the exact same string twice in a row separated by a space.
|
||||||
|
// This can hit a false positive for moves from files like "old file old"
|
||||||
|
// to "file", but such a case combined with custom diff prefixes is
|
||||||
|
// incredibly obscure.
|
||||||
|
'@^(?P<old>.*) (?P<new>\\1)$@',
|
||||||
|
);
|
||||||
|
|
||||||
|
foreach ($patterns as $pattern) {
|
||||||
|
if (preg_match($pattern, $paths, $matches)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!$matches) {
|
||||||
|
throw new Exception(
|
||||||
|
"Input diff contains ambiguous line 'diff --git {$paths}'. This line ".
|
||||||
|
"is ambiguous because there are spaces in the file names, so the ".
|
||||||
|
"parser can not determine where the file names begin and end. To ".
|
||||||
|
"resolve this ambiguity, use standard prefixes ('a/' and 'b/') when ".
|
||||||
|
"generating diffs.");
|
||||||
|
}
|
||||||
|
|
||||||
|
$old = $matches['old'];
|
||||||
|
$old = self::unescapeFilename($old);
|
||||||
|
$old = self::stripGitPathPrefix($old);
|
||||||
|
|
||||||
|
$new = $matches['new'];
|
||||||
|
$new = self::unescapeFilename($new);
|
||||||
|
$new = self::stripGitPathPrefix($new);
|
||||||
|
|
||||||
|
return array($old, $new);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -580,4 +580,60 @@ EOTEXT
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function testGitPathSplitting() {
|
||||||
|
static $tests = array(
|
||||||
|
"a/old.c b/new.c" => array('old.c', 'new.c'),
|
||||||
|
"a/old.c b/new.c\n" => array('old.c', 'new.c'),
|
||||||
|
"a/old.c b/new.c\r\n" => array('old.c', 'new.c'),
|
||||||
|
"old.c new.c" => array('old.c', 'new.c'),
|
||||||
|
"1/old.c 2/new.c" => array('old.c', 'new.c'),
|
||||||
|
'"a/\\"quotes1\\"" "b/\\"quotes2\\""' => array(
|
||||||
|
'"quotes1"',
|
||||||
|
'"quotes2"',
|
||||||
|
),
|
||||||
|
'"a/\\"quotes and spaces1\\"" "b/\\"quotes and spaces2\\""' => array(
|
||||||
|
'"quotes and spaces1"',
|
||||||
|
'"quotes and spaces2"',
|
||||||
|
),
|
||||||
|
'"a/\\342\\230\\2031" "b/\\342\\230\\2032"' => array(
|
||||||
|
"\xE2\x98\x831",
|
||||||
|
"\xE2\x98\x832",
|
||||||
|
),
|
||||||
|
"a/Core Data/old.c b/Core Data/new.c" => array(
|
||||||
|
'Core Data/old.c',
|
||||||
|
'Core Data/new.c',
|
||||||
|
),
|
||||||
|
"some file with spaces.c some file with spaces.c" => array(
|
||||||
|
'some file with spaces.c',
|
||||||
|
'some file with spaces.c',
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
foreach ($tests as $input => $expect) {
|
||||||
|
$result = ArcanistDiffParser::splitGitDiffPaths($input);
|
||||||
|
$this->assertEqual(
|
||||||
|
$expect,
|
||||||
|
$result,
|
||||||
|
"Split: {$input}");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static $ambiguous = array(
|
||||||
|
"old file with spaces.c new file with spaces.c",
|
||||||
|
);
|
||||||
|
|
||||||
|
foreach ($ambiguous as $input) {
|
||||||
|
$caught = null;
|
||||||
|
try {
|
||||||
|
ArcanistDiffParser::splitGitDiffPaths($input);
|
||||||
|
} catch (Exception $ex) {
|
||||||
|
$caught = $ex;
|
||||||
|
}
|
||||||
|
$this->assertEqual(
|
||||||
|
true,
|
||||||
|
($caught instanceof Exception),
|
||||||
|
"Ambiguous: {$input}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue