api = $api; return $this; } protected function getRepositoryAPI() { return $this->api; } public function setDetectBinaryFiles($detect) { $this->detectBinaryFiles = $detect; return $this; } public function forcePath($path) { $this->forcePath = $path; return $this; } public function setChanges(array $changes) { $this->changes = mpull($changes, null, 'getCurrentPath'); return $this; } public function parseSubversionDiff(ArcanistSubversionAPI $api, $paths) { $this->setRepositoryAPI($api); $diffs = array(); foreach ($paths as $path => $status) { if ($status & ArcanistRepositoryAPI::FLAG_UNTRACKED || $status & ArcanistRepositoryAPI::FLAG_CONFLICT || $status & ArcanistRepositoryAPI::FLAG_MISSING) { unset($paths[$path]); } } $root = null; $from = array(); foreach ($paths as $path => $status) { $change = $this->buildChange($path); if ($status & ArcanistRepositoryAPI::FLAG_ADDED) { $change->setType(ArcanistDiffChangeType::TYPE_ADD); } else if ($status & ArcanistRepositoryAPI::FLAG_DELETED) { $change->setType(ArcanistDiffChangeType::TYPE_DELETE); } else { $change->setType(ArcanistDiffChangeType::TYPE_CHANGE); } $is_dir = is_dir($api->getPath($path)); if ($is_dir) { $change->setFileType(ArcanistDiffChangeType::FILE_DIRECTORY); // We have to go hit the diff even for directories because they may // have property changes or moves, etc. } $is_link = is_link($api->getPath($path)); if ($is_link) { $change->setFileType(ArcanistDiffChangeType::FILE_SYMLINK); } $diff = $api->getRawDiffText($path); if ($diff) { $this->parseDiff($diff); } $info = $api->getSVNInfo($path); if (idx($info, 'Copied From URL')) { if (!$root) { $rinfo = $api->getSVNInfo('.'); $root = $rinfo['URL'].'/'; } $cpath = $info['Copied From URL']; $cpath = substr($cpath, strlen($root)); if ($info['Copied From Rev']) { // The user can "svn cp /path/to/file@12345 x", which pulls a file out // of version history at a specific revision. If we just use the path, // we'll collide with possible changes to that path in the working // copy below. In particular, "svn cp"-ing a path which no longer // exists somewhere in the working copy and then adding that path // gets us to the "origin change type" branches below with a // TYPE_ADD state on the path. To avoid this, append the origin // revision to the path so we'll necessarily generate a new change. // TODO: In theory, you could have an '@' in your path and this could // cause a collision, e.g. two files named 'f' and 'f@12345'. This is // at least somewhat the user's fault, though. if ($info['Copied From Rev'] != $info['Revision']) { $cpath .= '@'.$info['Copied From Rev']; } } $change->setOldPath($cpath); $from[$path] = $cpath; } } foreach ($paths as $path => $status) { $change = $this->buildChange($path); if (empty($from[$path])) { continue; } if (empty($this->changes[$from[$path]])) { if ($change->getType() == ArcanistDiffChangeType::TYPE_COPY_HERE) { // If the origin path wasn't changed (or isn't included in this diff) // and we only copied it, don't generate a changeset for it. This // keeps us out of trouble when we go to 'arc commit' and need to // figure out which files should be included in the commit list. continue; } } $origin = $this->buildChange($from[$path]); $origin->addAwayPath($change->getCurrentPath()); $type = $origin->getType(); switch ($type) { case ArcanistDiffChangeType::TYPE_MULTICOPY: case ArcanistDiffChangeType::TYPE_COPY_AWAY: // "Add" is possible if you do some bizarre tricks with svn:ignore and // "svn copy"'ing URLs straight from the repository; you can end up with // a file that is a copy of itself. See T271. case ArcanistDiffChangeType::TYPE_ADD: break; case ArcanistDiffChangeType::TYPE_DELETE: $origin->setType(ArcanistDiffChangeType::TYPE_MOVE_AWAY); break; case ArcanistDiffChangeType::TYPE_MOVE_AWAY: $origin->setType(ArcanistDiffChangeType::TYPE_MULTICOPY); break; case ArcanistDiffChangeType::TYPE_CHANGE: $origin->setType(ArcanistDiffChangeType::TYPE_COPY_AWAY); break; default: throw new Exception("Bad origin state {$type}."); } $type = $origin->getType(); switch ($type) { case ArcanistDiffChangeType::TYPE_MULTICOPY: case ArcanistDiffChangeType::TYPE_MOVE_AWAY: $change->setType(ArcanistDiffChangeType::TYPE_MOVE_HERE); break; case ArcanistDiffChangeType::TYPE_ADD: case ArcanistDiffChangeType::TYPE_COPY_AWAY: $change->setType(ArcanistDiffChangeType::TYPE_COPY_HERE); break; default: throw new Exception("Bad origin state {$type}."); } } return $this->changes; } public function parseDiff($diff) { $this->didStartParse($diff); if ($this->getLine() === null) { $this->didFailParse("Can't parse an empty diff!"); } do { $patterns = array( // This is a normal SVN text change, probably from "svn diff". '(?PIndex): (?P.+)', // This is an SVN property change, probably from "svn diff". '(?PProperty changes on): (?P.+)', // This is a git commit message, probably from "git show". '(?Pcommit) (?P[a-f0-9]+)', // This is a git diff, probably from "git show" or "git diff". '(?Pdiff --git) a/(?P.+) b/(?P.+)', // This is a unified diff, probably from "diff -u" or synthetic diffing. '(?P---) (?P.+)\s+\d{4}-\d{2}-\d{2}.*', '(?PBinary) files '. '(?P.+)\s+\d{4}-\d{2}-\d{2} and '. '(?P.+)\s+\d{4}-\d{2}-\d{2} differ.*', ); $ok = false; $line = $this->getLine(); $match = null; foreach ($patterns as $pattern) { $ok = preg_match('@^'.$pattern.'$@', $line, $match); if ($ok) { break; } } if (!$ok) { $this->didFailParse( "Expected a hunk header, like 'Index: /path/to/file.ext' (svn), ". "'Property changes on: /path/to/file.ext' (svn properties), ". "'commit 59bcc3ad6775562f845953cf01624225' (git show), ". "'diff --git' (git diff), or '--- filename' (unified diff)."); } $change = $this->buildChange(idx($match, 'cur')); if (isset($match['old'])) { $change->setOldPath($match['old']); } if (isset($match['hash'])) { $change->setCommitHash($match['hash']); } if (isset($match['binary'])) { $change->setFileType(ArcanistDiffChangeType::FILE_BINARY); $line = $this->nextNonemptyLine(); continue; } $line = $this->nextLine(); switch ($match['type']) { case 'Index': $this->parseIndexHunk($change); break; case 'Property changes on': $this->parsePropertyHunk($change); break; case 'diff --git': $this->setIsGit(true); $this->parseIndexHunk($change); break; case 'commit': $this->setIsGit(true); $this->parseCommitMessage($change); break; case '---': $ok = preg_match( '@^(?:\+\+\+) (.*)\s+\d{4}-\d{2}-\d{2}.*$@', $line, $match); if (!$ok) { $this->didFailParse("Expected '+++ filename' in unified diff."); } $change->setCurrentPath($match[1]); $line = $this->nextLine(); $this->parseChangeset($change); break; default: $this->didFailParse("Unknown diff type."); } } while ($this->getLine() !== null); $this->didFinishParse(); return $this->changes; } protected function parseCommitMessage(ArcanistDiffChange $change) { $change->setType(ArcanistDiffChangeType::TYPE_MESSAGE); $message = array(); $line = $this->getLine(); if (preg_match('/^Merge: /', $line)) { $this->nextLine(); } $line = $this->getLine(); if (!preg_match('/^Author: /', $line)) { $this->didFailParse("Expected 'Author:'."); } $line = $this->nextLine(); if (!preg_match('/^Date: /', $line)) { $this->didFailParse("Expected 'Date:'."); } while (($line = $this->nextLine()) !== null) { if (strlen($line) && $line[0] != ' ') { break; } // Strip leading spaces from Git commit messages. $message[] = substr($line, 4); } $message = rtrim(implode("\n", $message)); $change->setMetadata('message', $message); } /** * Parse an SVN property change hunk. These hunks are ambiguous so just sort * of try to get it mostly right. It's entirely possible to foil this parser * (or any other parser) with a carefully constructed property change. */ protected function parsePropertyHunk(ArcanistDiffChange $change) { $line = $this->getLine(); if (!preg_match('/^_+$/', $line)) { $this->didFailParse("Expected '______________________'."); } $line = $this->nextLine(); while ($line !== null) { $done = preg_match('/^(Index|Property changes on):/', $line); if ($done) { break; } $matches = null; $ok = preg_match('/^(Modified|Added|Deleted): (.*)$/', $line, $matches); if (!$ok) { $this->didFailParse("Expected 'Added', 'Deleted', or 'Modified'."); } $op = $matches[1]; $prop = $matches[2]; list($old, $new) = $this->parseSVNPropertyChange($op, $prop); if ($old !== null) { $change->setOldProperty($prop, $old); } if ($new !== null) { $change->setNewProperty($prop, $new); } $line = $this->getLine(); } } private function parseSVNPropertyChange($op, $prop) { $old = array(); $new = array(); $target = null; $line = $this->nextLine(); while ($line !== null) { $done = preg_match( '/^(Modified|Added|Deleted|Index|Property changes on):/', $line); if ($done) { break; } $trimline = ltrim($line); if ($trimline && $trimline[0] == '+') { if ($op == 'Deleted') { $this->didFailParse('Unexpected "+" section in property deletion.'); } $target = 'new'; $line = substr($trimline, 2); } else if ($trimline && $trimline[0] == '-') { if ($op == 'Added') { $this->didFailParse('Unexpected "-" section in property addition.'); } $target = 'old'; $line = substr($trimline, 2); } else if (!strncmp($trimline, 'Merged', 6)) { if ($op == 'Added') { $target = 'new'; } else { // These can appear on merges. No idea how to interpret this (unclear // what the old / new values are) and it's of dubious usefulness so // just throw it away until someone complains. $target = null; } $line = $trimline; } if ($target == 'new') { $new[] = $line; } else if ($target == 'old') { $old[] = $line; } $line = $this->nextLine(); } $old = rtrim(implode("\n", $old)); $new = rtrim(implode("\n", $new)); if (!strlen($old)) { $old = null; } if (!strlen($new)) { $new = null; } return array($old, $new); } protected function setIsGit($git) { if ($this->isGit !== null && $this->isGit != $git) { throw new Exception("Git status has changed!"); } $this->isGit = $git; return $this; } protected function getIsGit() { return $this->isGit; } protected function parseIndexHunk(ArcanistDiffChange $change) { $is_git = $this->getIsGit(); $line = $this->getLine(); if ($is_git) { do { $patterns = array( '(?Pnew) file mode (?P\d+)', '(?Pdeleted) file mode (?P\d+)', // These occur when someone uses `chmod` on a file. 'old mode (?P\d+)', 'new mode (?P\d+)', // These occur when you `mv` a file and git figures it out. 'similarity index ', 'rename from (?P.*)', '(?Prename) to (?P.*)', 'copy from (?P.*)', '(?Pcopy) to (?P.*)' ); $ok = false; $match = null; foreach ($patterns as $pattern) { $ok = preg_match('@^'.$pattern.'@', $line, $match); if ($ok) { break; } } if (!$ok) { if ($line === null || preg_match('/^(diff --git|commit) /', $line)) { // In this case, there are ONLY file mode changes, or this is a // pure move. return; } break; } if (!empty($match['oldmode'])) { $change->setOldProperty('unix:filemode', $match['oldmode']); } if (!empty($match['newmode'])) { $change->setNewProperty('unix:filemode', $match['newmode']); } if (!empty($match['deleted'])) { $change->setType(ArcanistDiffChangeType::TYPE_DELETE); } if (!empty($match['new'])) { $change->setType(ArcanistDiffChangeType::TYPE_ADD); } if (!empty($match['old'])) { $change->setOldPath($match['old']); } if (!empty($match['cur'])) { $change->setCurrentPath($match['cur']); } if (!empty($match['copy'])) { $change->setType(ArcanistDiffChangeType::TYPE_COPY_HERE); $old = $this->buildChange($change->getOldPath()); $type = $old->getType(); if ($type == ArcanistDiffChangeType::TYPE_MOVE_AWAY) { $old->setType(ArcanistDiffChangeType::TYPE_MULTICOPY); } else { $old->setType(ArcanistDiffChangeType::TYPE_COPY_AWAY); } $old->addAwayPath($change->getCurrentPath()); } if (!empty($match['move'])) { $change->setType(ArcanistDiffChangeType::TYPE_MOVE_HERE); $old = $this->buildChange($change->getOldPath()); $type = $old->getType(); if ($type == ArcanistDiffChangeType::TYPE_MULTICOPY) { // Great, no change. } else if ($type == ArcanistDiffChangeType::TYPE_MOVE_AWAY) { $old->setType(ArcanistDiffChangeType::TYPE_MULTICOPY); } else if ($type == ArcanistDiffChangeType::TYPE_COPY_AWAY) { $old->setType(ArcanistDiffChangeType::TYPE_MULTICOPY); } else { $old->setType(ArcanistDiffChangeType::TYPE_MOVE_AWAY); } $old->addAwayPath($change->getCurrentPath()); } $line = $this->nextNonemptyLine(); } while (true); } $line = $this->getLine(); $ok = preg_match('/^=+$/', $line) || ($is_git && preg_match('/^index .*$/', $line)); if (!$ok) { if ($is_git) { $this->didFailParse( "Expected 'index af23f...a98bc' header line."); } else { $this->didFailParse( "Expected '==========================' divider line."); } } // Adding an empty file in SVN can produce an empty line here. $line = $this->nextNonemptyLine(); // If there are files with only whitespace changes and -b or -w are // supplied as command-line flags to `diff', svn and git both produce // changes without any body. if ($line === null || preg_match( '/^(Index:|Property changes on:|diff --git|commit) /', $line)) { return; } $is_binary_add = preg_match( '/^Cannot display: file marked as a binary type.$/', $line); if ($is_binary_add) { $this->nextLine(); // Cannot display: file marked as a binary type. $this->nextNonemptyLine(); // svn:mime-type = application/octet-stream $this->markBinary($change); return; } // We can get this in git, or in SVN when a file exists in the repository // WITHOUT a binary mime-type and is changed and given a binary mime-type. $is_binary_diff = preg_match( '/^Binary files .* and .* differ$/', $line); if ($is_binary_diff) { $this->nextNonemptyLine(); // Binary files x and y differ $this->markBinary($change); return; } if ($is_git) { // "git diff -b" ignores whitespace, but has an empty hunk target if (preg_match('@^diff --git a/.*$@', $line)) { $this->nextLine(); return null; } } $old_file = $this->parseHunkTarget(); $new_file = $this->parseHunkTarget(); $change->setOldPath($old_file); $this->parseChangeset($change); } protected function parseHunkTarget() { $line = $this->getLine(); $matches = null; $ok = preg_match( '@^[-+]{3} (?:[ab]/)?(?P.*?)(?:\s*\(.*\))?$@', $line, $matches); if (!$ok) { $this->didFailParse( "Expected hunk target '+++ path/to/file.ext (revision N)'."); } $this->nextLine(); return $matches['path']; } protected function markBinary(ArcanistDiffChange $change) { $change->setFileType(ArcanistDiffChangeType::FILE_BINARY); return $this; } protected function parseChangeset(ArcanistDiffChange $change) { $all_changes = array(); do { $hunk = new ArcanistDiffHunk(); $line = $this->getLine(); $real = array(); // In the case where only one line is changed, the length is omitted. // The final group is for git, which appends a guess at the function // context to the diff. $matches = null; $ok = preg_match( '/^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(?: .*?)?$/U', $line, $matches); if (!$ok) { $this->didFailParse("Expected hunk header '@@ -NN,NN +NN,NN @@'."); } $hunk->setOldOffset($matches[1]); $hunk->setNewOffset($matches[3]); // Cover for the cases where length wasn't present (implying one line). $old_len = idx($matches, 2); if (!strlen($old_len)) { $old_len = 1; } $new_len = idx($matches, 4); if (!strlen($new_len)) { $new_len = 1; } $hunk->setOldLength($old_len); $hunk->setNewLength($new_len); $add = 0; $del = 0; $advance = false; while ((($line = $this->nextLine()) !== null)) { if (strlen($line)) { $char = $line[0]; } else { $char = '~'; } switch ($char) { case '\\': if (!preg_match('@\\ No newline at end of file@', $line)) { $this->didFailParse( "Expected '\ No newline at end of file'."); } if ($new_len) { $hunk->setIsMissingOldNewline(true); } else { $hunk->setIsMissingNewNewline(true); } if (!$new_len) { $advance = true; break 2; } break; case '+': if (!$new_len) { break 2; } ++$add; --$new_len; $real[] = $line; break; case '-': if (!$old_len) { break 2; } ++$del; --$old_len; $real[] = $line; break; case ' ': if (!$old_len && !$new_len) { break 2; } --$old_len; --$new_len; $real[] = $line; break; case '~': $advance = true; break 2; default: break 2; } } if ($old_len != 0 || $new_len != 0) { $this->didFailParse("Found the wrong number of hunk lines."); } $corpus = implode("\n", $real); $is_binary = false; if ($this->detectBinaryFiles) { $is_binary = preg_match('/([^\x09\x0A\x20-\x7E]+)/', $corpus); } if ($is_binary) { // SVN happily treats binary files which aren't marked with the right // mime type as text files. Detect that junk here and mark the file // binary. We'll catch stuff with unicode too, but that's verboten // anyway. If there are too many false positives with this we might // need to make it threshold-triggered instead of triggering on any // unprintable byte. $change->setFileType(ArcanistDiffChangeType::FILE_BINARY); } else { $hunk->setCorpus($corpus); $hunk->setAddLines($add); $hunk->setDelLines($del); $change->addHunk($hunk); } if ($advance) { $line = $this->nextNonemptyLine(); } } while (preg_match('/^@@ /', $line)); } protected function buildChange($path = null) { $change = null; if ($path !== null) { if (!empty($this->changes[$path])) { return $this->changes[$path]; } } if ($this->forcePath) { return $this->changes[$this->forcePath]; } $change = new ArcanistDiffChange(); if ($path !== null) { $change->setCurrentPath($path); $this->changes[$path] = $change; } else { $this->changes[] = $change; } return $change; } protected function didStartParse($text) { // TODO: Removed an fb_utf8ize() call here. -epriestley // Eat leading whitespace. This may happen if the first change in the diff // is an SVN property change. $text = ltrim($text); $this->text = explode("\n", $text); $this->line = 0; } protected function getLine() { if ($this->text === null) { throw new Exception("Not parsing!"); } if (isset($this->text[$this->line])) { return $this->text[$this->line]; } return null; } protected function nextLine() { $this->line++; return $this->getLine(); } protected function nextNonemptyLine() { while (($line = $this->nextLine()) !== null) { if (strlen(trim($line)) !== 0) { break; } } return $this->getLine(); } protected function didFinishParse() { $this->text = null; } protected function didFailParse($message) { $min = max(0, $this->line - 3); $max = min($this->line + 3, count($this->text) - 1); $context = ''; for ($ii = $min; $ii <= $max; $ii++) { $context .= sprintf( "%8.8s %s\n", ($ii == $this->line) ? '>>> ' : '', $this->text[$ii]); } $message = "Parse Exception: {$message}\n\n{$context}\n"; throw new Exception($message); } }