From 373aaa643a5188f5261664d73370e017f99ffda3 Mon Sep 17 00:00:00 2001 From: epriestley Date: Tue, 24 Mar 2015 13:12:09 -0700 Subject: [PATCH] Clean up copy detection code a bit Summary: Ref T1266. This doesn't change any behaviors, but some of this code has a lot of really complicated conditionals and I tried to break that up a bit. Also, reexpress this stuff in terms of the "structured" parser in D12144. Test Plan: Unit tests still pass. They aren't hugely comprehensive but did reliably fail when I screwed stuff up. Reviewers: btrahan Reviewed By: btrahan Subscribers: epriestley Maniphest Tasks: T1266 Differential Revision: https://secure.phabricator.com/D12145 --- resources/celerity/map.php | 4 +- .../parser/DifferentialChangesetParser.php | 180 ++++++++++++------ 2 files changed, 124 insertions(+), 60 deletions(-) diff --git a/resources/celerity/map.php b/resources/celerity/map.php index 7c7c81c9f1..cece3207b0 100644 --- a/resources/celerity/map.php +++ b/resources/celerity/map.php @@ -100,7 +100,7 @@ return array( 'rsrc/css/application/releeph/releeph-preview-branch.css' => 'b7a6f4a5', 'rsrc/css/application/releeph/releeph-request-differential-create-dialog.css' => '8d8b92cd', 'rsrc/css/application/releeph/releeph-request-typeahead.css' => '667a48ae', - 'rsrc/css/application/search/search-results.css' => '559cc554', + 'rsrc/css/application/search/search-results.css' => '15c71110', 'rsrc/css/application/slowvote/slowvote.css' => '266df6a1', 'rsrc/css/application/tokens/tokens.css' => '3d0f239e', 'rsrc/css/application/uiexample/example.css' => '528b19de', @@ -738,7 +738,7 @@ return array( 'phabricator-prefab' => '72da38cc', 'phabricator-profile-css' => '1a20dcbf', 'phabricator-remarkup-css' => 'bc65f3cc', - 'phabricator-search-results-css' => '559cc554', + 'phabricator-search-results-css' => '15c71110', 'phabricator-shaped-request' => '7cbe244b', 'phabricator-side-menu-view-css' => '7e8c6341', 'phabricator-slowvote-css' => '266df6a1', diff --git a/src/applications/differential/parser/DifferentialChangesetParser.php b/src/applications/differential/parser/DifferentialChangesetParser.php index f985a8657c..0dc11af150 100644 --- a/src/applications/differential/parser/DifferentialChangesetParser.php +++ b/src/applications/differential/parser/DifferentialChangesetParser.php @@ -1324,17 +1324,20 @@ final class DifferentialChangesetParser { foreach ($changesets as $changeset) { $file = $changeset->getFilename(); foreach ($changeset->getHunks() as $hunk) { - $line = $hunk->getOldOffset(); - foreach (explode("\n", $hunk->getChanges()) as $code) { - $type = (isset($code[0]) ? $code[0] : ''); - if ($type == '-' || $type == ' ') { - $code = trim(substr($code, 1)); - $files[$file][$line] = $code; - $types[$file][$line] = $type; - if (strlen($code) >= $min_width) { - $map[$code][] = array($file, $line); - } - $line++; + $lines = $hunk->getStructuredOldFile(); + foreach ($lines as $line => $info) { + $type = $info['type']; + if ($type == '\\') { + continue; + } + $types[$file][$line] = $type; + + $text = $info['text']; + $text = trim($text); + $files[$file][$line] = $text; + + if (strlen($text) >= $min_width) { + $map[$text][] = array($file, $line); } } } @@ -1343,57 +1346,118 @@ final class DifferentialChangesetParser { foreach ($changesets as $changeset) { $copies = array(); foreach ($changeset->getHunks() as $hunk) { - $added = array_map('trim', $hunk->getAddedLines()); - for (reset($added); list($line, $code) = each($added); ) { - if (isset($map[$code])) { // We found a long matching line. + $added = $hunk->getStructuredNewFile(); - if (count($map[$code]) > 16) { - // If there are a large number of identical lines in this diff, - // don't try to figure out where this block came from: the - // analysis is O(N^2), since we need to compare every line - // against every other line. Even if we arrive at a result, it - // is unlikely to be meaningful. See T5041. - continue 2; - } - - $best_length = 0; - foreach ($map[$code] as $val) { // Explore all candidates. - list($file, $orig_line) = $val; - $length = 1; - // Search also backwards for short lines. - foreach (array(-1, 1) as $direction) { - $offset = $direction; - while (!isset($copies[$line + $offset]) && - isset($added[$line + $offset]) && - idx($files[$file], $orig_line + $offset) === - $added[$line + $offset]) { - $length++; - $offset += $direction; - } - } - if ($length > $best_length || - ($length == $best_length && // Prefer moves. - idx($types[$file], $orig_line) == '-')) { - $best_length = $length; - // ($offset - 1) contains number of forward matching lines. - $best_offset = $offset - 1; - $best_file = $file; - $best_line = $orig_line; - } - } - $file = ($best_file == $changeset->getFilename() ? '' : $best_file); - for ($i = $best_length; $i--; ) { - $type = idx($types[$best_file], $best_line + $best_offset - $i); - $copies[$line + $best_offset - $i] = ($best_length < $min_lines - ? array() // Ignore short blocks. - : array($file, $best_line + $best_offset - $i, $type)); - } - for ($i = 0; $i < $best_offset; $i++) { - next($added); - } + foreach ($added as $line => $info) { + if ($info['type'] != '+') { + unset($added[$line]); + continue; } + $added[$line] = trim($info['text']); + } + + $skip_lines = 0; + foreach ($added as $line => $code) { + if ($skip_lines) { + // We're skipping lines that we already processed because we + // extended a block above them downward to include them. + $skip_lines--; + continue; + } + + if (empty($map[$code])) { + // This line was too short to trigger copy/move detection. + continue; + } + + if (count($map[$code]) > 16) { + // If there are a large number of identical lines in this diff, + // don't try to figure out where this block came from: the analysis + // is O(N^2), since we need to compare every line against every + // other line. Even if we arrive at a result, it is unlikely to be + // meaningful. See T5041. + continue; + } + + $best_length = 0; + + // Explore all candidates. + foreach ($map[$code] as $val) { + list($file, $orig_line) = $val; + $length = 1; + + // Search backward and forward to find all of the adjacent lines + // which match. + foreach (array(-1, 1) as $direction) { + $offset = $direction; + while (true) { + if (isset($copies[$line + $offset])) { + // If we run into a block above us which we've already + // attributed to a move or copy from elsewhere, stop + // looking. + break; + } + + if (!isset($added[$line + $offset])) { + // If we've run off the beginning or end of the new file, + // stop looking. + break; + } + + if (!isset($files[$file][$orig_line + $offset])) { + // If we've run off the beginning or end of the original + // file, we also stop looking. + break; + } + + $old = $files[$file][$orig_line + $offset]; + $new = $added[$line + $offset]; + if ($old !== $new) { + // If the old line doesn't match the new line, stop + // looking. + break; + } + + $length++; + $offset += $direction; + } + } + + if ($length < $best_length) { + // If we already know of a better source (more matching lines) + // for this move/copy, stick with that one. We prefer long + // copies/moves which match a lot of context over short ones. + continue; + } + + if ($length == $best_length) { + if (idx($types[$file], $orig_line) != '-') { + // If we already know of an equally good source (same number + // of matching lines) and this isn't a move, stick with the + // other one. We prefer moves over copies. + continue; + } + } + + $best_length = $length; + // ($offset - 1) contains number of forward matching lines. + $best_offset = $offset - 1; + $best_file = $file; + $best_line = $orig_line; + } + + $file = ($best_file == $changeset->getFilename() ? '' : $best_file); + for ($i = $best_length; $i--; ) { + $type = idx($types[$best_file], $best_line + $best_offset - $i); + $copies[$line + $best_offset - $i] = ($best_length < $min_lines + ? array() // Ignore short blocks. + : array($file, $best_line + $best_offset - $i, $type)); + } + + $skip_lines = $best_offset; } } + $copies = array_filter($copies); if ($copies) { $metadata = $changeset->getMetadata();