mirror of
https://we.phorge.it/source/phorge.git
synced 2025-01-18 02:31:10 +01:00
Clean up copy detection code a bit
Summary: Ref T1266. This doesn't change any behaviors, but some of this code has a lot of really complicated conditionals and I tried to break that up a bit. Also, reexpress this stuff in terms of the "structured" parser in D12144. Test Plan: Unit tests still pass. They aren't hugely comprehensive but did reliably fail when I screwed stuff up. Reviewers: btrahan Reviewed By: btrahan Subscribers: epriestley Maniphest Tasks: T1266 Differential Revision: https://secure.phabricator.com/D12145
This commit is contained in:
parent
74a4c2cf0b
commit
373aaa643a
2 changed files with 124 additions and 60 deletions
|
@ -100,7 +100,7 @@ return array(
|
|||
'rsrc/css/application/releeph/releeph-preview-branch.css' => 'b7a6f4a5',
|
||||
'rsrc/css/application/releeph/releeph-request-differential-create-dialog.css' => '8d8b92cd',
|
||||
'rsrc/css/application/releeph/releeph-request-typeahead.css' => '667a48ae',
|
||||
'rsrc/css/application/search/search-results.css' => '559cc554',
|
||||
'rsrc/css/application/search/search-results.css' => '15c71110',
|
||||
'rsrc/css/application/slowvote/slowvote.css' => '266df6a1',
|
||||
'rsrc/css/application/tokens/tokens.css' => '3d0f239e',
|
||||
'rsrc/css/application/uiexample/example.css' => '528b19de',
|
||||
|
@ -738,7 +738,7 @@ return array(
|
|||
'phabricator-prefab' => '72da38cc',
|
||||
'phabricator-profile-css' => '1a20dcbf',
|
||||
'phabricator-remarkup-css' => 'bc65f3cc',
|
||||
'phabricator-search-results-css' => '559cc554',
|
||||
'phabricator-search-results-css' => '15c71110',
|
||||
'phabricator-shaped-request' => '7cbe244b',
|
||||
'phabricator-side-menu-view-css' => '7e8c6341',
|
||||
'phabricator-slowvote-css' => '266df6a1',
|
||||
|
|
|
@ -1324,17 +1324,20 @@ final class DifferentialChangesetParser {
|
|||
foreach ($changesets as $changeset) {
|
||||
$file = $changeset->getFilename();
|
||||
foreach ($changeset->getHunks() as $hunk) {
|
||||
$line = $hunk->getOldOffset();
|
||||
foreach (explode("\n", $hunk->getChanges()) as $code) {
|
||||
$type = (isset($code[0]) ? $code[0] : '');
|
||||
if ($type == '-' || $type == ' ') {
|
||||
$code = trim(substr($code, 1));
|
||||
$files[$file][$line] = $code;
|
||||
$types[$file][$line] = $type;
|
||||
if (strlen($code) >= $min_width) {
|
||||
$map[$code][] = array($file, $line);
|
||||
}
|
||||
$line++;
|
||||
$lines = $hunk->getStructuredOldFile();
|
||||
foreach ($lines as $line => $info) {
|
||||
$type = $info['type'];
|
||||
if ($type == '\\') {
|
||||
continue;
|
||||
}
|
||||
$types[$file][$line] = $type;
|
||||
|
||||
$text = $info['text'];
|
||||
$text = trim($text);
|
||||
$files[$file][$line] = $text;
|
||||
|
||||
if (strlen($text) >= $min_width) {
|
||||
$map[$text][] = array($file, $line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1343,57 +1346,118 @@ final class DifferentialChangesetParser {
|
|||
foreach ($changesets as $changeset) {
|
||||
$copies = array();
|
||||
foreach ($changeset->getHunks() as $hunk) {
|
||||
$added = array_map('trim', $hunk->getAddedLines());
|
||||
for (reset($added); list($line, $code) = each($added); ) {
|
||||
if (isset($map[$code])) { // We found a long matching line.
|
||||
$added = $hunk->getStructuredNewFile();
|
||||
|
||||
if (count($map[$code]) > 16) {
|
||||
// If there are a large number of identical lines in this diff,
|
||||
// don't try to figure out where this block came from: the
|
||||
// analysis is O(N^2), since we need to compare every line
|
||||
// against every other line. Even if we arrive at a result, it
|
||||
// is unlikely to be meaningful. See T5041.
|
||||
continue 2;
|
||||
}
|
||||
|
||||
$best_length = 0;
|
||||
foreach ($map[$code] as $val) { // Explore all candidates.
|
||||
list($file, $orig_line) = $val;
|
||||
$length = 1;
|
||||
// Search also backwards for short lines.
|
||||
foreach (array(-1, 1) as $direction) {
|
||||
$offset = $direction;
|
||||
while (!isset($copies[$line + $offset]) &&
|
||||
isset($added[$line + $offset]) &&
|
||||
idx($files[$file], $orig_line + $offset) ===
|
||||
$added[$line + $offset]) {
|
||||
$length++;
|
||||
$offset += $direction;
|
||||
}
|
||||
}
|
||||
if ($length > $best_length ||
|
||||
($length == $best_length && // Prefer moves.
|
||||
idx($types[$file], $orig_line) == '-')) {
|
||||
$best_length = $length;
|
||||
// ($offset - 1) contains number of forward matching lines.
|
||||
$best_offset = $offset - 1;
|
||||
$best_file = $file;
|
||||
$best_line = $orig_line;
|
||||
}
|
||||
}
|
||||
$file = ($best_file == $changeset->getFilename() ? '' : $best_file);
|
||||
for ($i = $best_length; $i--; ) {
|
||||
$type = idx($types[$best_file], $best_line + $best_offset - $i);
|
||||
$copies[$line + $best_offset - $i] = ($best_length < $min_lines
|
||||
? array() // Ignore short blocks.
|
||||
: array($file, $best_line + $best_offset - $i, $type));
|
||||
}
|
||||
for ($i = 0; $i < $best_offset; $i++) {
|
||||
next($added);
|
||||
}
|
||||
foreach ($added as $line => $info) {
|
||||
if ($info['type'] != '+') {
|
||||
unset($added[$line]);
|
||||
continue;
|
||||
}
|
||||
$added[$line] = trim($info['text']);
|
||||
}
|
||||
|
||||
$skip_lines = 0;
|
||||
foreach ($added as $line => $code) {
|
||||
if ($skip_lines) {
|
||||
// We're skipping lines that we already processed because we
|
||||
// extended a block above them downward to include them.
|
||||
$skip_lines--;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (empty($map[$code])) {
|
||||
// This line was too short to trigger copy/move detection.
|
||||
continue;
|
||||
}
|
||||
|
||||
if (count($map[$code]) > 16) {
|
||||
// If there are a large number of identical lines in this diff,
|
||||
// don't try to figure out where this block came from: the analysis
|
||||
// is O(N^2), since we need to compare every line against every
|
||||
// other line. Even if we arrive at a result, it is unlikely to be
|
||||
// meaningful. See T5041.
|
||||
continue;
|
||||
}
|
||||
|
||||
$best_length = 0;
|
||||
|
||||
// Explore all candidates.
|
||||
foreach ($map[$code] as $val) {
|
||||
list($file, $orig_line) = $val;
|
||||
$length = 1;
|
||||
|
||||
// Search backward and forward to find all of the adjacent lines
|
||||
// which match.
|
||||
foreach (array(-1, 1) as $direction) {
|
||||
$offset = $direction;
|
||||
while (true) {
|
||||
if (isset($copies[$line + $offset])) {
|
||||
// If we run into a block above us which we've already
|
||||
// attributed to a move or copy from elsewhere, stop
|
||||
// looking.
|
||||
break;
|
||||
}
|
||||
|
||||
if (!isset($added[$line + $offset])) {
|
||||
// If we've run off the beginning or end of the new file,
|
||||
// stop looking.
|
||||
break;
|
||||
}
|
||||
|
||||
if (!isset($files[$file][$orig_line + $offset])) {
|
||||
// If we've run off the beginning or end of the original
|
||||
// file, we also stop looking.
|
||||
break;
|
||||
}
|
||||
|
||||
$old = $files[$file][$orig_line + $offset];
|
||||
$new = $added[$line + $offset];
|
||||
if ($old !== $new) {
|
||||
// If the old line doesn't match the new line, stop
|
||||
// looking.
|
||||
break;
|
||||
}
|
||||
|
||||
$length++;
|
||||
$offset += $direction;
|
||||
}
|
||||
}
|
||||
|
||||
if ($length < $best_length) {
|
||||
// If we already know of a better source (more matching lines)
|
||||
// for this move/copy, stick with that one. We prefer long
|
||||
// copies/moves which match a lot of context over short ones.
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($length == $best_length) {
|
||||
if (idx($types[$file], $orig_line) != '-') {
|
||||
// If we already know of an equally good source (same number
|
||||
// of matching lines) and this isn't a move, stick with the
|
||||
// other one. We prefer moves over copies.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
$best_length = $length;
|
||||
// ($offset - 1) contains number of forward matching lines.
|
||||
$best_offset = $offset - 1;
|
||||
$best_file = $file;
|
||||
$best_line = $orig_line;
|
||||
}
|
||||
|
||||
$file = ($best_file == $changeset->getFilename() ? '' : $best_file);
|
||||
for ($i = $best_length; $i--; ) {
|
||||
$type = idx($types[$best_file], $best_line + $best_offset - $i);
|
||||
$copies[$line + $best_offset - $i] = ($best_length < $min_lines
|
||||
? array() // Ignore short blocks.
|
||||
: array($file, $best_line + $best_offset - $i, $type));
|
||||
}
|
||||
|
||||
$skip_lines = $best_offset;
|
||||
}
|
||||
}
|
||||
|
||||
$copies = array_filter($copies);
|
||||
if ($copies) {
|
||||
$metadata = $changeset->getMetadata();
|
||||
|
|
Loading…
Reference in a new issue