1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2025-01-10 23:01:04 +01:00

Detect copied code by own algorithm

Summary:
Required for D2321.
Deprecates D2320.
Uses algorithm described at D2320#16.

Complexity of this algorithm would be `O(N)` (`N` stands for number of lines) in most cases.
The worst case is `O(A*F)` (`A` stands for number of added lines, `F` for number of colliding lines) but it should be pretty rare. Real-world example is 100 modified files with moved license block (15 lines) in each. This will require 1500*100 comparisons because the algorithm will be trying to find the longest block in each file.

Test Plan:
`arc diff --only` on commit with copied code.
More tests on standalone algorithm.

Reviewers: epriestley

Reviewed By: epriestley

CC: aran, Koolvin

Differential Revision: https://secure.phabricator.com/D2333
This commit is contained in:
vrana 2012-04-27 23:00:30 -07:00
parent 6a9ef778fc
commit 7affae9345
2 changed files with 83 additions and 0 deletions

View file

@ -151,9 +151,75 @@ final class DifferentialDiff extends DifferentialDAO {
}
$diff->setLineCount($lines);
$diff->detectCopiedCode();
return $diff;
}
private function detectCopiedCode($min_width = 40, $min_lines = 3) {
$map = array();
$files = array();
foreach ($this->changesets as $changeset) {
$file = $changeset->getFilename();
foreach ($changeset->getHunks() as $hunk) {
$line = $hunk->getOldOffset();
foreach (explode("\n", $hunk->makeOldFile()) as $code) {
$files[$file][$line] = $code;
if (strlen($code) >= $min_width) {
$map[$code][] = array($file, $line);
}
$line++;
}
}
}
foreach ($this->changesets as $changeset) {
$copies = array();
foreach ($changeset->getHunks() as $hunk) {
$added = $hunk->getAddedLines();
for (reset($added); list($line, $code) = each($added); next($added)) {
if (isset($map[$code])) { // We found a long matching line.
$lengths = array();
$max_offsets = array();
foreach ($map[$code] as $val) { // Explore all candidates.
list($file, $orig_line) = $val;
$lengths["$orig_line:$file"] = 1;
// Search also backwards for short lines.
foreach (array(-1, 1) as $direction) {
$offset = $direction;
$orig_code = idx($files[$file], $orig_line + $offset);
while (!isset($copies[$line + $offset]) &&
isset($added[$line + $offset]) &&
$orig_code === $added[$line + $offset]) {
$lengths["$orig_line:$file"]++;
$offset += $direction;
}
}
// ($offset - 1) contains number of forward matching lines.
$max_offsets["$orig_line:$file"] = $offset - 1;
}
$length = max($lengths); // Choose longest candidate.
$val = array_search($length, $lengths);
$offset = $max_offsets[$val];
list($orig_line, $file) = explode(':', $val, 2);
$save_file = ($file == $changeset->getFilename() ? '' : $file);
for ($i = $length; $i--; ) {
$copies[$line + $offset - $i] = ($length < $min_lines
? array() // Ignore short blocks.
: array($save_file, $orig_line + $offset - $i));
}
for ($i = 0; $i < $offset; $i++) {
next($added);
}
}
}
}
$metadata = $changeset->getMetadata();
$metadata['copy:lines'] = array_filter($copies);
$changeset->setMetadata($metadata);
}
}
public function getDiffDict() {
$dict = array(
'id' => $this->getID(),

View file

@ -25,6 +25,23 @@ final class DifferentialHunk extends DifferentialDAO {
protected $newOffset;
protected $newLen;
public function getAddedLines() {
$lines = array();
$n = $this->newOffset;
foreach (explode("\n", $this->changes) as $diff_line) {
if ($diff_line == '' || $diff_line[0] == '\\') {
continue;
}
if ($diff_line[0] == '+') {
$lines[$n] = substr($diff_line, 1);
}
if ($diff_line[0] != '-') {
$n++;
}
}
return $lines;
}
public function makeNewFile() {
return $this->makeContent($exclude = '-');
}