mirror of
https://we.phorge.it/source/phorge.git
synced 2025-01-10 23:01:04 +01:00
Detect copied code by own algorithm
Summary: Required for D2321. Deprecates D2320. Uses algorithm described at D2320#16. Complexity of this algorithm would be `O(N)` (`N` stands for number of lines) in most cases. The worst case is `O(A*F)` (`A` stands for number of added lines, `F` for number of colliding lines) but it should be pretty rare. Real-world example is 100 modified files with moved license block (15 lines) in each. This will require 1500*100 comparisons because the algorithm will be trying to find the longest block in each file. Test Plan: `arc diff --only` on commit with copied code. More tests on standalone algorithm. Reviewers: epriestley Reviewed By: epriestley CC: aran, Koolvin Differential Revision: https://secure.phabricator.com/D2333
This commit is contained in:
parent
6a9ef778fc
commit
7affae9345
2 changed files with 83 additions and 0 deletions
|
@ -151,9 +151,75 @@ final class DifferentialDiff extends DifferentialDAO {
|
|||
}
|
||||
$diff->setLineCount($lines);
|
||||
|
||||
$diff->detectCopiedCode();
|
||||
|
||||
return $diff;
|
||||
}
|
||||
|
||||
private function detectCopiedCode($min_width = 40, $min_lines = 3) {
|
||||
$map = array();
|
||||
$files = array();
|
||||
foreach ($this->changesets as $changeset) {
|
||||
$file = $changeset->getFilename();
|
||||
foreach ($changeset->getHunks() as $hunk) {
|
||||
$line = $hunk->getOldOffset();
|
||||
foreach (explode("\n", $hunk->makeOldFile()) as $code) {
|
||||
$files[$file][$line] = $code;
|
||||
if (strlen($code) >= $min_width) {
|
||||
$map[$code][] = array($file, $line);
|
||||
}
|
||||
$line++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($this->changesets as $changeset) {
|
||||
$copies = array();
|
||||
foreach ($changeset->getHunks() as $hunk) {
|
||||
$added = $hunk->getAddedLines();
|
||||
for (reset($added); list($line, $code) = each($added); next($added)) {
|
||||
if (isset($map[$code])) { // We found a long matching line.
|
||||
$lengths = array();
|
||||
$max_offsets = array();
|
||||
foreach ($map[$code] as $val) { // Explore all candidates.
|
||||
list($file, $orig_line) = $val;
|
||||
$lengths["$orig_line:$file"] = 1;
|
||||
// Search also backwards for short lines.
|
||||
foreach (array(-1, 1) as $direction) {
|
||||
$offset = $direction;
|
||||
$orig_code = idx($files[$file], $orig_line + $offset);
|
||||
while (!isset($copies[$line + $offset]) &&
|
||||
isset($added[$line + $offset]) &&
|
||||
$orig_code === $added[$line + $offset]) {
|
||||
$lengths["$orig_line:$file"]++;
|
||||
$offset += $direction;
|
||||
}
|
||||
}
|
||||
// ($offset - 1) contains number of forward matching lines.
|
||||
$max_offsets["$orig_line:$file"] = $offset - 1;
|
||||
}
|
||||
$length = max($lengths); // Choose longest candidate.
|
||||
$val = array_search($length, $lengths);
|
||||
$offset = $max_offsets[$val];
|
||||
list($orig_line, $file) = explode(':', $val, 2);
|
||||
$save_file = ($file == $changeset->getFilename() ? '' : $file);
|
||||
for ($i = $length; $i--; ) {
|
||||
$copies[$line + $offset - $i] = ($length < $min_lines
|
||||
? array() // Ignore short blocks.
|
||||
: array($save_file, $orig_line + $offset - $i));
|
||||
}
|
||||
for ($i = 0; $i < $offset; $i++) {
|
||||
next($added);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
$metadata = $changeset->getMetadata();
|
||||
$metadata['copy:lines'] = array_filter($copies);
|
||||
$changeset->setMetadata($metadata);
|
||||
}
|
||||
}
|
||||
|
||||
public function getDiffDict() {
|
||||
$dict = array(
|
||||
'id' => $this->getID(),
|
||||
|
|
|
@ -25,6 +25,23 @@ final class DifferentialHunk extends DifferentialDAO {
|
|||
protected $newOffset;
|
||||
protected $newLen;
|
||||
|
||||
public function getAddedLines() {
|
||||
$lines = array();
|
||||
$n = $this->newOffset;
|
||||
foreach (explode("\n", $this->changes) as $diff_line) {
|
||||
if ($diff_line == '' || $diff_line[0] == '\\') {
|
||||
continue;
|
||||
}
|
||||
if ($diff_line[0] == '+') {
|
||||
$lines[$n] = substr($diff_line, 1);
|
||||
}
|
||||
if ($diff_line[0] != '-') {
|
||||
$n++;
|
||||
}
|
||||
}
|
||||
return $lines;
|
||||
}
|
||||
|
||||
public function makeNewFile() {
|
||||
return $this->makeContent($exclude = '-');
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue