Clean up copy detection code a bit

Summary: Ref T1266. This doesn't change any behaviors, but some of this code has a lot of really complicated conditionals and I tried to break that up a bit. Also, reexpress this stuff in terms of the "structured" parser in D12144. Test Plan: Unit tests still pass. They aren't hugely comprehensive but did reliably fail when I screwed stuff up. Reviewers: btrahan Reviewed By: btrahan Subscribers: epriestley Maniphest Tasks: T1266 Differential Revision: https://secure.phabricator.com/D12145
2025-03-28 12:08:14 +01:00 · 2015-03-24 13:12:09 -07:00 · 2015-03-24 13:12:09 -07:00 · 373aaa643a
commit 373aaa643a
parent 74a4c2cf0b
2 changed files with 124 additions and 60 deletions
--- a/resources/celerity/map.php
+++ b/resources/celerity/map.php
@ -100,7 +100,7 @@ return array(
    'rsrc/css/application/releeph/releeph-preview-branch.css' => 'b7a6f4a5',
    'rsrc/css/application/releeph/releeph-request-differential-create-dialog.css' => '8d8b92cd',
    'rsrc/css/application/releeph/releeph-request-typeahead.css' => '667a48ae',
-    'rsrc/css/application/search/search-results.css' => '559cc554',
+    'rsrc/css/application/search/search-results.css' => '15c71110',
    'rsrc/css/application/slowvote/slowvote.css' => '266df6a1',
    'rsrc/css/application/tokens/tokens.css' => '3d0f239e',
    'rsrc/css/application/uiexample/example.css' => '528b19de',
@ -738,7 +738,7 @@ return array(
    'phabricator-prefab' => '72da38cc',
    'phabricator-profile-css' => '1a20dcbf',
    'phabricator-remarkup-css' => 'bc65f3cc',
-    'phabricator-search-results-css' => '559cc554',
+    'phabricator-search-results-css' => '15c71110',
    'phabricator-shaped-request' => '7cbe244b',
    'phabricator-side-menu-view-css' => '7e8c6341',
    'phabricator-slowvote-css' => '266df6a1',
--- a/src/applications/differential/parser/DifferentialChangesetParser.php
+++ b/src/applications/differential/parser/DifferentialChangesetParser.php
@ -1324,17 +1324,20 @@ final class DifferentialChangesetParser {
    foreach ($changesets as $changeset) {
      $file = $changeset->getFilename();
      foreach ($changeset->getHunks() as $hunk) {
-        $line = $hunk->getOldOffset();
-        foreach (explode("\n", $hunk->getChanges()) as $code) {
-          $type = (isset($code[0]) ? $code[0] : '');
-          if ($type == '-' || $type == ' ') {
-            $code = trim(substr($code, 1));
-            $files[$file][$line] = $code;
-            $types[$file][$line] = $type;
-            if (strlen($code) >= $min_width) {
-              $map[$code][] = array($file, $line);
-            }
-            $line++;
+        $lines = $hunk->getStructuredOldFile();
+        foreach ($lines as $line => $info) {
+          $type = $info['type'];
+          if ($type == '\\') {
+            continue;
+          }
+          $types[$file][$line] = $type;
+
+          $text = $info['text'];
+          $text = trim($text);
+          $files[$file][$line] = $text;
+
+          if (strlen($text) >= $min_width) {
+            $map[$text][] = array($file, $line);
          }
        }
      }
@ -1343,57 +1346,118 @@ final class DifferentialChangesetParser {
    foreach ($changesets as $changeset) {
      $copies = array();
      foreach ($changeset->getHunks() as $hunk) {
-        $added = array_map('trim', $hunk->getAddedLines());
-        for (reset($added); list($line, $code) = each($added); ) {
-          if (isset($map[$code])) { // We found a long matching line.
+        $added = $hunk->getStructuredNewFile();

-            if (count($map[$code]) > 16) {
-              // If there are a large number of identical lines in this diff,
-              // don't try to figure out where this block came from: the
-              // analysis is O(N^2), since we need to compare every line
-              // against every other line. Even if we arrive at a result, it
-              // is unlikely to be meaningful. See T5041.
-              continue 2;
-            }
-
-            $best_length = 0;
-            foreach ($map[$code] as $val) { // Explore all candidates.
-              list($file, $orig_line) = $val;
-              $length = 1;
-              // Search also backwards for short lines.
-              foreach (array(-1, 1) as $direction) {
-                $offset = $direction;
-                while (!isset($copies[$line + $offset]) &&
-                    isset($added[$line + $offset]) &&
-                    idx($files[$file], $orig_line + $offset) ===
-                      $added[$line + $offset]) {
-                  $length++;
-                  $offset += $direction;
-                }
-              }
-              if ($length > $best_length ||
-                  ($length == $best_length && // Prefer moves.
-                   idx($types[$file], $orig_line) == '-')) {
-                $best_length = $length;
-                // ($offset - 1) contains number of forward matching lines.
-                $best_offset = $offset - 1;
-                $best_file = $file;
-                $best_line = $orig_line;
-              }
-            }
-            $file = ($best_file == $changeset->getFilename() ? '' : $best_file);
-            for ($i = $best_length; $i--; ) {
-              $type = idx($types[$best_file], $best_line + $best_offset - $i);
-              $copies[$line + $best_offset - $i] = ($best_length < $min_lines
-                ? array() // Ignore short blocks.
-                : array($file, $best_line + $best_offset - $i, $type));
-            }
-            for ($i = 0; $i < $best_offset; $i++) {
-              next($added);
-            }
+        foreach ($added as $line => $info) {
+          if ($info['type'] != '+') {
+            unset($added[$line]);
+            continue;
          }
+          $added[$line] = trim($info['text']);
+        }
+
+        $skip_lines = 0;
+        foreach ($added as $line => $code) {
+          if ($skip_lines) {
+            // We're skipping lines that we already processed because we
+            // extended a block above them downward to include them.
+            $skip_lines--;
+            continue;
+          }
+
+          if (empty($map[$code])) {
+            // This line was too short to trigger copy/move detection.
+            continue;
+          }
+
+          if (count($map[$code]) > 16) {
+            // If there are a large number of identical lines in this diff,
+            // don't try to figure out where this block came from: the analysis
+            // is O(N^2), since we need to compare every line against every
+            // other line. Even if we arrive at a result, it is unlikely to be
+            // meaningful. See T5041.
+            continue;
+          }
+
+          $best_length = 0;
+
+          // Explore all candidates.
+          foreach ($map[$code] as $val) {
+            list($file, $orig_line) = $val;
+            $length = 1;
+
+            // Search backward and forward to find all of the adjacent lines
+            // which match.
+            foreach (array(-1, 1) as $direction) {
+              $offset = $direction;
+              while (true) {
+                if (isset($copies[$line + $offset])) {
+                  // If we run into a block above us which we've already
+                  // attributed to a move or copy from elsewhere, stop
+                  // looking.
+                  break;
+                }
+
+                if (!isset($added[$line + $offset])) {
+                  // If we've run off the beginning or end of the new file,
+                  // stop looking.
+                  break;
+                }
+
+                if (!isset($files[$file][$orig_line + $offset])) {
+                  // If we've run off the beginning or end of the original
+                  // file, we also stop looking.
+                  break;
+                }
+
+                $old = $files[$file][$orig_line + $offset];
+                $new = $added[$line + $offset];
+                if ($old !== $new) {
+                  // If the old line doesn't match the new line, stop
+                  // looking.
+                  break;
+                }
+
+                $length++;
+                $offset += $direction;
+              }
+            }
+
+            if ($length < $best_length) {
+              // If we already know of a better source (more matching lines)
+              // for this move/copy, stick with that one. We prefer long
+              // copies/moves which match a lot of context over short ones.
+              continue;
+            }
+
+            if ($length == $best_length) {
+              if (idx($types[$file], $orig_line) != '-') {
+                // If we already know of an equally good source (same number
+                // of matching lines) and this isn't a move, stick with the
+                // other one. We prefer moves over copies.
+                continue;
+              }
+            }
+
+            $best_length = $length;
+            // ($offset - 1) contains number of forward matching lines.
+            $best_offset = $offset - 1;
+            $best_file = $file;
+            $best_line = $orig_line;
+          }
+
+          $file = ($best_file == $changeset->getFilename() ? '' : $best_file);
+          for ($i = $best_length; $i--; ) {
+            $type = idx($types[$best_file], $best_line + $best_offset - $i);
+            $copies[$line + $best_offset - $i] = ($best_length < $min_lines
+              ? array() // Ignore short blocks.
+              : array($file, $best_line + $best_offset - $i, $type));
+          }
+
+          $skip_lines = $best_offset;
        }
      }
+
      $copies = array_filter($copies);
      if ($copies) {
        $metadata = $changeset->getMetadata();