From c6fc05ee0941f3b417075c9f17c1f2ded0e31042 Mon Sep 17 00:00:00 2001 From: epriestley Date: Wed, 21 Nov 2018 16:05:31 -0800 Subject: [PATCH] Pull Git filesize logic into a separate LowLevel query and use more Iterators Summary: Depends on D19829. Ref T13216. See PHI908. The current implementation is kind of a lot to live in `CommitHookEngine` and will likely fail if `git diff-tree` produces more than 2GB of output. Pull it out and make it slightly more robust against enormous commits. It's probably limited by this, now: ``` implode("\n", $every_path) ``` We could replace that with some `PhutilReverseRopeSource` primitive or something but since we don't have one of those and it seems unlikely that we'll hit this case in practice, I left it here for now with just the easy stuff converted to be stream-oriented. Test Plan: Used this script to test the query against various commits, got good results: ``` setViewer($viewer) ->withCallsigns(array('P')) ->executeOne(); var_dump( id(new DiffusionLowLevelFilesizeQuery()) ->setRepository($repository) ->withIdentifier($argv[1]) ->execute()); ``` Used this to find large commits in history and pull filesizes (worked great, although our largest commit only touches a couple thousand paths): ``` for hash in `git log --format=%H`; do echo -n $hash; echo -n ' '; git diff-tree -r --no-commit-id $hash | wc -l | awk '{print $1}'; done | awk '{print $2 " " $1}' | sort -n ``` Reviewers: amckinley Reviewed By: amckinley Maniphest Tasks: T13216 Differential Revision: https://secure.phabricator.com/D19830 --- src/__phutil_library_map__.php | 2 + .../engine/DiffusionCommitHookEngine.php | 94 +------------ .../DiffusionLowLevelFilesizeQuery.php | 125 ++++++++++++++++++ 3 files changed, 131 insertions(+), 90 deletions(-) create mode 100644 src/applications/diffusion/query/lowlevel/DiffusionLowLevelFilesizeQuery.php diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php index 77012e8a64..89d8a6bfac 100644 --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -839,6 +839,7 @@ phutil_register_library_map(array( 'DiffusionLookSoonConduitAPIMethod' => 'applications/diffusion/conduit/DiffusionLookSoonConduitAPIMethod.php', 'DiffusionLowLevelCommitFieldsQuery' => 'applications/diffusion/query/lowlevel/DiffusionLowLevelCommitFieldsQuery.php', 'DiffusionLowLevelCommitQuery' => 'applications/diffusion/query/lowlevel/DiffusionLowLevelCommitQuery.php', + 'DiffusionLowLevelFilesizeQuery' => 'applications/diffusion/query/lowlevel/DiffusionLowLevelFilesizeQuery.php', 'DiffusionLowLevelGitRefQuery' => 'applications/diffusion/query/lowlevel/DiffusionLowLevelGitRefQuery.php', 'DiffusionLowLevelMercurialBranchesQuery' => 'applications/diffusion/query/lowlevel/DiffusionLowLevelMercurialBranchesQuery.php', 'DiffusionLowLevelMercurialPathsQuery' => 'applications/diffusion/query/lowlevel/DiffusionLowLevelMercurialPathsQuery.php', @@ -6250,6 +6251,7 @@ phutil_register_library_map(array( 'DiffusionLookSoonConduitAPIMethod' => 'DiffusionConduitAPIMethod', 'DiffusionLowLevelCommitFieldsQuery' => 'DiffusionLowLevelQuery', 'DiffusionLowLevelCommitQuery' => 'DiffusionLowLevelQuery', + 'DiffusionLowLevelFilesizeQuery' => 'DiffusionLowLevelQuery', 'DiffusionLowLevelGitRefQuery' => 'DiffusionLowLevelQuery', 'DiffusionLowLevelMercurialBranchesQuery' => 'DiffusionLowLevelQuery', 'DiffusionLowLevelMercurialPathsQuery' => 'DiffusionLowLevelQuery', diff --git a/src/applications/diffusion/engine/DiffusionCommitHookEngine.php b/src/applications/diffusion/engine/DiffusionCommitHookEngine.php index 80736e0e1b..07d598cfa5 100644 --- a/src/applications/diffusion/engine/DiffusionCommitHookEngine.php +++ b/src/applications/diffusion/engine/DiffusionCommitHookEngine.php @@ -1305,97 +1305,11 @@ final class DiffusionCommitHookEngine extends Phobject { public function loadFileSizesForCommit($identifier) { $repository = $this->getRepository(); - $vcs = $repository->getVersionControlSystem(); - $path_sizes = array(); - - switch ($vcs) { - case PhabricatorRepositoryType::REPOSITORY_TYPE_GIT: - list($paths_raw) = $repository->execxLocalCommand( - 'diff-tree -z -r --no-commit-id %s --', - $identifier); - - // With "-z" we get "\0\0" for each line. Group the - // delimited text into ", " pairs. - $paths_raw = trim($paths_raw, "\0"); - $paths_raw = explode("\0", $paths_raw); - if (count($paths_raw) % 2) { - throw new Exception( - pht( - 'Unexpected number of output lines from "git diff-tree" when '. - 'processing commit ("%s"): got %s lines, expected an even '. - 'number.', - $identifier, - phutil_count($paths_raw))); - } - $paths_raw = array_chunk($paths_raw, 2); - - $paths = array(); - foreach ($paths_raw as $path_raw) { - list($fields, $pathname) = $path_raw; - $fields = explode(' ', $fields); - - // Fields are: - // - // :100644 100644 aaaa bbbb M - // - // [0] Old file mode. - // [1] New file mode. - // [2] Old object hash. - // [3] New object hash. - // [4] Change mode. - - $paths[] = array( - 'path' => $pathname, - 'newHash' => $fields[3], - ); - } - - if ($paths) { - $check_paths = array(); - foreach ($paths as $path) { - if ($path['newHash'] === self::EMPTY_HASH) { - $path_sizes[$path['path']] = 0; - continue; - } - $check_paths[$path['newHash']][] = $path['path']; - } - - if ($check_paths) { - $future = $repository->getLocalCommandFuture( - 'cat-file --batch-check=%s', - '%(objectsize)') - ->write(implode("\n", array_keys($check_paths))); - - list($sizes) = $future->resolvex(); - $sizes = trim($sizes); - $sizes = phutil_split_lines($sizes, false); - if (count($sizes) !== count($check_paths)) { - throw new Exception( - pht( - 'Unexpected number of output lines from "git cat-file" when '. - 'processing commit ("%s"): got %s lines, expected %s.', - $identifier, - phutil_count($sizes), - phutil_count($check_paths))); - } - - foreach ($check_paths as $object_hash => $path_names) { - $object_size = (int)array_shift($sizes); - foreach ($path_names as $path_name) { - $path_sizes[$path_name] = $object_size; - } - } - } - } - break; - default: - throw new Exception( - pht( - 'File size limits are not supported for this VCS.')); - } - - return $path_sizes; + return id(new DiffusionLowLevelFilesizeQuery()) + ->setRepository($repository) + ->withIdentifier($identifier) + ->execute(); } public function loadCommitRefForCommit($identifier) { diff --git a/src/applications/diffusion/query/lowlevel/DiffusionLowLevelFilesizeQuery.php b/src/applications/diffusion/query/lowlevel/DiffusionLowLevelFilesizeQuery.php new file mode 100644 index 0000000000..9d97a134ca --- /dev/null +++ b/src/applications/diffusion/query/lowlevel/DiffusionLowLevelFilesizeQuery.php @@ -0,0 +1,125 @@ +identifier = $identifier; + return $this; + } + + protected function executeQuery() { + if (!strlen($this->identifier)) { + throw new PhutilInvalidStateException('withIdentifier'); + } + + $type = $this->getRepository()->getVersionControlSystem(); + switch ($type) { + case PhabricatorRepositoryType::REPOSITORY_TYPE_GIT: + $result = $this->loadGitFilesizes(); + break; + default: + throw new Exception(pht('Unsupported repository type "%s"!', $type)); + } + + return $result; + } + + private function loadGitFilesizes() { + $repository = $this->getRepository(); + $identifier = $this->identifier; + + $paths_future = $repository->getLocalCommandFuture( + 'diff-tree -z -r --no-commit-id %s --', + $identifier); + + // With "-z" we get "\0\0" for each line. Process the + // delimited text as ", " pairs. + + $path_lines = id(new LinesOfALargeExecFuture($paths_future)) + ->setDelimiter("\0"); + + $paths = array(); + + $path_pairs = new PhutilChunkedIterator($path_lines, 2); + foreach ($path_pairs as $path_pair) { + if (count($path_pair) != 2) { + throw new Exception( + pht( + 'Unexpected number of output lines from "git diff-tree" when '. + 'processing commit ("%s"): expected an even number of lines.', + $identifier)); + } + + list($fields, $pathname) = array_values($path_pair); + $fields = explode(' ', $fields); + + // Fields are: + // + // :100644 100644 aaaa bbbb M + // + // [0] Old file mode. + // [1] New file mode. + // [2] Old object hash. + // [3] New object hash. + // [4] Change mode. + + $paths[] = array( + 'path' => $pathname, + 'newHash' => $fields[3], + ); + } + + $path_sizes = array(); + + if (!$paths) { + return $path_sizes; + } + + $check_paths = array(); + foreach ($paths as $path) { + if ($path['newHash'] === DiffusionCommitHookEngine::EMPTY_HASH) { + $path_sizes[$path['path']] = 0; + continue; + } + $check_paths[$path['newHash']][] = $path['path']; + } + + if (!$check_paths) { + return $path_sizes; + } + + $future = $repository->getLocalCommandFuture( + 'cat-file --batch-check=%s', + '%(objectsize)'); + + $future->write(implode("\n", array_keys($check_paths))); + + $size_lines = id(new LinesOfALargeExecFuture($future)) + ->setDelimiter("\n"); + foreach ($size_lines as $line) { + $object_size = (int)$line; + + $object_hash = head_key($check_paths); + $path_names = $check_paths[$object_hash]; + unset($check_paths[$object_hash]); + + foreach ($path_names as $path_name) { + $path_sizes[$path_name] = $object_size; + } + } + + if ($check_paths) { + throw new Exception( + pht( + 'Unexpected number of output lines from "git cat-file" when '. + 'processing commit ("%s").', + $identifier)); + } + + return $path_sizes; + } + +}