1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2025-01-22 20:51:10 +01:00

Pull Git filesize logic into a separate LowLevel query and use more Iterators

Summary:
Depends on D19829. Ref T13216. See PHI908. The current implementation is kind of a lot to live in `CommitHookEngine` and will likely fail if `git diff-tree` produces more than 2GB of output.

Pull it out and make it slightly more robust against enormous commits. It's probably limited by this, now:

```
implode("\n", $every_path)
```

We could replace that with some `PhutilReverseRopeSource` primitive or something but since we don't have one of those and it seems unlikely that we'll hit this case in practice, I left it here for now with just the easy stuff converted to be stream-oriented.

Test Plan:
Used this script to test the query against various commits, got good results:

```
<?php

require_once 'scripts/init/init-script.php';

$viewer = PhabricatorUser::getOmnipotentUser();

$repository = id(new PhabricatorRepositoryQuery())
  ->setViewer($viewer)
  ->withCallsigns(array('P'))
  ->executeOne();

var_dump(
  id(new DiffusionLowLevelFilesizeQuery())
    ->setRepository($repository)
    ->withIdentifier($argv[1])
    ->execute());
```

Used this to find large commits in history and pull filesizes (worked great, although our largest commit only touches a couple thousand paths):

```
for hash in `git log --format=%H`; do echo -n $hash; echo -n ' '; git diff-tree -r --no-commit-id $hash | wc -l | awk '{print $1}'; done | awk '{print $2 " " $1}' | sort -n
```

Reviewers: amckinley

Reviewed By: amckinley

Maniphest Tasks: T13216

Differential Revision: https://secure.phabricator.com/D19830
This commit is contained in:
epriestley 2018-11-21 16:05:31 -08:00
parent fd12b37d16
commit c6fc05ee09
3 changed files with 131 additions and 90 deletions

View file

@ -839,6 +839,7 @@ phutil_register_library_map(array(
'DiffusionLookSoonConduitAPIMethod' => 'applications/diffusion/conduit/DiffusionLookSoonConduitAPIMethod.php',
'DiffusionLowLevelCommitFieldsQuery' => 'applications/diffusion/query/lowlevel/DiffusionLowLevelCommitFieldsQuery.php',
'DiffusionLowLevelCommitQuery' => 'applications/diffusion/query/lowlevel/DiffusionLowLevelCommitQuery.php',
'DiffusionLowLevelFilesizeQuery' => 'applications/diffusion/query/lowlevel/DiffusionLowLevelFilesizeQuery.php',
'DiffusionLowLevelGitRefQuery' => 'applications/diffusion/query/lowlevel/DiffusionLowLevelGitRefQuery.php',
'DiffusionLowLevelMercurialBranchesQuery' => 'applications/diffusion/query/lowlevel/DiffusionLowLevelMercurialBranchesQuery.php',
'DiffusionLowLevelMercurialPathsQuery' => 'applications/diffusion/query/lowlevel/DiffusionLowLevelMercurialPathsQuery.php',
@ -6250,6 +6251,7 @@ phutil_register_library_map(array(
'DiffusionLookSoonConduitAPIMethod' => 'DiffusionConduitAPIMethod',
'DiffusionLowLevelCommitFieldsQuery' => 'DiffusionLowLevelQuery',
'DiffusionLowLevelCommitQuery' => 'DiffusionLowLevelQuery',
'DiffusionLowLevelFilesizeQuery' => 'DiffusionLowLevelQuery',
'DiffusionLowLevelGitRefQuery' => 'DiffusionLowLevelQuery',
'DiffusionLowLevelMercurialBranchesQuery' => 'DiffusionLowLevelQuery',
'DiffusionLowLevelMercurialPathsQuery' => 'DiffusionLowLevelQuery',

View file

@ -1305,97 +1305,11 @@ final class DiffusionCommitHookEngine extends Phobject {
public function loadFileSizesForCommit($identifier) {
$repository = $this->getRepository();
$vcs = $repository->getVersionControlSystem();
$path_sizes = array();
switch ($vcs) {
case PhabricatorRepositoryType::REPOSITORY_TYPE_GIT:
list($paths_raw) = $repository->execxLocalCommand(
'diff-tree -z -r --no-commit-id %s --',
$identifier);
// With "-z" we get "<fields>\0<filename>\0" for each line. Group the
// delimited text into "<fields>, <filename>" pairs.
$paths_raw = trim($paths_raw, "\0");
$paths_raw = explode("\0", $paths_raw);
if (count($paths_raw) % 2) {
throw new Exception(
pht(
'Unexpected number of output lines from "git diff-tree" when '.
'processing commit ("%s"): got %s lines, expected an even '.
'number.',
$identifier,
phutil_count($paths_raw)));
}
$paths_raw = array_chunk($paths_raw, 2);
$paths = array();
foreach ($paths_raw as $path_raw) {
list($fields, $pathname) = $path_raw;
$fields = explode(' ', $fields);
// Fields are:
//
// :100644 100644 aaaa bbbb M
//
// [0] Old file mode.
// [1] New file mode.
// [2] Old object hash.
// [3] New object hash.
// [4] Change mode.
$paths[] = array(
'path' => $pathname,
'newHash' => $fields[3],
);
}
if ($paths) {
$check_paths = array();
foreach ($paths as $path) {
if ($path['newHash'] === self::EMPTY_HASH) {
$path_sizes[$path['path']] = 0;
continue;
}
$check_paths[$path['newHash']][] = $path['path'];
}
if ($check_paths) {
$future = $repository->getLocalCommandFuture(
'cat-file --batch-check=%s',
'%(objectsize)')
->write(implode("\n", array_keys($check_paths)));
list($sizes) = $future->resolvex();
$sizes = trim($sizes);
$sizes = phutil_split_lines($sizes, false);
if (count($sizes) !== count($check_paths)) {
throw new Exception(
pht(
'Unexpected number of output lines from "git cat-file" when '.
'processing commit ("%s"): got %s lines, expected %s.',
$identifier,
phutil_count($sizes),
phutil_count($check_paths)));
}
foreach ($check_paths as $object_hash => $path_names) {
$object_size = (int)array_shift($sizes);
foreach ($path_names as $path_name) {
$path_sizes[$path_name] = $object_size;
}
}
}
}
break;
default:
throw new Exception(
pht(
'File size limits are not supported for this VCS.'));
}
return $path_sizes;
return id(new DiffusionLowLevelFilesizeQuery())
->setRepository($repository)
->withIdentifier($identifier)
->execute();
}
public function loadCommitRefForCommit($identifier) {

View file

@ -0,0 +1,125 @@
<?php
final class DiffusionLowLevelFilesizeQuery
extends DiffusionLowLevelQuery {
private $identifier;
public function withIdentifier($identifier) {
$this->identifier = $identifier;
return $this;
}
protected function executeQuery() {
if (!strlen($this->identifier)) {
throw new PhutilInvalidStateException('withIdentifier');
}
$type = $this->getRepository()->getVersionControlSystem();
switch ($type) {
case PhabricatorRepositoryType::REPOSITORY_TYPE_GIT:
$result = $this->loadGitFilesizes();
break;
default:
throw new Exception(pht('Unsupported repository type "%s"!', $type));
}
return $result;
}
private function loadGitFilesizes() {
$repository = $this->getRepository();
$identifier = $this->identifier;
$paths_future = $repository->getLocalCommandFuture(
'diff-tree -z -r --no-commit-id %s --',
$identifier);
// With "-z" we get "<fields>\0<filename>\0" for each line. Process the
// delimited text as "<fields>, <filename>" pairs.
$path_lines = id(new LinesOfALargeExecFuture($paths_future))
->setDelimiter("\0");
$paths = array();
$path_pairs = new PhutilChunkedIterator($path_lines, 2);
foreach ($path_pairs as $path_pair) {
if (count($path_pair) != 2) {
throw new Exception(
pht(
'Unexpected number of output lines from "git diff-tree" when '.
'processing commit ("%s"): expected an even number of lines.',
$identifier));
}
list($fields, $pathname) = array_values($path_pair);
$fields = explode(' ', $fields);
// Fields are:
//
// :100644 100644 aaaa bbbb M
//
// [0] Old file mode.
// [1] New file mode.
// [2] Old object hash.
// [3] New object hash.
// [4] Change mode.
$paths[] = array(
'path' => $pathname,
'newHash' => $fields[3],
);
}
$path_sizes = array();
if (!$paths) {
return $path_sizes;
}
$check_paths = array();
foreach ($paths as $path) {
if ($path['newHash'] === DiffusionCommitHookEngine::EMPTY_HASH) {
$path_sizes[$path['path']] = 0;
continue;
}
$check_paths[$path['newHash']][] = $path['path'];
}
if (!$check_paths) {
return $path_sizes;
}
$future = $repository->getLocalCommandFuture(
'cat-file --batch-check=%s',
'%(objectsize)');
$future->write(implode("\n", array_keys($check_paths)));
$size_lines = id(new LinesOfALargeExecFuture($future))
->setDelimiter("\n");
foreach ($size_lines as $line) {
$object_size = (int)$line;
$object_hash = head_key($check_paths);
$path_names = $check_paths[$object_hash];
unset($check_paths[$object_hash]);
foreach ($path_names as $path_name) {
$path_sizes[$path_name] = $object_size;
}
}
if ($check_paths) {
throw new Exception(
pht(
'Unexpected number of output lines from "git cat-file" when '.
'processing commit ("%s").',
$identifier));
}
return $path_sizes;
}
}