1
0
Fork 0
mirror of https://we.phorge.it/source/arcanist.git synced 2025-01-10 23:01:04 +01:00

Improve hg arc diff perf when image files are uploaded

Summary:
arc diff called 'hg cat' twice for every image binary in the diff.
This turns out to take 1 second per call on a large repo because mercurial
has to parse the manifest every time.

Now arc diff batches up all the files and does only two 'hg cat'
commands.  This makes the cost constant relative to the number of
images being uploaded.

Test Plan:
Ran arc diff on a diff with 30 images on both git and hg.
Verified that it was fast and that the images showed up in the web ui.

Reviewers: epriestley

Reviewed By: epriestley

CC: sid0, dschleimer, bos, aran, Korvin

Differential Revision: https://secure.phabricator.com/D5144
This commit is contained in:
durham 2013-02-28 09:46:42 -08:00
parent 0cb1b7efad
commit 8412f7cfd7
3 changed files with 89 additions and 2 deletions

View file

@ -1142,6 +1142,8 @@ final class ArcanistDiffParser {
return;
}
$imagechanges = array();
$changes = $this->changes;
foreach ($changes as $change) {
$path = $change->getCurrentPath();
@ -1183,8 +1185,22 @@ final class ArcanistDiffParser {
continue;
}
$change->setOriginalFileData($repository_api->getOriginalFileData($path));
$change->setCurrentFileData($repository_api->getCurrentFileData($path));
$imagechanges[$path] = $change;
}
// Fetch the actual file contents in batches so repositories
// that have slow random file accesses (i.e. mercurial) can
// optimize the retrieval.
$paths = array_keys($imagechanges);
$filedata = $repository_api->getBulkOriginalFileData($paths);
foreach ($filedata as $path => $data) {
$imagechanges[$path]->setOriginalFileData($data);
}
$filedata = $repository_api->getBulkCurrentFileData($paths);
foreach ($filedata as $path => $data) {
$imagechanges[$path]->setCurrentFileData($data);
}
$this->changes = $changes;

View file

@ -408,6 +408,50 @@ final class ArcanistMercurialAPI extends ArcanistRepositoryAPI {
$this->getWorkingCopyRevision());
}
public function getBulkOriginalFileData($paths) {
return $this->getBulkFileDataAtRevision($paths, $this->getBaseCommit());
}
public function getBulkCurrentFileData($paths) {
return $this->getBulkFileDataAtRevision(
$paths,
$this->getWorkingCopyRevision());
}
private function getBulkFileDataAtRevision($paths, $revision) {
// Calling 'hg cat' on each file individually is slow (1 second per file
// on a large repo) because mercurial has to decompress and parse the
// entire manifest every time. Do it in one large batch instead.
// hg cat will write the file data to files in a temp directory
$tmpdir = Filesystem::createTemporaryDirectory();
// Mercurial doesn't create the directories for us :(
foreach ($paths as $path) {
$tmppath = $tmpdir.'/'.$path;
Filesystem::createDirectory(dirname($tmppath), 0755, true);
}
list($err, $stdout) = $this->execManualLocal(
'cat --rev %s --output %s -- %C',
$revision,
// %p is the formatter for the repo-relative filepath
$tmpdir.'/%p',
implode(' ', $paths));
$filedata = array();
foreach ($paths as $path) {
$tmppath = $tmpdir.'/'.$path;
if (Filesystem::pathExists($tmppath)) {
$filedata[$path] = Filesystem::readFile($tmppath);
}
}
Filesystem::remove($tmpdir);
return $filedata;
}
private function getFileDataAtRevision($path, $revision) {
list($err, $stdout) = $this->execManualLocal(
'cat --rev %s -- %s',

View file

@ -296,6 +296,33 @@ abstract class ArcanistRepositoryAPI {
}
}
/**
* Fetches the original file data for each path provided.
*
* @return map<string, string> Map from path to file data.
*/
public function getBulkOriginalFileData($paths) {
$filedata = array();
foreach ($paths as $path) {
$filedata[$path] = $this->getOriginalFileData($path);
}
return $filedata;
}
/**
* Fetches the current file data for each path provided.
*
* @return map<string, string> Map from path to file data.
*/
public function getBulkCurrentFileData($paths) {
$filedata = array();
foreach ($paths as $path) {
$filedata[$path] = $this->getCurrentFileData($path);
}
return $filedata;
}
/**
* @return Traversable