1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-12-18 19:40:55 +01:00

Implement a chunked, APC-backed graph cache

Summary:
Ref T2683. This is a refinement and simplification of D5257. In particular:

  - D5257 only cached the commit chain, not path changes. This meant that we had to go issue an awkward query (which was slow on Facebook's install) periodically while reading the cache. This was reasonable locally but killed performance at FB scale. Instead, we can include path information in the cache. It is very rare that this is large except in Subversion, and we do not need to use this cache in Subversion. In other VCSes, the scale of this data is quite small (a handful of bytes per commit on average).
  - D5257 required a large, slow offline computation step. This relies on D9044 to populate parent data so we can build the cache online at will, and let it expire with normal LRU/LFU/whatever semantics. We need this parent data for other reasons anyway.
  - D5257 separated graph chunks per-repository. This change assumes we'll be able to pull stuff from APC most of the time and that the cost of switching chunks is not very large, so we can just build one chunk cache across all repositories. This allows the cache to be simpler.
  - D5257 needed an offline cache, and used a unique cache structure. Since this one can be built online it can mostly use normal cache code.
  - This also supports online appends to the cache.
  - Finally, this has a timeout to guarantee a ceiling on the worst case: the worst case is something like a query for a file that has never existed, in a repository which receives exactly 1 commit every time other repositories receive 4095 commits, on a cold cache. If we hit cases like this we can bail after warming the cache up a bit and fall back to asking the VCS for an answer.

This cache isn't perfect, but I believe it will give us substantial gains in the average case. It can often satisfy "average-looking" queries in 4-8ms, and pathological-ish queries in 20ms on my machine; `hg` usually can't even start up in less than 100ms. The major thing that's attractive about this approach is that it does not require anything external or complicated, and will "just work", even producing reasonble improvements for users without APC.

In followups, I'll modify queries to use this cache and see if it holds up in more realistic workloads.

Test Plan:
  - Used `bin/repository cache` to examine the behavior of this cache.
  - Did some profiling/testing from the web UI using `debug.php`.
  - This //appears// to provide a reasonable fast way to issue this query very quickly in the average case, without the various issues that plagued D5257.

Reviewers: btrahan

Reviewed By: btrahan

Subscribers: epriestley, jhurwitz

Maniphest Tasks: T2683

Differential Revision: https://secure.phabricator.com/D9045
This commit is contained in:
epriestley 2014-05-10 10:10:13 -07:00
parent 95eab2f3b0
commit e4ea092f60
7 changed files with 557 additions and 8 deletions

View file

@ -1983,8 +1983,10 @@ phutil_register_library_map(array(
'PhabricatorRepositoryEngine' => 'applications/repository/engine/PhabricatorRepositoryEngine.php',
'PhabricatorRepositoryGitCommitChangeParserWorker' => 'applications/repository/worker/commitchangeparser/PhabricatorRepositoryGitCommitChangeParserWorker.php',
'PhabricatorRepositoryGitCommitMessageParserWorker' => 'applications/repository/worker/commitmessageparser/PhabricatorRepositoryGitCommitMessageParserWorker.php',
'PhabricatorRepositoryGraphCache' => 'applications/repository/graphcache/PhabricatorRepositoryGraphCache.php',
'PhabricatorRepositoryGraphStream' => 'applications/repository/daemon/PhabricatorRepositoryGraphStream.php',
'PhabricatorRepositoryListController' => 'applications/repository/controller/PhabricatorRepositoryListController.php',
'PhabricatorRepositoryManagementCacheWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementCacheWorkflow.php',
'PhabricatorRepositoryManagementDeleteWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementDeleteWorkflow.php',
'PhabricatorRepositoryManagementDiscoverWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementDiscoverWorkflow.php',
'PhabricatorRepositoryManagementEditWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementEditWorkflow.php',
@ -4828,6 +4830,7 @@ phutil_register_library_map(array(
'PhabricatorRepositoryGitCommitMessageParserWorker' => 'PhabricatorRepositoryCommitMessageParserWorker',
'PhabricatorRepositoryGraphStream' => 'Phobject',
'PhabricatorRepositoryListController' => 'PhabricatorRepositoryController',
'PhabricatorRepositoryManagementCacheWorkflow' => 'PhabricatorRepositoryManagementWorkflow',
'PhabricatorRepositoryManagementDeleteWorkflow' => 'PhabricatorRepositoryManagementWorkflow',
'PhabricatorRepositoryManagementDiscoverWorkflow' => 'PhabricatorRepositoryManagementWorkflow',
'PhabricatorRepositoryManagementEditWorkflow' => 'PhabricatorRepositoryManagementWorkflow',

View file

@ -9,6 +9,53 @@ final class PhabricatorCaches {
return PhabricatorEnv::getEnvConfig('phabricator.cache-namespace');
}
private static function newStackFromCaches(array $caches) {
$caches = self::addNamespaceToCaches($caches);
$caches = self::addProfilerToCaches($caches);
return id(new PhutilKeyValueCacheStack())
->setCaches($caches);
}
/* -( Repository Graph Cache )--------------------------------------------- */
public static function getRepositoryGraphL1Cache() {
static $cache;
if (!$cache) {
$caches = self::buildRepositoryGraphL1Caches();
$cache = self::newStackFromCaches($caches);
}
return $cache;
}
private static function buildRepositoryGraphL1Caches() {
$caches = array();
$apc = new PhutilKeyValueCacheAPC();
if ($apc->isAvailable()) {
$caches[] = $apc;
}
return $caches;
}
public static function getRepositoryGraphL2Cache() {
static $cache;
if (!$cache) {
$caches = self::buildRepositoryGraphL2Caches();
$cache = self::newStackFromCaches($caches);
}
return $cache;
}
private static function buildRepositoryGraphL2Caches() {
$caches = array();
$caches[] = new PhabricatorKeyValueDatabaseCache();
return $caches;
}
/* -( Setup Cache )-------------------------------------------------------- */
@ -30,10 +77,7 @@ final class PhabricatorCaches {
static $cache;
if (!$cache) {
$caches = self::buildSetupCaches();
$caches = self::addNamespaceToCaches($caches);
$caches = self::addProfilerToCaches($caches);
$cache = id(new PhutilKeyValueCacheStack())
->setCaches($caches);
$cache = self::newStackFromCaches($caches);
}
return $cache;
}

View file

@ -575,6 +575,12 @@ final class PhabricatorRepositoryDiscoveryEngine
}
$parent_ids[] = $parent_map[$parent];
}
} else {
// Write an explicit 0 so we can distinguish between "really no
// parents" and "data not available".
if (!$repository->isSVN()) {
$parent_ids = array(0);
}
}
$commit->openTransaction();

View file

@ -0,0 +1,386 @@
<?php
/**
* Given a commit and a path, efficiently determine the most recent ancestor
* commit where the path was touched.
*
* In Git and Mercurial, log operations with a path are relatively slow. For
* example:
*
* git log -n1 <commit> -- <path>
*
* ...routinely takes several hundred milliseconds, and equivalent requests
* often take longer in Mercurial.
*
* Unfortunately, this operation is fundamental to rendering a repository for
* the web, and essentially everything else that's slow can be reduced to this
* plus some trivial work afterward. Making this fast is desirable and powerful,
* and allows us to make other things fast by expressing them in terms of this
* query.
*
* Because the query is fundamentally a graph query, it isn't easy to express
* in a reasonable way in MySQL, and we can't do round trips to the server to
* walk the graph without incurring huge performance penalties.
*
* However, the total amount of data in the graph is relatively small. By
* caching it in chunks and keeping it in APC, we can reasonably load and walk
* the graph in PHP quickly.
*
* For more context, see T2683.
*
* Structure of the Cache
* ======================
*
* The cache divides commits into buckets (see @{method:getBucketSize}). To
* walk the graph, we pull a commit's bucket. The bucket is a map from commit
* IDs to a list of parents and changed paths, separated by `null`. For
* example, a bucket might look like this:
*
* array(
* 1 => array(0, null, 17, 18),
* 2 => array(1, null, 4),
* // ...
* )
*
* This means that commit ID 1 has parent commit 0 (a special value meaning
* no parents) and affected path IDs 17 and 18. Commit ID 2 has parent commit 1,
* and affected path 4.
*
* This data structure attempts to balance compactness, ease of construction,
* simplicity of cache semantics, and lookup performance. In the average case,
* it appears to do a reasonable job at this.
*
* @task query Querying the Graph Cache
* @task cache Cache Internals
*/
final class PhabricatorRepositoryGraphCache {
/* -( Querying the Graph Cache )------------------------------------------- */
/**
* Search the graph cache for the most modification to a path.
*
* @param int The commit ID to search ancestors of.
* @param int The path ID to search for changes to.
* @param float Maximum number of seconds to spend trying to satisfy this
* query using the graph cache. By default, `0.5` (500ms).
* @return mixed Commit ID, or `null` if no ancestors exist, or `false` if
* the graph cache was unable to determine the answer.
* @task query
*/
public function loadLastModifiedCommitID($commit_id, $path_id, $time = 0.5) {
$commit_id = (int)$commit_id;
$path_id = (int)$path_id;
$bucket_data = null;
$data_key = null;
$seen = array();
$t_start = microtime(true);
$iterations = 0;
while (true) {
$bucket_key = $this->getBucketKey($commit_id);
if (($data_key != $bucket_key) || $bucket_data === null) {
$bucket_data = $this->getBucketData($bucket_key);
$data_key = $bucket_key;
}
if (empty($bucket_data[$commit_id])) {
// Rebuild the cache bucket, since the commit might be a very recent
// one that we'll pick up by rebuilding.
$bucket_data = $this->getBucketData($bucket_key, $bucket_data);
if (empty($bucket_data[$commit_id])) {
// A rebuild didn't help. This can occur legitimately if the commit
// is new and hasn't parsed yet.
return false;
}
// Otherwise, the rebuild gave us the data, so we can keep going.
}
// Sanity check so we can survive and recover from bad data.
if (isset($seen[$commit_id])) {
phlog(pht('Unexpected infinite loop in RepositoryGraphCache!'));
return false;
} else {
$seen[$commit_id] = true;
}
// `$data` is a list: the commit's parent IDs, followed by `null`,
// followed by the modified paths in ascending order. We figure out the
// first parent first, then check if the path was touched. If the path
// was touched, this is the commit we're after. If not, walk backward
// in the tree.
$items = $bucket_data[$commit_id];
$size = count($items);
// Walk past the parent information.
$parent_id = null;
for ($ii = 0; ; ++$ii) {
if ($items[$ii] === null) {
break;
}
if ($parent_id === null) {
$parent_id = $items[$ii];
}
}
// Look for a modification to the path.
for (; $ii < $size; ++$ii) {
$item = $items[$ii];
if ($item > $path_id) {
break;
}
if ($item === $path_id) {
return $commit_id;
}
}
if ($parent_id) {
$commit_id = $parent_id;
// Periodically check if we've spent too long looking for a result
// in the cache, and return so we can fall back to a VCS operation. This
// keeps us from having a degenerate worst case if, e.g., the cache
// is cold and we need to inspect a very large number of blocks
// to satisfy the query.
if (((++$iterations) % 64) === 0) {
$t_end = microtime(true);
if (($t_end - $t_start) > $time) {
return false;
}
}
continue;
}
// If we have an explicit 0, that means this commit really has no parents.
// Usually, it is the first commit in the repository.
if ($parent_id === 0) {
return null;
}
// If we didn't find a parent, the parent data isn't available. We fail
// to find an answer in the cache and fall back to querying the VCS.
return false;
}
}
/* -( Cache Internals )---------------------------------------------------- */
/**
* Get the bucket key for a given commit ID.
*
* @param int Commit ID.
* @return int Bucket key.
* @task cache
*/
private function getBucketKey($commit_id) {
return (int)floor($commit_id / $this->getBucketSize());
}
/**
* Get the cache key for a given bucket key (from @{method:getBucketKey}).
*
* @param int Bucket key.
* @return string Cache key.
* @task cache
*/
private function getBucketCacheKey($bucket_key) {
static $prefix;
if ($prefix === null) {
$self = get_class($this);
$size = $this->getBucketSize();
$prefix = "{$self}:{$size}:1:";
}
return $prefix.$bucket_key;
}
/**
* Get the number of items per bucket.
*
* @return int Number of items to store per bucket.
* @task cache
*/
private function getBucketSize() {
return 4096;
}
/**
* Retrieve or build a graph cache bucket from the cache.
*
* Normally, this operates as a readthrough cache call. It can also be used
* to force a cache update by passing the existing data to `$rebuild_data`.
*
* @param int Bucket key, from @{method:getBucketKey}.
* @param mixed Current data, to force a cache rebuild of this bucket.
* @return array Data from the cache.
* @task cache
*/
private function getBucketData($bucket_key, $rebuild_data = null) {
$cache_key = $this->getBucketCacheKey($bucket_key);
// TODO: This cache stuff could be handled more gracefully, but the
// database cache currently requires values to be strings and needs
// some tweaking to support this as part of a stack. Our cache semantics
// here are also unusual (not purely readthrough) because this cache is
// appendable.
$cache_level1 = PhabricatorCaches::getRepositoryGraphL1Cache();
$cache_level2 = PhabricatorCaches::getRepositoryGraphL2Cache();
if ($rebuild_data === null) {
$bucket_data = $cache_level1->getKey($cache_key);
if ($bucket_data) {
return $bucket_data;
}
$bucket_data = $cache_level2->getKey($cache_key);
if ($bucket_data) {
$unserialized = @unserialize($bucket_data);
if ($unserialized) {
// Fill APC if we got a database hit but missed in APC.
$cache_level1->setKey($cache_key, $unserialized);
return $unserialized;
}
}
}
if (!is_array($rebuild_data)) {
$rebuild_data = array();
}
$bucket_data = $this->rebuildBucket($bucket_key, $rebuild_data);
// Don't bother writing the data if we didn't update anything.
if ($bucket_data !== $rebuild_data) {
$cache_level2->setKey($cache_key, serialize($bucket_data));
$cache_level1->setKey($cache_key, $bucket_data);
}
return $bucket_data;
}
/**
* Rebuild a cache bucket, amending existing data if avialable.
*
* @param int Bucket key, from @{method:getBucketKey}.
* @param array Existing bucket data.
* @return array Rebuilt bucket data.
* @task cache
*/
private function rebuildBucket($bucket_key, array $current_data) {
$bucket_min = ($bucket_key * $this->getBucketSize());
$bucket_max = ($bucket_min + $this->getBucketSize()) - 1;
// We need to reload all of the commits in the bucket because there is
// no guarantee that they'll get parsed in order, so we can fill large
// commit IDs before small ones. Later on, we'll ignore the commits we
// already know about.
$table_commit = new PhabricatorRepositoryCommit();
$table_repository = new PhabricatorRepository();
$conn_r = $table_commit->establishConnection('r');
// Find all the Git and Mercurial commits in the block which have completed
// change import. We can't fill the cache accurately for commits which have
// not completed change import, so just pretend we don't know about them.
// In these cases, we will will ultimately fall back to VCS queries.
$commit_rows = queryfx_all(
$conn_r,
'SELECT c.id FROM %T c
JOIN %T r ON c.repositoryID = r.id AND r.versionControlSystem IN (%Ls)
WHERE c.id BETWEEN %d AND %d
AND (c.importStatus & %d) = %d',
$table_commit->getTableName(),
$table_repository->getTableName(),
array(
PhabricatorRepositoryType::REPOSITORY_TYPE_GIT,
PhabricatorRepositoryType::REPOSITORY_TYPE_MERCURIAL,
),
$bucket_min,
$bucket_max,
PhabricatorRepositoryCommit::IMPORTED_CHANGE,
PhabricatorRepositoryCommit::IMPORTED_CHANGE);
// If we don't have any data, just return the existing data.
if (!$commit_rows) {
return $current_data;
}
// Remove the commits we already have data for. We don't need to rebuild
// these. If there's nothing left, return the existing data.
$commit_ids = ipull($commit_rows, 'id', 'id');
$commit_ids = array_diff_key($commit_ids, $current_data);
if (!$commit_ids) {
return $current_data;
}
// Find all the path changes for the new commits.
$path_changes = queryfx_all(
$conn_r,
'SELECT commitID, pathID FROM %T
WHERE commitID IN (%Ld)
AND (isDirect = 1 OR changeType = %d)',
PhabricatorRepository::TABLE_PATHCHANGE,
$commit_ids,
DifferentialChangeType::TYPE_CHILD);
$path_changes = igroup($path_changes, 'commitID');
// Find all the parents for the new commits.
$parents = queryfx_all(
$conn_r,
'SELECT childCommitID, parentCommitID FROM %T
WHERE childCommitID IN (%Ld)
ORDER BY id ASC',
PhabricatorRepository::TABLE_PARENTS,
$commit_ids);
$parents = igroup($parents, 'childCommitID');
// Build the actual data for the cache.
foreach ($commit_ids as $commit_id) {
$parent_ids = array();
if (isset($parents[$commit_id])) {
foreach ($parents[$commit_id] as $row) {
$parent_ids[] = (int)$row['parentCommitID'];
}
}
if (isset($path_changes[$commit_id])) {
$path_ids = $path_changes[$commit_id];
foreach ($path_ids as $key => $path_id) {
$path_ids[$key] = (int)$path_id['pathID'];
}
sort($path_ids);
} else {
$path_ids = array();
}
$value = $parent_ids;
$value[] = null;
foreach ($path_ids as $path_id) {
$value[] = $path_id;
}
$current_data[$commit_id] = $value;
}
return $current_data;
}
}

View file

@ -0,0 +1,92 @@
<?php
final class PhabricatorRepositoryManagementCacheWorkflow
extends PhabricatorRepositoryManagementWorkflow {
public function didConstruct() {
$this
->setName('cache')
->setExamples(
"**cache** [__options__] --commit __commit__ --path __path__")
->setSynopsis(pht('Manage the repository graph cache.'))
->setArguments(
array(
array(
'name' => 'commit',
'param' => 'commit',
'help' => pht('Specify a commit to look up.'),
),
array(
'name' => 'path',
'param' => 'path',
'help' => pht('Specify a path to look up.'),
),
));
}
public function execute(PhutilArgumentParser $args) {
$commit_name = $args->getArg('commit');
if ($commit_name === null) {
throw new PhutilArgumentUsageException(
pht('Specify a commit to look up with `--commit`.'));
}
$commit = $this->loadNamedCommit($commit_name);
$path_name = $args->getArg('path');
if ($path_name === null) {
throw new PhutilArgumentUsageException(
pht('Specify a path to look up with `--path`.'));
}
$path_map = id(new DiffusionPathIDQuery(array($path_name)))
->loadPathIDs();
if (empty($path_map[$path_name])) {
throw new PhutilArgumentUsageException(
pht('Path "%s" is not known to Phabricator.', $path_name));
}
$path_id = $path_map[$path_name];
$graph_cache = new PhabricatorRepositoryGraphCache();
$t_start = microtime(true);
$cache_result = $graph_cache->loadLastModifiedCommitID(
$commit->getID(),
$path_id);
$t_end = microtime(true);
$console = PhutilConsole::getConsole();
$console->writeOut(
"%s\n",
pht('Query took %s ms.', new PhutilNumber(1000 * ($t_end - $t_start))));
if ($cache_result === false) {
$console->writeOut(
"%s\n",
pht('Not found in graph cache.'));
} else if ($cache_result === null) {
$console->writeOut(
"%s\n",
pht('Path not modified in any ancestor commit.'));
} else {
$last = id(new DiffusionCommitQuery())
->setViewer($this->getViewer())
->withIDs(array($cache_result))
->executeOne();
if (!$last) {
throw new Exception(pht('Cache returned bogus result!'));
}
$console->writeOut(
"%s\n",
pht(
'Path was last changed at %s.',
$commit->getRepository()->formatCommitName(
$last->getcommitIdentifier())));
}
return 0;
}
}

View file

@ -116,12 +116,20 @@ final class PhabricatorRepositoryManagementParentsWorkflow
}
$sql = array();
foreach ($parents as $parent) {
if (!$parents) {
// Write an explicit 0 to indicate "no parents" instead of "no data".
$sql[] = qsprintf(
$conn_w,
'(%d, %d)',
$map[$child],
$map[$parent]);
'(%d, 0)',
$map[$child]);
} else {
foreach ($parents as $parent) {
$sql[] = qsprintf(
$conn_w,
'(%d, %d)',
$map[$child],
$map[$parent]);
}
}
$commit_table->openTransaction();

View file

@ -32,6 +32,15 @@ abstract class PhabricatorRepositoryManagementWorkflow
return null;
}
return $this->loadNamedCommits($names);
}
protected function loadNamedCommit($name) {
$map = $this->loadNamedCommits(array($name));
return $map[$name];
}
protected function loadNamedCommits(array $names) {
$query = id(new DiffusionCommitQuery())
->setViewer($this->getViewer())
->withIdentifiers($names);
@ -49,4 +58,5 @@ abstract class PhabricatorRepositoryManagementWorkflow
return $map;
}
}