1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-25 16:22:43 +01:00

When we fail to acquire a repository lock, try to provide a hint about why

Summary:
Ref T13202. See PHI889. If the lock log is enabled, we can try to offer more details about lock holders.

When we fail to acquire a lock:

  - check for recent acquisitions and suggest that this is a bottleneck issue;
  - if there are no recent acquisitions, check for the last acquisition and print details about it (what process, how long ago, whether or not we believe it was released).

Test Plan:
  - Enabled the lock log.
  - Changed the lock wait time to 1 second.
  - Added a `sleep(10)` after grabbing the lock.
  - In one window, ran a Conduit call or a `git fetch`.
  - In another window, ran another operation.
  - Got useful/sensible errors for both ssh and web lock holders, for example:

> PhutilProxyException: Failed to acquire read lock after waiting 1 second(s). You may be able to retry later. (This lock was most recently acquired by a process (pid=12609, host=orbital-3.local, sapi=apache2handler, controller=PhabricatorConduitAPIController, method=diffusion.rawdiffquery) 3 second(s) ago. There is no record of this lock being released.)

> PhutilProxyException: Failed to acquire read lock after waiting 1 second(s). You may be able to retry later. (This lock was most recently acquired by a process (pid=65251, host=orbital-3.local, sapi=cli, argv=/Users/epriestley/dev/core/lib/phabricator/bin/ssh-exec --phabricator-ssh-device local.phacility.net --phabricator-ssh-key 2) 2 second(s) ago. There is no record of this lock being released.)

Reviewers: amckinley

Reviewed By: amckinley

Maniphest Tasks: T13202

Differential Revision: https://secure.phabricator.com/D19702
This commit is contained in:
epriestley 2018-09-23 11:01:29 -07:00
parent 7db265cd5d
commit 021c612cb2
2 changed files with 160 additions and 14 deletions

View file

@ -170,12 +170,13 @@ final class DiffusionRepositoryClusterEngine extends Phobject {
pht(
'Acquired read lock immediately.'));
}
} catch (Exception $ex) {
} catch (PhutilLockException $ex) {
throw new PhutilProxyException(
pht(
'Failed to acquire read lock after waiting %s second(s). You '.
'may be able to retry later.',
new PhutilNumber($lock_wait)),
'may be able to retry later. (%s)',
new PhutilNumber($lock_wait),
$ex->getHint()),
$ex);
}
@ -349,12 +350,13 @@ final class DiffusionRepositoryClusterEngine extends Phobject {
pht(
'Acquired write lock immediately.'));
}
} catch (Exception $ex) {
} catch (PhutilLockException $ex) {
throw new PhutilProxyException(
pht(
'Failed to acquire write lock after waiting %s second(s). You '.
'may be able to retry later.',
new PhutilNumber($lock_wait)),
'may be able to retry later. (%s)',
new PhutilNumber($lock_wait),
$ex->getHint()),
$ex);
}

View file

@ -144,7 +144,8 @@ final class PhabricatorGlobalLock extends PhutilLock {
$ok = head($result);
if (!$ok) {
throw new PhutilLockException($lock_name);
throw id(new PhutilLockException($lock_name))
->setHint($this->newHint($lock_name, $wait));
}
$conn->rememberLock($lock_name);
@ -152,13 +153,7 @@ final class PhabricatorGlobalLock extends PhutilLock {
$this->conn = $conn;
if ($this->shouldLogLock()) {
global $argv;
$lock_context = array(
'pid' => getmypid(),
'host' => php_uname('n'),
'argv' => $argv,
);
$lock_context = $this->newLockContext();
$log = id(new PhabricatorDaemonLockLog())
->setLockName($lock_name)
@ -228,4 +223,153 @@ final class PhabricatorGlobalLock extends PhutilLock {
return true;
}
private function newLockContext() {
$context = array(
'pid' => getmypid(),
'host' => php_uname('n'),
'sapi' => php_sapi_name(),
);
global $argv;
if ($argv) {
$context['argv'] = $argv;
}
$access_log = null;
// TODO: There's currently no cohesive way to get the parameterized access
// log for the current request across different request types. Web requests
// have an "AccessLog", SSH requests have an "SSHLog", and other processes
// (like scripts) have no log. But there's no method to say "give me any
// log you've got". For now, just test if we have a web request and use the
// "AccessLog" if we do, since that's the only one we actually read any
// parameters from.
// NOTE: "PhabricatorStartup" is only available from web requests, not
// from CLI scripts.
if (class_exists('PhabricatorStartup', false)) {
$access_log = PhabricatorAccessLog::getLog();
}
if ($access_log) {
$controller = $access_log->getData('C');
if ($controller) {
$context['controller'] = $controller;
}
$method = $access_log->getData('m');
if ($method) {
$context['method'] = $method;
}
}
return $context;
}
private function newHint($lock_name, $wait) {
if (!$this->shouldLogLock()) {
return pht(
'Enable the lock log for more detailed information about '.
'which process is holding this lock.');
}
$now = PhabricatorTime::getNow();
// First, look for recent logs. If other processes have been acquiring and
// releasing this lock while we've been waiting, this is more likely to be
// a contention/throughput issue than an issue with something hung while
// holding the lock.
$limit = 100;
$logs = id(new PhabricatorDaemonLockLog())->loadAllWhere(
'lockName = %s AND dateCreated >= %d ORDER BY id ASC LIMIT %d',
$lock_name,
($now - $wait),
$limit);
if ($logs) {
if (count($logs) === $limit) {
return pht(
'During the last %s second(s) spent waiting for the lock, more '.
'than %s other process(es) acquired it, so this is likely a '.
'bottleneck. Use "bin/lock log --name %s" to review log activity.',
new PhutilNumber($wait),
new PhutilNumber($limit),
$lock_name);
} else {
return pht(
'During the last %s second(s) spent waiting for the lock, %s '.
'other process(es) acquired it, so this is likely a '.
'bottleneck. Use "bin/lock log --name %s" to review log activity.',
new PhutilNumber($wait),
phutil_count($logs),
$lock_name);
}
}
$last_log = id(new PhabricatorDaemonLockLog())->loadOneWhere(
'lockName = %s ORDER BY id DESC LIMIT 1',
$lock_name);
if ($last_log) {
$info = array();
$acquired = $last_log->getDateCreated();
$context = $last_log->getLockContext();
$process_info = array();
$pid = idx($context, 'pid');
if ($pid) {
$process_info[] = 'pid='.$pid;
}
$host = idx($context, 'host');
if ($host) {
$process_info[] = 'host='.$host;
}
$sapi = idx($context, 'sapi');
if ($sapi) {
$process_info[] = 'sapi='.$sapi;
}
$argv = idx($context, 'argv');
if ($argv) {
$process_info[] = 'argv='.(string)csprintf('%LR', $argv);
}
$controller = idx($context, 'controller');
if ($controller) {
$process_info[] = 'controller='.$controller;
}
$method = idx($context, 'method');
if ($method) {
$process_info[] = 'method='.$method;
}
$process_info = implode(', ', $process_info);
$info[] = pht(
'This lock was most recently acquired by a process (%s) '.
'%s second(s) ago.',
$process_info,
new PhutilNumber($now - $acquired));
$released = $last_log->getLockReleased();
if ($released) {
$info[] = pht(
'This lock was released %s second(s) ago.',
new PhutilNumber($now - $released));
} else {
$info[] = pht('There is no record of this lock being released.');
}
return implode(' ', $info);
}
return pht(
'Found no records of processes acquiring or releasing this lock.');
}
}