2012-06-27 13:59:12 -07:00
|
|
|
<?php
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Global, MySQL-backed lock. This is a high-reliability, low-performance
|
|
|
|
* global lock.
|
|
|
|
*
|
|
|
|
* The lock is maintained by using GET_LOCK() in MySQL, and automatically
|
|
|
|
* released when the connection terminates. Thus, this lock can safely be used
|
|
|
|
* to control access to shared resources without implementing any sort of
|
|
|
|
* timeout or override logic: the lock can't normally be stuck in a locked state
|
|
|
|
* with no process actually holding the lock.
|
|
|
|
*
|
|
|
|
* However, acquiring the lock is moderately expensive (several network
|
|
|
|
* roundtrips). This makes it unsuitable for tasks where lock performance is
|
|
|
|
* important.
|
|
|
|
*
|
|
|
|
* $lock = PhabricatorGlobalLock::newLock('example');
|
|
|
|
* $lock->lock();
|
|
|
|
* do_contentious_things();
|
|
|
|
* $lock->unlock();
|
|
|
|
*
|
2012-07-09 10:39:30 -07:00
|
|
|
* NOTE: This lock is not completely global; it is namespaced to the active
|
|
|
|
* storage namespace so that unit tests running in separate table namespaces
|
|
|
|
* are isolated from one another.
|
|
|
|
*
|
2012-06-27 13:59:12 -07:00
|
|
|
* @task construct Constructing Locks
|
|
|
|
* @task impl Implementation
|
|
|
|
*/
|
|
|
|
final class PhabricatorGlobalLock extends PhutilLock {
|
|
|
|
|
2018-03-05 14:07:47 -08:00
|
|
|
private $parameters;
|
2012-06-27 13:59:12 -07:00
|
|
|
private $conn;
|
Make cluster repositories more resistant to freezing
Summary:
Ref T10860. This allows us to recover if the connection to the database is lost during a push.
If we lose the connection to the master database during a push, we would previously freeze the repository. This is very safe, but not very operator-friendly since you have to go manually unfreeze it.
We don't need to be quite this aggressive about freezing things. The repository state is still consistent after we've "upgraded" the lock by setting `isWriting = 1`, so we're actually fine even if we lost the global lock.
Instead of just freezing the repository immediately, sit there in a loop waiting for the master to come back up for a few minutes. If it recovers, we can release the lock and everything will be OK again.
Basically, the changes are:
- If we can't release the lock at first, sit in a loop trying really hard to release it for a while.
- Add a unique lock identifier so we can be certain we're only releasing //our// lock no matter what else is going on.
- Do the version reads on the same connection holding the lock, so we can be sure we haven't lost the lock before we do that read.
Test Plan:
- Added a `sleep(10)` after accepting the write but before releasing the lock so I could run `mysqld stop` and force this issue to occur.
- Pushed like this:
```
$ echo D >> record && git commit -am D && git push
[master 707ecc3] D
1 file changed, 1 insertion(+)
# Push received by "local001.phacility.net", forwarding to cluster host.
# Waiting up to 120 second(s) for a cluster write lock...
# Acquired write lock immediately.
# Waiting up to 120 second(s) for a cluster read lock on "local001.phacility.net"...
# Acquired read lock immediately.
# Device "local001.phacility.net" is already a cluster leader and does not need to be synchronized.
# Ready to receive on cluster host "local001.phacility.net".
Counting objects: 3, done.
Delta compression using up to 8 threads.
Compressing objects: 100% (2/2), done.
Writing objects: 100% (3/3), 254 bytes | 0 bytes/s, done.
Total 3 (delta 1), reused 0 (delta 0)
BEGIN SLEEP
```
- Here, I stopped `mysqld` from the CLI in another terminal window.
```
END SLEEP
# CRITICAL. Failed to release cluster write lock!
# The connection to the master database was lost while receiving the write.
# This process will spend 300 more second(s) attempting to recover, then give up.
```
- Here, I started `mysqld` again.
```
# RECOVERED. Link to master database was restored.
# Released cluster write lock.
To ssh://local@localvault.phacility.com/diffusion/26/locktopia.git
2cbf87c..707ecc3 master -> master
```
Reviewers: chad
Reviewed By: chad
Maniphest Tasks: T10860
Differential Revision: https://secure.phabricator.com/D15792
2016-04-24 10:07:35 -07:00
|
|
|
private $isExternalConnection = false;
|
2018-03-05 14:22:13 -08:00
|
|
|
private $log;
|
|
|
|
private $disableLogging;
|
2012-06-27 13:59:12 -07:00
|
|
|
|
2012-08-10 11:28:43 -07:00
|
|
|
private static $pool = array();
|
|
|
|
|
2012-06-27 13:59:12 -07:00
|
|
|
|
|
|
|
/* -( Constructing Locks )------------------------------------------------- */
|
|
|
|
|
|
|
|
|
2018-03-05 14:07:47 -08:00
|
|
|
public static function newLock($name, $parameters = array()) {
|
2012-07-09 10:39:30 -07:00
|
|
|
$namespace = PhabricatorLiskDAO::getStorageNamespace();
|
2015-04-02 13:42:22 -07:00
|
|
|
$namespace = PhabricatorHash::digestToLength($namespace, 20);
|
|
|
|
|
2018-03-05 14:07:47 -08:00
|
|
|
$parts = array();
|
|
|
|
ksort($parameters);
|
|
|
|
foreach ($parameters as $key => $parameter) {
|
|
|
|
if (!preg_match('/^[a-zA-Z0-9]+\z/', $key)) {
|
|
|
|
throw new Exception(
|
|
|
|
pht(
|
|
|
|
'Lock parameter key "%s" must be alphanumeric.',
|
|
|
|
$key));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!is_scalar($parameter) && !is_null($parameter)) {
|
|
|
|
throw new Exception(
|
|
|
|
pht(
|
|
|
|
'Lock parameter for key "%s" must be a scalar.',
|
|
|
|
$key));
|
|
|
|
}
|
|
|
|
|
|
|
|
$value = phutil_json_encode($parameter);
|
|
|
|
$parts[] = "{$key}={$value}";
|
2015-04-02 13:42:22 -07:00
|
|
|
}
|
2018-03-05 14:07:47 -08:00
|
|
|
$parts = implode(', ', $parts);
|
2012-06-27 13:59:12 -07:00
|
|
|
|
2018-03-05 14:07:47 -08:00
|
|
|
$local = "{$name}({$parts})";
|
|
|
|
$local = PhabricatorHash::digestToLength($local, 20);
|
|
|
|
|
|
|
|
$full_name = "ph:{$namespace}:{$local}";
|
2012-06-27 13:59:12 -07:00
|
|
|
$lock = self::getLock($full_name);
|
|
|
|
if (!$lock) {
|
|
|
|
$lock = new PhabricatorGlobalLock($full_name);
|
|
|
|
self::registerLock($lock);
|
2018-03-05 14:07:47 -08:00
|
|
|
|
|
|
|
$lock->parameters = $parameters;
|
2012-06-27 13:59:12 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
return $lock;
|
|
|
|
}
|
|
|
|
|
2015-12-01 07:58:14 +11:00
|
|
|
/**
|
|
|
|
* Use a specific database connection for locking.
|
|
|
|
*
|
|
|
|
* By default, `PhabricatorGlobalLock` will lock on the "repository" database
|
|
|
|
* (somewhat arbitrarily). In most cases this is fine, but this method can
|
|
|
|
* be used to lock on a specific connection.
|
|
|
|
*
|
|
|
|
* @param AphrontDatabaseConnection
|
|
|
|
* @return this
|
|
|
|
*/
|
|
|
|
public function useSpecificConnection(AphrontDatabaseConnection $conn) {
|
|
|
|
$this->conn = $conn;
|
Make cluster repositories more resistant to freezing
Summary:
Ref T10860. This allows us to recover if the connection to the database is lost during a push.
If we lose the connection to the master database during a push, we would previously freeze the repository. This is very safe, but not very operator-friendly since you have to go manually unfreeze it.
We don't need to be quite this aggressive about freezing things. The repository state is still consistent after we've "upgraded" the lock by setting `isWriting = 1`, so we're actually fine even if we lost the global lock.
Instead of just freezing the repository immediately, sit there in a loop waiting for the master to come back up for a few minutes. If it recovers, we can release the lock and everything will be OK again.
Basically, the changes are:
- If we can't release the lock at first, sit in a loop trying really hard to release it for a while.
- Add a unique lock identifier so we can be certain we're only releasing //our// lock no matter what else is going on.
- Do the version reads on the same connection holding the lock, so we can be sure we haven't lost the lock before we do that read.
Test Plan:
- Added a `sleep(10)` after accepting the write but before releasing the lock so I could run `mysqld stop` and force this issue to occur.
- Pushed like this:
```
$ echo D >> record && git commit -am D && git push
[master 707ecc3] D
1 file changed, 1 insertion(+)
# Push received by "local001.phacility.net", forwarding to cluster host.
# Waiting up to 120 second(s) for a cluster write lock...
# Acquired write lock immediately.
# Waiting up to 120 second(s) for a cluster read lock on "local001.phacility.net"...
# Acquired read lock immediately.
# Device "local001.phacility.net" is already a cluster leader and does not need to be synchronized.
# Ready to receive on cluster host "local001.phacility.net".
Counting objects: 3, done.
Delta compression using up to 8 threads.
Compressing objects: 100% (2/2), done.
Writing objects: 100% (3/3), 254 bytes | 0 bytes/s, done.
Total 3 (delta 1), reused 0 (delta 0)
BEGIN SLEEP
```
- Here, I stopped `mysqld` from the CLI in another terminal window.
```
END SLEEP
# CRITICAL. Failed to release cluster write lock!
# The connection to the master database was lost while receiving the write.
# This process will spend 300 more second(s) attempting to recover, then give up.
```
- Here, I started `mysqld` again.
```
# RECOVERED. Link to master database was restored.
# Released cluster write lock.
To ssh://local@localvault.phacility.com/diffusion/26/locktopia.git
2cbf87c..707ecc3 master -> master
```
Reviewers: chad
Reviewed By: chad
Maniphest Tasks: T10860
Differential Revision: https://secure.phabricator.com/D15792
2016-04-24 10:07:35 -07:00
|
|
|
$this->isExternalConnection = true;
|
2015-12-01 07:58:14 +11:00
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
2018-03-05 14:22:13 -08:00
|
|
|
public function setDisableLogging($disable) {
|
|
|
|
$this->disableLogging = $disable;
|
|
|
|
return $this;
|
|
|
|
}
|
|
|
|
|
2012-06-27 13:59:12 -07:00
|
|
|
|
|
|
|
/* -( Implementation )----------------------------------------------------- */
|
|
|
|
|
2012-07-05 16:03:43 -07:00
|
|
|
protected function doLock($wait) {
|
2012-06-27 13:59:12 -07:00
|
|
|
$conn = $this->conn;
|
2012-08-10 11:28:43 -07:00
|
|
|
|
|
|
|
if (!$conn) {
|
|
|
|
// Try to reuse a connection from the connection pool.
|
|
|
|
$conn = array_pop(self::$pool);
|
|
|
|
}
|
|
|
|
|
2012-06-27 13:59:12 -07:00
|
|
|
if (!$conn) {
|
|
|
|
// NOTE: Using the 'repository' database somewhat arbitrarily, mostly
|
|
|
|
// because the first client of locks is the repository daemons. We must
|
|
|
|
// always use the same database for all locks, but don't access any
|
|
|
|
// tables so we could use any valid database. We could build a
|
|
|
|
// database-free connection instead, but that's kind of messy and we
|
|
|
|
// might forget about it in the future if we vertically partition the
|
|
|
|
// application.
|
|
|
|
$dao = new PhabricatorRepository();
|
|
|
|
|
|
|
|
// NOTE: Using "force_new" to make sure each lock is on its own
|
|
|
|
// connection.
|
|
|
|
$conn = $dao->establishConnection('w', $force_new = true);
|
|
|
|
}
|
|
|
|
|
2015-12-01 07:58:14 +11:00
|
|
|
// NOTE: Since MySQL will disconnect us if we're idle for too long, we set
|
|
|
|
// the wait_timeout to an enormous value, to allow us to hold the
|
|
|
|
// connection open indefinitely (or, at least, for 24 days).
|
|
|
|
$max_allowed_timeout = 2147483;
|
|
|
|
queryfx($conn, 'SET wait_timeout = %d', $max_allowed_timeout);
|
|
|
|
|
Make cluster repositories more resistant to freezing
Summary:
Ref T10860. This allows us to recover if the connection to the database is lost during a push.
If we lose the connection to the master database during a push, we would previously freeze the repository. This is very safe, but not very operator-friendly since you have to go manually unfreeze it.
We don't need to be quite this aggressive about freezing things. The repository state is still consistent after we've "upgraded" the lock by setting `isWriting = 1`, so we're actually fine even if we lost the global lock.
Instead of just freezing the repository immediately, sit there in a loop waiting for the master to come back up for a few minutes. If it recovers, we can release the lock and everything will be OK again.
Basically, the changes are:
- If we can't release the lock at first, sit in a loop trying really hard to release it for a while.
- Add a unique lock identifier so we can be certain we're only releasing //our// lock no matter what else is going on.
- Do the version reads on the same connection holding the lock, so we can be sure we haven't lost the lock before we do that read.
Test Plan:
- Added a `sleep(10)` after accepting the write but before releasing the lock so I could run `mysqld stop` and force this issue to occur.
- Pushed like this:
```
$ echo D >> record && git commit -am D && git push
[master 707ecc3] D
1 file changed, 1 insertion(+)
# Push received by "local001.phacility.net", forwarding to cluster host.
# Waiting up to 120 second(s) for a cluster write lock...
# Acquired write lock immediately.
# Waiting up to 120 second(s) for a cluster read lock on "local001.phacility.net"...
# Acquired read lock immediately.
# Device "local001.phacility.net" is already a cluster leader and does not need to be synchronized.
# Ready to receive on cluster host "local001.phacility.net".
Counting objects: 3, done.
Delta compression using up to 8 threads.
Compressing objects: 100% (2/2), done.
Writing objects: 100% (3/3), 254 bytes | 0 bytes/s, done.
Total 3 (delta 1), reused 0 (delta 0)
BEGIN SLEEP
```
- Here, I stopped `mysqld` from the CLI in another terminal window.
```
END SLEEP
# CRITICAL. Failed to release cluster write lock!
# The connection to the master database was lost while receiving the write.
# This process will spend 300 more second(s) attempting to recover, then give up.
```
- Here, I started `mysqld` again.
```
# RECOVERED. Link to master database was restored.
# Released cluster write lock.
To ssh://local@localvault.phacility.com/diffusion/26/locktopia.git
2cbf87c..707ecc3 master -> master
```
Reviewers: chad
Reviewed By: chad
Maniphest Tasks: T10860
Differential Revision: https://secure.phabricator.com/D15792
2016-04-24 10:07:35 -07:00
|
|
|
$lock_name = $this->getName();
|
|
|
|
|
2012-06-27 13:59:12 -07:00
|
|
|
$result = queryfx_one(
|
|
|
|
$conn,
|
2012-07-05 16:03:43 -07:00
|
|
|
'SELECT GET_LOCK(%s, %f)',
|
Make cluster repositories more resistant to freezing
Summary:
Ref T10860. This allows us to recover if the connection to the database is lost during a push.
If we lose the connection to the master database during a push, we would previously freeze the repository. This is very safe, but not very operator-friendly since you have to go manually unfreeze it.
We don't need to be quite this aggressive about freezing things. The repository state is still consistent after we've "upgraded" the lock by setting `isWriting = 1`, so we're actually fine even if we lost the global lock.
Instead of just freezing the repository immediately, sit there in a loop waiting for the master to come back up for a few minutes. If it recovers, we can release the lock and everything will be OK again.
Basically, the changes are:
- If we can't release the lock at first, sit in a loop trying really hard to release it for a while.
- Add a unique lock identifier so we can be certain we're only releasing //our// lock no matter what else is going on.
- Do the version reads on the same connection holding the lock, so we can be sure we haven't lost the lock before we do that read.
Test Plan:
- Added a `sleep(10)` after accepting the write but before releasing the lock so I could run `mysqld stop` and force this issue to occur.
- Pushed like this:
```
$ echo D >> record && git commit -am D && git push
[master 707ecc3] D
1 file changed, 1 insertion(+)
# Push received by "local001.phacility.net", forwarding to cluster host.
# Waiting up to 120 second(s) for a cluster write lock...
# Acquired write lock immediately.
# Waiting up to 120 second(s) for a cluster read lock on "local001.phacility.net"...
# Acquired read lock immediately.
# Device "local001.phacility.net" is already a cluster leader and does not need to be synchronized.
# Ready to receive on cluster host "local001.phacility.net".
Counting objects: 3, done.
Delta compression using up to 8 threads.
Compressing objects: 100% (2/2), done.
Writing objects: 100% (3/3), 254 bytes | 0 bytes/s, done.
Total 3 (delta 1), reused 0 (delta 0)
BEGIN SLEEP
```
- Here, I stopped `mysqld` from the CLI in another terminal window.
```
END SLEEP
# CRITICAL. Failed to release cluster write lock!
# The connection to the master database was lost while receiving the write.
# This process will spend 300 more second(s) attempting to recover, then give up.
```
- Here, I started `mysqld` again.
```
# RECOVERED. Link to master database was restored.
# Released cluster write lock.
To ssh://local@localvault.phacility.com/diffusion/26/locktopia.git
2cbf87c..707ecc3 master -> master
```
Reviewers: chad
Reviewed By: chad
Maniphest Tasks: T10860
Differential Revision: https://secure.phabricator.com/D15792
2016-04-24 10:07:35 -07:00
|
|
|
$lock_name,
|
2012-07-05 16:03:43 -07:00
|
|
|
$wait);
|
2012-06-27 13:59:12 -07:00
|
|
|
|
|
|
|
$ok = head($result);
|
|
|
|
if (!$ok) {
|
When we fail to acquire a repository lock, try to provide a hint about why
Summary:
Ref T13202. See PHI889. If the lock log is enabled, we can try to offer more details about lock holders.
When we fail to acquire a lock:
- check for recent acquisitions and suggest that this is a bottleneck issue;
- if there are no recent acquisitions, check for the last acquisition and print details about it (what process, how long ago, whether or not we believe it was released).
Test Plan:
- Enabled the lock log.
- Changed the lock wait time to 1 second.
- Added a `sleep(10)` after grabbing the lock.
- In one window, ran a Conduit call or a `git fetch`.
- In another window, ran another operation.
- Got useful/sensible errors for both ssh and web lock holders, for example:
> PhutilProxyException: Failed to acquire read lock after waiting 1 second(s). You may be able to retry later. (This lock was most recently acquired by a process (pid=12609, host=orbital-3.local, sapi=apache2handler, controller=PhabricatorConduitAPIController, method=diffusion.rawdiffquery) 3 second(s) ago. There is no record of this lock being released.)
> PhutilProxyException: Failed to acquire read lock after waiting 1 second(s). You may be able to retry later. (This lock was most recently acquired by a process (pid=65251, host=orbital-3.local, sapi=cli, argv=/Users/epriestley/dev/core/lib/phabricator/bin/ssh-exec --phabricator-ssh-device local.phacility.net --phabricator-ssh-key 2) 2 second(s) ago. There is no record of this lock being released.)
Reviewers: amckinley
Reviewed By: amckinley
Maniphest Tasks: T13202
Differential Revision: https://secure.phabricator.com/D19702
2018-09-23 11:01:29 -07:00
|
|
|
throw id(new PhutilLockException($lock_name))
|
|
|
|
->setHint($this->newHint($lock_name, $wait));
|
2012-06-27 13:59:12 -07:00
|
|
|
}
|
|
|
|
|
Make cluster repositories more resistant to freezing
Summary:
Ref T10860. This allows us to recover if the connection to the database is lost during a push.
If we lose the connection to the master database during a push, we would previously freeze the repository. This is very safe, but not very operator-friendly since you have to go manually unfreeze it.
We don't need to be quite this aggressive about freezing things. The repository state is still consistent after we've "upgraded" the lock by setting `isWriting = 1`, so we're actually fine even if we lost the global lock.
Instead of just freezing the repository immediately, sit there in a loop waiting for the master to come back up for a few minutes. If it recovers, we can release the lock and everything will be OK again.
Basically, the changes are:
- If we can't release the lock at first, sit in a loop trying really hard to release it for a while.
- Add a unique lock identifier so we can be certain we're only releasing //our// lock no matter what else is going on.
- Do the version reads on the same connection holding the lock, so we can be sure we haven't lost the lock before we do that read.
Test Plan:
- Added a `sleep(10)` after accepting the write but before releasing the lock so I could run `mysqld stop` and force this issue to occur.
- Pushed like this:
```
$ echo D >> record && git commit -am D && git push
[master 707ecc3] D
1 file changed, 1 insertion(+)
# Push received by "local001.phacility.net", forwarding to cluster host.
# Waiting up to 120 second(s) for a cluster write lock...
# Acquired write lock immediately.
# Waiting up to 120 second(s) for a cluster read lock on "local001.phacility.net"...
# Acquired read lock immediately.
# Device "local001.phacility.net" is already a cluster leader and does not need to be synchronized.
# Ready to receive on cluster host "local001.phacility.net".
Counting objects: 3, done.
Delta compression using up to 8 threads.
Compressing objects: 100% (2/2), done.
Writing objects: 100% (3/3), 254 bytes | 0 bytes/s, done.
Total 3 (delta 1), reused 0 (delta 0)
BEGIN SLEEP
```
- Here, I stopped `mysqld` from the CLI in another terminal window.
```
END SLEEP
# CRITICAL. Failed to release cluster write lock!
# The connection to the master database was lost while receiving the write.
# This process will spend 300 more second(s) attempting to recover, then give up.
```
- Here, I started `mysqld` again.
```
# RECOVERED. Link to master database was restored.
# Released cluster write lock.
To ssh://local@localvault.phacility.com/diffusion/26/locktopia.git
2cbf87c..707ecc3 master -> master
```
Reviewers: chad
Reviewed By: chad
Maniphest Tasks: T10860
Differential Revision: https://secure.phabricator.com/D15792
2016-04-24 10:07:35 -07:00
|
|
|
$conn->rememberLock($lock_name);
|
|
|
|
|
2012-06-27 13:59:12 -07:00
|
|
|
$this->conn = $conn;
|
2018-03-05 14:22:13 -08:00
|
|
|
|
|
|
|
if ($this->shouldLogLock()) {
|
When we fail to acquire a repository lock, try to provide a hint about why
Summary:
Ref T13202. See PHI889. If the lock log is enabled, we can try to offer more details about lock holders.
When we fail to acquire a lock:
- check for recent acquisitions and suggest that this is a bottleneck issue;
- if there are no recent acquisitions, check for the last acquisition and print details about it (what process, how long ago, whether or not we believe it was released).
Test Plan:
- Enabled the lock log.
- Changed the lock wait time to 1 second.
- Added a `sleep(10)` after grabbing the lock.
- In one window, ran a Conduit call or a `git fetch`.
- In another window, ran another operation.
- Got useful/sensible errors for both ssh and web lock holders, for example:
> PhutilProxyException: Failed to acquire read lock after waiting 1 second(s). You may be able to retry later. (This lock was most recently acquired by a process (pid=12609, host=orbital-3.local, sapi=apache2handler, controller=PhabricatorConduitAPIController, method=diffusion.rawdiffquery) 3 second(s) ago. There is no record of this lock being released.)
> PhutilProxyException: Failed to acquire read lock after waiting 1 second(s). You may be able to retry later. (This lock was most recently acquired by a process (pid=65251, host=orbital-3.local, sapi=cli, argv=/Users/epriestley/dev/core/lib/phabricator/bin/ssh-exec --phabricator-ssh-device local.phacility.net --phabricator-ssh-key 2) 2 second(s) ago. There is no record of this lock being released.)
Reviewers: amckinley
Reviewed By: amckinley
Maniphest Tasks: T13202
Differential Revision: https://secure.phabricator.com/D19702
2018-09-23 11:01:29 -07:00
|
|
|
$lock_context = $this->newLockContext();
|
2018-03-05 14:22:13 -08:00
|
|
|
|
|
|
|
$log = id(new PhabricatorDaemonLockLog())
|
|
|
|
->setLockName($lock_name)
|
|
|
|
->setLockParameters($this->parameters)
|
|
|
|
->setLockContext($lock_context)
|
|
|
|
->save();
|
|
|
|
|
|
|
|
$this->log = $log;
|
|
|
|
}
|
2012-06-27 13:59:12 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
protected function doUnlock() {
|
Make cluster repositories more resistant to freezing
Summary:
Ref T10860. This allows us to recover if the connection to the database is lost during a push.
If we lose the connection to the master database during a push, we would previously freeze the repository. This is very safe, but not very operator-friendly since you have to go manually unfreeze it.
We don't need to be quite this aggressive about freezing things. The repository state is still consistent after we've "upgraded" the lock by setting `isWriting = 1`, so we're actually fine even if we lost the global lock.
Instead of just freezing the repository immediately, sit there in a loop waiting for the master to come back up for a few minutes. If it recovers, we can release the lock and everything will be OK again.
Basically, the changes are:
- If we can't release the lock at first, sit in a loop trying really hard to release it for a while.
- Add a unique lock identifier so we can be certain we're only releasing //our// lock no matter what else is going on.
- Do the version reads on the same connection holding the lock, so we can be sure we haven't lost the lock before we do that read.
Test Plan:
- Added a `sleep(10)` after accepting the write but before releasing the lock so I could run `mysqld stop` and force this issue to occur.
- Pushed like this:
```
$ echo D >> record && git commit -am D && git push
[master 707ecc3] D
1 file changed, 1 insertion(+)
# Push received by "local001.phacility.net", forwarding to cluster host.
# Waiting up to 120 second(s) for a cluster write lock...
# Acquired write lock immediately.
# Waiting up to 120 second(s) for a cluster read lock on "local001.phacility.net"...
# Acquired read lock immediately.
# Device "local001.phacility.net" is already a cluster leader and does not need to be synchronized.
# Ready to receive on cluster host "local001.phacility.net".
Counting objects: 3, done.
Delta compression using up to 8 threads.
Compressing objects: 100% (2/2), done.
Writing objects: 100% (3/3), 254 bytes | 0 bytes/s, done.
Total 3 (delta 1), reused 0 (delta 0)
BEGIN SLEEP
```
- Here, I stopped `mysqld` from the CLI in another terminal window.
```
END SLEEP
# CRITICAL. Failed to release cluster write lock!
# The connection to the master database was lost while receiving the write.
# This process will spend 300 more second(s) attempting to recover, then give up.
```
- Here, I started `mysqld` again.
```
# RECOVERED. Link to master database was restored.
# Released cluster write lock.
To ssh://local@localvault.phacility.com/diffusion/26/locktopia.git
2cbf87c..707ecc3 master -> master
```
Reviewers: chad
Reviewed By: chad
Maniphest Tasks: T10860
Differential Revision: https://secure.phabricator.com/D15792
2016-04-24 10:07:35 -07:00
|
|
|
$lock_name = $this->getName();
|
|
|
|
|
|
|
|
$conn = $this->conn;
|
|
|
|
|
|
|
|
try {
|
|
|
|
$result = queryfx_one(
|
|
|
|
$conn,
|
|
|
|
'SELECT RELEASE_LOCK(%s)',
|
|
|
|
$lock_name);
|
|
|
|
$conn->forgetLock($lock_name);
|
|
|
|
} catch (Exception $ex) {
|
|
|
|
$result = array(null);
|
|
|
|
}
|
|
|
|
|
|
|
|
$ok = head($result);
|
|
|
|
if (!$ok) {
|
|
|
|
// TODO: We could throw here, but then this lock doesn't get marked
|
|
|
|
// unlocked and we throw again later when exiting. It also doesn't
|
|
|
|
// particularly matter for any current applications. For now, just
|
|
|
|
// swallow the error.
|
|
|
|
}
|
2012-06-27 13:59:12 -07:00
|
|
|
|
2012-07-23 19:06:58 -07:00
|
|
|
$this->conn = null;
|
Make cluster repositories more resistant to freezing
Summary:
Ref T10860. This allows us to recover if the connection to the database is lost during a push.
If we lose the connection to the master database during a push, we would previously freeze the repository. This is very safe, but not very operator-friendly since you have to go manually unfreeze it.
We don't need to be quite this aggressive about freezing things. The repository state is still consistent after we've "upgraded" the lock by setting `isWriting = 1`, so we're actually fine even if we lost the global lock.
Instead of just freezing the repository immediately, sit there in a loop waiting for the master to come back up for a few minutes. If it recovers, we can release the lock and everything will be OK again.
Basically, the changes are:
- If we can't release the lock at first, sit in a loop trying really hard to release it for a while.
- Add a unique lock identifier so we can be certain we're only releasing //our// lock no matter what else is going on.
- Do the version reads on the same connection holding the lock, so we can be sure we haven't lost the lock before we do that read.
Test Plan:
- Added a `sleep(10)` after accepting the write but before releasing the lock so I could run `mysqld stop` and force this issue to occur.
- Pushed like this:
```
$ echo D >> record && git commit -am D && git push
[master 707ecc3] D
1 file changed, 1 insertion(+)
# Push received by "local001.phacility.net", forwarding to cluster host.
# Waiting up to 120 second(s) for a cluster write lock...
# Acquired write lock immediately.
# Waiting up to 120 second(s) for a cluster read lock on "local001.phacility.net"...
# Acquired read lock immediately.
# Device "local001.phacility.net" is already a cluster leader and does not need to be synchronized.
# Ready to receive on cluster host "local001.phacility.net".
Counting objects: 3, done.
Delta compression using up to 8 threads.
Compressing objects: 100% (2/2), done.
Writing objects: 100% (3/3), 254 bytes | 0 bytes/s, done.
Total 3 (delta 1), reused 0 (delta 0)
BEGIN SLEEP
```
- Here, I stopped `mysqld` from the CLI in another terminal window.
```
END SLEEP
# CRITICAL. Failed to release cluster write lock!
# The connection to the master database was lost while receiving the write.
# This process will spend 300 more second(s) attempting to recover, then give up.
```
- Here, I started `mysqld` again.
```
# RECOVERED. Link to master database was restored.
# Released cluster write lock.
To ssh://local@localvault.phacility.com/diffusion/26/locktopia.git
2cbf87c..707ecc3 master -> master
```
Reviewers: chad
Reviewed By: chad
Maniphest Tasks: T10860
Differential Revision: https://secure.phabricator.com/D15792
2016-04-24 10:07:35 -07:00
|
|
|
$this->isExternalConnection = false;
|
|
|
|
|
|
|
|
if (!$this->isExternalConnection) {
|
|
|
|
$conn->close();
|
|
|
|
self::$pool[] = $conn;
|
|
|
|
}
|
2018-03-05 14:22:13 -08:00
|
|
|
|
|
|
|
if ($this->log) {
|
|
|
|
$log = $this->log;
|
|
|
|
$this->log = null;
|
|
|
|
|
|
|
|
$conn = $log->establishConnection('w');
|
|
|
|
queryfx(
|
|
|
|
$conn,
|
|
|
|
'UPDATE %T SET lockReleased = UNIX_TIMESTAMP() WHERE id = %d',
|
|
|
|
$log->getTableName(),
|
|
|
|
$log->getID());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private function shouldLogLock() {
|
|
|
|
if ($this->disableLogging) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
$policy = id(new PhabricatorDaemonLockLogGarbageCollector())
|
|
|
|
->getRetentionPolicy();
|
|
|
|
if (!$policy) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
2012-06-27 13:59:12 -07:00
|
|
|
}
|
|
|
|
|
When we fail to acquire a repository lock, try to provide a hint about why
Summary:
Ref T13202. See PHI889. If the lock log is enabled, we can try to offer more details about lock holders.
When we fail to acquire a lock:
- check for recent acquisitions and suggest that this is a bottleneck issue;
- if there are no recent acquisitions, check for the last acquisition and print details about it (what process, how long ago, whether or not we believe it was released).
Test Plan:
- Enabled the lock log.
- Changed the lock wait time to 1 second.
- Added a `sleep(10)` after grabbing the lock.
- In one window, ran a Conduit call or a `git fetch`.
- In another window, ran another operation.
- Got useful/sensible errors for both ssh and web lock holders, for example:
> PhutilProxyException: Failed to acquire read lock after waiting 1 second(s). You may be able to retry later. (This lock was most recently acquired by a process (pid=12609, host=orbital-3.local, sapi=apache2handler, controller=PhabricatorConduitAPIController, method=diffusion.rawdiffquery) 3 second(s) ago. There is no record of this lock being released.)
> PhutilProxyException: Failed to acquire read lock after waiting 1 second(s). You may be able to retry later. (This lock was most recently acquired by a process (pid=65251, host=orbital-3.local, sapi=cli, argv=/Users/epriestley/dev/core/lib/phabricator/bin/ssh-exec --phabricator-ssh-device local.phacility.net --phabricator-ssh-key 2) 2 second(s) ago. There is no record of this lock being released.)
Reviewers: amckinley
Reviewed By: amckinley
Maniphest Tasks: T13202
Differential Revision: https://secure.phabricator.com/D19702
2018-09-23 11:01:29 -07:00
|
|
|
private function newLockContext() {
|
|
|
|
$context = array(
|
|
|
|
'pid' => getmypid(),
|
|
|
|
'host' => php_uname('n'),
|
|
|
|
'sapi' => php_sapi_name(),
|
|
|
|
);
|
|
|
|
|
|
|
|
global $argv;
|
|
|
|
if ($argv) {
|
|
|
|
$context['argv'] = $argv;
|
|
|
|
}
|
|
|
|
|
|
|
|
$access_log = null;
|
|
|
|
|
|
|
|
// TODO: There's currently no cohesive way to get the parameterized access
|
|
|
|
// log for the current request across different request types. Web requests
|
|
|
|
// have an "AccessLog", SSH requests have an "SSHLog", and other processes
|
|
|
|
// (like scripts) have no log. But there's no method to say "give me any
|
|
|
|
// log you've got". For now, just test if we have a web request and use the
|
|
|
|
// "AccessLog" if we do, since that's the only one we actually read any
|
|
|
|
// parameters from.
|
|
|
|
|
|
|
|
// NOTE: "PhabricatorStartup" is only available from web requests, not
|
|
|
|
// from CLI scripts.
|
|
|
|
if (class_exists('PhabricatorStartup', false)) {
|
|
|
|
$access_log = PhabricatorAccessLog::getLog();
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($access_log) {
|
|
|
|
$controller = $access_log->getData('C');
|
|
|
|
if ($controller) {
|
|
|
|
$context['controller'] = $controller;
|
|
|
|
}
|
|
|
|
|
|
|
|
$method = $access_log->getData('m');
|
|
|
|
if ($method) {
|
|
|
|
$context['method'] = $method;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $context;
|
|
|
|
}
|
|
|
|
|
|
|
|
private function newHint($lock_name, $wait) {
|
|
|
|
if (!$this->shouldLogLock()) {
|
|
|
|
return pht(
|
|
|
|
'Enable the lock log for more detailed information about '.
|
|
|
|
'which process is holding this lock.');
|
|
|
|
}
|
|
|
|
|
|
|
|
$now = PhabricatorTime::getNow();
|
|
|
|
|
|
|
|
// First, look for recent logs. If other processes have been acquiring and
|
|
|
|
// releasing this lock while we've been waiting, this is more likely to be
|
|
|
|
// a contention/throughput issue than an issue with something hung while
|
|
|
|
// holding the lock.
|
|
|
|
$limit = 100;
|
|
|
|
$logs = id(new PhabricatorDaemonLockLog())->loadAllWhere(
|
|
|
|
'lockName = %s AND dateCreated >= %d ORDER BY id ASC LIMIT %d',
|
|
|
|
$lock_name,
|
|
|
|
($now - $wait),
|
|
|
|
$limit);
|
|
|
|
|
|
|
|
if ($logs) {
|
|
|
|
if (count($logs) === $limit) {
|
|
|
|
return pht(
|
|
|
|
'During the last %s second(s) spent waiting for the lock, more '.
|
|
|
|
'than %s other process(es) acquired it, so this is likely a '.
|
|
|
|
'bottleneck. Use "bin/lock log --name %s" to review log activity.',
|
|
|
|
new PhutilNumber($wait),
|
|
|
|
new PhutilNumber($limit),
|
|
|
|
$lock_name);
|
|
|
|
} else {
|
|
|
|
return pht(
|
|
|
|
'During the last %s second(s) spent waiting for the lock, %s '.
|
|
|
|
'other process(es) acquired it, so this is likely a '.
|
|
|
|
'bottleneck. Use "bin/lock log --name %s" to review log activity.',
|
|
|
|
new PhutilNumber($wait),
|
|
|
|
phutil_count($logs),
|
|
|
|
$lock_name);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$last_log = id(new PhabricatorDaemonLockLog())->loadOneWhere(
|
|
|
|
'lockName = %s ORDER BY id DESC LIMIT 1',
|
|
|
|
$lock_name);
|
|
|
|
|
|
|
|
if ($last_log) {
|
|
|
|
$info = array();
|
|
|
|
|
|
|
|
$acquired = $last_log->getDateCreated();
|
|
|
|
$context = $last_log->getLockContext();
|
|
|
|
|
|
|
|
$process_info = array();
|
|
|
|
|
|
|
|
$pid = idx($context, 'pid');
|
|
|
|
if ($pid) {
|
|
|
|
$process_info[] = 'pid='.$pid;
|
|
|
|
}
|
|
|
|
|
|
|
|
$host = idx($context, 'host');
|
|
|
|
if ($host) {
|
|
|
|
$process_info[] = 'host='.$host;
|
|
|
|
}
|
|
|
|
|
|
|
|
$sapi = idx($context, 'sapi');
|
|
|
|
if ($sapi) {
|
|
|
|
$process_info[] = 'sapi='.$sapi;
|
|
|
|
}
|
|
|
|
|
|
|
|
$argv = idx($context, 'argv');
|
|
|
|
if ($argv) {
|
|
|
|
$process_info[] = 'argv='.(string)csprintf('%LR', $argv);
|
|
|
|
}
|
|
|
|
|
|
|
|
$controller = idx($context, 'controller');
|
|
|
|
if ($controller) {
|
|
|
|
$process_info[] = 'controller='.$controller;
|
|
|
|
}
|
|
|
|
|
|
|
|
$method = idx($context, 'method');
|
|
|
|
if ($method) {
|
|
|
|
$process_info[] = 'method='.$method;
|
|
|
|
}
|
|
|
|
|
|
|
|
$process_info = implode(', ', $process_info);
|
|
|
|
|
|
|
|
$info[] = pht(
|
|
|
|
'This lock was most recently acquired by a process (%s) '.
|
|
|
|
'%s second(s) ago.',
|
|
|
|
$process_info,
|
|
|
|
new PhutilNumber($now - $acquired));
|
|
|
|
|
|
|
|
$released = $last_log->getLockReleased();
|
|
|
|
if ($released) {
|
|
|
|
$info[] = pht(
|
|
|
|
'This lock was released %s second(s) ago.',
|
|
|
|
new PhutilNumber($now - $released));
|
|
|
|
} else {
|
|
|
|
$info[] = pht('There is no record of this lock being released.');
|
|
|
|
}
|
|
|
|
|
|
|
|
return implode(' ', $info);
|
|
|
|
}
|
|
|
|
|
|
|
|
return pht(
|
|
|
|
'Found no records of processes acquiring or releasing this lock.');
|
|
|
|
}
|
|
|
|
|
2012-06-27 13:59:12 -07:00
|
|
|
}
|