1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-09-20 01:08:50 +02:00

Automatically degrade to read-only mode when unable to connect to the master

Summary:
Ref T4571. If we fail to connect to the master, automatically try to degrade into a temporary read-only mode ("UNREACHABLE") for the remainder of the request, if possible.

If the request was something like "load the homepage", that'll work fine. If it was something like "submit a comment", there's nothing we can do and we just have to fail.

Detecting this condition imposes a performance penalty: every request checks the connection and gives the database a long time to respond, since we don't want to drop writes unless we have to. So the degraded mode works, but it's really slow, and may perpetuate the problem if the root issue is load-related.

This lays the groundwork for improving this case by degrading futher into a "SEVERED" mode which will persist across requests. In the future, if several requests in a short period of time fail, we'll sever the database host and refuse to try to connect to it for a little while, connecting directly to replicas instead (basically, we're "health checking" the master, like a load balancer would health check a web application server). This will give us a better (much faster) degraded mode in a major service disruption, and reduce load on the master if the root cause is load-related, giving it a better chance of recovering on its own.

Test Plan:
  - Disabled master in config by changing the host/username, got degraded automatically to UNREACAHBLE mode immediately.
  - Faked full SEVERED mode, requests hit replicas and put me in the mode properly.
  - Made stuff work, hit some good pages.
  - Hit some non-cluster pages.

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T4571

Differential Revision: https://secure.phabricator.com/D15674
This commit is contained in:
epriestley 2016-04-10 05:51:34 -07:00
parent e0a8cac703
commit 146fb646f9
8 changed files with 209 additions and 35 deletions

View file

@ -1989,7 +1989,9 @@ phutil_register_library_map(array(
'PhabricatorClusterDatabasesConfigOptionType' => 'infrastructure/cluster/PhabricatorClusterDatabasesConfigOptionType.php',
'PhabricatorClusterException' => 'infrastructure/cluster/PhabricatorClusterException.php',
'PhabricatorClusterExceptionHandler' => 'infrastructure/cluster/PhabricatorClusterExceptionHandler.php',
'PhabricatorClusterImpossibleWriteException' => 'infrastructure/cluster/PhabricatorClusterImpossibleWriteException.php',
'PhabricatorClusterImproperWriteException' => 'infrastructure/cluster/PhabricatorClusterImproperWriteException.php',
'PhabricatorClusterStrandedException' => 'infrastructure/cluster/PhabricatorClusterStrandedException.php',
'PhabricatorColumnProxyInterface' => 'applications/project/interface/PhabricatorColumnProxyInterface.php',
'PhabricatorColumnsEditField' => 'applications/transactions/editfield/PhabricatorColumnsEditField.php',
'PhabricatorCommentEditEngineExtension' => 'applications/transactions/engineextension/PhabricatorCommentEditEngineExtension.php',
@ -6402,7 +6404,9 @@ phutil_register_library_map(array(
'PhabricatorClusterDatabasesConfigOptionType' => 'PhabricatorConfigJSONOptionType',
'PhabricatorClusterException' => 'Exception',
'PhabricatorClusterExceptionHandler' => 'PhabricatorRequestExceptionHandler',
'PhabricatorClusterImpossibleWriteException' => 'PhabricatorClusterException',
'PhabricatorClusterImproperWriteException' => 'PhabricatorClusterException',
'PhabricatorClusterStrandedException' => 'PhabricatorClusterException',
'PhabricatorColumnsEditField' => 'PhabricatorPHIDListEditField',
'PhabricatorCommentEditEngineExtension' => 'PhabricatorEditEngineExtension',
'PhabricatorCommentEditField' => 'PhabricatorEditField',

View file

@ -8,6 +8,7 @@ final class PhabricatorSystemReadOnlyController
}
public function handleRequest(AphrontRequest $request) {
$viewer = $this->getViewer();
$reason = $request->getURIData('reason');
$body = array();
@ -48,15 +49,77 @@ final class PhabricatorSystemReadOnlyController
phutil_tag('tt', array(), 'cluster.databases'));
$button = pht('Wait Patiently');
break;
case PhabricatorEnv::READONLY_UNREACHABLE:
$title = pht('Unable to Reach Master');
$body[] = pht(
'Phabricator was unable to connect to the writable ("master") '.
'database while handling this request, and automatically degraded '.
'into read-only mode.');
$body[] = pht(
'This may happen if there is a temporary network anomaly on the '.
'server side, like cosmic radiation or spooky ghosts. If this '.
'failure was caused by a transient service interruption, '.
'Phabricator will recover momentarily.');
$body[] = pht(
'This may also indicate that a more serious failure has occurred. '.
'If this interruption does not resolve on its own, Phabricator '.
'will soon detect the persistent disruption and degrade into '.
'read-only mode until the issue is resolved.');
$button = pht('Quite Unsettling');
break;
case PhabricatorEnv::READONLY_SEVERED:
$title = pht('Severed From Master');
$body[] = pht(
'Phabricator has consistently been unable to reach the writable '.
'("master") database while processing recent requests.');
$body[] = pht(
'This likely indicates a severe misconfiguration or major service '.
'interruption.');
$body[] = pht(
'Phabricator will periodically retry the connection and recover '.
'once service is restored. Most causes of persistent service '.
'interruption will require administrative intervention in order '.
'to restore service.');
$body[] = pht(
'Although this may be the result of a misconfiguration or '.
'operational error, this is also the state you reach if a '.
'meteor recently obliterated a datacenter.');
$button = pht('Panic!');
break;
default:
return new Aphront404Response();
}
switch ($reason) {
case PhabricatorEnv::READONLY_UNREACHABLE:
case PhabricatorEnv::READONLY_SEVERED:
$body[] = pht(
'This request was served from a replica database. Replica '.
'databases may lag behind the master, so very recent activity '.
'may not be reflected in the UI. This data will be restored if '.
'the master database is restored, but may have been lost if the '.
'master database has been reduced to a pile of ash.');
break;
}
$body[] = pht(
'In read-only mode you can read existing information, but you will not '.
'be able to edit objects or create new objects until this mode is '.
'disabled.');
if ($viewer->getIsAdmin()) {
$body[] = pht(
'As an administrator, you can review status information from the '.
'%s control panel. This may provide more information about the '.
'current state of affairs.',
phutil_tag(
'a',
array(
'href' => '/config/cluster/databases/',
),
pht('Cluster Database Status')));
}
$dialog = $this->newDialog()
->setTitle($title)
->setWidth(AphrontDialogView::WIDTH_FORM)

View file

@ -25,11 +25,15 @@ final class PhabricatorClusterExceptionHandler
$title = $ex->getExceptionTitle();
return id(new AphrontDialogView())
$dialog = id(new AphrontDialogView())
->setTitle($title)
->setUser($viewer)
->appendParagraph($ex->getMessage())
->addCancelButton('/', pht('Proceed With Caution'));
return id(new AphrontDialogResponse())
->setDialog($dialog)
->setHTTPResponseCode(500);
}
}

View file

@ -0,0 +1,10 @@
<?php
final class PhabricatorClusterImpossibleWriteException
extends PhabricatorClusterException {
public function getExceptionTitle() {
return pht('Impossible Cluster Write');
}
}

View file

@ -0,0 +1,10 @@
<?php
final class PhabricatorClusterStrandedException
extends PhabricatorClusterException {
public function getExceptionTitle() {
return pht('Unable to Reach Any Database');
}
}

View file

@ -13,6 +13,8 @@ final class PhabricatorDatabaseRef
const REPLICATION_REPLICA_NONE = 'replica-none';
const REPLICATION_SLOW = 'replica-slow';
const KEY_REFS = 'cluster.db.refs';
private $host;
private $port;
private $user;
@ -28,6 +30,8 @@ final class PhabricatorDatabaseRef
private $replicaMessage;
private $replicaDelay;
private $didFailToConnect;
public function setHost($host) {
$this->host = $host;
return $this;
@ -190,7 +194,19 @@ final class PhabricatorDatabaseRef
);
}
public static function loadAll() {
public static function getLiveRefs() {
$cache = PhabricatorCaches::getRequestCache();
$refs = $cache->getKey(self::KEY_REFS);
if (!$refs) {
$refs = self::newRefs();
$cache->setKey(self::KEY_REFS, $refs);
}
return $refs;
}
public static function newRefs() {
$refs = array();
$default_port = PhabricatorEnv::getEnvConfig('mysql.port');
@ -232,7 +248,7 @@ final class PhabricatorDatabaseRef
}
public static function queryAll() {
$refs = self::loadAll();
$refs = self::newRefs();
foreach ($refs as $ref) {
if ($ref->getDisabled()) {
@ -242,6 +258,7 @@ final class PhabricatorDatabaseRef
$conn = $ref->newManagementConnection();
$t_start = microtime(true);
$replica_status = false;
try {
$replica_status = queryfx_one($conn, 'SHOW SLAVE STATUS');
$ref->setConnectionStatus(self::STATUS_OKAY);
@ -269,33 +286,35 @@ final class PhabricatorDatabaseRef
$t_end = microtime(true);
$ref->setConnectionLatency($t_end - $t_start);
$is_replica = (bool)$replica_status;
if ($ref->getIsMaster() && $is_replica) {
$ref->setReplicaStatus(self::REPLICATION_MASTER_REPLICA);
$ref->setReplicaMessage(
pht(
'This host has a "master" role, but is replicating data from '.
'another host ("%s")!',
idx($replica_status, 'Master_Host')));
} else if (!$ref->getIsMaster() && !$is_replica) {
$ref->setReplicaStatus(self::REPLICATION_REPLICA_NONE);
$ref->setReplicaMessage(
pht(
'This host has a "replica" role, but is not replicating data '.
'from a master (no output from "SHOW SLAVE STATUS").'));
} else {
$ref->setReplicaStatus(self::REPLICATION_OKAY);
}
if ($is_replica) {
$latency = (int)idx($replica_status, 'Seconds_Behind_Master');
$ref->setReplicaDelay($latency);
if ($latency > 30) {
$ref->setReplicaStatus(self::REPLICATION_SLOW);
if ($replica_status !== false) {
$is_replica = (bool)$replica_status;
if ($ref->getIsMaster() && $is_replica) {
$ref->setReplicaStatus(self::REPLICATION_MASTER_REPLICA);
$ref->setReplicaMessage(
pht(
'This replica is lagging far behind the master. Data is at '.
'risk!'));
'This host has a "master" role, but is replicating data from '.
'another host ("%s")!',
idx($replica_status, 'Master_Host')));
} else if (!$ref->getIsMaster() && !$is_replica) {
$ref->setReplicaStatus(self::REPLICATION_REPLICA_NONE);
$ref->setReplicaMessage(
pht(
'This host has a "replica" role, but is not replicating data '.
'from a master (no output from "SHOW SLAVE STATUS").'));
} else {
$ref->setReplicaStatus(self::REPLICATION_OKAY);
}
if ($is_replica) {
$latency = (int)idx($replica_status, 'Seconds_Behind_Master');
$ref->setReplicaDelay($latency);
if ($latency > 30) {
$ref->setReplicaStatus(self::REPLICATION_SLOW);
$ref->setReplicaMessage(
pht(
'This replica is lagging far behind the master. Data is at '.
'risk!'));
}
}
}
}
@ -318,8 +337,31 @@ final class PhabricatorDatabaseRef
));
}
public function isSevered() {
return $this->didFailToConnect;
}
public function isReachable(AphrontDatabaseConnection $connection) {
if ($this->isSevered()) {
return false;
}
try {
$connection->openConnection();
$reachable = true;
} catch (Exception $ex) {
$reachable = false;
}
if (!$reachable) {
$this->didFailToConnect = true;
}
return $reachable;
}
public static function getMasterDatabaseRef() {
$refs = self::loadAll();
$refs = self::getLiveRefs();
if (!$refs) {
$conf = PhabricatorEnv::newObjectFromConfig(
@ -348,7 +390,7 @@ final class PhabricatorDatabaseRef
}
public static function getReplicaDatabaseRef() {
$refs = self::loadAll();
$refs = self::getLiveRefs();
if (!$refs) {
return null;

View file

@ -60,6 +60,8 @@ final class PhabricatorEnv extends Phobject {
private static $readOnlyReason;
const READONLY_CONFIG = 'config';
const READONLY_UNREACHABLE = 'unreachable';
const READONLY_SEVERED = 'severed';
const READONLY_MASTERLESS = 'masterless';
/**
@ -217,6 +219,8 @@ final class PhabricatorEnv extends Phobject {
$master = PhabricatorDatabaseRef::getMasterDatabaseRef();
if (!$master) {
self::setReadOnly(true, self::READONLY_MASTERLESS);
} else if ($master->isSevered()) {
self::setReadOnly(true, self::READONLY_SEVERED);
}
try {
@ -468,6 +472,12 @@ final class PhabricatorEnv extends Phobject {
return pht(
'Phabricator is in read-only mode (no writable database '.
'is configured).');
case self::READONLY_UNREACHABLE:
return pht(
'Phabricator is in read-only mode (unreachable master).');
case self::READONLY_SEVERED:
return pht(
'Phabricator is in read-only mode (major interruption).');
}
return pht('Phabricator is in read-only mode.');

View file

@ -60,8 +60,8 @@ abstract class PhabricatorLiskDAO extends LiskDAO {
$this->raiseImproperWrite($database);
}
$refs = PhabricatorDatabaseRef::loadAll();
if ($refs) {
$is_cluster = (bool)PhabricatorEnv::getEnvConfig('cluster.databases');
if ($is_cluster) {
$connection = $this->newClusterConnection($database, $mode);
} else {
$connection = $this->newBasicConnection($database, $mode, $namespace);
@ -99,8 +99,19 @@ abstract class PhabricatorLiskDAO extends LiskDAO {
private function newClusterConnection($database, $mode) {
$master = PhabricatorDatabaseRef::getMasterDatabaseRef();
if ($master) {
return $master->newApplicationConnection($database);
if ($master && !$master->isSevered()) {
$connection = $master->newApplicationConnection($database);
if ($master->isReachable($connection)) {
return $connection;
} else {
if ($mode == 'w') {
$this->raiseImpossibleWrite($database);
}
PhabricatorEnv::setReadOnly(
true,
PhabricatorEnv::READONLY_UNREACHABLE);
}
}
$replica = PhabricatorDatabaseRef::getReplicaDatabaseRef();
@ -111,8 +122,11 @@ abstract class PhabricatorLiskDAO extends LiskDAO {
$connection = $replica->newApplicationConnection($database);
$connection->setReadOnly(true);
if ($replica->isReachable($connection)) {
return $connection;
}
return $connection;
$this->raiseUnreachable($database);
}
private function raiseImproperWrite($database) {
@ -124,6 +138,23 @@ abstract class PhabricatorLiskDAO extends LiskDAO {
$database));
}
private function raiseImpossibleWrite($database) {
throw new PhabricatorClusterImpossibleWriteException(
pht(
'Unable to connect to master database ("%s"). This is a severe '.
'failure; your request did not complete.',
$database));
}
private function raiseUnreachable($database) {
throw new PhabricatorClusterStrandedException(
pht(
'Unable to establish a connection to ANY database host '.
'(while trying "%s"). All masters and replicas are completely '.
'unreachable.',
$database));
}
/**
* @task config