1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-19 13:22:42 +01:00

Automatically sever databases after prolonged unreachability

Summary:
Ref T4571. When a database goes down briefly, we fall back to replicas.

However, this fallback is slow (not good for users) and keeps sending a lot of traffic to the master (might be bad if the root cause is load-related).

Keep track of recent connections and fully degrade into "severed" mode if we see a sequence of failures over a reasonable period of time. In this mode, we send much less traffic to the master (faster for users; less load for the database).

We do send a little bit of traffic still, and if the master recovers we'll recover back into normal mode seeing several connections in a row succeed.

This is similar to what most load balancers do when pulling web servers in and out of pools.

For now, the specific numbers are:

  - We do at most one health check every 3 seconds.
  - If 5 checks in a row fail or succeed, we sever or un-sever the database (so it takes about 15 seconds to switch modes).
  - If the database is currently marked unhealthy, we reduce timeouts and retries when connecting to it.

Test Plan:
  - Configured a bad `master`.
  - Browsed around for a bit, initially saw "unrechable master" errors.
  - After about 15 seconds, saw "major interruption" errors instead.
  - Fixed the config for `master`.
  - Browsed around for a while longer.
  - After about 15 seconds, things recovered.
  - Used "Cluster Databases" console to keep an eye on health checks: it now shows how many recent health checks were good:

{F1213397}

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T4571

Differential Revision: https://secure.phabricator.com/D15677
This commit is contained in:
epriestley 2016-04-10 14:18:09 -07:00
parent 5cf09f567a
commit ebff07d019
6 changed files with 278 additions and 6 deletions

View file

@ -2241,6 +2241,7 @@ phutil_register_library_map(array(
'PhabricatorDashboardViewController' => 'applications/dashboard/controller/PhabricatorDashboardViewController.php',
'PhabricatorDataCacheSpec' => 'applications/cache/spec/PhabricatorDataCacheSpec.php',
'PhabricatorDataNotAttachedException' => 'infrastructure/storage/lisk/PhabricatorDataNotAttachedException.php',
'PhabricatorDatabaseHealthRecord' => 'infrastructure/cluster/PhabricatorDatabaseHealthRecord.php',
'PhabricatorDatabaseRef' => 'infrastructure/cluster/PhabricatorDatabaseRef.php',
'PhabricatorDatabaseSetupCheck' => 'applications/config/check/PhabricatorDatabaseSetupCheck.php',
'PhabricatorDatasourceEditField' => 'applications/transactions/editfield/PhabricatorDatasourceEditField.php',
@ -6697,6 +6698,7 @@ phutil_register_library_map(array(
'PhabricatorDashboardViewController' => 'PhabricatorDashboardController',
'PhabricatorDataCacheSpec' => 'PhabricatorCacheSpec',
'PhabricatorDataNotAttachedException' => 'Exception',
'PhabricatorDatabaseHealthRecord' => 'Phobject',
'PhabricatorDatabaseRef' => 'Phobject',
'PhabricatorDatabaseSetupCheck' => 'PhabricatorSetupCheck',
'PhabricatorDatasourceEditField' => 'PhabricatorTokenizerEditField',

View file

@ -174,6 +174,11 @@ final class PhabricatorCaches extends Phobject {
* @task setup
*/
private static function buildSetupCaches() {
// If this is the CLI, just build a setup cache.
if (php_sapi_name() == 'cli') {
return array();
}
// In most cases, we should have APC. This is an ideal cache for our
// purposes -- it's fast and empties on server restart.
$apc = new PhutilAPCKeyValueCache();

View file

@ -115,6 +115,29 @@ final class PhabricatorConfigClusterDatabasesController
$replica_label,
);
$health = $database->getHealthRecord();
$health_up = $health->getUpEventCount();
$health_down = $health->getDownEventCount();
if ($health->getIsHealthy()) {
$health_icon = id(new PHUIIconView())
->setIcon('fa-plus green');
} else {
$health_icon = id(new PHUIIconView())
->setIcon('fa-times red');
}
$health_count = pht(
'%s / %s',
new PhutilNumber($health_up),
new PhutilNumber($health_up + $health_down));
$health_status = array(
$health_icon,
' ',
$health_count,
);
$messages = array();
$conn_message = $database->getConnectionMessage();
@ -136,10 +159,12 @@ final class PhabricatorConfigClusterDatabasesController
$database->getUser(),
$connection,
$replication,
$health_status,
$messages,
);
}
$table = id(new AphrontTableView($rows))
->setNoDataString(
pht('Phabricator is not configured in cluster mode.'))
@ -151,6 +176,7 @@ final class PhabricatorConfigClusterDatabasesController
pht('User'),
pht('Connection'),
pht('Replication'),
pht('Health'),
pht('Messages'),
))
->setColumnClasses(
@ -161,6 +187,7 @@ final class PhabricatorConfigClusterDatabasesController
null,
null,
null,
null,
'wide',
));

View file

@ -0,0 +1,185 @@
<?php
final class PhabricatorDatabaseHealthRecord
extends Phobject {
private $ref;
private $shouldCheck;
private $isHealthy;
private $upEventCount;
private $downEventCount;
public function __construct(PhabricatorDatabaseRef $ref) {
$this->ref = $ref;
$this->readState();
}
/**
* Is the database currently healthy?
*/
public function getIsHealthy() {
return $this->isHealthy;
}
/**
* Should this request check database health?
*/
public function getShouldCheck() {
return $this->shouldCheck;
}
/**
* How many recent health checks were successful?
*/
public function getUpEventCount() {
return $this->upEventCount;
}
/**
* How many recent health checks failed?
*/
public function getDownEventCount() {
return $this->downEventCount;
}
/**
* Number of failures or successes we need to see in a row before we change
* the state.
*/
public function getRequiredEventCount() {
return 5;
}
/**
* Seconds to wait between health checks.
*/
public function getHealthCheckFrequency() {
return 3;
}
public function didHealthCheck($result) {
$now = microtime(true);
$check_frequency = $this->getHealthCheckFrequency();
$event_count = $this->getRequiredEventCount();
$record = $this->readHealthRecord();
$log = $record['log'];
foreach ($log as $key => $event) {
$when = idx($event, 'timestamp');
// If the log already has another nearby event, just ignore this one.
// We raced with another process and our result can just be thrown away.
if (($now - $when) <= $check_frequency) {
return $this;
}
}
$log[] = array(
'timestamp' => $now,
'up' => $result,
);
// Throw away older events which are now obsolete.
$log = array_slice($log, -$event_count);
$count_up = 0;
$count_down = 0;
foreach ($log as $event) {
if ($event['up']) {
$count_up++;
} else {
$count_down++;
}
}
// If all of the events are the same, change the state.
if ($count_up == $event_count) {
$record['up'] = true;
} else if ($count_down == $event_count) {
$record['up'] = false;
}
$record['log'] = $log;
$this->writeHealthRecord($record);
$this->isHealthy = $record['up'];
$this->shouldCheck = false;
$this->updateStatistics($record);
return $this;
}
private function readState() {
$now = microtime(true);
$check_frequency = $this->getHealthCheckFrequency();
$record = $this->readHealthRecord();
$last_check = $record['lastCheck'];
if (($now - $last_check) >= $check_frequency) {
$record['lastCheck'] = $now;
$this->writeHealthRecord($record);
$this->shouldCheck = true;
} else {
$this->shouldCheck = false;
}
$this->isHealthy = $record['up'];
$this->updateStatistics($record);
}
private function updateStatistics(array $record) {
$this->upEventCount = 0;
$this->downEventCount = 0;
foreach ($record['log'] as $event) {
if ($event['up']) {
$this->upEventCount++;
} else {
$this->downEventCount++;
}
}
}
private function getHealthRecordCacheKey() {
$ref = $this->ref;
$host = $ref->getHost();
$port = $ref->getPort();
return "cluster.db.health({$host}, {$port})";
}
private function readHealthRecord() {
$cache = PhabricatorCaches::getSetupCache();
$cache_key = $this->getHealthRecordCacheKey();
$health_record = $cache->getKey($cache_key);
if (!is_array($health_record)) {
$health_record = array(
'up' => true,
'lastCheck' => 0,
'log' => array(),
);
}
return $health_record;
}
private function writeHealthRecord(array $record) {
$cache = PhabricatorCaches::getSetupCache();
$cache_key = $this->getHealthRecordCacheKey();
$cache->setKey($cache_key, $record);
}
}

View file

@ -30,6 +30,7 @@ final class PhabricatorDatabaseRef
private $replicaMessage;
private $replicaDelay;
private $healthRecord;
private $didFailToConnect;
public function setHost($host) {
@ -326,7 +327,7 @@ final class PhabricatorDatabaseRef
return $this->newConnection(
array(
'retries' => 0,
'timeout' => 3,
'timeout' => 2,
));
}
@ -338,11 +339,24 @@ final class PhabricatorDatabaseRef
}
public function isSevered() {
return $this->didFailToConnect;
if ($this->didFailToConnect) {
return true;
}
$record = $this->getHealthRecord();
$is_healthy = $record->getIsHealthy();
if (!$is_healthy) {
return true;
}
return false;
}
public function isReachable(AphrontDatabaseConnection $connection) {
if ($this->isSevered()) {
$record = $this->getHealthRecord();
$should_check = $record->getShouldCheck();
if ($this->isSevered() && !$should_check) {
return false;
}
@ -353,6 +367,10 @@ final class PhabricatorDatabaseRef
$reachable = false;
}
if ($should_check) {
$record->didHealthCheck($reachable);
}
if (!$reachable) {
$this->didFailToConnect = true;
}
@ -360,6 +378,26 @@ final class PhabricatorDatabaseRef
return $reachable;
}
public function checkHealth() {
$health = $this->getHealthRecord();
$should_check = $health->getShouldCheck();
if ($should_check) {
// This does an implicit health update.
$connection = $this->newManagementConnection();
$this->isReachable($connection);
}
return $this;
}
public function getHealthRecord() {
if (!$this->healthRecord) {
$this->healthRecord = new PhabricatorDatabaseHealthRecord($this);
}
return $this->healthRecord;
}
public static function getMasterDatabaseRef() {
$refs = self::getLiveRefs();
@ -415,14 +453,26 @@ final class PhabricatorDatabaseRef
}
private function newConnection(array $options) {
// If we believe the database is unhealthy, don't spend as much time
// trying to connect to it, since it's likely to continue to fail and
// hammering it can only make the problem worse.
$record = $this->getHealthRecord();
if ($record->getIsHealthy()) {
$default_retries = 3;
$default_timeout = 10;
} else {
$default_retries = 0;
$default_timeout = 2;
}
$spec = $options + array(
'user' => $this->getUser(),
'pass' => $this->getPass(),
'host' => $this->getHost(),
'port' => $this->getPort(),
'database' => null,
'retries' => 3,
'timeout' => 15,
'retries' => $default_retries,
'timeout' => $default_timeout,
);
return PhabricatorEnv::newObjectFromConfig(

View file

@ -220,7 +220,10 @@ final class PhabricatorEnv extends Phobject {
if (!$master) {
self::setReadOnly(true, self::READONLY_MASTERLESS);
} else if ($master->isSevered()) {
self::setReadOnly(true, self::READONLY_SEVERED);
$master->checkHealth();
if ($master->isSevered()) {
self::setReadOnly(true, self::READONLY_SEVERED);
}
}
try {