mirror of
https://we.phorge.it/source/phorge.git
synced 2024-12-24 06:20:56 +01:00
Automatically sever databases after prolonged unreachability
Summary: Ref T4571. When a database goes down briefly, we fall back to replicas. However, this fallback is slow (not good for users) and keeps sending a lot of traffic to the master (might be bad if the root cause is load-related). Keep track of recent connections and fully degrade into "severed" mode if we see a sequence of failures over a reasonable period of time. In this mode, we send much less traffic to the master (faster for users; less load for the database). We do send a little bit of traffic still, and if the master recovers we'll recover back into normal mode seeing several connections in a row succeed. This is similar to what most load balancers do when pulling web servers in and out of pools. For now, the specific numbers are: - We do at most one health check every 3 seconds. - If 5 checks in a row fail or succeed, we sever or un-sever the database (so it takes about 15 seconds to switch modes). - If the database is currently marked unhealthy, we reduce timeouts and retries when connecting to it. Test Plan: - Configured a bad `master`. - Browsed around for a bit, initially saw "unrechable master" errors. - After about 15 seconds, saw "major interruption" errors instead. - Fixed the config for `master`. - Browsed around for a while longer. - After about 15 seconds, things recovered. - Used "Cluster Databases" console to keep an eye on health checks: it now shows how many recent health checks were good: {F1213397} Reviewers: chad Reviewed By: chad Maniphest Tasks: T4571 Differential Revision: https://secure.phabricator.com/D15677
This commit is contained in:
parent
5cf09f567a
commit
ebff07d019
6 changed files with 278 additions and 6 deletions
|
@ -2241,6 +2241,7 @@ phutil_register_library_map(array(
|
|||
'PhabricatorDashboardViewController' => 'applications/dashboard/controller/PhabricatorDashboardViewController.php',
|
||||
'PhabricatorDataCacheSpec' => 'applications/cache/spec/PhabricatorDataCacheSpec.php',
|
||||
'PhabricatorDataNotAttachedException' => 'infrastructure/storage/lisk/PhabricatorDataNotAttachedException.php',
|
||||
'PhabricatorDatabaseHealthRecord' => 'infrastructure/cluster/PhabricatorDatabaseHealthRecord.php',
|
||||
'PhabricatorDatabaseRef' => 'infrastructure/cluster/PhabricatorDatabaseRef.php',
|
||||
'PhabricatorDatabaseSetupCheck' => 'applications/config/check/PhabricatorDatabaseSetupCheck.php',
|
||||
'PhabricatorDatasourceEditField' => 'applications/transactions/editfield/PhabricatorDatasourceEditField.php',
|
||||
|
@ -6697,6 +6698,7 @@ phutil_register_library_map(array(
|
|||
'PhabricatorDashboardViewController' => 'PhabricatorDashboardController',
|
||||
'PhabricatorDataCacheSpec' => 'PhabricatorCacheSpec',
|
||||
'PhabricatorDataNotAttachedException' => 'Exception',
|
||||
'PhabricatorDatabaseHealthRecord' => 'Phobject',
|
||||
'PhabricatorDatabaseRef' => 'Phobject',
|
||||
'PhabricatorDatabaseSetupCheck' => 'PhabricatorSetupCheck',
|
||||
'PhabricatorDatasourceEditField' => 'PhabricatorTokenizerEditField',
|
||||
|
|
5
src/applications/cache/PhabricatorCaches.php
vendored
5
src/applications/cache/PhabricatorCaches.php
vendored
|
@ -174,6 +174,11 @@ final class PhabricatorCaches extends Phobject {
|
|||
* @task setup
|
||||
*/
|
||||
private static function buildSetupCaches() {
|
||||
// If this is the CLI, just build a setup cache.
|
||||
if (php_sapi_name() == 'cli') {
|
||||
return array();
|
||||
}
|
||||
|
||||
// In most cases, we should have APC. This is an ideal cache for our
|
||||
// purposes -- it's fast and empties on server restart.
|
||||
$apc = new PhutilAPCKeyValueCache();
|
||||
|
|
|
@ -115,6 +115,29 @@ final class PhabricatorConfigClusterDatabasesController
|
|||
$replica_label,
|
||||
);
|
||||
|
||||
$health = $database->getHealthRecord();
|
||||
$health_up = $health->getUpEventCount();
|
||||
$health_down = $health->getDownEventCount();
|
||||
|
||||
if ($health->getIsHealthy()) {
|
||||
$health_icon = id(new PHUIIconView())
|
||||
->setIcon('fa-plus green');
|
||||
} else {
|
||||
$health_icon = id(new PHUIIconView())
|
||||
->setIcon('fa-times red');
|
||||
}
|
||||
|
||||
$health_count = pht(
|
||||
'%s / %s',
|
||||
new PhutilNumber($health_up),
|
||||
new PhutilNumber($health_up + $health_down));
|
||||
|
||||
$health_status = array(
|
||||
$health_icon,
|
||||
' ',
|
||||
$health_count,
|
||||
);
|
||||
|
||||
$messages = array();
|
||||
|
||||
$conn_message = $database->getConnectionMessage();
|
||||
|
@ -136,10 +159,12 @@ final class PhabricatorConfigClusterDatabasesController
|
|||
$database->getUser(),
|
||||
$connection,
|
||||
$replication,
|
||||
$health_status,
|
||||
$messages,
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
$table = id(new AphrontTableView($rows))
|
||||
->setNoDataString(
|
||||
pht('Phabricator is not configured in cluster mode.'))
|
||||
|
@ -151,6 +176,7 @@ final class PhabricatorConfigClusterDatabasesController
|
|||
pht('User'),
|
||||
pht('Connection'),
|
||||
pht('Replication'),
|
||||
pht('Health'),
|
||||
pht('Messages'),
|
||||
))
|
||||
->setColumnClasses(
|
||||
|
@ -161,6 +187,7 @@ final class PhabricatorConfigClusterDatabasesController
|
|||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
'wide',
|
||||
));
|
||||
|
||||
|
|
185
src/infrastructure/cluster/PhabricatorDatabaseHealthRecord.php
Normal file
185
src/infrastructure/cluster/PhabricatorDatabaseHealthRecord.php
Normal file
|
@ -0,0 +1,185 @@
|
|||
<?php
|
||||
|
||||
final class PhabricatorDatabaseHealthRecord
|
||||
extends Phobject {
|
||||
|
||||
private $ref;
|
||||
private $shouldCheck;
|
||||
private $isHealthy;
|
||||
private $upEventCount;
|
||||
private $downEventCount;
|
||||
|
||||
public function __construct(PhabricatorDatabaseRef $ref) {
|
||||
$this->ref = $ref;
|
||||
$this->readState();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Is the database currently healthy?
|
||||
*/
|
||||
public function getIsHealthy() {
|
||||
return $this->isHealthy;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Should this request check database health?
|
||||
*/
|
||||
public function getShouldCheck() {
|
||||
return $this->shouldCheck;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* How many recent health checks were successful?
|
||||
*/
|
||||
public function getUpEventCount() {
|
||||
return $this->upEventCount;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* How many recent health checks failed?
|
||||
*/
|
||||
public function getDownEventCount() {
|
||||
return $this->downEventCount;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Number of failures or successes we need to see in a row before we change
|
||||
* the state.
|
||||
*/
|
||||
public function getRequiredEventCount() {
|
||||
return 5;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Seconds to wait between health checks.
|
||||
*/
|
||||
public function getHealthCheckFrequency() {
|
||||
return 3;
|
||||
}
|
||||
|
||||
|
||||
public function didHealthCheck($result) {
|
||||
$now = microtime(true);
|
||||
$check_frequency = $this->getHealthCheckFrequency();
|
||||
$event_count = $this->getRequiredEventCount();
|
||||
|
||||
$record = $this->readHealthRecord();
|
||||
|
||||
$log = $record['log'];
|
||||
foreach ($log as $key => $event) {
|
||||
$when = idx($event, 'timestamp');
|
||||
|
||||
// If the log already has another nearby event, just ignore this one.
|
||||
// We raced with another process and our result can just be thrown away.
|
||||
if (($now - $when) <= $check_frequency) {
|
||||
return $this;
|
||||
}
|
||||
}
|
||||
|
||||
$log[] = array(
|
||||
'timestamp' => $now,
|
||||
'up' => $result,
|
||||
);
|
||||
|
||||
// Throw away older events which are now obsolete.
|
||||
$log = array_slice($log, -$event_count);
|
||||
|
||||
$count_up = 0;
|
||||
$count_down = 0;
|
||||
foreach ($log as $event) {
|
||||
if ($event['up']) {
|
||||
$count_up++;
|
||||
} else {
|
||||
$count_down++;
|
||||
}
|
||||
}
|
||||
|
||||
// If all of the events are the same, change the state.
|
||||
if ($count_up == $event_count) {
|
||||
$record['up'] = true;
|
||||
} else if ($count_down == $event_count) {
|
||||
$record['up'] = false;
|
||||
}
|
||||
|
||||
$record['log'] = $log;
|
||||
|
||||
$this->writeHealthRecord($record);
|
||||
|
||||
$this->isHealthy = $record['up'];
|
||||
$this->shouldCheck = false;
|
||||
$this->updateStatistics($record);
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
|
||||
private function readState() {
|
||||
$now = microtime(true);
|
||||
$check_frequency = $this->getHealthCheckFrequency();
|
||||
|
||||
$record = $this->readHealthRecord();
|
||||
|
||||
$last_check = $record['lastCheck'];
|
||||
|
||||
if (($now - $last_check) >= $check_frequency) {
|
||||
$record['lastCheck'] = $now;
|
||||
$this->writeHealthRecord($record);
|
||||
$this->shouldCheck = true;
|
||||
} else {
|
||||
$this->shouldCheck = false;
|
||||
}
|
||||
|
||||
$this->isHealthy = $record['up'];
|
||||
$this->updateStatistics($record);
|
||||
}
|
||||
|
||||
private function updateStatistics(array $record) {
|
||||
$this->upEventCount = 0;
|
||||
$this->downEventCount = 0;
|
||||
foreach ($record['log'] as $event) {
|
||||
if ($event['up']) {
|
||||
$this->upEventCount++;
|
||||
} else {
|
||||
$this->downEventCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function getHealthRecordCacheKey() {
|
||||
$ref = $this->ref;
|
||||
|
||||
$host = $ref->getHost();
|
||||
$port = $ref->getPort();
|
||||
|
||||
return "cluster.db.health({$host}, {$port})";
|
||||
}
|
||||
|
||||
private function readHealthRecord() {
|
||||
$cache = PhabricatorCaches::getSetupCache();
|
||||
$cache_key = $this->getHealthRecordCacheKey();
|
||||
$health_record = $cache->getKey($cache_key);
|
||||
|
||||
if (!is_array($health_record)) {
|
||||
$health_record = array(
|
||||
'up' => true,
|
||||
'lastCheck' => 0,
|
||||
'log' => array(),
|
||||
);
|
||||
}
|
||||
|
||||
return $health_record;
|
||||
}
|
||||
|
||||
private function writeHealthRecord(array $record) {
|
||||
$cache = PhabricatorCaches::getSetupCache();
|
||||
$cache_key = $this->getHealthRecordCacheKey();
|
||||
$cache->setKey($cache_key, $record);
|
||||
}
|
||||
|
||||
}
|
|
@ -30,6 +30,7 @@ final class PhabricatorDatabaseRef
|
|||
private $replicaMessage;
|
||||
private $replicaDelay;
|
||||
|
||||
private $healthRecord;
|
||||
private $didFailToConnect;
|
||||
|
||||
public function setHost($host) {
|
||||
|
@ -326,7 +327,7 @@ final class PhabricatorDatabaseRef
|
|||
return $this->newConnection(
|
||||
array(
|
||||
'retries' => 0,
|
||||
'timeout' => 3,
|
||||
'timeout' => 2,
|
||||
));
|
||||
}
|
||||
|
||||
|
@ -338,11 +339,24 @@ final class PhabricatorDatabaseRef
|
|||
}
|
||||
|
||||
public function isSevered() {
|
||||
return $this->didFailToConnect;
|
||||
if ($this->didFailToConnect) {
|
||||
return true;
|
||||
}
|
||||
|
||||
$record = $this->getHealthRecord();
|
||||
$is_healthy = $record->getIsHealthy();
|
||||
if (!$is_healthy) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public function isReachable(AphrontDatabaseConnection $connection) {
|
||||
if ($this->isSevered()) {
|
||||
$record = $this->getHealthRecord();
|
||||
$should_check = $record->getShouldCheck();
|
||||
|
||||
if ($this->isSevered() && !$should_check) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -353,6 +367,10 @@ final class PhabricatorDatabaseRef
|
|||
$reachable = false;
|
||||
}
|
||||
|
||||
if ($should_check) {
|
||||
$record->didHealthCheck($reachable);
|
||||
}
|
||||
|
||||
if (!$reachable) {
|
||||
$this->didFailToConnect = true;
|
||||
}
|
||||
|
@ -360,6 +378,26 @@ final class PhabricatorDatabaseRef
|
|||
return $reachable;
|
||||
}
|
||||
|
||||
public function checkHealth() {
|
||||
$health = $this->getHealthRecord();
|
||||
|
||||
$should_check = $health->getShouldCheck();
|
||||
if ($should_check) {
|
||||
// This does an implicit health update.
|
||||
$connection = $this->newManagementConnection();
|
||||
$this->isReachable($connection);
|
||||
}
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function getHealthRecord() {
|
||||
if (!$this->healthRecord) {
|
||||
$this->healthRecord = new PhabricatorDatabaseHealthRecord($this);
|
||||
}
|
||||
return $this->healthRecord;
|
||||
}
|
||||
|
||||
public static function getMasterDatabaseRef() {
|
||||
$refs = self::getLiveRefs();
|
||||
|
||||
|
@ -415,14 +453,26 @@ final class PhabricatorDatabaseRef
|
|||
}
|
||||
|
||||
private function newConnection(array $options) {
|
||||
// If we believe the database is unhealthy, don't spend as much time
|
||||
// trying to connect to it, since it's likely to continue to fail and
|
||||
// hammering it can only make the problem worse.
|
||||
$record = $this->getHealthRecord();
|
||||
if ($record->getIsHealthy()) {
|
||||
$default_retries = 3;
|
||||
$default_timeout = 10;
|
||||
} else {
|
||||
$default_retries = 0;
|
||||
$default_timeout = 2;
|
||||
}
|
||||
|
||||
$spec = $options + array(
|
||||
'user' => $this->getUser(),
|
||||
'pass' => $this->getPass(),
|
||||
'host' => $this->getHost(),
|
||||
'port' => $this->getPort(),
|
||||
'database' => null,
|
||||
'retries' => 3,
|
||||
'timeout' => 15,
|
||||
'retries' => $default_retries,
|
||||
'timeout' => $default_timeout,
|
||||
);
|
||||
|
||||
return PhabricatorEnv::newObjectFromConfig(
|
||||
|
|
5
src/infrastructure/env/PhabricatorEnv.php
vendored
5
src/infrastructure/env/PhabricatorEnv.php
vendored
|
@ -220,7 +220,10 @@ final class PhabricatorEnv extends Phobject {
|
|||
if (!$master) {
|
||||
self::setReadOnly(true, self::READONLY_MASTERLESS);
|
||||
} else if ($master->isSevered()) {
|
||||
self::setReadOnly(true, self::READONLY_SEVERED);
|
||||
$master->checkHealth();
|
||||
if ($master->isSevered()) {
|
||||
self::setReadOnly(true, self::READONLY_SEVERED);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
|
|
Loading…
Reference in a new issue