1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-28 17:52:43 +01:00
phorge-phorge/src/applications/config/controller/PhabricatorConfigClusterDatabasesController.php

246 lines
6.6 KiB
PHP
Raw Normal View History

<?php
final class PhabricatorConfigClusterDatabasesController
extends PhabricatorConfigController {
public function handleRequest(AphrontRequest $request) {
$nav = $this->buildSideNavView();
$nav->selectFilter('cluster/databases/');
$title = pht('Cluster Database Status');
$doc_href = PhabricatorEnv::getDoclink('Cluster: Databases');
$header = id(new PHUIHeaderView())
->setHeader($title)
->setProfileHeader(true)
->addActionLink(
id(new PHUIButtonView())
->setIcon('fa-book')
->setHref($doc_href)
->setTag('a')
->setText(pht('Documentation')));
$crumbs = $this
->buildApplicationCrumbs($nav)
->addTextCrumb($title)
->setBorder(true);
$database_status = $this->buildClusterDatabaseStatus();
$content = id(new PhabricatorConfigPageView())
->setHeader($header)
->setContent($database_status);
return $this->newPage()
->setTitle($title)
->setCrumbs($crumbs)
->setNavigation($nav)
->appendChild($content)
->addClass('white-background');
}
private function buildClusterDatabaseStatus() {
$viewer = $this->getViewer();
$databases = PhabricatorDatabaseRef::queryAll();
$connection_map = PhabricatorDatabaseRef::getConnectionStatusMap();
$replica_map = PhabricatorDatabaseRef::getReplicaStatusMap();
Javelin::initBehavior('phabricator-tooltips');
$rows = array();
foreach ($databases as $database) {
$messages = array();
if ($database->getIsMaster()) {
$role_icon = id(new PHUIIconView())
->setIcon('fa-database sky')
->addSigil('has-tooltip')
->setMetadata(
array(
'tip' => pht('Master'),
));
} else {
$role_icon = id(new PHUIIconView())
->setIcon('fa-download')
->addSigil('has-tooltip')
->setMetadata(
array(
'tip' => pht('Replica'),
));
}
if ($database->getDisabled()) {
$conn_icon = 'fa-times';
$conn_color = 'grey';
$conn_label = pht('Disabled');
} else {
$status = $database->getConnectionStatus();
$info = idx($connection_map, $status, array());
$conn_icon = idx($info, 'icon');
$conn_color = idx($info, 'color');
$conn_label = idx($info, 'label');
if ($status === PhabricatorDatabaseRef::STATUS_OKAY) {
$latency = $database->getConnectionLatency();
$latency = (int)(1000000 * $latency);
$conn_label = pht('%s us', new PhutilNumber($latency));
}
}
$connection = array(
id(new PHUIIconView())->setIcon("{$conn_icon} {$conn_color}"),
' ',
$conn_label,
);
if ($database->getDisabled()) {
$replica_icon = 'fa-times';
$replica_color = 'grey';
$replica_label = pht('Disabled');
} else {
$status = $database->getReplicaStatus();
$info = idx($replica_map, $status, array());
$replica_icon = idx($info, 'icon');
$replica_color = idx($info, 'color');
$replica_label = idx($info, 'label');
if ($database->getIsMaster()) {
if ($status === PhabricatorDatabaseRef::REPLICATION_OKAY) {
$replica_icon = 'fa-database';
}
} else {
switch ($status) {
case PhabricatorDatabaseRef::REPLICATION_OKAY:
case PhabricatorDatabaseRef::REPLICATION_SLOW:
$delay = $database->getReplicaDelay();
if ($delay) {
$replica_label = pht('%ss Behind', new PhutilNumber($delay));
} else {
$replica_label = pht('Up to Date');
}
break;
}
}
}
$replication = array(
id(new PHUIIconView())->setIcon("{$replica_icon} {$replica_color}"),
' ',
$replica_label,
);
Automatically sever databases after prolonged unreachability Summary: Ref T4571. When a database goes down briefly, we fall back to replicas. However, this fallback is slow (not good for users) and keeps sending a lot of traffic to the master (might be bad if the root cause is load-related). Keep track of recent connections and fully degrade into "severed" mode if we see a sequence of failures over a reasonable period of time. In this mode, we send much less traffic to the master (faster for users; less load for the database). We do send a little bit of traffic still, and if the master recovers we'll recover back into normal mode seeing several connections in a row succeed. This is similar to what most load balancers do when pulling web servers in and out of pools. For now, the specific numbers are: - We do at most one health check every 3 seconds. - If 5 checks in a row fail or succeed, we sever or un-sever the database (so it takes about 15 seconds to switch modes). - If the database is currently marked unhealthy, we reduce timeouts and retries when connecting to it. Test Plan: - Configured a bad `master`. - Browsed around for a bit, initially saw "unrechable master" errors. - After about 15 seconds, saw "major interruption" errors instead. - Fixed the config for `master`. - Browsed around for a while longer. - After about 15 seconds, things recovered. - Used "Cluster Databases" console to keep an eye on health checks: it now shows how many recent health checks were good: {F1213397} Reviewers: chad Reviewed By: chad Maniphest Tasks: T4571 Differential Revision: https://secure.phabricator.com/D15677
2016-04-10 23:18:09 +02:00
$health = $database->getHealthRecord();
$health_up = $health->getUpEventCount();
$health_down = $health->getDownEventCount();
if ($health->getIsHealthy()) {
$health_icon = id(new PHUIIconView())
->setIcon('fa-plus green');
} else {
$health_icon = id(new PHUIIconView())
->setIcon('fa-times red');
$messages[] = pht(
'UNHEALTHY: This database has failed recent health checks. Traffic '.
'will not be sent to it until it recovers.');
Automatically sever databases after prolonged unreachability Summary: Ref T4571. When a database goes down briefly, we fall back to replicas. However, this fallback is slow (not good for users) and keeps sending a lot of traffic to the master (might be bad if the root cause is load-related). Keep track of recent connections and fully degrade into "severed" mode if we see a sequence of failures over a reasonable period of time. In this mode, we send much less traffic to the master (faster for users; less load for the database). We do send a little bit of traffic still, and if the master recovers we'll recover back into normal mode seeing several connections in a row succeed. This is similar to what most load balancers do when pulling web servers in and out of pools. For now, the specific numbers are: - We do at most one health check every 3 seconds. - If 5 checks in a row fail or succeed, we sever or un-sever the database (so it takes about 15 seconds to switch modes). - If the database is currently marked unhealthy, we reduce timeouts and retries when connecting to it. Test Plan: - Configured a bad `master`. - Browsed around for a bit, initially saw "unrechable master" errors. - After about 15 seconds, saw "major interruption" errors instead. - Fixed the config for `master`. - Browsed around for a while longer. - After about 15 seconds, things recovered. - Used "Cluster Databases" console to keep an eye on health checks: it now shows how many recent health checks were good: {F1213397} Reviewers: chad Reviewed By: chad Maniphest Tasks: T4571 Differential Revision: https://secure.phabricator.com/D15677
2016-04-10 23:18:09 +02:00
}
$health_count = pht(
'%s / %s',
new PhutilNumber($health_up),
new PhutilNumber($health_up + $health_down));
$health_status = array(
$health_icon,
' ',
$health_count,
);
$conn_message = $database->getConnectionMessage();
if ($conn_message) {
$messages[] = $conn_message;
}
$replica_message = $database->getReplicaMessage();
if ($replica_message) {
$messages[] = $replica_message;
}
$messages = phutil_implode_html(phutil_tag('br'), $messages);
$partition = null;
if ($database->getIsMaster()) {
if ($database->getIsDefaultPartition()) {
$partition = id(new PHUIIconView())
->setIcon('fa-circle sky')
->addSigil('has-tooltip')
->setMetadata(
array(
'tip' => pht('Default Partition'),
));
} else {
$map = $database->getApplicationMap();
if ($map) {
$list = implode(', ', $map);
} else {
$list = pht('Empty');
}
$partition = id(new PHUIIconView())
->setIcon('fa-adjust sky')
->addSigil('has-tooltip')
->setMetadata(
array(
'tip' => pht('Partition: %s', $list),
));
}
}
$rows[] = array(
$role_icon,
$partition,
$database->getHost(),
$database->getPort(),
$database->getUser(),
$connection,
$replication,
Automatically sever databases after prolonged unreachability Summary: Ref T4571. When a database goes down briefly, we fall back to replicas. However, this fallback is slow (not good for users) and keeps sending a lot of traffic to the master (might be bad if the root cause is load-related). Keep track of recent connections and fully degrade into "severed" mode if we see a sequence of failures over a reasonable period of time. In this mode, we send much less traffic to the master (faster for users; less load for the database). We do send a little bit of traffic still, and if the master recovers we'll recover back into normal mode seeing several connections in a row succeed. This is similar to what most load balancers do when pulling web servers in and out of pools. For now, the specific numbers are: - We do at most one health check every 3 seconds. - If 5 checks in a row fail or succeed, we sever or un-sever the database (so it takes about 15 seconds to switch modes). - If the database is currently marked unhealthy, we reduce timeouts and retries when connecting to it. Test Plan: - Configured a bad `master`. - Browsed around for a bit, initially saw "unrechable master" errors. - After about 15 seconds, saw "major interruption" errors instead. - Fixed the config for `master`. - Browsed around for a while longer. - After about 15 seconds, things recovered. - Used "Cluster Databases" console to keep an eye on health checks: it now shows how many recent health checks were good: {F1213397} Reviewers: chad Reviewed By: chad Maniphest Tasks: T4571 Differential Revision: https://secure.phabricator.com/D15677
2016-04-10 23:18:09 +02:00
$health_status,
$messages,
);
}
Automatically sever databases after prolonged unreachability Summary: Ref T4571. When a database goes down briefly, we fall back to replicas. However, this fallback is slow (not good for users) and keeps sending a lot of traffic to the master (might be bad if the root cause is load-related). Keep track of recent connections and fully degrade into "severed" mode if we see a sequence of failures over a reasonable period of time. In this mode, we send much less traffic to the master (faster for users; less load for the database). We do send a little bit of traffic still, and if the master recovers we'll recover back into normal mode seeing several connections in a row succeed. This is similar to what most load balancers do when pulling web servers in and out of pools. For now, the specific numbers are: - We do at most one health check every 3 seconds. - If 5 checks in a row fail or succeed, we sever or un-sever the database (so it takes about 15 seconds to switch modes). - If the database is currently marked unhealthy, we reduce timeouts and retries when connecting to it. Test Plan: - Configured a bad `master`. - Browsed around for a bit, initially saw "unrechable master" errors. - After about 15 seconds, saw "major interruption" errors instead. - Fixed the config for `master`. - Browsed around for a while longer. - After about 15 seconds, things recovered. - Used "Cluster Databases" console to keep an eye on health checks: it now shows how many recent health checks were good: {F1213397} Reviewers: chad Reviewed By: chad Maniphest Tasks: T4571 Differential Revision: https://secure.phabricator.com/D15677
2016-04-10 23:18:09 +02:00
$table = id(new AphrontTableView($rows))
->setNoDataString(
pht('Phabricator is not configured in cluster mode.'))
->setHeaders(
array(
null,
null,
pht('Host'),
pht('Port'),
pht('User'),
pht('Connection'),
pht('Replication'),
Automatically sever databases after prolonged unreachability Summary: Ref T4571. When a database goes down briefly, we fall back to replicas. However, this fallback is slow (not good for users) and keeps sending a lot of traffic to the master (might be bad if the root cause is load-related). Keep track of recent connections and fully degrade into "severed" mode if we see a sequence of failures over a reasonable period of time. In this mode, we send much less traffic to the master (faster for users; less load for the database). We do send a little bit of traffic still, and if the master recovers we'll recover back into normal mode seeing several connections in a row succeed. This is similar to what most load balancers do when pulling web servers in and out of pools. For now, the specific numbers are: - We do at most one health check every 3 seconds. - If 5 checks in a row fail or succeed, we sever or un-sever the database (so it takes about 15 seconds to switch modes). - If the database is currently marked unhealthy, we reduce timeouts and retries when connecting to it. Test Plan: - Configured a bad `master`. - Browsed around for a bit, initially saw "unrechable master" errors. - After about 15 seconds, saw "major interruption" errors instead. - Fixed the config for `master`. - Browsed around for a while longer. - After about 15 seconds, things recovered. - Used "Cluster Databases" console to keep an eye on health checks: it now shows how many recent health checks were good: {F1213397} Reviewers: chad Reviewed By: chad Maniphest Tasks: T4571 Differential Revision: https://secure.phabricator.com/D15677
2016-04-10 23:18:09 +02:00
pht('Health'),
pht('Messages'),
))
->setColumnClasses(
array(
null,
null,
null,
null,
null,
null,
Automatically sever databases after prolonged unreachability Summary: Ref T4571. When a database goes down briefly, we fall back to replicas. However, this fallback is slow (not good for users) and keeps sending a lot of traffic to the master (might be bad if the root cause is load-related). Keep track of recent connections and fully degrade into "severed" mode if we see a sequence of failures over a reasonable period of time. In this mode, we send much less traffic to the master (faster for users; less load for the database). We do send a little bit of traffic still, and if the master recovers we'll recover back into normal mode seeing several connections in a row succeed. This is similar to what most load balancers do when pulling web servers in and out of pools. For now, the specific numbers are: - We do at most one health check every 3 seconds. - If 5 checks in a row fail or succeed, we sever or un-sever the database (so it takes about 15 seconds to switch modes). - If the database is currently marked unhealthy, we reduce timeouts and retries when connecting to it. Test Plan: - Configured a bad `master`. - Browsed around for a bit, initially saw "unrechable master" errors. - After about 15 seconds, saw "major interruption" errors instead. - Fixed the config for `master`. - Browsed around for a while longer. - After about 15 seconds, things recovered. - Used "Cluster Databases" console to keep an eye on health checks: it now shows how many recent health checks were good: {F1213397} Reviewers: chad Reviewed By: chad Maniphest Tasks: T4571 Differential Revision: https://secure.phabricator.com/D15677
2016-04-10 23:18:09 +02:00
null,
null,
'wide',
));
return $table;
}
}