2016-04-10 00:21:29 +02:00
|
|
|
<?php
|
|
|
|
|
|
|
|
final class PhabricatorConfigClusterDatabasesController
|
|
|
|
extends PhabricatorConfigController {
|
|
|
|
|
|
|
|
public function handleRequest(AphrontRequest $request) {
|
|
|
|
$nav = $this->buildSideNavView();
|
|
|
|
$nav->selectFilter('cluster/databases/');
|
|
|
|
|
Redesign Config Application
Summary: Ref T11132, significantly cleans up the Config app, new layout, icons, spacing, etc. Some minor todos around re-designing "issues", mobile support, and maybe another pass at actual Group pages.
Test Plan: Visit and test every page in the config app, set new items, resolve setup issues, etc.
Reviewers: epriestley
Reviewed By: epriestley
Subscribers: PHID-OPKG-gm6ozazyms6q6i22gyam, Korvin
Maniphest Tasks: T11132
Differential Revision: https://secure.phabricator.com/D16468
2016-08-30 00:36:13 +02:00
|
|
|
$title = pht('Cluster Database Status');
|
|
|
|
$doc_href = PhabricatorEnv::getDoclink('Cluster: Databases');
|
|
|
|
|
|
|
|
$header = id(new PHUIHeaderView())
|
|
|
|
->setHeader($title)
|
|
|
|
->setProfileHeader(true)
|
|
|
|
->addActionLink(
|
|
|
|
id(new PHUIButtonView())
|
|
|
|
->setIcon('fa-book')
|
|
|
|
->setHref($doc_href)
|
|
|
|
->setTag('a')
|
|
|
|
->setText(pht('Documentation')));
|
2016-04-10 00:21:29 +02:00
|
|
|
|
|
|
|
$crumbs = $this
|
|
|
|
->buildApplicationCrumbs($nav)
|
Redesign Config Application
Summary: Ref T11132, significantly cleans up the Config app, new layout, icons, spacing, etc. Some minor todos around re-designing "issues", mobile support, and maybe another pass at actual Group pages.
Test Plan: Visit and test every page in the config app, set new items, resolve setup issues, etc.
Reviewers: epriestley
Reviewed By: epriestley
Subscribers: PHID-OPKG-gm6ozazyms6q6i22gyam, Korvin
Maniphest Tasks: T11132
Differential Revision: https://secure.phabricator.com/D16468
2016-08-30 00:36:13 +02:00
|
|
|
->addTextCrumb($title)
|
|
|
|
->setBorder(true);
|
2016-04-10 00:21:29 +02:00
|
|
|
|
|
|
|
$database_status = $this->buildClusterDatabaseStatus();
|
|
|
|
|
Redesign Config Application
Summary: Ref T11132, significantly cleans up the Config app, new layout, icons, spacing, etc. Some minor todos around re-designing "issues", mobile support, and maybe another pass at actual Group pages.
Test Plan: Visit and test every page in the config app, set new items, resolve setup issues, etc.
Reviewers: epriestley
Reviewed By: epriestley
Subscribers: PHID-OPKG-gm6ozazyms6q6i22gyam, Korvin
Maniphest Tasks: T11132
Differential Revision: https://secure.phabricator.com/D16468
2016-08-30 00:36:13 +02:00
|
|
|
$content = id(new PhabricatorConfigPageView())
|
|
|
|
->setHeader($header)
|
|
|
|
->setContent($database_status);
|
2016-04-10 00:21:29 +02:00
|
|
|
|
|
|
|
return $this->newPage()
|
|
|
|
->setTitle($title)
|
|
|
|
->setCrumbs($crumbs)
|
Redesign Config Application
Summary: Ref T11132, significantly cleans up the Config app, new layout, icons, spacing, etc. Some minor todos around re-designing "issues", mobile support, and maybe another pass at actual Group pages.
Test Plan: Visit and test every page in the config app, set new items, resolve setup issues, etc.
Reviewers: epriestley
Reviewed By: epriestley
Subscribers: PHID-OPKG-gm6ozazyms6q6i22gyam, Korvin
Maniphest Tasks: T11132
Differential Revision: https://secure.phabricator.com/D16468
2016-08-30 00:36:13 +02:00
|
|
|
->setNavigation($nav)
|
|
|
|
->appendChild($content)
|
2016-10-01 21:35:13 +02:00
|
|
|
->addClass('white-background');
|
2016-04-10 00:21:29 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
private function buildClusterDatabaseStatus() {
|
|
|
|
$viewer = $this->getViewer();
|
|
|
|
|
|
|
|
$databases = PhabricatorDatabaseRef::queryAll();
|
|
|
|
$connection_map = PhabricatorDatabaseRef::getConnectionStatusMap();
|
|
|
|
$replica_map = PhabricatorDatabaseRef::getReplicaStatusMap();
|
|
|
|
Javelin::initBehavior('phabricator-tooltips');
|
|
|
|
|
|
|
|
$rows = array();
|
|
|
|
foreach ($databases as $database) {
|
2016-04-11 14:26:45 +02:00
|
|
|
$messages = array();
|
|
|
|
|
2016-04-10 00:21:29 +02:00
|
|
|
if ($database->getIsMaster()) {
|
|
|
|
$role_icon = id(new PHUIIconView())
|
|
|
|
->setIcon('fa-database sky')
|
|
|
|
->addSigil('has-tooltip')
|
|
|
|
->setMetadata(
|
|
|
|
array(
|
|
|
|
'tip' => pht('Master'),
|
|
|
|
));
|
|
|
|
} else {
|
|
|
|
$role_icon = id(new PHUIIconView())
|
|
|
|
->setIcon('fa-download')
|
|
|
|
->addSigil('has-tooltip')
|
|
|
|
->setMetadata(
|
|
|
|
array(
|
|
|
|
'tip' => pht('Replica'),
|
|
|
|
));
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($database->getDisabled()) {
|
|
|
|
$conn_icon = 'fa-times';
|
|
|
|
$conn_color = 'grey';
|
|
|
|
$conn_label = pht('Disabled');
|
|
|
|
} else {
|
|
|
|
$status = $database->getConnectionStatus();
|
|
|
|
|
|
|
|
$info = idx($connection_map, $status, array());
|
|
|
|
$conn_icon = idx($info, 'icon');
|
|
|
|
$conn_color = idx($info, 'color');
|
|
|
|
$conn_label = idx($info, 'label');
|
|
|
|
|
|
|
|
if ($status === PhabricatorDatabaseRef::STATUS_OKAY) {
|
|
|
|
$latency = $database->getConnectionLatency();
|
|
|
|
$latency = (int)(1000000 * $latency);
|
|
|
|
$conn_label = pht('%s us', new PhutilNumber($latency));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$connection = array(
|
|
|
|
id(new PHUIIconView())->setIcon("{$conn_icon} {$conn_color}"),
|
|
|
|
' ',
|
|
|
|
$conn_label,
|
|
|
|
);
|
|
|
|
|
|
|
|
if ($database->getDisabled()) {
|
|
|
|
$replica_icon = 'fa-times';
|
|
|
|
$replica_color = 'grey';
|
|
|
|
$replica_label = pht('Disabled');
|
|
|
|
} else {
|
|
|
|
$status = $database->getReplicaStatus();
|
|
|
|
|
|
|
|
$info = idx($replica_map, $status, array());
|
|
|
|
$replica_icon = idx($info, 'icon');
|
|
|
|
$replica_color = idx($info, 'color');
|
|
|
|
$replica_label = idx($info, 'label');
|
|
|
|
|
|
|
|
if ($database->getIsMaster()) {
|
|
|
|
if ($status === PhabricatorDatabaseRef::REPLICATION_OKAY) {
|
|
|
|
$replica_icon = 'fa-database';
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
switch ($status) {
|
|
|
|
case PhabricatorDatabaseRef::REPLICATION_OKAY:
|
|
|
|
case PhabricatorDatabaseRef::REPLICATION_SLOW:
|
|
|
|
$delay = $database->getReplicaDelay();
|
|
|
|
if ($delay) {
|
|
|
|
$replica_label = pht('%ss Behind', new PhutilNumber($delay));
|
|
|
|
} else {
|
|
|
|
$replica_label = pht('Up to Date');
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$replication = array(
|
|
|
|
id(new PHUIIconView())->setIcon("{$replica_icon} {$replica_color}"),
|
|
|
|
' ',
|
|
|
|
$replica_label,
|
|
|
|
);
|
|
|
|
|
Automatically sever databases after prolonged unreachability
Summary:
Ref T4571. When a database goes down briefly, we fall back to replicas.
However, this fallback is slow (not good for users) and keeps sending a lot of traffic to the master (might be bad if the root cause is load-related).
Keep track of recent connections and fully degrade into "severed" mode if we see a sequence of failures over a reasonable period of time. In this mode, we send much less traffic to the master (faster for users; less load for the database).
We do send a little bit of traffic still, and if the master recovers we'll recover back into normal mode seeing several connections in a row succeed.
This is similar to what most load balancers do when pulling web servers in and out of pools.
For now, the specific numbers are:
- We do at most one health check every 3 seconds.
- If 5 checks in a row fail or succeed, we sever or un-sever the database (so it takes about 15 seconds to switch modes).
- If the database is currently marked unhealthy, we reduce timeouts and retries when connecting to it.
Test Plan:
- Configured a bad `master`.
- Browsed around for a bit, initially saw "unrechable master" errors.
- After about 15 seconds, saw "major interruption" errors instead.
- Fixed the config for `master`.
- Browsed around for a while longer.
- After about 15 seconds, things recovered.
- Used "Cluster Databases" console to keep an eye on health checks: it now shows how many recent health checks were good:
{F1213397}
Reviewers: chad
Reviewed By: chad
Maniphest Tasks: T4571
Differential Revision: https://secure.phabricator.com/D15677
2016-04-10 23:18:09 +02:00
|
|
|
$health = $database->getHealthRecord();
|
|
|
|
$health_up = $health->getUpEventCount();
|
|
|
|
$health_down = $health->getDownEventCount();
|
|
|
|
|
|
|
|
if ($health->getIsHealthy()) {
|
|
|
|
$health_icon = id(new PHUIIconView())
|
|
|
|
->setIcon('fa-plus green');
|
|
|
|
} else {
|
|
|
|
$health_icon = id(new PHUIIconView())
|
|
|
|
->setIcon('fa-times red');
|
2016-04-11 14:26:45 +02:00
|
|
|
$messages[] = pht(
|
|
|
|
'UNHEALTHY: This database has failed recent health checks. Traffic '.
|
|
|
|
'will not be sent to it until it recovers.');
|
Automatically sever databases after prolonged unreachability
Summary:
Ref T4571. When a database goes down briefly, we fall back to replicas.
However, this fallback is slow (not good for users) and keeps sending a lot of traffic to the master (might be bad if the root cause is load-related).
Keep track of recent connections and fully degrade into "severed" mode if we see a sequence of failures over a reasonable period of time. In this mode, we send much less traffic to the master (faster for users; less load for the database).
We do send a little bit of traffic still, and if the master recovers we'll recover back into normal mode seeing several connections in a row succeed.
This is similar to what most load balancers do when pulling web servers in and out of pools.
For now, the specific numbers are:
- We do at most one health check every 3 seconds.
- If 5 checks in a row fail or succeed, we sever or un-sever the database (so it takes about 15 seconds to switch modes).
- If the database is currently marked unhealthy, we reduce timeouts and retries when connecting to it.
Test Plan:
- Configured a bad `master`.
- Browsed around for a bit, initially saw "unrechable master" errors.
- After about 15 seconds, saw "major interruption" errors instead.
- Fixed the config for `master`.
- Browsed around for a while longer.
- After about 15 seconds, things recovered.
- Used "Cluster Databases" console to keep an eye on health checks: it now shows how many recent health checks were good:
{F1213397}
Reviewers: chad
Reviewed By: chad
Maniphest Tasks: T4571
Differential Revision: https://secure.phabricator.com/D15677
2016-04-10 23:18:09 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
$health_count = pht(
|
|
|
|
'%s / %s',
|
|
|
|
new PhutilNumber($health_up),
|
|
|
|
new PhutilNumber($health_up + $health_down));
|
|
|
|
|
|
|
|
$health_status = array(
|
|
|
|
$health_icon,
|
|
|
|
' ',
|
|
|
|
$health_count,
|
|
|
|
);
|
|
|
|
|
2016-04-10 00:21:29 +02:00
|
|
|
$conn_message = $database->getConnectionMessage();
|
|
|
|
if ($conn_message) {
|
|
|
|
$messages[] = $conn_message;
|
|
|
|
}
|
|
|
|
|
|
|
|
$replica_message = $database->getReplicaMessage();
|
|
|
|
if ($replica_message) {
|
|
|
|
$messages[] = $replica_message;
|
|
|
|
}
|
|
|
|
|
|
|
|
$messages = phutil_implode_html(phutil_tag('br'), $messages);
|
|
|
|
|
2016-11-16 00:44:12 +01:00
|
|
|
$partition = null;
|
|
|
|
if ($database->getIsMaster()) {
|
|
|
|
if ($database->getIsDefaultPartition()) {
|
|
|
|
$partition = id(new PHUIIconView())
|
|
|
|
->setIcon('fa-circle sky')
|
|
|
|
->addSigil('has-tooltip')
|
|
|
|
->setMetadata(
|
|
|
|
array(
|
|
|
|
'tip' => pht('Default Partition'),
|
|
|
|
));
|
|
|
|
} else {
|
|
|
|
$map = $database->getApplicationMap();
|
|
|
|
if ($map) {
|
|
|
|
$list = implode(', ', $map);
|
|
|
|
} else {
|
|
|
|
$list = pht('Empty');
|
|
|
|
}
|
|
|
|
|
|
|
|
$partition = id(new PHUIIconView())
|
|
|
|
->setIcon('fa-adjust sky')
|
|
|
|
->addSigil('has-tooltip')
|
|
|
|
->setMetadata(
|
|
|
|
array(
|
|
|
|
'tip' => pht('Partition: %s', $list),
|
|
|
|
));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-04-10 00:21:29 +02:00
|
|
|
$rows[] = array(
|
|
|
|
$role_icon,
|
2016-11-16 00:44:12 +01:00
|
|
|
$partition,
|
2016-04-10 00:21:29 +02:00
|
|
|
$database->getHost(),
|
|
|
|
$database->getPort(),
|
|
|
|
$database->getUser(),
|
|
|
|
$connection,
|
|
|
|
$replication,
|
Automatically sever databases after prolonged unreachability
Summary:
Ref T4571. When a database goes down briefly, we fall back to replicas.
However, this fallback is slow (not good for users) and keeps sending a lot of traffic to the master (might be bad if the root cause is load-related).
Keep track of recent connections and fully degrade into "severed" mode if we see a sequence of failures over a reasonable period of time. In this mode, we send much less traffic to the master (faster for users; less load for the database).
We do send a little bit of traffic still, and if the master recovers we'll recover back into normal mode seeing several connections in a row succeed.
This is similar to what most load balancers do when pulling web servers in and out of pools.
For now, the specific numbers are:
- We do at most one health check every 3 seconds.
- If 5 checks in a row fail or succeed, we sever or un-sever the database (so it takes about 15 seconds to switch modes).
- If the database is currently marked unhealthy, we reduce timeouts and retries when connecting to it.
Test Plan:
- Configured a bad `master`.
- Browsed around for a bit, initially saw "unrechable master" errors.
- After about 15 seconds, saw "major interruption" errors instead.
- Fixed the config for `master`.
- Browsed around for a while longer.
- After about 15 seconds, things recovered.
- Used "Cluster Databases" console to keep an eye on health checks: it now shows how many recent health checks were good:
{F1213397}
Reviewers: chad
Reviewed By: chad
Maniphest Tasks: T4571
Differential Revision: https://secure.phabricator.com/D15677
2016-04-10 23:18:09 +02:00
|
|
|
$health_status,
|
2016-04-10 00:21:29 +02:00
|
|
|
$messages,
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
Automatically sever databases after prolonged unreachability
Summary:
Ref T4571. When a database goes down briefly, we fall back to replicas.
However, this fallback is slow (not good for users) and keeps sending a lot of traffic to the master (might be bad if the root cause is load-related).
Keep track of recent connections and fully degrade into "severed" mode if we see a sequence of failures over a reasonable period of time. In this mode, we send much less traffic to the master (faster for users; less load for the database).
We do send a little bit of traffic still, and if the master recovers we'll recover back into normal mode seeing several connections in a row succeed.
This is similar to what most load balancers do when pulling web servers in and out of pools.
For now, the specific numbers are:
- We do at most one health check every 3 seconds.
- If 5 checks in a row fail or succeed, we sever or un-sever the database (so it takes about 15 seconds to switch modes).
- If the database is currently marked unhealthy, we reduce timeouts and retries when connecting to it.
Test Plan:
- Configured a bad `master`.
- Browsed around for a bit, initially saw "unrechable master" errors.
- After about 15 seconds, saw "major interruption" errors instead.
- Fixed the config for `master`.
- Browsed around for a while longer.
- After about 15 seconds, things recovered.
- Used "Cluster Databases" console to keep an eye on health checks: it now shows how many recent health checks were good:
{F1213397}
Reviewers: chad
Reviewed By: chad
Maniphest Tasks: T4571
Differential Revision: https://secure.phabricator.com/D15677
2016-04-10 23:18:09 +02:00
|
|
|
|
2016-04-10 00:21:29 +02:00
|
|
|
$table = id(new AphrontTableView($rows))
|
|
|
|
->setNoDataString(
|
|
|
|
pht('Phabricator is not configured in cluster mode.'))
|
|
|
|
->setHeaders(
|
|
|
|
array(
|
2016-11-16 00:44:12 +01:00
|
|
|
null,
|
2016-04-10 00:21:29 +02:00
|
|
|
null,
|
|
|
|
pht('Host'),
|
|
|
|
pht('Port'),
|
|
|
|
pht('User'),
|
|
|
|
pht('Connection'),
|
|
|
|
pht('Replication'),
|
Automatically sever databases after prolonged unreachability
Summary:
Ref T4571. When a database goes down briefly, we fall back to replicas.
However, this fallback is slow (not good for users) and keeps sending a lot of traffic to the master (might be bad if the root cause is load-related).
Keep track of recent connections and fully degrade into "severed" mode if we see a sequence of failures over a reasonable period of time. In this mode, we send much less traffic to the master (faster for users; less load for the database).
We do send a little bit of traffic still, and if the master recovers we'll recover back into normal mode seeing several connections in a row succeed.
This is similar to what most load balancers do when pulling web servers in and out of pools.
For now, the specific numbers are:
- We do at most one health check every 3 seconds.
- If 5 checks in a row fail or succeed, we sever or un-sever the database (so it takes about 15 seconds to switch modes).
- If the database is currently marked unhealthy, we reduce timeouts and retries when connecting to it.
Test Plan:
- Configured a bad `master`.
- Browsed around for a bit, initially saw "unrechable master" errors.
- After about 15 seconds, saw "major interruption" errors instead.
- Fixed the config for `master`.
- Browsed around for a while longer.
- After about 15 seconds, things recovered.
- Used "Cluster Databases" console to keep an eye on health checks: it now shows how many recent health checks were good:
{F1213397}
Reviewers: chad
Reviewed By: chad
Maniphest Tasks: T4571
Differential Revision: https://secure.phabricator.com/D15677
2016-04-10 23:18:09 +02:00
|
|
|
pht('Health'),
|
2016-04-10 00:21:29 +02:00
|
|
|
pht('Messages'),
|
|
|
|
))
|
|
|
|
->setColumnClasses(
|
|
|
|
array(
|
|
|
|
null,
|
|
|
|
null,
|
|
|
|
null,
|
|
|
|
null,
|
|
|
|
null,
|
|
|
|
null,
|
Automatically sever databases after prolonged unreachability
Summary:
Ref T4571. When a database goes down briefly, we fall back to replicas.
However, this fallback is slow (not good for users) and keeps sending a lot of traffic to the master (might be bad if the root cause is load-related).
Keep track of recent connections and fully degrade into "severed" mode if we see a sequence of failures over a reasonable period of time. In this mode, we send much less traffic to the master (faster for users; less load for the database).
We do send a little bit of traffic still, and if the master recovers we'll recover back into normal mode seeing several connections in a row succeed.
This is similar to what most load balancers do when pulling web servers in and out of pools.
For now, the specific numbers are:
- We do at most one health check every 3 seconds.
- If 5 checks in a row fail or succeed, we sever or un-sever the database (so it takes about 15 seconds to switch modes).
- If the database is currently marked unhealthy, we reduce timeouts and retries when connecting to it.
Test Plan:
- Configured a bad `master`.
- Browsed around for a bit, initially saw "unrechable master" errors.
- After about 15 seconds, saw "major interruption" errors instead.
- Fixed the config for `master`.
- Browsed around for a while longer.
- After about 15 seconds, things recovered.
- Used "Cluster Databases" console to keep an eye on health checks: it now shows how many recent health checks were good:
{F1213397}
Reviewers: chad
Reviewed By: chad
Maniphest Tasks: T4571
Differential Revision: https://secure.phabricator.com/D15677
2016-04-10 23:18:09 +02:00
|
|
|
null,
|
2016-11-16 00:44:12 +01:00
|
|
|
null,
|
2016-04-10 00:21:29 +02:00
|
|
|
'wide',
|
|
|
|
));
|
|
|
|
|
Redesign Config Application
Summary: Ref T11132, significantly cleans up the Config app, new layout, icons, spacing, etc. Some minor todos around re-designing "issues", mobile support, and maybe another pass at actual Group pages.
Test Plan: Visit and test every page in the config app, set new items, resolve setup issues, etc.
Reviewers: epriestley
Reviewed By: epriestley
Subscribers: PHID-OPKG-gm6ozazyms6q6i22gyam, Korvin
Maniphest Tasks: T11132
Differential Revision: https://secure.phabricator.com/D16468
2016-08-30 00:36:13 +02:00
|
|
|
return $table;
|
2016-04-10 00:21:29 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|