1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-30 10:42:41 +01:00

Implement bin/repository thaw for unfreezing cluster repositories

Summary:
Ref T10751. Add support tooling for manually prying your way out of trouble if disaster strikes.

Refine documentation, try to refer to devices as "devices" more consistently instead of sometimes calling them "nodes".

Test Plan: Promoted and demoted repository devices with `bin/repository thaw`.

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T10751

Differential Revision: https://secure.phabricator.com/D15768
This commit is contained in:
epriestley 2016-04-20 06:54:53 -07:00
parent 11aa902bd1
commit bd4fb3c9fa
5 changed files with 311 additions and 52 deletions

View file

@ -3183,6 +3183,7 @@ phutil_register_library_map(array(
'PhabricatorRepositoryManagementPullWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementPullWorkflow.php', 'PhabricatorRepositoryManagementPullWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementPullWorkflow.php',
'PhabricatorRepositoryManagementRefsWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementRefsWorkflow.php', 'PhabricatorRepositoryManagementRefsWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementRefsWorkflow.php',
'PhabricatorRepositoryManagementReparseWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementReparseWorkflow.php', 'PhabricatorRepositoryManagementReparseWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementReparseWorkflow.php',
'PhabricatorRepositoryManagementThawWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementThawWorkflow.php',
'PhabricatorRepositoryManagementUpdateWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementUpdateWorkflow.php', 'PhabricatorRepositoryManagementUpdateWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementUpdateWorkflow.php',
'PhabricatorRepositoryManagementWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementWorkflow.php', 'PhabricatorRepositoryManagementWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementWorkflow.php',
'PhabricatorRepositoryMercurialCommitChangeParserWorker' => 'applications/repository/worker/commitchangeparser/PhabricatorRepositoryMercurialCommitChangeParserWorker.php', 'PhabricatorRepositoryMercurialCommitChangeParserWorker' => 'applications/repository/worker/commitchangeparser/PhabricatorRepositoryMercurialCommitChangeParserWorker.php',
@ -7834,6 +7835,7 @@ phutil_register_library_map(array(
'PhabricatorRepositoryManagementPullWorkflow' => 'PhabricatorRepositoryManagementWorkflow', 'PhabricatorRepositoryManagementPullWorkflow' => 'PhabricatorRepositoryManagementWorkflow',
'PhabricatorRepositoryManagementRefsWorkflow' => 'PhabricatorRepositoryManagementWorkflow', 'PhabricatorRepositoryManagementRefsWorkflow' => 'PhabricatorRepositoryManagementWorkflow',
'PhabricatorRepositoryManagementReparseWorkflow' => 'PhabricatorRepositoryManagementWorkflow', 'PhabricatorRepositoryManagementReparseWorkflow' => 'PhabricatorRepositoryManagementWorkflow',
'PhabricatorRepositoryManagementThawWorkflow' => 'PhabricatorRepositoryManagementWorkflow',
'PhabricatorRepositoryManagementUpdateWorkflow' => 'PhabricatorRepositoryManagementWorkflow', 'PhabricatorRepositoryManagementUpdateWorkflow' => 'PhabricatorRepositoryManagementWorkflow',
'PhabricatorRepositoryManagementWorkflow' => 'PhabricatorManagementWorkflow', 'PhabricatorRepositoryManagementWorkflow' => 'PhabricatorManagementWorkflow',
'PhabricatorRepositoryMercurialCommitChangeParserWorker' => 'PhabricatorRepositoryCommitChangeParserWorker', 'PhabricatorRepositoryMercurialCommitChangeParserWorker' => 'PhabricatorRepositoryCommitChangeParserWorker',

View file

@ -0,0 +1,186 @@
<?php
final class PhabricatorRepositoryManagementThawWorkflow
extends PhabricatorRepositoryManagementWorkflow {
protected function didConstruct() {
$this
->setName('thaw')
->setExamples('**thaw** [options] __repository__ ...')
->setSynopsis(
pht(
'Resolve issues with frozen cluster repositories. Very advanced '.
'and dangerous.'))
->setArguments(
array(
array(
'name' => 'demote',
'param' => 'device',
'help' => pht(
'Demote a device, discarding local changes. Clears stuck '.
'write locks and recovers from lost leaders.'),
),
array(
'name' => 'promote',
'param' => 'device',
'help' => pht(
'Promote a device, discarding changes on other devices. '.
'Resolves ambiguous leadership and recovers from demotion '.
'mistakes.'),
),
array(
'name' => 'force',
'help' => pht('Run operations without asking for confirmation.'),
),
array(
'name' => 'repositories',
'wildcard' => true,
),
));
}
public function execute(PhutilArgumentParser $args) {
$viewer = $this->getViewer();
$repositories = $this->loadRepositories($args, 'repositories');
if (!$repositories) {
throw new PhutilArgumentUsageException(
pht('Specify one or more repositories to thaw.'));
}
$promote = $args->getArg('promote');
$demote = $args->getArg('demote');
if (!$promote && !$demote) {
throw new PhutilArgumentUsageException(
pht('You must choose a device to --promote or --demote.'));
}
if ($promote && $demote) {
throw new PhutilArgumentUsageException(
pht('Specify either --promote or --demote, but not both.'));
}
$device_name = nonempty($promote, $demote);
$device = id(new AlmanacDeviceQuery())
->setViewer($viewer)
->withNames(array($device_name))
->executeOne();
if (!$device) {
throw new PhutilArgumentUsageException(
pht('No device "%s" exists.', $device_name));
}
if ($promote) {
$risk_message = pht(
'Promoting a device can cause the loss of any repository data which '.
'only exists on other devices. The version of the repository on the '.
'promoted device will become authoritative.');
} else {
$risk_message = pht(
'Demoting a device can cause the loss of any repository data which '.
'only exists on the demoted device. The version of the repository '.
'on some other device will become authoritative.');
}
echo tsprintf(
"**<bg:red> %s </bg>** %s\n",
pht('DATA AT RISK'),
$risk_message);
$is_force = $args->getArg('force');
$prompt = pht('Accept the possibilty of permanent data loss?');
if (!$is_force && !phutil_console_confirm($prompt)) {
throw new PhutilArgumentUsageException(
pht('User aborted the workflow.'));
}
foreach ($repositories as $repository) {
$repository_phid = $repository->getPHID();
$write_lock = PhabricatorRepositoryWorkingCopyVersion::getWriteLock(
$repository_phid);
echo tsprintf(
"%s\n",
pht(
'Waiting to acquire write lock for "%s"...',
$repository->getDisplayName()));
$write_lock->lock(phutil_units('5 minutes in seconds'));
try {
$service = $repository->loadAlmanacService();
if (!$service) {
throw new PhutilArgumentUsageException(
pht(
'Repository "%s" is not a cluster repository: it is not '.
'bound to an Almanac service.',
$repository->getDisplayName()));
}
$bindings = $service->getActiveBindings();
$bindings = mpull($bindings, null, 'getDevicePHID');
if (empty($bindings[$device->getPHID()])) {
throw new PhutilArgumentUsageException(
pht(
'Repository "%s" has no active binding to device "%s". Only '.
'actively bound devices can be promoted or demoted.',
$repository->getDisplayName(),
$device->getName()));
}
$versions = PhabricatorRepositoryWorkingCopyVersion::loadVersions(
$repository->getPHID());
$versions = mpull($versions, null, 'getDevicePHID');
$versions = array_select_keys($versions, array_keys($bindings));
if ($versions && $promote) {
throw new PhutilArgumentUsageException(
pht(
'Unable to promote "%s" for repository "%s": the leaders for '.
'this cluster are not ambiguous.',
$device->getName(),
$repository->getDisplayName()));
}
if ($promote) {
PhabricatorRepositoryWorkingCopyVersion::updateVersion(
$repository->getPHID(),
$device->getPHID(),
0);
echo tsprintf(
"%s\n",
pht(
'Promoted "%s" to become a leader for "%s".',
$device->getName(),
$repository->getDisplayName()));
}
if ($demote) {
PhabricatorRepositoryWorkingCopyVersion::demoteDevice(
$repository->getPHID(),
$device->getPHID());
echo tsprintf(
"%s\n",
pht(
'Demoted "%s" from leadership of repository "%s".',
$device->getName(),
$repository->getDisplayName()));
}
} catch (Exception $ex) {
$write_lock->unlock();
throw $ex;
}
$write_lock->unlock();
}
return 0;
}
}

View file

@ -2397,11 +2397,12 @@ final class PhabricatorRepository extends PhabricatorRepositoryDAO
continue; continue;
} }
// TODO: This should provide more help so users can resolve the issue.
throw new Exception( throw new Exception(
pht( pht(
'An incomplete write was previously performed to this repository; '. 'An previous write to this repository was interrupted; refusing '.
'refusing new writes.')); 'new writes. This issue resolves operator intervention to resolve, '.
'see "Write Interruptions" in the "Cluster: Repositories" in the '.
'documentation for instructions.'));
} }
try { try {
@ -2566,7 +2567,7 @@ final class PhabricatorRepository extends PhabricatorRepositoryDAO
->setPath($path); ->setPath($path);
} }
private function loadAlmanacService() { public function loadAlmanacService() {
$service_phid = $this->getAlmanacServicePHID(); $service_phid = $this->getAlmanacServicePHID();
if (!$service_phid) { if (!$service_phid) {
// No service, so this is a local repository. // No service, so this is a local repository.

View file

@ -132,6 +132,7 @@ final class PhabricatorRepositoryWorkingCopyVersion
$repository_phid, $repository_phid,
$device_phid, $device_phid,
$new_version) { $new_version) {
$version = new self(); $version = new self();
$conn_w = $version->establishConnection('w'); $conn_w = $version->establishConnection('w');
$table = $version->getTableName(); $table = $version->getTableName();
@ -152,4 +153,23 @@ final class PhabricatorRepositoryWorkingCopyVersion
} }
/**
* Explicitly demote a device.
*/
public static function demoteDevice(
$repository_phid,
$device_phid) {
$version = new self();
$conn_w = $version->establishConnection('w');
$table = $version->getTableName();
queryfx(
$conn_w,
'DELETE FROM %T WHERE repositoryPHID = %s AND devicePHID = %s',
$table,
$repository_phid,
$device_phid);
}
} }

View file

@ -98,7 +98,7 @@ similar agents of other rogue nations is beyond the scope of this document.
Monitoring Replication Monitoring Replication
====================== ======================
You can review the current status of a repository on cluster nodes in You can review the current status of a repository on cluster devices in
{nav Diffusion > (Repository) > Manage Repository > Cluster Configuration}. {nav Diffusion > (Repository) > Manage Repository > Cluster Configuration}.
This screen shows all the configured devices which are hosting the repository This screen shows all the configured devices which are hosting the repository
@ -106,20 +106,20 @@ and the available version.
**Version**: When a repository is mutated by a push, Phabricator increases **Version**: When a repository is mutated by a push, Phabricator increases
an internal version number for the repository. This column shows which version an internal version number for the repository. This column shows which version
is on disk on the corresponding node. is on disk on the corresponding device.
After a change is pushed, the node which received the change will have a larger After a change is pushed, the device which received the change will have a
version number than the other nodes. The change should be passively replicated larger version number than the other devices. The change should be passively
to the remaining nodes after a brief period of time, although this can take replicated to the remaining devices after a brief period of time, although this
a while if the change was large or the network connection between nodes is can take a while if the change was large or the network connection between
slow or unreliable. devices is slow or unreliable.
You can click the version number to see the corresponding push logs for that You can click the version number to see the corresponding push logs for that
change. The logs contain details about what was changed, and can help you change. The logs contain details about what was changed, and can help you
identify if replication is slow because a change is large or for some other identify if replication is slow because a change is large or for some other
reason. reason.
**Writing**: This shows that the node is currently holding a write lock. This **Writing**: This shows that the device is currently holding a write lock. This
normally means that it is actively receiving a push, but can also mean that normally means that it is actively receiving a push, but can also mean that
there was a write interruption. See "Write Interruptions" below for details. there was a write interruption. See "Write Interruptions" below for details.
@ -131,43 +131,74 @@ the user whose change is holding the lock.
currently held, this shows when the lock was acquired. currently held, this shows when the lock was acquired.
Cluster Failure Modes
=====================
There are three major cluster failure modes:
- **Write Interruptions**: A write started but did not complete, leaving
the disk state and cluster state out of sync.
- **Loss of Leaders**: None of the devices with the most up-to-date data
are reachable.
- **Ambiguous Leaders**: The internal state of the repository is unclear.
Phabricator can detect these issues, and responds by freezing the repository
(usually preventing all reads and writes) until the issue is resolved. These
conditions are normally rare and very little data is at risk, but Phabricator
errs on the side of caution and requires decisions which may result in data
loss to be confirmed by a human.
The next sections cover these failure modes and appropriate responses in
more detail. In general, you will respond to these issues by assessing the
situation and then possibly choosing to discard some data.
Write Interruptions Write Interruptions
=================== ===================
A repository cluster can be put into an inconsistent state by an interruption A repository cluster can be put into an inconsistent state by an interruption
in a brief window during and immediately after a write. in a brief window during and immediately after a write. This looks like this:
- A change is pushed to a server.
- The server acquires a write lock and begins writing the change.
- During or immediately after the write, lightning strikes the server
and destroys it.
Phabricator can not commit changes to a working copy (stored on disk) and to Phabricator can not commit changes to a working copy (stored on disk) and to
the global state (stored in a database) atomically, so there is a narrow window the global state (stored in a database) atomically, so there is necessarily a
between committing these two different states when some tragedy (like a narrow window between committing these two different states when some tragedy
lightning strike) can befall a server, leaving the global and local views of can befall a server, leaving the global and local views of the repository state
the repository state possibly divergent. possibly divergent.
In these cases, Phabricator fails into a frozen state where further writes In these cases, Phabricator fails into a frozen state where further writes
are not permitted until the failure is investigated and resolved. are not permitted until the failure is investigated and resolved. When a
repository is frozen in this way it remains readable.
You can use the monitoring console to review the state of a frozen repository You can use the monitoring console to review the state of a frozen repository
with a held write lock. The **Writing** column will show which node is holding with a held write lock. The **Writing** column will show which device is
the lock, and whoever is named in the **Last Writer** column may be able to holding the lock, and whoever is named in the **Last Writer** column may be
help you figure out what happened by providing more information about what they able to help you figure out what happened by providing more information about
were doing and what they observed. what they were doing and what they observed.
Because the push was not acknowledged, it is normally safe to demote the node: Because the push was not acknowledged, it is normally safe to resolve this
the user should have received an error anyway, and should not expect their push issue by demoting the device. Demoting the device will undo any changes
to have worked. However, data is technically at risk and you may want to committed by the push, and they will be lost forever.
investigate further and try to understand the issue in more detail before
However, the user should have received an error anyway, and should not expect
their push to have worked. Still, data is technically at risk and you may want
to investigate further and try to understand the issue in more detail before
continuing. continuing.
There is no way to explicitly keep the write, but if it was committed to disk There is no way to explicitly keep the write, but if it was committed to disk
you can recover it manually from the working copy on the device and then push you can recover it manually from the working copy on the device (for example,
it again. by using `git format-patch`) and then push it again after recovering.
If you demote the node, the in-process write will be thrown away, even if it If you demote the device, the in-process write will be thrown away, even if it
was complete on disk. To demote the node and release the write lock, run this was complete on disk. To demote the device and release the write lock, run this
command: command:
``` ```
phabricator/ $ ./bin/repository thaw rXYZ --demote repo002.corp.net phabricator/ $ ./bin/repository thaw <repository> --demote <device>
``` ```
{icon exclamation-triangle, color="yellow"} Any committed but unacknowledged {icon exclamation-triangle, color="yellow"} Any committed but unacknowledged
@ -181,17 +212,18 @@ A more straightforward failure condition is the loss of all servers in a
cluster which have the most up-to-date copy of a repository. This looks like cluster which have the most up-to-date copy of a repository. This looks like
this: this:
- There is a cluster setup with two nodes, X and Y. - There is a cluster setup with two devices, X and Y.
- A new change is pushed to server X. - A new change is pushed to server X.
- Before the change can propagate to server Y, lightning strikes server X - Before the change can propagate to server Y, lightning strikes server X
and destroys it. and destroys it.
Here, all of the "leader" nodes with the most up-to-date copy of the repository Here, all of the "leader" devices with the most up-to-date copy of the
have been lost. Phabricator will refuse to serve this repository because it repository have been lost. Phabricator will freeze the repository refuse to
can not serve it consistently, and can not accept writes without data loss. serve requests because it can not serve it consistently, and can not accept new
writes without data loss.
The most straightforward way to resolve this issue is to restore any leader to The most straightforward way to resolve this issue is to restore any leader to
service. The change will be able to replicate to other nodes once a leader service. The change will be able to replicate to other devices once a leader
comes back online. comes back online.
If you are unable to restore a leader or unsure that you can restore one If you are unable to restore a leader or unsure that you can restore one
@ -201,13 +233,20 @@ push logs.
If you are comfortable discarding these changes, you can instruct Phabricator If you are comfortable discarding these changes, you can instruct Phabricator
that it can forget about the leaders in two ways: disable the service bindings that it can forget about the leaders in two ways: disable the service bindings
to all of the leader nodes so they are no longer part of the cluster, or to all of the leader devices so they are no longer part of the cluster, or use
use `bin/repository thaw` to `--demote` the leaders explicitly. `bin/repository thaw` to `--demote` the leaders explicitly.
If you do this, **you will lose data**. Either action will discard any changes If you do this, **you will lose data**. Either action will discard any changes
on the affected leaders which have not replicated to other nodes in the cluster. on the affected leaders which have not replicated to other devices in the
cluster.
To demote a device, run this command: To remove a device from the cluster, disable all of the bindings to it
in Almanac, using the web UI.
{icon exclamation-triangle, color="red"} Any data which is only present on
the disabled device will be lost.
To demote a device without removing it from the cluster, run this command:
``` ```
phabricator/ $ ./bin/repository thaw rXYZ --demote repo002.corp.net phabricator/ $ ./bin/repository thaw rXYZ --demote repo002.corp.net
@ -220,24 +259,35 @@ phabricator/ $ ./bin/repository thaw rXYZ --demote repo002.corp.net
Ambiguous Leaders Ambiguous Leaders
================= =================
Repository clusters can also freeze if the leader nodes are ambiguous. This Repository clusters can also freeze if the leader devices are ambiguous. This
can happen if you replace an entire cluster with new devices suddenly, or can happen if you replace an entire cluster with new devices suddenly, or
make a mistake with the `--demote` flag. make a mistake with the `--demote` flag. This generally arises from some kind
of operator error, like this:
When Phabricator can not tell which node in a cluster is a leader, it freezes - Someone accidentally uses `bin/repository thaw ... --demote` to demote
the cluster because it is possible that some nodes have less data and others every device in a cluster.
- Someone accidentally deletes all the version information for a repository
from the database by making a mistake with a `DELETE` or `UPDATE` query.
- Someone accidentally disable all of the devices in a cluster, then add
entirely new ones before repositories can propagate.
When Phabricator can not tell which device in a cluster is a leader, it freezes
the cluster because it is possible that some devices have less data and others
have more, and if it choses a leader arbitrarily it may destroy some data have more, and if it choses a leader arbitrarily it may destroy some data
which you would prefer to retain. which you would prefer to retain.
To resolve this, you need to tell Phabricator which node has the most To resolve this, you need to tell Phabricator which device has the most
up-to-date data and promote that node to become a leader. If you do this, up-to-date data and promote that device to become a leader. If you know all
**you may lose data** if you promote the wrong node, and some other node devices have the same data, you are free to promote any device.
really had more up-to-date data. If you want to double check, you can examine
the working copies on disk before promoting, by connecting to the machines and
using commands like `git log` to inspect state.
Once you have identified a node which has data you're happy with, use If you promote a device, **you may lose data** if you promote the wrong device
`bin/repository thaw` to `--promote` the device: and some other device really had more up-to-date data. If you want to double
check, you can examine the working copies on disk before promoting by
connecting to the machines and using commands like `git log` to inspect state.
Once you have identified a device which has data you're happy with, use
`bin/repository thaw` to `--promote` the device. The data on the chosen
device will become authoritative:
``` ```
phabricator/ $ ./bin/repository thaw rXYZ --promote repo002.corp.net phabricator/ $ ./bin/repository thaw rXYZ --promote repo002.corp.net