1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-23 07:12:41 +01:00

Allow "bin/repository thaw --demote" to demote an entire service, not just a single device

Summary: Ref T13222. See PHI992. If you lose an entire cluster, you may want to aggressively demote it out of existence. You currently need to `xargs` your way through this. Allow `--demote <service>`, which demotes all devices in a service.

Test Plan: Demoted with `--demote <device>` and `--demote <service>`. Hit the `--promote service` error.

Reviewers: amckinley

Reviewed By: amckinley

Maniphest Tasks: T13222

Differential Revision: https://secure.phabricator.com/D19850
This commit is contained in:
epriestley 2018-12-05 12:22:14 -08:00
parent bba4186005
commit 1a6a0181a8
2 changed files with 164 additions and 111 deletions

View file

@ -15,10 +15,11 @@ final class PhabricatorRepositoryManagementThawWorkflow
array( array(
array( array(
'name' => 'demote', 'name' => 'demote',
'param' => 'device', 'param' => 'device/service',
'help' => pht( 'help' => pht(
'Demote a device, discarding local changes. Clears stuck '. 'Demote a device (or all devices in a service) discarding '.
'write locks and recovers from lost leaders.'), 'local changes. Clears stuck write locks and recovers from '.
'lost leaders.'),
), ),
array( array(
'name' => 'promote', 'name' => 'promote',
@ -61,15 +62,53 @@ final class PhabricatorRepositoryManagementThawWorkflow
pht('Specify either --promote or --demote, but not both.')); pht('Specify either --promote or --demote, but not both.'));
} }
$device_name = nonempty($promote, $demote); $target_name = nonempty($promote, $demote);
$device = id(new AlmanacDeviceQuery()) $devices = id(new AlmanacDeviceQuery())
->setViewer($viewer) ->setViewer($viewer)
->withNames(array($device_name)) ->withNames(array($target_name))
->executeOne(); ->execute();
if (!$device) { if (!$devices) {
throw new PhutilArgumentUsageException( $service = id(new AlmanacServiceQuery())
pht('No device "%s" exists.', $device_name)); ->setViewer($viewer)
->withNames(array($target_name))
->executeOne();
if (!$service) {
throw new PhutilArgumentUsageException(
pht('No device or service named "%s" exists.', $target_name));
}
if ($promote) {
throw new PhutilArgumentUsageException(
pht(
'You can not "--promote" an entire service ("%s"). Only a single '.
'device may be promoted.',
$target_name));
}
$bindings = id(new AlmanacBindingQuery())
->setViewer($viewer)
->withServicePHIDs(array($service->getPHID()))
->execute();
if (!$bindings) {
throw new PhutilArgumentUsageException(
pht(
'Service "%s" is not bound to any devices.',
$target_name));
}
$interfaces = id(new AlmanacInterfaceQuery())
->setViewer($viewer)
->withPHIDs(mpull($bindings, 'getInterfacePHID'))
->execute();
$device_phids = mpull($interfaces, 'getDevicePHID');
$devices = id(new AlmanacDeviceQuery())
->setViewer($viewer)
->withPHIDs($device_phids)
->execute();
} }
$repository_names = $args->getArg('repositories'); $repository_names = $args->getArg('repositories');
@ -97,7 +136,7 @@ final class PhabricatorRepositoryManagementThawWorkflow
$services = id(new AlmanacServiceQuery()) $services = id(new AlmanacServiceQuery())
->setViewer($viewer) ->setViewer($viewer)
->withDevicePHIDs(array($device->getPHID())) ->withDevicePHIDs(mpull($devices, 'getPHID'))
->execute(); ->execute();
if ($services) { if ($services) {
$repositories = id(new PhabricatorRepositoryQuery()) $repositories = id(new PhabricatorRepositoryQuery())
@ -108,7 +147,7 @@ final class PhabricatorRepositoryManagementThawWorkflow
if (!$repositories) { if (!$repositories) {
throw new PhutilArgumentUsageException( throw new PhutilArgumentUsageException(
pht('There are no repositories on the selected device.')); pht('There are no repositories on the selected device or service.'));
} }
} }
@ -150,126 +189,128 @@ final class PhabricatorRepositoryManagementThawWorkflow
pht('User aborted the workflow.')); pht('User aborted the workflow.'));
} }
foreach ($repositories as $repository) { foreach ($devices as $device) {
$repository_phid = $repository->getPHID(); foreach ($repositories as $repository) {
$repository_phid = $repository->getPHID();
$write_lock = PhabricatorRepositoryWorkingCopyVersion::getWriteLock( $write_lock = PhabricatorRepositoryWorkingCopyVersion::getWriteLock(
$repository_phid); $repository_phid);
echo tsprintf( echo tsprintf(
"%s\n", "%s\n",
pht( pht(
'Waiting to acquire write lock for "%s"...', 'Waiting to acquire write lock for "%s"...',
$repository->getDisplayName())); $repository->getDisplayName()));
$write_lock->lock(phutil_units('5 minutes in seconds')); $write_lock->lock(phutil_units('5 minutes in seconds'));
try { try {
$service = $repository->loadAlmanacService(); $service = $repository->loadAlmanacService();
if (!$service) { if (!$service) {
throw new PhutilArgumentUsageException(
pht(
'Repository "%s" is not a cluster repository: it is not '.
'bound to an Almanac service.',
$repository->getDisplayName()));
}
if ($promote) {
// You can only promote active devices. (You may demote active or
// inactive devices.)
$bindings = $service->getActiveBindings();
$bindings = mpull($bindings, null, 'getDevicePHID');
if (empty($bindings[$device->getPHID()])) {
throw new PhutilArgumentUsageException( throw new PhutilArgumentUsageException(
pht( pht(
'Repository "%s" has no active binding to device "%s". Only '. 'Repository "%s" is not a cluster repository: it is not '.
'actively bound devices can be promoted.', 'bound to an Almanac service.',
$repository->getDisplayName(), $repository->getDisplayName()));
$device->getName()));
} }
$versions = PhabricatorRepositoryWorkingCopyVersion::loadVersions( if ($promote) {
$repository->getPHID()); // You can only promote active devices. (You may demote active or
$versions = mpull($versions, null, 'getDevicePHID'); // inactive devices.)
$bindings = $service->getActiveBindings();
// Before we promote, make sure there are no outstanding versions on $bindings = mpull($bindings, null, 'getDevicePHID');
// devices with inactive bindings. If there are, you need to demote if (empty($bindings[$device->getPHID()])) {
// these first. throw new PhutilArgumentUsageException(
$inactive = array(); pht(
foreach ($versions as $device_phid => $version) { 'Repository "%s" has no active binding to device "%s". '.
if (isset($bindings[$device_phid])) { 'Only actively bound devices can be promoted.',
continue; $repository->getDisplayName(),
$device->getName()));
} }
$inactive[$device_phid] = $version;
}
if ($inactive) { $versions = PhabricatorRepositoryWorkingCopyVersion::loadVersions(
$handles = $viewer->loadHandles(array_keys($inactive)); $repository->getPHID());
$versions = mpull($versions, null, 'getDevicePHID');
$handle_list = iterator_to_array($handles); // Before we promote, make sure there are no outstanding versions
$handle_list = mpull($handle_list, 'getName'); // on devices with inactive bindings. If there are, you need to
$handle_list = implode(', ', $handle_list); // demote these first.
$inactive = array();
foreach ($versions as $device_phid => $version) {
if (isset($bindings[$device_phid])) {
continue;
}
$inactive[$device_phid] = $version;
}
throw new PhutilArgumentUsageException( if ($inactive) {
$handles = $viewer->loadHandles(array_keys($inactive));
$handle_list = iterator_to_array($handles);
$handle_list = mpull($handle_list, 'getName');
$handle_list = implode(', ', $handle_list);
throw new PhutilArgumentUsageException(
pht(
'Repository "%s" has versions on inactive devices. Demote '.
'(or reactivate) these devices before promoting a new '.
'leader: %s.',
$repository->getDisplayName(),
$handle_list));
}
// Now, make sure there are no outstanding versions on devices with
// active bindings. These also need to be demoted (or promoting is
// a mistake or already happened).
$active = array_select_keys($versions, array_keys($bindings));
if ($active) {
$handles = $viewer->loadHandles(array_keys($active));
$handle_list = iterator_to_array($handles);
$handle_list = mpull($handle_list, 'getName');
$handle_list = implode(', ', $handle_list);
throw new PhutilArgumentUsageException(
pht(
'Unable to promote "%s" for repository "%s" because this '.
'cluster already has one or more unambiguous leaders: %s.',
$device->getName(),
$repository->getDisplayName(),
$handle_list));
}
PhabricatorRepositoryWorkingCopyVersion::updateVersion(
$repository->getPHID(),
$device->getPHID(),
0);
echo tsprintf(
"%s\n",
pht( pht(
'Repository "%s" has versions on inactive devices. Demote '. 'Promoted "%s" to become a leader for "%s".',
'(or reactivate) these devices before promoting a new '.
'leader: %s.',
$repository->getDisplayName(),
$handle_list));
}
// Now, make sure there are no outstanding versions on devices with
// active bindings. These also need to be demoted (or promoting is a
// mistake or already happened).
$active = array_select_keys($versions, array_keys($bindings));
if ($active) {
$handles = $viewer->loadHandles(array_keys($active));
$handle_list = iterator_to_array($handles);
$handle_list = mpull($handle_list, 'getName');
$handle_list = implode(', ', $handle_list);
throw new PhutilArgumentUsageException(
pht(
'Unable to promote "%s" for repository "%s" because this '.
'cluster already has one or more unambiguous leaders: %s.',
$device->getName(), $device->getName(),
$repository->getDisplayName(), $repository->getDisplayName()));
$handle_list));
} }
PhabricatorRepositoryWorkingCopyVersion::updateVersion( if ($demote) {
$repository->getPHID(), PhabricatorRepositoryWorkingCopyVersion::demoteDevice(
$device->getPHID(), $repository->getPHID(),
0); $device->getPHID());
echo tsprintf( echo tsprintf(
"%s\n", "%s\n",
pht( pht(
'Promoted "%s" to become a leader for "%s".', 'Demoted "%s" from leadership of repository "%s".',
$device->getName(), $device->getName(),
$repository->getDisplayName())); $repository->getDisplayName()));
}
} catch (Exception $ex) {
$write_lock->unlock();
throw $ex;
} }
if ($demote) {
PhabricatorRepositoryWorkingCopyVersion::demoteDevice(
$repository->getPHID(),
$device->getPHID());
echo tsprintf(
"%s\n",
pht(
'Demoted "%s" from leadership of repository "%s".',
$device->getName(),
$repository->getDisplayName()));
}
} catch (Exception $ex) {
$write_lock->unlock(); $write_lock->unlock();
throw $ex;
} }
$write_lock->unlock();
} }
return 0; return 0;

View file

@ -433,6 +433,18 @@ If you do this, **you will lose unreplicated data**. You will discard any
changes on the affected leaders which have not replicated to other devices changes on the affected leaders which have not replicated to other devices
in the cluster. in the cluster.
If you have lost an entire cluster and replaced it with new devices that you
have restored from backups, you can aggressively wipe all memory of the old
devices by using `--demote <service>` and `--all-repositories`. **This is
dangerous and discards all unreplicated data in any repository on any device.**
```
phabricator/ $ ./bin/repository thaw --demote repo.corp.net --all-repositories
```
After you do this, continue below to promote a leader and restore the cluster
to service.
Ambiguous Leaders Ambiguous Leaders
================= =================