Implement bin/repository thaw for unfreezing cluster repositories

Summary: Ref T10751. Add support tooling for manually prying your way out of trouble if disaster strikes. Refine documentation, try to refer to devices as "devices" more consistently instead of sometimes calling them "nodes". Test Plan: Promoted and demoted repository devices with `bin/repository thaw`. Reviewers: chad Reviewed By: chad Maniphest Tasks: T10751 Differential Revision: https://secure.phabricator.com/D15768
2024-11-30 10:42:41 +01:00 · 2016-04-20 06:54:53 -07:00 · 2016-04-20 06:54:53 -07:00 · bd4fb3c9fa
commit bd4fb3c9fa
parent 11aa902bd1
5 changed files with 311 additions and 52 deletions
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@ -3183,6 +3183,7 @@ phutil_register_library_map(array(
    'PhabricatorRepositoryManagementPullWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementPullWorkflow.php',
    'PhabricatorRepositoryManagementRefsWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementRefsWorkflow.php',
    'PhabricatorRepositoryManagementReparseWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementReparseWorkflow.php',
    'PhabricatorRepositoryManagementThawWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementThawWorkflow.php',
    'PhabricatorRepositoryManagementUpdateWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementUpdateWorkflow.php',
    'PhabricatorRepositoryManagementWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementWorkflow.php',
    'PhabricatorRepositoryMercurialCommitChangeParserWorker' => 'applications/repository/worker/commitchangeparser/PhabricatorRepositoryMercurialCommitChangeParserWorker.php',
@ -7834,6 +7835,7 @@ phutil_register_library_map(array(
    'PhabricatorRepositoryManagementPullWorkflow' => 'PhabricatorRepositoryManagementWorkflow',
    'PhabricatorRepositoryManagementRefsWorkflow' => 'PhabricatorRepositoryManagementWorkflow',
    'PhabricatorRepositoryManagementReparseWorkflow' => 'PhabricatorRepositoryManagementWorkflow',
    'PhabricatorRepositoryManagementThawWorkflow' => 'PhabricatorRepositoryManagementWorkflow',
    'PhabricatorRepositoryManagementUpdateWorkflow' => 'PhabricatorRepositoryManagementWorkflow',
    'PhabricatorRepositoryManagementWorkflow' => 'PhabricatorManagementWorkflow',
    'PhabricatorRepositoryMercurialCommitChangeParserWorker' => 'PhabricatorRepositoryCommitChangeParserWorker',
--- a/src/applications/repository/management/PhabricatorRepositoryManagementThawWorkflow.php
+++ b/src/applications/repository/management/PhabricatorRepositoryManagementThawWorkflow.php
@ -0,0 +1,186 @@
 <?php
 final class PhabricatorRepositoryManagementThawWorkflow
  extends PhabricatorRepositoryManagementWorkflow {
  protected function didConstruct() {
    $this
      ->setName('thaw')
      ->setExamples('**thaw** [options] __repository__ ...')
      ->setSynopsis(
        pht(
          'Resolve issues with frozen cluster repositories. Very advanced '.
          'and dangerous.'))
      ->setArguments(
        array(
          array(
            'name' => 'demote',
            'param' => 'device',
            'help' => pht(
              'Demote a device, discarding local changes. Clears stuck '.
              'write locks and recovers from lost leaders.'),
          ),
          array(
            'name' => 'promote',
            'param' => 'device',
            'help' => pht(
              'Promote a device, discarding changes on other devices. '.
              'Resolves ambiguous leadership and recovers from demotion '.
              'mistakes.'),
          ),
          array(
            'name' => 'force',
            'help' => pht('Run operations without asking for confirmation.'),
          ),
          array(
            'name' => 'repositories',
            'wildcard' => true,
          ),
        ));
  }
  public function execute(PhutilArgumentParser $args) {
    $viewer = $this->getViewer();
    $repositories = $this->loadRepositories($args, 'repositories');
    if (!$repositories) {
      throw new PhutilArgumentUsageException(
        pht('Specify one or more repositories to thaw.'));
    }
    $promote = $args->getArg('promote');
    $demote = $args->getArg('demote');
    if (!$promote && !$demote) {
      throw new PhutilArgumentUsageException(
        pht('You must choose a device to --promote or --demote.'));
    }
    if ($promote && $demote) {
      throw new PhutilArgumentUsageException(
        pht('Specify either --promote or --demote, but not both.'));
    }
    $device_name = nonempty($promote, $demote);
    $device = id(new AlmanacDeviceQuery())
      ->setViewer($viewer)
      ->withNames(array($device_name))
      ->executeOne();
    if (!$device) {
      throw new PhutilArgumentUsageException(
        pht('No device "%s" exists.', $device_name));
    }
    if ($promote) {
      $risk_message = pht(
        'Promoting a device can cause the loss of any repository data which '.
        'only exists on other devices. The version of the repository on the '.
        'promoted device will become authoritative.');
    } else {
      $risk_message = pht(
        'Demoting a device can cause the loss of any repository data which '.
        'only exists on the demoted device. The version of the repository '.
        'on some other device will become authoritative.');
    }
    echo tsprintf(
      "**<bg:red> %s </bg>** %s\n",
      pht('DATA AT RISK'),
      $risk_message);
    $is_force = $args->getArg('force');
    $prompt = pht('Accept the possibilty of permanent data loss?');
    if (!$is_force && !phutil_console_confirm($prompt)) {
      throw new PhutilArgumentUsageException(
        pht('User aborted the workflow.'));
    }
    foreach ($repositories as $repository) {
      $repository_phid = $repository->getPHID();
      $write_lock = PhabricatorRepositoryWorkingCopyVersion::getWriteLock(
        $repository_phid);
      echo tsprintf(
        "%s\n",
        pht(
          'Waiting to acquire write lock for "%s"...',
          $repository->getDisplayName()));
      $write_lock->lock(phutil_units('5 minutes in seconds'));
      try {
        $service = $repository->loadAlmanacService();
        if (!$service) {
          throw new PhutilArgumentUsageException(
            pht(
              'Repository "%s" is not a cluster repository: it is not '.
              'bound to an Almanac service.',
              $repository->getDisplayName()));
        }
        $bindings = $service->getActiveBindings();
        $bindings = mpull($bindings, null, 'getDevicePHID');
        if (empty($bindings[$device->getPHID()])) {
          throw new PhutilArgumentUsageException(
            pht(
              'Repository "%s" has no active binding to device "%s". Only '.
              'actively bound devices can be promoted or demoted.',
              $repository->getDisplayName(),
              $device->getName()));
        }
        $versions = PhabricatorRepositoryWorkingCopyVersion::loadVersions(
          $repository->getPHID());
        $versions = mpull($versions, null, 'getDevicePHID');
        $versions = array_select_keys($versions, array_keys($bindings));
        if ($versions && $promote) {
          throw new PhutilArgumentUsageException(
            pht(
              'Unable to promote "%s" for repository "%s": the leaders for '.
              'this cluster are not ambiguous.',
              $device->getName(),
              $repository->getDisplayName()));
        }
        if ($promote) {
          PhabricatorRepositoryWorkingCopyVersion::updateVersion(
            $repository->getPHID(),
            $device->getPHID(),
            0);
          echo tsprintf(
            "%s\n",
            pht(
              'Promoted "%s" to become a leader for "%s".',
              $device->getName(),
              $repository->getDisplayName()));
        }
        if ($demote) {
          PhabricatorRepositoryWorkingCopyVersion::demoteDevice(
            $repository->getPHID(),
            $device->getPHID());
          echo tsprintf(
            "%s\n",
            pht(
              'Demoted "%s" from leadership of repository "%s".',
              $device->getName(),
              $repository->getDisplayName()));
        }
      } catch (Exception $ex) {
        $write_lock->unlock();
        throw $ex;
      }
      $write_lock->unlock();
    }
    return 0;
  }
 }
--- a/src/applications/repository/storage/PhabricatorRepository.php
+++ b/src/applications/repository/storage/PhabricatorRepository.php
@ -2397,11 +2397,12 @@ final class PhabricatorRepository extends PhabricatorRepositoryDAO
        continue;
      }
      // TODO: This should provide more help so users can resolve the issue.
      throw new Exception(
        pht(
-          'An incomplete write was previously performed to this repository; '.
+          'An previous write to this repository was interrupted; refusing '.
-          'refusing new writes.'));
+          'new writes. This issue resolves operator intervention to resolve, '.
          'see "Write Interruptions" in the "Cluster: Repositories" in the '.
          'documentation for instructions.'));
    }
    try {
@ -2566,7 +2567,7 @@ final class PhabricatorRepository extends PhabricatorRepositoryDAO
      ->setPath($path);
  }
-  private function loadAlmanacService() {
+  public function loadAlmanacService() {
    $service_phid = $this->getAlmanacServicePHID();
    if (!$service_phid) {
      // No service, so this is a local repository.
--- a/src/applications/repository/storage/PhabricatorRepositoryWorkingCopyVersion.php
+++ b/src/applications/repository/storage/PhabricatorRepositoryWorkingCopyVersion.php
@ -132,6 +132,7 @@ final class PhabricatorRepositoryWorkingCopyVersion
    $repository_phid,
    $device_phid,
    $new_version) {
    $version = new self();
    $conn_w = $version->establishConnection('w');
    $table = $version->getTableName();
@ -152,4 +153,23 @@ final class PhabricatorRepositoryWorkingCopyVersion
  }
  /**
   * Explicitly demote a device.
   */
  public static function demoteDevice(
    $repository_phid,
    $device_phid) {
    $version = new self();
    $conn_w = $version->establishConnection('w');
    $table = $version->getTableName();
    queryfx(
      $conn_w,
      'DELETE FROM %T WHERE repositoryPHID = %s AND devicePHID = %s',
      $table,
      $repository_phid,
      $device_phid);
  }
 }
--- a/src/docs/user/cluster/cluster_repositories.diviner
+++ b/src/docs/user/cluster/cluster_repositories.diviner
@ -98,7 +98,7 @@ similar agents of other rogue nations is beyond the scope of this document.
 Monitoring Replication
 ======================
-You can review the current status of a repository on cluster nodes in
+You can review the current status of a repository on cluster devices in
 {nav Diffusion > (Repository) > Manage Repository > Cluster Configuration}.
 This screen shows all the configured devices which are hosting the repository
@ -106,20 +106,20 @@ and the available version.
 **Version**: When a repository is mutated by a push, Phabricator increases
 an internal version number for the repository. This column shows which version
-is on disk on the corresponding node.
+is on disk on the corresponding device.
-After a change is pushed, the node which received the change will have a larger
+After a change is pushed, the device which received the change will have a
-version number than the other nodes. The change should be passively replicated
+larger version number than the other devices. The change should be passively
-to the remaining nodes after a brief period of time, although this can take
+replicated to the remaining devices after a brief period of time, although this
-a while if the change was large or the network connection between nodes is
+can take a while if the change was large or the network connection between
-slow or unreliable.
+devices is slow or unreliable.
 You can click the version number to see the corresponding push logs for that
 change. The logs contain details about what was changed, and can help you
 identify if replication is slow because a change is large or for some other
 reason.
-**Writing**: This shows that the node is currently holding a write lock. This
+**Writing**: This shows that the device is currently holding a write lock. This
 normally means that it is actively receiving a push, but can also mean that
 there was a write interruption. See "Write Interruptions" below for details.
@ -131,43 +131,74 @@ the user whose change is holding the lock.
 currently held, this shows when the lock was acquired.
 Cluster Failure Modes
 =====================
 There are three major cluster failure modes:
  - **Write Interruptions**: A write started but did not complete, leaving
    the disk state and cluster state out of sync.
  - **Loss of Leaders**: None of the devices with the most up-to-date data
    are reachable.
  - **Ambiguous Leaders**: The internal state of the repository is unclear.
 Phabricator can detect these issues, and responds by freezing the repository
 (usually preventing all reads and writes) until the issue is resolved. These
 conditions are normally rare and very little data is at risk, but Phabricator
 errs on the side of caution and requires decisions which may result in data
 loss to be confirmed by a human.
 The next sections cover these failure modes and appropriate responses in
 more detail. In general, you will respond to these issues by assessing the
 situation and then possibly choosing to discard some data.
 Write Interruptions
 ===================
 A repository cluster can be put into an inconsistent state by an interruption
-in a brief window during and immediately after a write.
+in a brief window during and immediately after a write. This looks like this:
  - A change is pushed to a server.
  - The server acquires a write lock and begins writing the change.
  - During or immediately after the write, lightning strikes the server
    and destroys it.
 Phabricator can not commit changes to a working copy (stored on disk) and to
-the global state (stored in a database) atomically, so there is a narrow window
+the global state (stored in a database) atomically, so there is necessarily a
-between committing these two different states when some tragedy (like a
+narrow window between committing these two different states when some tragedy
-lightning strike) can befall a server, leaving the global and local views of
+can befall a server, leaving the global and local views of the repository state
-the repository state possibly divergent.
+possibly divergent.
 In these cases, Phabricator fails into a frozen state where further writes
-are not permitted until the failure is investigated and resolved.
+are not permitted until the failure is investigated and resolved. When a
 repository is frozen in this way it remains readable.
 You can use the monitoring console to review the state of a frozen repository
-with a held write lock. The **Writing** column will show which node is holding
+with a held write lock. The **Writing** column will show which device is
-the lock, and whoever is named in the **Last Writer** column may be able to
+holding the lock, and whoever is named in the **Last Writer** column may be
-help you figure out what happened by providing more information about what they
+able to help you figure out what happened by providing more information about
-were doing and what they observed.
+what they were doing and what they observed.
-Because the push was not acknowledged, it is normally safe to demote the node:
+Because the push was not acknowledged, it is normally safe to resolve this
-the user should have received an error anyway, and should not expect their push
+issue by demoting the device. Demoting the device will undo any changes
-to have worked. However, data is technically at risk and you may want to
+committed by the push, and they will be lost forever.
-investigate further and try to understand the issue in more detail before
+
 However, the user should have received an error anyway, and should not expect
 their push to have worked. Still, data is technically at risk and you may want
 to investigate further and try to understand the issue in more detail before
 continuing.
 There is no way to explicitly keep the write, but if it was committed to disk
-you can recover it manually from the working copy on the device and then push
+you can recover it manually from the working copy on the device (for example,
-it again.
+by using `git format-patch`) and then push it again after recovering.
-If you demote the node, the in-process write will be thrown away, even if it
+If you demote the device, the in-process write will be thrown away, even if it
-was complete on disk. To demote the node and release the write lock, run this
+was complete on disk. To demote the device and release the write lock, run this
 command:
 ```
-phabricator/ $ ./bin/repository thaw rXYZ --demote repo002.corp.net
+phabricator/ $ ./bin/repository thaw <repository> --demote <device>
 ```
 {icon exclamation-triangle, color="yellow"} Any committed but unacknowledged
@ -181,17 +212,18 @@ A more straightforward failure condition is the loss of all servers in a
 cluster which have the most up-to-date copy of a repository. This looks like
 this:
-  - There is a cluster setup with two nodes, X and Y.
+  - There is a cluster setup with two devices, X and Y.
  - A new change is pushed to server X.
  - Before the change can propagate to server Y, lightning strikes server X
    and destroys it.
-Here, all of the "leader" nodes with the most up-to-date copy of the repository
+Here, all of the "leader" devices with the most up-to-date copy of the
-have been lost. Phabricator will refuse to serve this repository because it
+repository have been lost. Phabricator will freeze the repository refuse to
-can not serve it consistently, and can not accept writes without data loss.
+serve requests because it can not serve it consistently, and can not accept new
 writes without data loss.
 The most straightforward way to resolve this issue is to restore any leader to
-service. The change will be able to replicate to other nodes once a leader
+service. The change will be able to replicate to other devices once a leader
 comes back online.
 If you are unable to restore a leader or unsure that you can restore one
@ -201,13 +233,20 @@ push logs.
 If you are comfortable discarding these changes, you can instruct Phabricator
 that it can forget about the leaders in two ways: disable the service bindings
-to all of the leader nodes so they are no longer part of the cluster, or
+to all of the leader devices so they are no longer part of the cluster, or use
-use `bin/repository thaw` to `--demote` the leaders explicitly.
+`bin/repository thaw` to `--demote` the leaders explicitly.
 If you do this, **you will lose data**. Either action will discard any changes
-on the affected leaders which have not replicated to other nodes in the cluster.
+on the affected leaders which have not replicated to other devices in the
 cluster.
-To demote a device, run this command:
+To remove a device from the cluster, disable all of the bindings to it
 in Almanac, using the web UI.
 {icon exclamation-triangle, color="red"} Any data which is only present on
 the disabled device will be lost.
 To demote a device without removing it from the cluster, run this command:
 ```
 phabricator/ $ ./bin/repository thaw rXYZ --demote repo002.corp.net
@ -220,24 +259,35 @@ phabricator/ $ ./bin/repository thaw rXYZ --demote repo002.corp.net
 Ambiguous Leaders
 =================
-Repository clusters can also freeze if the leader nodes are ambiguous. This
+Repository clusters can also freeze if the leader devices are ambiguous. This
 can happen if you replace an entire cluster with new devices suddenly, or
-make a mistake with the `--demote` flag.
+make a mistake with the `--demote` flag. This generally arises from some kind
 of operator error, like this:
-When Phabricator can not tell which node in a cluster is a leader, it freezes
+  - Someone accidentally uses `bin/repository thaw ... --demote` to demote
-the cluster because it is possible that some nodes have less data and others
+    every device in a cluster.
  - Someone accidentally deletes all the version information for a repository
    from the database by making a mistake with a `DELETE` or `UPDATE` query.
  - Someone accidentally disable all of the devices in a cluster, then add
    entirely new ones before repositories can propagate.
 When Phabricator can not tell which device in a cluster is a leader, it freezes
 the cluster because it is possible that some devices have less data and others
 have more, and if it choses a leader arbitrarily it may destroy some data
 which you would prefer to retain.
-To resolve this, you need to tell Phabricator which node has the most
+To resolve this, you need to tell Phabricator which device has the most
-up-to-date data and promote that node to become a leader. If you do this,
+up-to-date data and promote that device to become a leader. If you know all
-**you may lose data** if you promote the wrong node, and some other node
+devices have the same data, you are free to promote any device.
 really had more up-to-date data. If you want to double check, you can examine
 the working copies on disk before promoting, by connecting to the machines and
 using commands like `git log` to inspect state.
-Once you have identified a node which has data you're happy with, use
+If you promote a device, **you may lose data** if you promote the wrong device
-`bin/repository thaw` to `--promote` the device:
+and some other device really had more up-to-date data. If you want to double
 check, you can examine the working copies on disk before promoting by
 connecting to the machines and using commands like `git log` to inspect state.
 Once you have identified a device which has data you're happy with, use
 `bin/repository thaw` to `--promote` the device. The data on the chosen
 device will become authoritative:
 ```
 phabricator/ $ ./bin/repository thaw rXYZ --promote repo002.corp.net