From ff3d1769b47538630de7bf180a360f8cf835ca86 Mon Sep 17 00:00:00 2001 From: epriestley Date: Tue, 3 Sep 2019 10:38:41 -0700 Subject: [PATCH] Instead of retrying safe reads 3 times, retry each eligible service once Summary: Ref T13286. When retrying a read request, keep retrying as long as we have canididate services. Since we consume a service with each attempt, there's no real reason to abort early, and trying every service allows reads to always succeed even if (for example) 8 nodes of a 16-node cluster are dead because of a severed network link between datacenters. Test Plan: Ran `git pull` in a clustered repository with an up node and a down node; saw retry count dynamically adjust to available node count. Maniphest Tasks: T13286 Differential Revision: https://secure.phabricator.com/D20777 --- .../ssh/DiffusionGitUploadPackSSHWorkflow.php | 21 ++++++------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/src/applications/diffusion/ssh/DiffusionGitUploadPackSSHWorkflow.php b/src/applications/diffusion/ssh/DiffusionGitUploadPackSSHWorkflow.php index 3e8186190a..5c0e2588b7 100644 --- a/src/applications/diffusion/ssh/DiffusionGitUploadPackSSHWorkflow.php +++ b/src/applications/diffusion/ssh/DiffusionGitUploadPackSSHWorkflow.php @@ -126,8 +126,6 @@ final class DiffusionGitUploadPackSSHWorkflow ->setCommandChannelFromExecFuture($future) ->execute(); - $err = 1; - // TODO: Currently, when proxying, we do not write an event log on the // proxy. Perhaps we should write a "proxy log". This is not very useful // for statistics or auditing, but could be useful for diagnostics. @@ -144,14 +142,7 @@ final class DiffusionGitUploadPackSSHWorkflow // the same host. array_shift($refs); - // Check if we have more services we can try. If we do, we'll make an - // effort to fall back to them below. If not, we can't do anything to - // recover so just bail out. - if (!$refs) { - return $err; - } - - $should_retry = $this->shouldRetryRequest(); + $should_retry = $this->shouldRetryRequest($refs); if (!$should_retry) { return $err; } @@ -168,7 +159,7 @@ final class DiffusionGitUploadPackSSHWorkflow return $this; } - private function shouldRetryRequest() { + private function shouldRetryRequest(array $remaining_refs) { $this->requestFailures++; if ($this->requestFailures > $this->requestAttempts) { @@ -178,11 +169,11 @@ final class DiffusionGitUploadPackSSHWorkflow "missing call to \"didBeginRequest()\".\n")); } - $max_failures = 3; - if ($this->requestFailures >= $max_failures) { + if (!$remaining_refs) { $this->writeClusterEngineLogMessage( pht( - "# Reached maximum number of retry attempts, giving up.\n")); + "# All available services failed to serve the request, ". + "giving up.\n")); return false; } @@ -208,7 +199,7 @@ final class DiffusionGitUploadPackSSHWorkflow pht( "# Service request failed, retrying (making attempt %s of %s).\n", new PhutilNumber($this->requestAttempts + 1), - new PhutilNumber($max_failures))); + new PhutilNumber($this->requestAttempts + count($remaining_refs)))); return true; }