mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-29 10:12:41 +01:00
Adjust the Drydock allocator to limit each pending lease to one allocating resource
Summary: Ref T13677. Currently, one lease may cause multiple resources to allocate simultaneously if it starts allocating one, then wakes up from a yield later on and still sees no available resources. This is never desired -- or, at least, produces desirable behavior only entirely by accident. Normally, it causes an excess of resources to allocate. This is not a catastrophic problem: the extra resources usually get used sooner or later or cleaned up; and the total amount of badness is limited by overall resource allocation limits. However, this behavior is also suppressed by an artificial "25% of current pool size" growth limit throttle which I intend to remove. Removing this throttle without fixing the allocator behavior could make this "too many resources" problem worse. Change the allocator so that a lease that has started allocating a resource won't allocate another resource until the first resource leaves the "pending" state. This also fixes some general oddness with the allocator and attempts to simplify the structure. Test Plan: - Ran 8 taskmasters. - Destroyed all resources and leases. - Leased 4 working copies. - Saw exactly 4 resources build and lease, all simultaneously. - Destroyed all resources and leases. - Leased 32 working copies. - Saw exactly 32 resources build and lease, approximately 8 at a time (limited by taskmasters). - Destroyed all leases (but not resources). - Leased 32 working copies, saw them satisfied by existing resources. Subscribers: PHID-OPKG-gm6ozazyms6q6i22gyam Maniphest Tasks: T13677 Differential Revision: https://secure.phabricator.com/D21807
This commit is contained in:
parent
13a37282bc
commit
62360ea406
2 changed files with 339 additions and 195 deletions
|
@ -188,35 +188,49 @@ final class DrydockLeaseUpdateWorker extends DrydockWorker {
|
|||
|
||||
// First, try to find a suitable open resource which we can acquire a new
|
||||
// lease on.
|
||||
$resources = $this->loadResourcesForAllocatingLease($blueprints, $lease);
|
||||
$resources = $this->loadAcquirableResourcesForLease($blueprints, $lease);
|
||||
|
||||
// If no resources exist yet, see if we can build one.
|
||||
if (!$resources) {
|
||||
$usable_blueprints = $this->removeOverallocatedBlueprints(
|
||||
$blueprints,
|
||||
$lease);
|
||||
list($free_resources, $used_resources) = $this->partitionResources(
|
||||
$lease,
|
||||
$resources);
|
||||
|
||||
// If we get nothing back here, some blueprint claims it can eventually
|
||||
// satisfy the lease, just not right now. This is a temporary failure,
|
||||
// and we expect allocation to succeed eventually.
|
||||
if (!$usable_blueprints) {
|
||||
$blueprints = $this->rankBlueprints($blueprints, $lease);
|
||||
|
||||
// Try to actively reclaim unused resources. If we succeed, jump back
|
||||
// into the queue in an effort to claim it.
|
||||
foreach ($blueprints as $blueprint) {
|
||||
$reclaimed = $this->reclaimResources($blueprint, $lease);
|
||||
if ($reclaimed) {
|
||||
$lease->logEvent(
|
||||
DrydockLeaseReclaimLogType::LOGCONST,
|
||||
array(
|
||||
'resourcePHIDs' => array($reclaimed->getPHID()),
|
||||
));
|
||||
|
||||
throw new PhabricatorWorkerYieldException(15);
|
||||
$resource = $this->leaseAnyResource($lease, $free_resources);
|
||||
if ($resource) {
|
||||
return $resource;
|
||||
}
|
||||
|
||||
// We're about to try creating a resource. If we're already creating
|
||||
// something, just yield until that resolves.
|
||||
|
||||
$this->yieldForPendingResources($lease);
|
||||
|
||||
// We haven't been able to lease an existing resource yet, so now we try to
|
||||
// create one. We may still have some less-desirable "used" resources that
|
||||
// we'll sometimes try to lease later if we fail to allocate a new resource.
|
||||
|
||||
$resource = $this->newLeasedResource($lease, $blueprints);
|
||||
if ($resource) {
|
||||
return $resource;
|
||||
}
|
||||
|
||||
// We haven't been able to lease a desirable "free" resource or create a
|
||||
// new resource. Try to lease a "used" resource.
|
||||
|
||||
$resource = $this->leaseAnyResource($lease, $used_resources);
|
||||
if ($resource) {
|
||||
return $resource;
|
||||
}
|
||||
|
||||
// If this lease has already triggered a reclaim, just yield and wait for
|
||||
// it to resolve.
|
||||
$this->yieldForReclaimingResources($lease);
|
||||
|
||||
// Try to reclaim a resource. This will yield if it reclaims something.
|
||||
$this->reclaimAnyResource($lease, $blueprints);
|
||||
|
||||
// We weren't able to lease, create, or reclaim any resources. We just have
|
||||
// to wait for resources to become available.
|
||||
|
||||
$lease->logEvent(
|
||||
DrydockLeaseWaitingForResourcesLogType::LOGCONST,
|
||||
array(
|
||||
|
@ -226,10 +240,212 @@ final class DrydockLeaseUpdateWorker extends DrydockWorker {
|
|||
throw new PhabricatorWorkerYieldException(15);
|
||||
}
|
||||
|
||||
private function reclaimAnyResource(DrydockLease $lease, array $blueprints) {
|
||||
assert_instances_of($blueprints, 'DrydockBlueprint');
|
||||
|
||||
$blueprints = $this->rankBlueprints($blueprints, $lease);
|
||||
|
||||
// Try to actively reclaim unused resources. If we succeed, jump back
|
||||
// into the queue in an effort to claim it.
|
||||
|
||||
foreach ($blueprints as $blueprint) {
|
||||
$reclaimed = $this->reclaimResources($blueprint, $lease);
|
||||
if ($reclaimed) {
|
||||
|
||||
$lease->logEvent(
|
||||
DrydockLeaseReclaimLogType::LOGCONST,
|
||||
array(
|
||||
'resourcePHIDs' => array($reclaimed->getPHID()),
|
||||
));
|
||||
|
||||
// Yield explicitly here: we'll be awakened when the resource is
|
||||
// reclaimed.
|
||||
|
||||
throw new PhabricatorWorkerYieldException(15);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function yieldForPendingResources(DrydockLease $lease) {
|
||||
// See T13677. If this lease has already triggered the allocation of
|
||||
// one or more resources and they are still pending, just yield and
|
||||
// wait for them.
|
||||
|
||||
$viewer = $this->getViewer();
|
||||
|
||||
$phids = $lease->getAllocatedResourcePHIDs();
|
||||
if (!$phids) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$resources = id(new DrydockResourceQuery())
|
||||
->setViewer($viewer)
|
||||
->withPHIDs($phids)
|
||||
->withStatuses(
|
||||
array(
|
||||
DrydockResourceStatus::STATUS_PENDING,
|
||||
))
|
||||
->setLimit(1)
|
||||
->execute();
|
||||
if (!$resources) {
|
||||
return;
|
||||
}
|
||||
|
||||
$lease->logEvent(
|
||||
DrydockLeaseWaitingForActivationLogType::LOGCONST,
|
||||
array(
|
||||
'resourcePHIDs' => mpull($resources, 'getPHID'),
|
||||
));
|
||||
|
||||
throw new PhabricatorWorkerYieldException(15);
|
||||
}
|
||||
|
||||
private function yieldForReclaimingResources(DrydockLease $lease) {
|
||||
$viewer = $this->getViewer();
|
||||
|
||||
$phids = $lease->getReclaimedResourcePHIDs();
|
||||
if (!$phids) {
|
||||
return;
|
||||
}
|
||||
|
||||
$resources = id(new DrydockResourceQuery())
|
||||
->setViewer($viewer)
|
||||
->withPHIDs($phids)
|
||||
->withStatuses(
|
||||
array(
|
||||
DrydockResourceStatus::STATUS_ACTIVE,
|
||||
))
|
||||
->setLimit(1)
|
||||
->execute();
|
||||
if (!$resources) {
|
||||
return;
|
||||
}
|
||||
|
||||
$lease->logEvent(
|
||||
DrydockLeaseWaitingForReclamationLogType::LOGCONST,
|
||||
array(
|
||||
'resourcePHIDs' => mpull($resources, 'getPHID'),
|
||||
));
|
||||
|
||||
throw new PhabricatorWorkerYieldException(15);
|
||||
}
|
||||
|
||||
private function newLeasedResource(
|
||||
DrydockLease $lease,
|
||||
array $blueprints) {
|
||||
assert_instances_of($blueprints, 'DrydockBlueprint');
|
||||
|
||||
$usable_blueprints = $this->removeOverallocatedBlueprints(
|
||||
$blueprints,
|
||||
$lease);
|
||||
|
||||
// If we get nothing back here, some blueprint claims it can eventually
|
||||
// satisfy the lease, just not right now. This is a temporary failure,
|
||||
// and we expect allocation to succeed eventually.
|
||||
|
||||
// Return, try to lease a "used" resource, and continue from there.
|
||||
|
||||
if (!$usable_blueprints) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$usable_blueprints = $this->rankBlueprints($usable_blueprints, $lease);
|
||||
|
||||
$new_resources = $this->newResources($lease, $usable_blueprints);
|
||||
if (!$new_resources) {
|
||||
// If we were unable to create any new resources, return and
|
||||
// try to lease a "used" resource.
|
||||
return null;
|
||||
}
|
||||
|
||||
$new_resources = $this->removeUnacquirableResources(
|
||||
$new_resources,
|
||||
$lease);
|
||||
if (!$new_resources) {
|
||||
// If we make it here, we just built a resource but aren't allowed
|
||||
// to acquire it. We expect this to happen if the resource prevents
|
||||
// acquisition until it activates, which is common when a resource
|
||||
// needs to perform setup steps.
|
||||
|
||||
// Explicitly yield and wait for activation, since we don't want to
|
||||
// lease a "used" resource.
|
||||
|
||||
throw new PhabricatorWorkerYieldException(15);
|
||||
}
|
||||
|
||||
$resource = $this->leaseAnyResource($lease, $new_resources);
|
||||
if ($resource) {
|
||||
return $resource;
|
||||
}
|
||||
|
||||
// We may not be able to lease a resource even if we just built it:
|
||||
// another process may snatch it up before we can lease it. This should
|
||||
// be rare, but is not concerning. Just try to build another resource.
|
||||
|
||||
// We likely could try to build the next resource immediately, but err on
|
||||
// the side of caution and yield for now, at least until this code is
|
||||
// better vetted.
|
||||
|
||||
throw new PhabricatorWorkerYieldException(15);
|
||||
}
|
||||
|
||||
private function partitionResources(
|
||||
DrydockLease $lease,
|
||||
array $resources) {
|
||||
|
||||
assert_instances_of($resources, 'DrydockResource');
|
||||
$viewer = $this->getViewer();
|
||||
|
||||
$lease_statuses = array(
|
||||
DrydockLeaseStatus::STATUS_PENDING,
|
||||
DrydockLeaseStatus::STATUS_ACQUIRED,
|
||||
DrydockLeaseStatus::STATUS_ACTIVE,
|
||||
);
|
||||
|
||||
// Partition resources into "free" resources (which we can try to lease
|
||||
// immediately) and "used" resources, which we can only to lease after we
|
||||
// fail to allocate a new resource.
|
||||
|
||||
// "Free" resources are unleased and/or prefer reuse over allocation.
|
||||
// "Used" resources are leased and prefer allocation over reuse.
|
||||
|
||||
$free_resources = array();
|
||||
$used_resources = array();
|
||||
|
||||
foreach ($resources as $resource) {
|
||||
$blueprint = $resource->getBlueprint();
|
||||
|
||||
if (!$blueprint->shouldAllocateSupplementalResource($resource, $lease)) {
|
||||
$free_resources[] = $resource;
|
||||
continue;
|
||||
}
|
||||
|
||||
$leases = id(new DrydockLeaseQuery())
|
||||
->setViewer($viewer)
|
||||
->withResourcePHIDs(array($resource->getPHID()))
|
||||
->withStatuses($lease_statuses)
|
||||
->setLimit(1)
|
||||
->execute();
|
||||
if (!$leases) {
|
||||
$free_resources[] = $resource;
|
||||
continue;
|
||||
}
|
||||
|
||||
$used_resources[] = $resource;
|
||||
}
|
||||
|
||||
return array($free_resources, $used_resources);
|
||||
}
|
||||
|
||||
private function newResources(
|
||||
DrydockLease $lease,
|
||||
array $blueprints) {
|
||||
assert_instances_of($blueprints, 'DrydockBlueprint');
|
||||
|
||||
$resources = array();
|
||||
$exceptions = array();
|
||||
foreach ($usable_blueprints as $blueprint) {
|
||||
foreach ($blueprints as $blueprint) {
|
||||
$caught = null;
|
||||
try {
|
||||
$resources[] = $this->allocateResource($blueprint, $lease);
|
||||
|
||||
|
@ -237,6 +453,12 @@ final class DrydockLeaseUpdateWorker extends DrydockWorker {
|
|||
// this.
|
||||
break;
|
||||
} catch (Exception $ex) {
|
||||
$caught = $ex;
|
||||
} catch (Throwable $ex) {
|
||||
$caught = $ex;
|
||||
}
|
||||
|
||||
if ($caught) {
|
||||
// This failure is not normally expected, so log it. It can be
|
||||
// caused by something mundane and recoverable, however (see below
|
||||
// for discussion).
|
||||
|
@ -250,18 +472,18 @@ final class DrydockLeaseUpdateWorker extends DrydockWorker {
|
|||
$blueprint->logEvent(
|
||||
DrydockResourceAllocationFailureLogType::LOGCONST,
|
||||
array(
|
||||
'class' => get_class($ex),
|
||||
'message' => $ex->getMessage(),
|
||||
'class' => get_class($caught),
|
||||
'message' => $caught->getMessage(),
|
||||
));
|
||||
|
||||
$exceptions[] = $ex;
|
||||
$exceptions[] = $caught;
|
||||
}
|
||||
}
|
||||
|
||||
if (!$resources) {
|
||||
// If one or more blueprints claimed that they would be able to
|
||||
// allocate resources but none are actually able to allocate resources,
|
||||
// log the failure and yield so we try again soon.
|
||||
// If one or more blueprints claimed that they would be able to allocate
|
||||
// resources but none are actually able to allocate resources, log the
|
||||
// failure and yield so we try again soon.
|
||||
|
||||
// This can happen if some unexpected issue occurs during allocation
|
||||
// (for example, a call to build a VM fails for some reason) or if we
|
||||
|
@ -281,44 +503,45 @@ final class DrydockLeaseUpdateWorker extends DrydockWorker {
|
|||
'message' => $ex->getMessage(),
|
||||
));
|
||||
|
||||
throw new PhabricatorWorkerYieldException(15);
|
||||
return null;
|
||||
}
|
||||
|
||||
$resources = $this->removeUnacquirableResources($resources, $lease);
|
||||
return $resources;
|
||||
}
|
||||
|
||||
|
||||
private function leaseAnyResource(
|
||||
DrydockLease $lease,
|
||||
array $resources) {
|
||||
assert_instances_of($resources, 'DrydockResource');
|
||||
|
||||
if (!$resources) {
|
||||
// If we make it here, we just built a resource but aren't allowed
|
||||
// to acquire it. We expect this during routine operation if the
|
||||
// resource prevents acquisition until it activates. Yield and wait
|
||||
// for activation.
|
||||
throw new PhabricatorWorkerYieldException(15);
|
||||
}
|
||||
|
||||
// NOTE: We have not acquired the lease yet, so it is possible that the
|
||||
// resource we just built will be snatched up by some other lease before
|
||||
// we can acquire it. This is not problematic: we'll retry a little later
|
||||
// and should succeed eventually.
|
||||
return null;
|
||||
}
|
||||
|
||||
$resources = $this->rankResources($resources, $lease);
|
||||
|
||||
$exceptions = array();
|
||||
$yields = array();
|
||||
$allocated = false;
|
||||
|
||||
$allocated = null;
|
||||
foreach ($resources as $resource) {
|
||||
try {
|
||||
$resource = $this->newResourceForAcquisition($resource, $lease);
|
||||
$this->acquireLease($resource, $lease);
|
||||
$allocated = true;
|
||||
$allocated = $resource;
|
||||
break;
|
||||
} catch (DrydockResourceLockException $ex) {
|
||||
// We need to lock the resource to actually acquire it. If we aren't
|
||||
// able to acquire the lock quickly enough, we can yield and try again
|
||||
// later.
|
||||
$yields[] = $ex;
|
||||
} catch (DrydockSlotLockException $ex) {
|
||||
// This also just indicates we ran into some kind of contention,
|
||||
// probably from another lease. Just yield.
|
||||
$yields[] = $ex;
|
||||
} catch (DrydockAcquiredBrokenResourceException $ex) {
|
||||
// If a resource was reclaimed or destroyed by the time we actually
|
||||
// got around to acquiring it, we just got unlucky. We can yield and
|
||||
// try again later.
|
||||
// got around to acquiring it, we just got unlucky.
|
||||
$yields[] = $ex;
|
||||
} catch (PhabricatorWorkerYieldException $ex) {
|
||||
// We can be told to yield, particularly by the supplemental allocator
|
||||
|
@ -329,18 +552,20 @@ final class DrydockLeaseUpdateWorker extends DrydockWorker {
|
|||
}
|
||||
}
|
||||
|
||||
if (!$allocated) {
|
||||
if ($allocated) {
|
||||
return $allocated;
|
||||
}
|
||||
|
||||
if ($yields) {
|
||||
throw new PhabricatorWorkerYieldException(15);
|
||||
} else {
|
||||
}
|
||||
|
||||
throw new PhutilAggregateException(
|
||||
pht(
|
||||
'Unable to acquire lease "%s" on any resource.',
|
||||
$lease->getPHID()),
|
||||
$exceptions);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
|
@ -426,7 +651,7 @@ final class DrydockLeaseUpdateWorker extends DrydockWorker {
|
|||
* the lease.
|
||||
* @task allocator
|
||||
*/
|
||||
private function loadResourcesForAllocatingLease(
|
||||
private function loadAcquirableResourcesForLease(
|
||||
array $blueprints,
|
||||
DrydockLease $lease) {
|
||||
assert_instances_of($blueprints, 'DrydockBlueprint');
|
||||
|
@ -438,7 +663,6 @@ final class DrydockLeaseUpdateWorker extends DrydockWorker {
|
|||
->withTypes(array($lease->getResourceType()))
|
||||
->withStatuses(
|
||||
array(
|
||||
DrydockResourceStatus::STATUS_PENDING,
|
||||
DrydockResourceStatus::STATUS_ACTIVE,
|
||||
))
|
||||
->execute();
|
||||
|
@ -558,6 +782,13 @@ final class DrydockLeaseUpdateWorker extends DrydockWorker {
|
|||
// If this resource was allocated as a pending resource, queue a task to
|
||||
// activate it.
|
||||
if ($resource->getStatus() == DrydockResourceStatus::STATUS_PENDING) {
|
||||
|
||||
$lease->addAllocatedResourcePHIDs(
|
||||
array(
|
||||
$resource->getPHID(),
|
||||
));
|
||||
$lease->save();
|
||||
|
||||
PhabricatorWorker::scheduleTask(
|
||||
'DrydockResourceUpdateWorker',
|
||||
array(
|
||||
|
@ -636,26 +867,6 @@ final class DrydockLeaseUpdateWorker extends DrydockWorker {
|
|||
DrydockLease $lease) {
|
||||
$viewer = $this->getViewer();
|
||||
|
||||
// If this lease is marked as already in the process of reclaiming a
|
||||
// resource, don't let it reclaim another one until the first reclaim
|
||||
// completes. This stops one lease from reclaiming a large number of
|
||||
// resources if the reclaims take a while to complete.
|
||||
$reclaiming_phid = $lease->getAttribute('drydock.reclaimingPHID');
|
||||
if ($reclaiming_phid) {
|
||||
$reclaiming_resource = id(new DrydockResourceQuery())
|
||||
->setViewer($viewer)
|
||||
->withPHIDs(array($reclaiming_phid))
|
||||
->withStatuses(
|
||||
array(
|
||||
DrydockResourceStatus::STATUS_ACTIVE,
|
||||
DrydockResourceStatus::STATUS_RELEASED,
|
||||
))
|
||||
->executeOne();
|
||||
if ($reclaiming_resource) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
$resources = id(new DrydockResourceQuery())
|
||||
->setViewer($viewer)
|
||||
->withBlueprintPHIDs(array($blueprint->getPHID()))
|
||||
|
@ -754,73 +965,6 @@ final class DrydockLeaseUpdateWorker extends DrydockWorker {
|
|||
}
|
||||
}
|
||||
|
||||
private function newResourceForAcquisition(
|
||||
DrydockResource $resource,
|
||||
DrydockLease $lease) {
|
||||
|
||||
// If the resource has no leases against it, never build a new one. This is
|
||||
// likely already a new resource that just activated.
|
||||
$viewer = $this->getViewer();
|
||||
|
||||
$statuses = array(
|
||||
DrydockLeaseStatus::STATUS_PENDING,
|
||||
DrydockLeaseStatus::STATUS_ACQUIRED,
|
||||
DrydockLeaseStatus::STATUS_ACTIVE,
|
||||
);
|
||||
|
||||
$leases = id(new DrydockLeaseQuery())
|
||||
->setViewer($viewer)
|
||||
->withResourcePHIDs(array($resource->getPHID()))
|
||||
->withStatuses($statuses)
|
||||
->setLimit(1)
|
||||
->execute();
|
||||
if (!$leases) {
|
||||
return $resource;
|
||||
}
|
||||
|
||||
// If we're about to get a lease on a resource, check if the blueprint
|
||||
// wants to allocate a supplemental resource. If it does, try to perform a
|
||||
// new allocation instead.
|
||||
$blueprint = $resource->getBlueprint();
|
||||
if (!$blueprint->shouldAllocateSupplementalResource($resource, $lease)) {
|
||||
return $resource;
|
||||
}
|
||||
|
||||
// If the blueprint is already overallocated, we can't allocate a new
|
||||
// resource. Just return the existing resource.
|
||||
$remaining = $this->removeOverallocatedBlueprints(
|
||||
array($blueprint),
|
||||
$lease);
|
||||
if (!$remaining) {
|
||||
return $resource;
|
||||
}
|
||||
|
||||
// Try to build a new resource.
|
||||
try {
|
||||
$new_resource = $this->allocateResource($blueprint, $lease);
|
||||
} catch (Exception $ex) {
|
||||
$blueprint->logEvent(
|
||||
DrydockResourceAllocationFailureLogType::LOGCONST,
|
||||
array(
|
||||
'class' => get_class($ex),
|
||||
'message' => $ex->getMessage(),
|
||||
));
|
||||
|
||||
return $resource;
|
||||
}
|
||||
|
||||
// If we can't actually acquire the new resource yet, just yield.
|
||||
// (We could try to move forward with the original resource instead.)
|
||||
$acquirable = $this->removeUnacquirableResources(
|
||||
array($new_resource),
|
||||
$lease);
|
||||
if (!$acquirable) {
|
||||
throw new PhabricatorWorkerYieldException(15);
|
||||
}
|
||||
|
||||
return $new_resource;
|
||||
}
|
||||
|
||||
|
||||
/* -( Activating Leases )-------------------------------------------------- */
|
||||
|
||||
|
|
|
@ -260,7 +260,7 @@ abstract class DrydockWorker extends PhabricatorWorker {
|
|||
// Mark the lease as reclaiming this resource. It won't be allowed to start
|
||||
// another reclaim as long as this resource is still in the process of
|
||||
// being reclaimed.
|
||||
$lease->setAttribute('drydock.reclaimingPHID', $resource->getPHID());
|
||||
$lease->addReclaimedResourcePHIDs(array($resource->getPHID()));
|
||||
|
||||
// When the resource releases, we we want to reawaken this task since it
|
||||
// should (usually) be able to start building a new resource right away.
|
||||
|
|
Loading…
Reference in a new issue