From 22c64c67ffca605885a4ae00a325a1ab45e28139 Mon Sep 17 00:00:00 2001 From: epriestley Date: Tue, 22 Jan 2013 12:27:18 -0800 Subject: [PATCH] Fix performance problem for large task queues Summary: Some time ago, we added `ORDER BY id ASC` to the worker `UPDATE ...` query, because someone reported that their MySQL read slaves were complaining about the query (I can't find the exact error message, but something to the effect of the rows the query affected not being deterministic). This seemed harmless since it should be the same as the query's implicit order (I guess?), but actually made the query dramatically slower for large numbers of rows. On my local machine, this query takes about 2 seconds with ~1M rows. If I run `SELECT`, or run `UPDATE` without ORDER BY, the query takes < 0.01s. I don't understand exactly what's happening -- my guess is something to do with the ORDER BY implying that a lot of rows need to be locked? In T2372, a user is seeing 20-60s rumtimes on this query. I solved this by doing a SELECT, followed by an UPDATE. Each query runs quickly. This introduces the possibility of a race (two processes SELECT the same rows, then try to UPDATE), which we currently recover from by having the second UPDATE fail and then having that daemon try again 1 second later. This seems generally reasonable. Some alternatives I considered: - We could SELECT ... LOCK FOR UPDATE, but failing and retrying a little later seems at least as good as blocking. - We could select more rows than we need, and then try to lock some of them randomly. I think this would work well, but it's a bit more complex than what we're doing now so I left it until we have a clearer need. Test Plan: Inserted ~1M tasks into the queue. Ran `phd debug taskmaster`, saw ~2s task updates. Applied patch. Ran `phd debug taskmaster`, saw <1ms updates. Ran `phd launch 8 taskmaster`, saw rapid completion of tasks. This stuff also has fairly thorough unit test coverage. Reviewers: vrana, btrahan Reviewed By: vrana CC: aran Maniphest Tasks: T2372 Differential Revision: https://secure.phabricator.com/D4576 --- .../query/PhabricatorWorkerLeaseQuery.php | 82 +++++++++++++++++-- 1 file changed, 73 insertions(+), 9 deletions(-) diff --git a/src/infrastructure/daemon/workers/query/PhabricatorWorkerLeaseQuery.php b/src/infrastructure/daemon/workers/query/PhabricatorWorkerLeaseQuery.php index 8cbf4eaf8b..d4996e3e35 100644 --- a/src/infrastructure/daemon/workers/query/PhabricatorWorkerLeaseQuery.php +++ b/src/infrastructure/daemon/workers/query/PhabricatorWorkerLeaseQuery.php @@ -48,21 +48,43 @@ final class PhabricatorWorkerLeaseQuery extends PhabricatorQuery { $leased = 0; foreach ($phases as $phase) { - queryfx( + + // NOTE: If we issue `UPDATE ... WHERE ... ORDER BY id ASC`, the query + // goes very, very slowly. The `ORDER BY` triggers this, although we get + // the same apparent results without it. Without the ORDER BY, binary + // read slaves complain that the query isn't repeatable. To avoid both + // problems, do a SELECT and then an UPDATE. + + $rows = queryfx_all( $conn_w, - 'UPDATE %T task - SET leaseOwner = %s, leaseExpires = UNIX_TIMESTAMP() + %d - %Q %Q %Q', + 'SELECT id, leaseOwner FROM %T %Q %Q %Q', $task_table->getTableName(), - $lease_ownership_name, - self::DEFAULT_LEASE_DURATION, $this->buildWhereClause($conn_w, $phase), $this->buildOrderClause($conn_w), $this->buildLimitClause($conn_w, $limit - $leased)); - $leased += $conn_w->getAffectedRows(); - if ($leased == $limit) { - break; + // NOTE: Sometimes, we'll race with another worker and they'll grab + // this task before we do. We could reduce how often this happens by + // selecting more tasks than we need, then shuffling them and trying + // to lock only the number we're actually after. However, the amount + // of time workers spend here should be very small relative to their + // total runtime, so keep it simple for the moment. + + if ($rows) { + queryfx( + $conn_w, + 'UPDATE %T task + SET leaseOwner = %s, leaseExpires = UNIX_TIMESTAMP() + %d + %Q', + $task_table->getTableName(), + $lease_ownership_name, + self::DEFAULT_LEASE_DURATION, + $this->buildUpdateWhereClause($conn_w, $phase, $rows)); + + $leased += $conn_w->getAffectedRows(); + if ($leased == $limit) { + break; + } } } @@ -124,6 +146,48 @@ final class PhabricatorWorkerLeaseQuery extends PhabricatorQuery { return $this->formatWhereClause($where); } + private function buildUpdateWhereClause( + AphrontDatabaseConnection $conn_w, + $phase, + array $rows) { + + $where = array(); + + // NOTE: This is basically working around the MySQL behavior that + // `IN (NULL)` doesn't match NULL. + + switch ($phase) { + case self::PHASE_UNLEASED: + $where[] = qsprintf( + $conn_w, + 'leaseOwner IS NULL'); + $where[] = qsprintf( + $conn_w, + 'id IN (%Ld)', + ipull($rows, 'id')); + break; + case self::PHASE_EXPIRED: + $in = array(); + foreach ($rows as $row) { + $in[] = qsprintf( + $conn_w, + '(%d, %s)', + $row['id'], + $row['leaseOwner']); + } + $where[] = qsprintf( + $conn_w, + '(id, leaseOwner) IN (%Q)', + '('.implode(', ', $in).')'); + break; + default: + throw new Exception("Unknown phase '{$phase}'!"); + } + + return $this->formatWhereClause($where); + + } + private function buildOrderClause(AphrontDatabaseConnection $conn_w) { return qsprintf($conn_w, 'ORDER BY id ASC'); }