mirror of
https://we.phorge.it/source/phorge.git
synced 2025-01-05 12:21:02 +01:00
(stable) When repositories hit pull errors, stop updating them as frequently
Summary: Ref T11665. Currently, when a repository hits an error, we retry it after 15s. This is correct if the error was temporary/transient/config-related (e.g., bad network or administrator setting up credentials) but not so great if the error is long-lasting (completely bad authentication, invalid URI, etc), as it can pile up to a meaningful amount of unnecessary load over time. Instead, record how many times in a row we've hit an error and adjust backoff behavior: first error is 15s, then 30s, 45s, etc. Additionally, when computing the backoff for an empty repository, use the repository creation time as though it was the most recent commit. This is a good proxy which gives us reasonable backoff behavior. This required removing the `CODE_WORKING` messages, since they would have reset the error count. We could restore them (as a different type of message), but I think they aren't particularly useful since cloning usually doesn't take too long and there's more status information avilable now than there was when this stuff was written. Test Plan: - Ran `bin/phd debug pull`. - Saw sensible, increasing backoffs selected for repositories with errors. - Saw sensible backoffs selected for empty repositories. Reviewers: chad Maniphest Tasks: T11665 Differential Revision: https://secure.phabricator.com/D16575
This commit is contained in:
parent
554940b33f
commit
894d0dc51b
5 changed files with 73 additions and 37 deletions
2
resources/sql/autopatches/20160919.repo.messagecount.sql
Normal file
2
resources/sql/autopatches/20160919.repo.messagecount.sql
Normal file
|
@ -0,0 +1,2 @@
|
|||
ALTER TABLE {$NAMESPACE}_repository.repository_statusmessage
|
||||
ADD messageCount INT UNSIGNED NOT NULL;
|
|
@ -359,20 +359,13 @@ final class DiffusionRepositoryStatusManagementPanel
|
|||
return $view;
|
||||
}
|
||||
break;
|
||||
case PhabricatorRepositoryStatusMessage::CODE_WORKING:
|
||||
default:
|
||||
$view->addItem(
|
||||
id(new PHUIStatusItemView())
|
||||
->setIcon(PHUIStatusItemView::ICON_CLOCK, 'green')
|
||||
->setTarget(pht('Initializing Working Copy'))
|
||||
->setNote(pht('Daemons are initializing the working copy.')));
|
||||
return $view;
|
||||
default:
|
||||
$view->addItem(
|
||||
id(new PHUIStatusItemView())
|
||||
->setIcon(PHUIStatusItemView::ICON_WARNING, 'red')
|
||||
->setTarget(pht('Unknown Init Status'))
|
||||
->setNote($message->getStatusCode()));
|
||||
return $view;
|
||||
}
|
||||
} else {
|
||||
$view->addItem(
|
||||
|
|
|
@ -172,8 +172,6 @@ final class PhabricatorRepositoryPullEngine
|
|||
}
|
||||
|
||||
private function logPull($message) {
|
||||
$code_working = PhabricatorRepositoryStatusMessage::CODE_WORKING;
|
||||
$this->updateRepositoryInitStatus($code_working, $message);
|
||||
$this->log('%s', $message);
|
||||
}
|
||||
|
||||
|
|
|
@ -1649,21 +1649,33 @@ final class PhabricatorRepository extends PhabricatorRepositoryDAO
|
|||
$this->getID(),
|
||||
$status_type);
|
||||
} else {
|
||||
// If the existing message has the same code (e.g., we just hit an
|
||||
// error and also previously hit an error) we increment the message
|
||||
// count by 1. This allows us to determine how many times in a row
|
||||
// we've run into an error.
|
||||
|
||||
queryfx(
|
||||
$conn_w,
|
||||
'INSERT INTO %T
|
||||
(repositoryID, statusType, statusCode, parameters, epoch)
|
||||
VALUES (%d, %s, %s, %s, %d)
|
||||
(repositoryID, statusType, statusCode, parameters, epoch,
|
||||
messageCount)
|
||||
VALUES (%d, %s, %s, %s, %d, %d)
|
||||
ON DUPLICATE KEY UPDATE
|
||||
statusCode = VALUES(statusCode),
|
||||
parameters = VALUES(parameters),
|
||||
epoch = VALUES(epoch)',
|
||||
epoch = VALUES(epoch),
|
||||
messageCount =
|
||||
IF(
|
||||
statusCode = VALUES(statusCode),
|
||||
messageCount + 1,
|
||||
VALUES(messageCount))',
|
||||
$table_name,
|
||||
$this->getID(),
|
||||
$status_type,
|
||||
$status_code,
|
||||
json_encode($parameters),
|
||||
time());
|
||||
time(),
|
||||
1);
|
||||
}
|
||||
|
||||
return $this;
|
||||
|
@ -1738,6 +1750,33 @@ final class PhabricatorRepository extends PhabricatorRepositoryDAO
|
|||
* @return int Repository update interval, in seconds.
|
||||
*/
|
||||
public function loadUpdateInterval($minimum = 15) {
|
||||
// First, check if we've hit errors recently. If we have, wait one period
|
||||
// for each consecutive error. Normally, this corresponds to a backoff of
|
||||
// 15s, 30s, 45s, etc.
|
||||
|
||||
$message_table = new PhabricatorRepositoryStatusMessage();
|
||||
$conn = $message_table->establishConnection('r');
|
||||
$error_count = queryfx_one(
|
||||
$conn,
|
||||
'SELECT MAX(messageCount) error_count FROM %T
|
||||
WHERE repositoryID = %d
|
||||
AND statusType IN (%Ls)
|
||||
AND statusCode IN (%Ls)',
|
||||
$message_table->getTableName(),
|
||||
$this->getID(),
|
||||
array(
|
||||
PhabricatorRepositoryStatusMessage::TYPE_INIT,
|
||||
PhabricatorRepositoryStatusMessage::TYPE_FETCH,
|
||||
),
|
||||
array(
|
||||
PhabricatorRepositoryStatusMessage::CODE_ERROR,
|
||||
));
|
||||
|
||||
$error_count = (int)$error_count['error_count'];
|
||||
if ($error_count > 0) {
|
||||
return (int)($minimum * $error_count);
|
||||
}
|
||||
|
||||
// If a repository is still importing, always pull it as frequently as
|
||||
// possible. This prevents us from hanging for a long time at 99.9% when
|
||||
// importing an inactive repository.
|
||||
|
@ -1758,31 +1797,34 @@ final class PhabricatorRepository extends PhabricatorRepositoryDAO
|
|||
$window_start);
|
||||
if ($last_commit) {
|
||||
$time_since_commit = ($window_start - $last_commit['epoch']);
|
||||
|
||||
$last_few_days = phutil_units('3 days in seconds');
|
||||
|
||||
if ($time_since_commit <= $last_few_days) {
|
||||
// For repositories with activity in the recent past, we wait one
|
||||
// extra second for every 10 minutes since the last commit. This
|
||||
// shorter backoff is intended to handle weekends and other short
|
||||
// breaks from development.
|
||||
$smart_wait = ($time_since_commit / 600);
|
||||
} else {
|
||||
// For repositories without recent activity, we wait one extra second
|
||||
// for every 4 minutes since the last commit. This longer backoff
|
||||
// handles rarely used repositories, up to the maximum.
|
||||
$smart_wait = ($time_since_commit / 240);
|
||||
}
|
||||
|
||||
// We'll never wait more than 6 hours to pull a repository.
|
||||
$longest_wait = phutil_units('6 hours in seconds');
|
||||
$smart_wait = min($smart_wait, $longest_wait);
|
||||
|
||||
$smart_wait = max($minimum, $smart_wait);
|
||||
} else {
|
||||
$smart_wait = $minimum;
|
||||
// If the repository has no commits, treat the creation date as
|
||||
// though it were the date of the last commit. This makes empty
|
||||
// repositories update quickly at first but slow down over time
|
||||
// if they don't see any activity.
|
||||
$time_since_commit = ($window_start - $this->getDateCreated());
|
||||
}
|
||||
|
||||
$last_few_days = phutil_units('3 days in seconds');
|
||||
|
||||
if ($time_since_commit <= $last_few_days) {
|
||||
// For repositories with activity in the recent past, we wait one
|
||||
// extra second for every 10 minutes since the last commit. This
|
||||
// shorter backoff is intended to handle weekends and other short
|
||||
// breaks from development.
|
||||
$smart_wait = ($time_since_commit / 600);
|
||||
} else {
|
||||
// For repositories without recent activity, we wait one extra second
|
||||
// for every 4 minutes since the last commit. This longer backoff
|
||||
// handles rarely used repositories, up to the maximum.
|
||||
$smart_wait = ($time_since_commit / 240);
|
||||
}
|
||||
|
||||
// We'll never wait more than 6 hours to pull a repository.
|
||||
$longest_wait = phutil_units('6 hours in seconds');
|
||||
$smart_wait = min($smart_wait, $longest_wait);
|
||||
$smart_wait = max($minimum, $smart_wait);
|
||||
|
||||
return (int)$smart_wait;
|
||||
}
|
||||
|
||||
|
|
|
@ -8,7 +8,6 @@ final class PhabricatorRepositoryStatusMessage
|
|||
const TYPE_NEEDS_UPDATE = 'needs-update';
|
||||
|
||||
const CODE_ERROR = 'error';
|
||||
const CODE_WORKING = 'working';
|
||||
const CODE_OKAY = 'okay';
|
||||
|
||||
protected $repositoryID;
|
||||
|
@ -16,6 +15,7 @@ final class PhabricatorRepositoryStatusMessage
|
|||
protected $statusCode;
|
||||
protected $parameters = array();
|
||||
protected $epoch;
|
||||
protected $messageCount;
|
||||
|
||||
protected function getConfiguration() {
|
||||
return array(
|
||||
|
@ -26,6 +26,7 @@ final class PhabricatorRepositoryStatusMessage
|
|||
self::CONFIG_COLUMN_SCHEMA => array(
|
||||
'statusType' => 'text32',
|
||||
'statusCode' => 'text32',
|
||||
'messageCount' => 'uint32',
|
||||
),
|
||||
self::CONFIG_KEY_SCHEMA => array(
|
||||
'repositoryID' => array(
|
||||
|
|
Loading…
Reference in a new issue