1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-12-22 21:40:55 +01:00

When documents are indexed, record the indexer version (versus the object version) and index epoch

Summary:
Ref T13587. D21495 has significant changes to the ngram indexer, which might possibly contain bugs.

Make it easier to reindex a subset of documents (based on the date when the index was built, and/or the software version which generated the index).

This is in addition to the existing versioning, which is focused on object versions.

Test Plan: Ran `bin/search index` with various old and new arguments. Spot-checked the `IndexVersion` table.

Subscribers: PHID-OPKG-gm6ozazyms6q6i22gyam

Maniphest Tasks: T13587

Differential Revision: https://secure.phabricator.com/D21560
This commit is contained in:
epriestley 2021-02-16 15:29:17 -08:00
parent 4f647fb6be
commit 6703fec3e2
5 changed files with 280 additions and 86 deletions

View file

@ -0,0 +1,2 @@
ALTER TABLE {$NAMESPACE}_search.search_indexversion
ADD indexVersion BINARY(12) NOT NULL;

View file

@ -0,0 +1,2 @@
ALTER TABLE {$NAMESPACE}_search.search_indexversion
ADD indexEpoch INT UNSIGNED NOT NULL;

View file

@ -109,8 +109,10 @@ final class PhabricatorIndexEngine extends Phobject {
$rows = queryfx_all(
$conn_r,
'SELECT * FROM %T WHERE objectPHID = %s AND extensionKey IN (%Ls)',
$table->getTableName(),
'SELECT version, extensionKey
FROM %R
WHERE objectPHID = %s AND extensionKey IN (%Ls)',
$table,
$object_phid,
$extension_keys);
@ -128,22 +130,35 @@ final class PhabricatorIndexEngine extends Phobject {
$table = new PhabricatorSearchIndexVersion();
$conn_w = $table->establishConnection('w');
$now = PhabricatorTime::getNow();
// See T13587. For now, this is just a marker to make it easy to reindex
// documents if some version of the indexing code is later discovered to
// be questionable.
$index_version = '2021-02-16-A';
$sql = array();
foreach ($versions as $key => $version) {
$sql[] = qsprintf(
$conn_w,
'(%s, %s, %s)',
'(%s, %s, %s, %s, %d)',
$object_phid,
$key,
$version);
$version,
$index_version,
$now);
}
queryfx(
$conn_w,
'INSERT INTO %T (objectPHID, extensionKey, version)
'INSERT INTO %R (objectPHID, extensionKey, version,
indexVersion, indexEpoch)
VALUES %LQ
ON DUPLICATE KEY UPDATE version = VALUES(version)',
$table->getTableName(),
ON DUPLICATE KEY UPDATE
version = VALUES(version),
indexVersion = VALUES(indexVersion),
indexEpoch = VALUES(indexEpoch)',
$table,
$sql);
}

View file

@ -8,9 +8,13 @@ final class PhabricatorSearchManagementIndexWorkflow
->setName('index')
->setSynopsis(pht('Build or rebuild search indexes.'))
->setExamples(
"**index** D123\n".
"**index** --type task\n".
"**index** --all")
implode(
"\n",
array(
'**index** D123',
'**index** --all',
'**index** [--type __task__] [--version __version__] ...',
)))
->setArguments(
array(
array(
@ -20,6 +24,7 @@ final class PhabricatorSearchManagementIndexWorkflow
array(
'name' => 'type',
'param' => 'type',
'repeat' => true,
'help' => pht(
'Object types to reindex, like "task", "commit" or "revision".'),
),
@ -37,6 +42,28 @@ final class PhabricatorSearchManagementIndexWorkflow
'Force a complete rebuild of the entire index instead of an '.
'incremental update.'),
),
array(
'name' => 'version',
'param' => 'version',
'repeat' => true,
'help' => pht(
'Reindex objects previously indexed with a particular '.
'version of the indexer.'),
),
array(
'name' => 'min-index-date',
'param' => 'date',
'help' => pht(
'Reindex objects previously indexed on or after a '.
'given date.'),
),
array(
'name' => 'max-index-date',
'param' => 'date',
'help' => pht(
'Reindex objects previously indexed on or before a '.
'given date.'),
),
array(
'name' => 'objects',
'wildcard' => true,
@ -47,37 +74,46 @@ final class PhabricatorSearchManagementIndexWorkflow
public function execute(PhutilArgumentParser $args) {
$this->validateClusterSearchConfig();
$console = PhutilConsole::getConsole();
$is_all = $args->getArg('all');
$is_type = $args->getArg('type');
$is_force = $args->getArg('force');
$obj_names = $args->getArg('objects');
$object_types = $args->getArg('type');
$index_versions = $args->getArg('version');
if ($obj_names && ($is_all || $is_type)) {
$min_epoch = $args->getArg('min-index-date');
if ($min_epoch !== null) {
$min_epoch = $this->parseTimeArgument($min_epoch);
}
$max_epoch = $args->getArg('max-index-date');
if ($max_epoch !== null) {
$max_epoch = $this->parseTimeArgument($max_epoch);
}
$object_names = $args->getArg('objects');
$any_constraints =
($object_names) ||
($object_types) ||
($index_versions) ||
($min_epoch) ||
($max_epoch);
if ($is_all && $any_constraints) {
throw new PhutilArgumentUsageException(
pht(
"You can not name objects to index alongside the '%s' or '%s' flags.",
'--all',
'--type'));
} else if (!$obj_names && !($is_all || $is_type)) {
'You can not use query constraint flags (like "--version", '.
'"--type", or a list of specific objects) with "--all".'));
}
if (!$is_all && !$any_constraints) {
throw new PhutilArgumentUsageException(
pht(
"Provide one of '%s', '%s' or a list of object names.",
'--all',
'--type'));
'Provide a list of objects to index (like "D123"), or a set of '.
'query constraint flags (like "--type"), or "--all" to index '.
'all objects.'));
}
if ($obj_names) {
$phids = $this->loadPHIDsByNames($obj_names);
} else {
$phids = $this->loadPHIDsByTypes($is_type);
}
if (!$phids) {
throw new PhutilArgumentUsageException(pht('Nothing to index!'));
}
if ($args->getArg('background')) {
$is_background = true;
@ -87,21 +123,80 @@ final class PhabricatorSearchManagementIndexWorkflow
}
if (!$is_background) {
echo tsprintf(
"**<bg:blue> %s </bg>** %s\n",
$this->logInfo(
pht('NOTE'),
pht(
'Run this workflow with "%s" to queue tasks for the daemon workers.',
'--background'));
'Run this workflow with "--background" to queue tasks for the '.
'daemon workers.'));
}
$groups = phid_group_by_type($phids);
foreach ($groups as $group_type => $group) {
$console->writeOut(
"%s\n",
pht('Indexing %d object(s) of type %s.', count($group), $group_type));
$this->logInfo(
pht('SELECT'),
pht('Selecting objects to index...'));
$object_phids = null;
if ($object_names) {
$object_phids = $this->loadPHIDsByNames($object_names);
$object_phids = array_fuse($object_phids);
}
$type_phids = null;
if ($is_all || $object_types) {
$object_map = $this->getIndexableObjectsByTypes($object_types);
$type_phids = array();
foreach ($object_map as $object) {
$iterator = new LiskMigrationIterator($object);
foreach ($iterator as $o) {
$type_phids[] = $o->getPHID();
}
}
$type_phids = array_fuse($type_phids);
}
$index_phids = null;
if ($index_versions || $min_epoch || $max_epoch) {
$index_phids = $this->loadPHIDsByIndexConstraints(
$index_versions,
$min_epoch,
$max_epoch);
$index_phids = array_fuse($index_phids);
}
$working_set = null;
$filter_sets = array(
$object_phids,
$type_phids,
$index_phids,
);
foreach ($filter_sets as $filter_set) {
if ($filter_set === null) {
continue;
}
if ($working_set === null) {
$working_set = $filter_set;
continue;
}
$working_set = array_intersect_key($working_set, $filter_set);
}
$phids = array_keys($working_set);
if (!$phids) {
$this->logWarn(
pht('NO OBJECTS'),
pht('No objects selected to index.'));
return 0;
}
$this->logInfo(
pht('INDEXING'),
pht(
'Indexing %s object(s).',
phutil_count($phids)));
$bar = id(new PhutilConsoleProgressBar())
->setTotal(count($phids));
@ -166,8 +261,7 @@ final class PhabricatorSearchManagementIndexWorkflow
if ($track_skips) {
if ($count_updated) {
echo tsprintf(
"**<bg:green> %s </bg>** %s\n",
$this->logOkay(
pht('DONE'),
pht(
'Updated search indexes for %s document(s).',
@ -175,29 +269,25 @@ final class PhabricatorSearchManagementIndexWorkflow
}
if ($count_skipped) {
echo tsprintf(
"**<bg:yellow> %s </bg>** %s\n",
$this->logWarn(
pht('SKIP'),
pht(
'Skipped %s documents(s) which have not updated since they were '.
'last indexed.',
new PhutilNumber($count_skipped)));
echo tsprintf(
"**<bg:blue> %s </bg>** %s\n",
$this->logInfo(
pht('NOTE'),
pht(
'Use "--force" to force the index to update these documents.'));
}
} else if ($is_background) {
echo tsprintf(
"**<bg:green> %s </bg>** %s\n",
$this->logOkay(
pht('DONE'),
pht(
'Queued %s document(s) for background indexing.',
new PhutilNumber(count($phids))));
} else {
echo tsprintf(
"**<bg:green> %s </bg>** %s\n",
$this->logOkay(
pht('DONE'),
pht(
'Forced search index updates for %s document(s).',
@ -224,62 +314,100 @@ final class PhabricatorSearchManagementIndexWorkflow
return mpull($objects, 'getPHID');
}
private function loadPHIDsByTypes($type) {
private function getIndexableObjectsByTypes(array $types) {
$objects = id(new PhutilClassMapQuery())
->setAncestorClass('PhabricatorIndexableInterface')
->execute();
$normalized_type = phutil_utf8_strtolower($type);
$type_map = array();
$normal_map = array();
foreach ($types as $type) {
$normalized_type = phutil_utf8_strtolower($type);
$type_map[$type] = $normalized_type;
$matches = array();
if (isset($normal_map[$normalized_type])) {
$old_type = $normal_map[$normalized_type];
throw new PhutilArgumentUsageException(
pht(
'Type specification "%s" duplicates type specification "%s". '.
'Specify each type only once.',
$type,
$old_type));
}
$normal_map[$normalized_type] = $type;
}
$object_matches = array();
$matches_map = array();
$exact_map = array();
foreach ($objects as $object) {
$object_class = get_class($object);
if (!$types) {
$object_matches[$object_class] = $object;
continue;
}
$normalized_class = phutil_utf8_strtolower($object_class);
if ($normalized_class === $normalized_type) {
$matches = array($object_class => $object);
break;
// If a specified type is exactly the name of this class, match it.
if (isset($normal_map[$normalized_class])) {
$object_matches[$object_class] = $object;
$matching_type = $normal_map[$normalized_class];
$matches_map[$matching_type] = array($object_class);
$exact_map[$matching_type] = true;
continue;
}
if (!strlen($type) ||
strpos($normalized_class, $normalized_type) !== false) {
$matches[$object_class] = $object;
foreach ($type_map as $type => $normalized_type) {
// If we already have an exact match for this type, don't match it
// as a substring. An indexable "MothObject" should be selectable
// exactly without also selecting "MammothObject".
if (isset($exact_map[$type])) {
continue;
}
// If the selector isn't a substring of the class name, continue.
if (strpos($normalized_class, $normalized_type) === false) {
continue;
}
$matches_map[$type][] = $object_class;
$object_matches[$object_class] = $object;
}
}
if (!$matches) {
$all_types = array();
foreach ($objects as $object) {
$all_types[] = get_class($object);
$all_types = array();
foreach ($objects as $object) {
$all_types[] = get_class($object);
}
sort($all_types);
$type_list = implode(', ', $all_types);
foreach ($type_map as $type => $normalized_type) {
$matches = idx($matches_map, $type);
if (!$matches) {
throw new PhutilArgumentUsageException(
pht(
'Type "%s" matches no indexable objects. '.
'Supported types are: %s.',
$type,
$type_list));
}
sort($all_types);
throw new PhutilArgumentUsageException(
pht(
'Type "%s" matches no indexable objects. Supported types are: %s.',
$type,
implode(', ', $all_types)));
}
if ((count($matches) > 1) && strlen($type)) {
throw new PhutilArgumentUsageException(
pht(
'Type "%s" matches multiple indexable objects. Use a more '.
'specific string. Matching object types are: %s.',
$type,
implode(', ', array_keys($matches))));
}
$phids = array();
foreach ($matches as $match) {
$iterator = new LiskMigrationIterator($match);
foreach ($iterator as $object) {
$phids[] = $object->getPHID();
if (count($matches) > 1) {
throw new PhutilArgumentUsageException(
pht(
'Type "%s" matches multiple indexable objects. Use a more '.
'specific string. Matching objects are: %s.',
$type,
implode(', ', $matches)));
}
}
return $phids;
return $object_matches;
}
private function loadIndexVersions($phid) {
@ -294,4 +422,43 @@ final class PhabricatorSearchManagementIndexWorkflow
$phid);
}
private function loadPHIDsByIndexConstraints(
array $index_versions,
$min_date,
$max_date) {
$table = new PhabricatorSearchIndexVersion();
$conn = $table->establishConnection('r');
$where = array();
if ($index_versions) {
$where[] = qsprintf(
$conn,
'indexVersion IN (%Ls)',
$index_versions);
}
if ($min_date !== null) {
$where[] = qsprintf(
$conn,
'indexEpoch >= %d',
$min_date);
}
if ($max_date !== null) {
$where[] = qsprintf(
$conn,
'indexEpoch <= %d',
$max_date);
}
$rows = queryfx_all(
$conn,
'SELECT DISTINCT objectPHID FROM %R WHERE %LA',
$table,
$where);
return ipull($rows, 'objectPHID');
}
}

View file

@ -6,6 +6,8 @@ final class PhabricatorSearchIndexVersion
protected $objectPHID;
protected $extensionKey;
protected $version;
protected $indexVersion;
protected $indexEpoch;
protected function getConfiguration() {
return array(
@ -13,12 +15,18 @@ final class PhabricatorSearchIndexVersion
self::CONFIG_COLUMN_SCHEMA => array(
'extensionKey' => 'text64',
'version' => 'text128',
'indexVersion' => 'bytes12',
'indexEpoch' => 'epoch',
),
self::CONFIG_KEY_SCHEMA => array(
'key_object' => array(
'columns' => array('objectPHID', 'extensionKey'),
'unique' => true,
),
// NOTE: "bin/search index" may query this table by "indexVersion" or
// "indexEpoch", but this is rare and scanning the table seems fine.
),
) + parent::getConfiguration();
}