1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-26 08:42:41 +01:00

When documents are indexed, record the indexer version (versus the object version) and index epoch

Summary:
Ref T13587. D21495 has significant changes to the ngram indexer, which might possibly contain bugs.

Make it easier to reindex a subset of documents (based on the date when the index was built, and/or the software version which generated the index).

This is in addition to the existing versioning, which is focused on object versions.

Test Plan: Ran `bin/search index` with various old and new arguments. Spot-checked the `IndexVersion` table.

Subscribers: PHID-OPKG-gm6ozazyms6q6i22gyam

Maniphest Tasks: T13587

Differential Revision: https://secure.phabricator.com/D21560
This commit is contained in:
epriestley 2021-02-16 15:29:17 -08:00
parent 4f647fb6be
commit 6703fec3e2
5 changed files with 280 additions and 86 deletions

View file

@ -0,0 +1,2 @@
ALTER TABLE {$NAMESPACE}_search.search_indexversion
ADD indexVersion BINARY(12) NOT NULL;

View file

@ -0,0 +1,2 @@
ALTER TABLE {$NAMESPACE}_search.search_indexversion
ADD indexEpoch INT UNSIGNED NOT NULL;

View file

@ -109,8 +109,10 @@ final class PhabricatorIndexEngine extends Phobject {
$rows = queryfx_all( $rows = queryfx_all(
$conn_r, $conn_r,
'SELECT * FROM %T WHERE objectPHID = %s AND extensionKey IN (%Ls)', 'SELECT version, extensionKey
$table->getTableName(), FROM %R
WHERE objectPHID = %s AND extensionKey IN (%Ls)',
$table,
$object_phid, $object_phid,
$extension_keys); $extension_keys);
@ -128,22 +130,35 @@ final class PhabricatorIndexEngine extends Phobject {
$table = new PhabricatorSearchIndexVersion(); $table = new PhabricatorSearchIndexVersion();
$conn_w = $table->establishConnection('w'); $conn_w = $table->establishConnection('w');
$now = PhabricatorTime::getNow();
// See T13587. For now, this is just a marker to make it easy to reindex
// documents if some version of the indexing code is later discovered to
// be questionable.
$index_version = '2021-02-16-A';
$sql = array(); $sql = array();
foreach ($versions as $key => $version) { foreach ($versions as $key => $version) {
$sql[] = qsprintf( $sql[] = qsprintf(
$conn_w, $conn_w,
'(%s, %s, %s)', '(%s, %s, %s, %s, %d)',
$object_phid, $object_phid,
$key, $key,
$version); $version,
$index_version,
$now);
} }
queryfx( queryfx(
$conn_w, $conn_w,
'INSERT INTO %T (objectPHID, extensionKey, version) 'INSERT INTO %R (objectPHID, extensionKey, version,
indexVersion, indexEpoch)
VALUES %LQ VALUES %LQ
ON DUPLICATE KEY UPDATE version = VALUES(version)', ON DUPLICATE KEY UPDATE
$table->getTableName(), version = VALUES(version),
indexVersion = VALUES(indexVersion),
indexEpoch = VALUES(indexEpoch)',
$table,
$sql); $sql);
} }

View file

@ -8,9 +8,13 @@ final class PhabricatorSearchManagementIndexWorkflow
->setName('index') ->setName('index')
->setSynopsis(pht('Build or rebuild search indexes.')) ->setSynopsis(pht('Build or rebuild search indexes.'))
->setExamples( ->setExamples(
"**index** D123\n". implode(
"**index** --type task\n". "\n",
"**index** --all") array(
'**index** D123',
'**index** --all',
'**index** [--type __task__] [--version __version__] ...',
)))
->setArguments( ->setArguments(
array( array(
array( array(
@ -20,6 +24,7 @@ final class PhabricatorSearchManagementIndexWorkflow
array( array(
'name' => 'type', 'name' => 'type',
'param' => 'type', 'param' => 'type',
'repeat' => true,
'help' => pht( 'help' => pht(
'Object types to reindex, like "task", "commit" or "revision".'), 'Object types to reindex, like "task", "commit" or "revision".'),
), ),
@ -37,6 +42,28 @@ final class PhabricatorSearchManagementIndexWorkflow
'Force a complete rebuild of the entire index instead of an '. 'Force a complete rebuild of the entire index instead of an '.
'incremental update.'), 'incremental update.'),
), ),
array(
'name' => 'version',
'param' => 'version',
'repeat' => true,
'help' => pht(
'Reindex objects previously indexed with a particular '.
'version of the indexer.'),
),
array(
'name' => 'min-index-date',
'param' => 'date',
'help' => pht(
'Reindex objects previously indexed on or after a '.
'given date.'),
),
array(
'name' => 'max-index-date',
'param' => 'date',
'help' => pht(
'Reindex objects previously indexed on or before a '.
'given date.'),
),
array( array(
'name' => 'objects', 'name' => 'objects',
'wildcard' => true, 'wildcard' => true,
@ -47,37 +74,46 @@ final class PhabricatorSearchManagementIndexWorkflow
public function execute(PhutilArgumentParser $args) { public function execute(PhutilArgumentParser $args) {
$this->validateClusterSearchConfig(); $this->validateClusterSearchConfig();
$console = PhutilConsole::getConsole();
$is_all = $args->getArg('all'); $is_all = $args->getArg('all');
$is_type = $args->getArg('type');
$is_force = $args->getArg('force'); $is_force = $args->getArg('force');
$obj_names = $args->getArg('objects'); $object_types = $args->getArg('type');
$index_versions = $args->getArg('version');
if ($obj_names && ($is_all || $is_type)) { $min_epoch = $args->getArg('min-index-date');
if ($min_epoch !== null) {
$min_epoch = $this->parseTimeArgument($min_epoch);
}
$max_epoch = $args->getArg('max-index-date');
if ($max_epoch !== null) {
$max_epoch = $this->parseTimeArgument($max_epoch);
}
$object_names = $args->getArg('objects');
$any_constraints =
($object_names) ||
($object_types) ||
($index_versions) ||
($min_epoch) ||
($max_epoch);
if ($is_all && $any_constraints) {
throw new PhutilArgumentUsageException( throw new PhutilArgumentUsageException(
pht( pht(
"You can not name objects to index alongside the '%s' or '%s' flags.", 'You can not use query constraint flags (like "--version", '.
'--all', '"--type", or a list of specific objects) with "--all".'));
'--type')); }
} else if (!$obj_names && !($is_all || $is_type)) {
if (!$is_all && !$any_constraints) {
throw new PhutilArgumentUsageException( throw new PhutilArgumentUsageException(
pht( pht(
"Provide one of '%s', '%s' or a list of object names.", 'Provide a list of objects to index (like "D123"), or a set of '.
'--all', 'query constraint flags (like "--type"), or "--all" to index '.
'--type')); 'all objects.'));
} }
if ($obj_names) {
$phids = $this->loadPHIDsByNames($obj_names);
} else {
$phids = $this->loadPHIDsByTypes($is_type);
}
if (!$phids) {
throw new PhutilArgumentUsageException(pht('Nothing to index!'));
}
if ($args->getArg('background')) { if ($args->getArg('background')) {
$is_background = true; $is_background = true;
@ -87,21 +123,80 @@ final class PhabricatorSearchManagementIndexWorkflow
} }
if (!$is_background) { if (!$is_background) {
echo tsprintf( $this->logInfo(
"**<bg:blue> %s </bg>** %s\n",
pht('NOTE'), pht('NOTE'),
pht( pht(
'Run this workflow with "%s" to queue tasks for the daemon workers.', 'Run this workflow with "--background" to queue tasks for the '.
'--background')); 'daemon workers.'));
} }
$groups = phid_group_by_type($phids); $this->logInfo(
foreach ($groups as $group_type => $group) { pht('SELECT'),
$console->writeOut( pht('Selecting objects to index...'));
"%s\n",
pht('Indexing %d object(s) of type %s.', count($group), $group_type)); $object_phids = null;
if ($object_names) {
$object_phids = $this->loadPHIDsByNames($object_names);
$object_phids = array_fuse($object_phids);
} }
$type_phids = null;
if ($is_all || $object_types) {
$object_map = $this->getIndexableObjectsByTypes($object_types);
$type_phids = array();
foreach ($object_map as $object) {
$iterator = new LiskMigrationIterator($object);
foreach ($iterator as $o) {
$type_phids[] = $o->getPHID();
}
}
$type_phids = array_fuse($type_phids);
}
$index_phids = null;
if ($index_versions || $min_epoch || $max_epoch) {
$index_phids = $this->loadPHIDsByIndexConstraints(
$index_versions,
$min_epoch,
$max_epoch);
$index_phids = array_fuse($index_phids);
}
$working_set = null;
$filter_sets = array(
$object_phids,
$type_phids,
$index_phids,
);
foreach ($filter_sets as $filter_set) {
if ($filter_set === null) {
continue;
}
if ($working_set === null) {
$working_set = $filter_set;
continue;
}
$working_set = array_intersect_key($working_set, $filter_set);
}
$phids = array_keys($working_set);
if (!$phids) {
$this->logWarn(
pht('NO OBJECTS'),
pht('No objects selected to index.'));
return 0;
}
$this->logInfo(
pht('INDEXING'),
pht(
'Indexing %s object(s).',
phutil_count($phids)));
$bar = id(new PhutilConsoleProgressBar()) $bar = id(new PhutilConsoleProgressBar())
->setTotal(count($phids)); ->setTotal(count($phids));
@ -166,8 +261,7 @@ final class PhabricatorSearchManagementIndexWorkflow
if ($track_skips) { if ($track_skips) {
if ($count_updated) { if ($count_updated) {
echo tsprintf( $this->logOkay(
"**<bg:green> %s </bg>** %s\n",
pht('DONE'), pht('DONE'),
pht( pht(
'Updated search indexes for %s document(s).', 'Updated search indexes for %s document(s).',
@ -175,29 +269,25 @@ final class PhabricatorSearchManagementIndexWorkflow
} }
if ($count_skipped) { if ($count_skipped) {
echo tsprintf( $this->logWarn(
"**<bg:yellow> %s </bg>** %s\n",
pht('SKIP'), pht('SKIP'),
pht( pht(
'Skipped %s documents(s) which have not updated since they were '. 'Skipped %s documents(s) which have not updated since they were '.
'last indexed.', 'last indexed.',
new PhutilNumber($count_skipped))); new PhutilNumber($count_skipped)));
echo tsprintf( $this->logInfo(
"**<bg:blue> %s </bg>** %s\n",
pht('NOTE'), pht('NOTE'),
pht( pht(
'Use "--force" to force the index to update these documents.')); 'Use "--force" to force the index to update these documents.'));
} }
} else if ($is_background) { } else if ($is_background) {
echo tsprintf( $this->logOkay(
"**<bg:green> %s </bg>** %s\n",
pht('DONE'), pht('DONE'),
pht( pht(
'Queued %s document(s) for background indexing.', 'Queued %s document(s) for background indexing.',
new PhutilNumber(count($phids)))); new PhutilNumber(count($phids))));
} else { } else {
echo tsprintf( $this->logOkay(
"**<bg:green> %s </bg>** %s\n",
pht('DONE'), pht('DONE'),
pht( pht(
'Forced search index updates for %s document(s).', 'Forced search index updates for %s document(s).',
@ -224,62 +314,100 @@ final class PhabricatorSearchManagementIndexWorkflow
return mpull($objects, 'getPHID'); return mpull($objects, 'getPHID');
} }
private function loadPHIDsByTypes($type) { private function getIndexableObjectsByTypes(array $types) {
$objects = id(new PhutilClassMapQuery()) $objects = id(new PhutilClassMapQuery())
->setAncestorClass('PhabricatorIndexableInterface') ->setAncestorClass('PhabricatorIndexableInterface')
->execute(); ->execute();
$type_map = array();
$normal_map = array();
foreach ($types as $type) {
$normalized_type = phutil_utf8_strtolower($type); $normalized_type = phutil_utf8_strtolower($type);
$type_map[$type] = $normalized_type;
$matches = array(); if (isset($normal_map[$normalized_type])) {
$old_type = $normal_map[$normalized_type];
throw new PhutilArgumentUsageException(
pht(
'Type specification "%s" duplicates type specification "%s". '.
'Specify each type only once.',
$type,
$old_type));
}
$normal_map[$normalized_type] = $type;
}
$object_matches = array();
$matches_map = array();
$exact_map = array();
foreach ($objects as $object) { foreach ($objects as $object) {
$object_class = get_class($object); $object_class = get_class($object);
if (!$types) {
$object_matches[$object_class] = $object;
continue;
}
$normalized_class = phutil_utf8_strtolower($object_class); $normalized_class = phutil_utf8_strtolower($object_class);
if ($normalized_class === $normalized_type) { // If a specified type is exactly the name of this class, match it.
$matches = array($object_class => $object); if (isset($normal_map[$normalized_class])) {
break; $object_matches[$object_class] = $object;
$matching_type = $normal_map[$normalized_class];
$matches_map[$matching_type] = array($object_class);
$exact_map[$matching_type] = true;
continue;
} }
if (!strlen($type) || foreach ($type_map as $type => $normalized_type) {
strpos($normalized_class, $normalized_type) !== false) { // If we already have an exact match for this type, don't match it
$matches[$object_class] = $object; // as a substring. An indexable "MothObject" should be selectable
// exactly without also selecting "MammothObject".
if (isset($exact_map[$type])) {
continue;
}
// If the selector isn't a substring of the class name, continue.
if (strpos($normalized_class, $normalized_type) === false) {
continue;
}
$matches_map[$type][] = $object_class;
$object_matches[$object_class] = $object;
} }
} }
if (!$matches) {
$all_types = array(); $all_types = array();
foreach ($objects as $object) { foreach ($objects as $object) {
$all_types[] = get_class($object); $all_types[] = get_class($object);
} }
sort($all_types); sort($all_types);
$type_list = implode(', ', $all_types);
foreach ($type_map as $type => $normalized_type) {
$matches = idx($matches_map, $type);
if (!$matches) {
throw new PhutilArgumentUsageException( throw new PhutilArgumentUsageException(
pht( pht(
'Type "%s" matches no indexable objects. Supported types are: %s.', 'Type "%s" matches no indexable objects. '.
'Supported types are: %s.',
$type, $type,
implode(', ', $all_types))); $type_list));
} }
if ((count($matches) > 1) && strlen($type)) { if (count($matches) > 1) {
throw new PhutilArgumentUsageException( throw new PhutilArgumentUsageException(
pht( pht(
'Type "%s" matches multiple indexable objects. Use a more '. 'Type "%s" matches multiple indexable objects. Use a more '.
'specific string. Matching object types are: %s.', 'specific string. Matching objects are: %s.',
$type, $type,
implode(', ', array_keys($matches)))); implode(', ', $matches)));
}
$phids = array();
foreach ($matches as $match) {
$iterator = new LiskMigrationIterator($match);
foreach ($iterator as $object) {
$phids[] = $object->getPHID();
} }
} }
return $phids; return $object_matches;
} }
private function loadIndexVersions($phid) { private function loadIndexVersions($phid) {
@ -294,4 +422,43 @@ final class PhabricatorSearchManagementIndexWorkflow
$phid); $phid);
} }
private function loadPHIDsByIndexConstraints(
array $index_versions,
$min_date,
$max_date) {
$table = new PhabricatorSearchIndexVersion();
$conn = $table->establishConnection('r');
$where = array();
if ($index_versions) {
$where[] = qsprintf(
$conn,
'indexVersion IN (%Ls)',
$index_versions);
}
if ($min_date !== null) {
$where[] = qsprintf(
$conn,
'indexEpoch >= %d',
$min_date);
}
if ($max_date !== null) {
$where[] = qsprintf(
$conn,
'indexEpoch <= %d',
$max_date);
}
$rows = queryfx_all(
$conn,
'SELECT DISTINCT objectPHID FROM %R WHERE %LA',
$table,
$where);
return ipull($rows, 'objectPHID');
}
} }

View file

@ -6,6 +6,8 @@ final class PhabricatorSearchIndexVersion
protected $objectPHID; protected $objectPHID;
protected $extensionKey; protected $extensionKey;
protected $version; protected $version;
protected $indexVersion;
protected $indexEpoch;
protected function getConfiguration() { protected function getConfiguration() {
return array( return array(
@ -13,12 +15,18 @@ final class PhabricatorSearchIndexVersion
self::CONFIG_COLUMN_SCHEMA => array( self::CONFIG_COLUMN_SCHEMA => array(
'extensionKey' => 'text64', 'extensionKey' => 'text64',
'version' => 'text128', 'version' => 'text128',
'indexVersion' => 'bytes12',
'indexEpoch' => 'epoch',
), ),
self::CONFIG_KEY_SCHEMA => array( self::CONFIG_KEY_SCHEMA => array(
'key_object' => array( 'key_object' => array(
'columns' => array('objectPHID', 'extensionKey'), 'columns' => array('objectPHID', 'extensionKey'),
'unique' => true, 'unique' => true,
), ),
// NOTE: "bin/search index" may query this table by "indexVersion" or
// "indexEpoch", but this is rare and scanning the table seems fine.
), ),
) + parent::getConfiguration(); ) + parent::getConfiguration();
} }