mirror of
https://we.phorge.it/source/phorge.git
synced 2024-12-22 21:40:55 +01:00
When documents are indexed, record the indexer version (versus the object version) and index epoch
Summary: Ref T13587. D21495 has significant changes to the ngram indexer, which might possibly contain bugs. Make it easier to reindex a subset of documents (based on the date when the index was built, and/or the software version which generated the index). This is in addition to the existing versioning, which is focused on object versions. Test Plan: Ran `bin/search index` with various old and new arguments. Spot-checked the `IndexVersion` table. Subscribers: PHID-OPKG-gm6ozazyms6q6i22gyam Maniphest Tasks: T13587 Differential Revision: https://secure.phabricator.com/D21560
This commit is contained in:
parent
4f647fb6be
commit
6703fec3e2
5 changed files with 280 additions and 86 deletions
2
resources/sql/autopatches/20210216.index.01.version.sql
Normal file
2
resources/sql/autopatches/20210216.index.01.version.sql
Normal file
|
@ -0,0 +1,2 @@
|
|||
ALTER TABLE {$NAMESPACE}_search.search_indexversion
|
||||
ADD indexVersion BINARY(12) NOT NULL;
|
2
resources/sql/autopatches/20210216.index.02.epoch.sql
Normal file
2
resources/sql/autopatches/20210216.index.02.epoch.sql
Normal file
|
@ -0,0 +1,2 @@
|
|||
ALTER TABLE {$NAMESPACE}_search.search_indexversion
|
||||
ADD indexEpoch INT UNSIGNED NOT NULL;
|
|
@ -109,8 +109,10 @@ final class PhabricatorIndexEngine extends Phobject {
|
|||
|
||||
$rows = queryfx_all(
|
||||
$conn_r,
|
||||
'SELECT * FROM %T WHERE objectPHID = %s AND extensionKey IN (%Ls)',
|
||||
$table->getTableName(),
|
||||
'SELECT version, extensionKey
|
||||
FROM %R
|
||||
WHERE objectPHID = %s AND extensionKey IN (%Ls)',
|
||||
$table,
|
||||
$object_phid,
|
||||
$extension_keys);
|
||||
|
||||
|
@ -128,22 +130,35 @@ final class PhabricatorIndexEngine extends Phobject {
|
|||
$table = new PhabricatorSearchIndexVersion();
|
||||
$conn_w = $table->establishConnection('w');
|
||||
|
||||
$now = PhabricatorTime::getNow();
|
||||
|
||||
// See T13587. For now, this is just a marker to make it easy to reindex
|
||||
// documents if some version of the indexing code is later discovered to
|
||||
// be questionable.
|
||||
$index_version = '2021-02-16-A';
|
||||
|
||||
$sql = array();
|
||||
foreach ($versions as $key => $version) {
|
||||
$sql[] = qsprintf(
|
||||
$conn_w,
|
||||
'(%s, %s, %s)',
|
||||
'(%s, %s, %s, %s, %d)',
|
||||
$object_phid,
|
||||
$key,
|
||||
$version);
|
||||
$version,
|
||||
$index_version,
|
||||
$now);
|
||||
}
|
||||
|
||||
queryfx(
|
||||
$conn_w,
|
||||
'INSERT INTO %T (objectPHID, extensionKey, version)
|
||||
'INSERT INTO %R (objectPHID, extensionKey, version,
|
||||
indexVersion, indexEpoch)
|
||||
VALUES %LQ
|
||||
ON DUPLICATE KEY UPDATE version = VALUES(version)',
|
||||
$table->getTableName(),
|
||||
ON DUPLICATE KEY UPDATE
|
||||
version = VALUES(version),
|
||||
indexVersion = VALUES(indexVersion),
|
||||
indexEpoch = VALUES(indexEpoch)',
|
||||
$table,
|
||||
$sql);
|
||||
}
|
||||
|
||||
|
|
|
@ -8,9 +8,13 @@ final class PhabricatorSearchManagementIndexWorkflow
|
|||
->setName('index')
|
||||
->setSynopsis(pht('Build or rebuild search indexes.'))
|
||||
->setExamples(
|
||||
"**index** D123\n".
|
||||
"**index** --type task\n".
|
||||
"**index** --all")
|
||||
implode(
|
||||
"\n",
|
||||
array(
|
||||
'**index** D123',
|
||||
'**index** --all',
|
||||
'**index** [--type __task__] [--version __version__] ...',
|
||||
)))
|
||||
->setArguments(
|
||||
array(
|
||||
array(
|
||||
|
@ -20,6 +24,7 @@ final class PhabricatorSearchManagementIndexWorkflow
|
|||
array(
|
||||
'name' => 'type',
|
||||
'param' => 'type',
|
||||
'repeat' => true,
|
||||
'help' => pht(
|
||||
'Object types to reindex, like "task", "commit" or "revision".'),
|
||||
),
|
||||
|
@ -37,6 +42,28 @@ final class PhabricatorSearchManagementIndexWorkflow
|
|||
'Force a complete rebuild of the entire index instead of an '.
|
||||
'incremental update.'),
|
||||
),
|
||||
array(
|
||||
'name' => 'version',
|
||||
'param' => 'version',
|
||||
'repeat' => true,
|
||||
'help' => pht(
|
||||
'Reindex objects previously indexed with a particular '.
|
||||
'version of the indexer.'),
|
||||
),
|
||||
array(
|
||||
'name' => 'min-index-date',
|
||||
'param' => 'date',
|
||||
'help' => pht(
|
||||
'Reindex objects previously indexed on or after a '.
|
||||
'given date.'),
|
||||
),
|
||||
array(
|
||||
'name' => 'max-index-date',
|
||||
'param' => 'date',
|
||||
'help' => pht(
|
||||
'Reindex objects previously indexed on or before a '.
|
||||
'given date.'),
|
||||
),
|
||||
array(
|
||||
'name' => 'objects',
|
||||
'wildcard' => true,
|
||||
|
@ -47,37 +74,46 @@ final class PhabricatorSearchManagementIndexWorkflow
|
|||
public function execute(PhutilArgumentParser $args) {
|
||||
$this->validateClusterSearchConfig();
|
||||
|
||||
$console = PhutilConsole::getConsole();
|
||||
|
||||
$is_all = $args->getArg('all');
|
||||
$is_type = $args->getArg('type');
|
||||
$is_force = $args->getArg('force');
|
||||
|
||||
$obj_names = $args->getArg('objects');
|
||||
$object_types = $args->getArg('type');
|
||||
$index_versions = $args->getArg('version');
|
||||
|
||||
if ($obj_names && ($is_all || $is_type)) {
|
||||
$min_epoch = $args->getArg('min-index-date');
|
||||
if ($min_epoch !== null) {
|
||||
$min_epoch = $this->parseTimeArgument($min_epoch);
|
||||
}
|
||||
|
||||
$max_epoch = $args->getArg('max-index-date');
|
||||
if ($max_epoch !== null) {
|
||||
$max_epoch = $this->parseTimeArgument($max_epoch);
|
||||
}
|
||||
|
||||
$object_names = $args->getArg('objects');
|
||||
|
||||
$any_constraints =
|
||||
($object_names) ||
|
||||
($object_types) ||
|
||||
($index_versions) ||
|
||||
($min_epoch) ||
|
||||
($max_epoch);
|
||||
|
||||
if ($is_all && $any_constraints) {
|
||||
throw new PhutilArgumentUsageException(
|
||||
pht(
|
||||
"You can not name objects to index alongside the '%s' or '%s' flags.",
|
||||
'--all',
|
||||
'--type'));
|
||||
} else if (!$obj_names && !($is_all || $is_type)) {
|
||||
'You can not use query constraint flags (like "--version", '.
|
||||
'"--type", or a list of specific objects) with "--all".'));
|
||||
}
|
||||
|
||||
if (!$is_all && !$any_constraints) {
|
||||
throw new PhutilArgumentUsageException(
|
||||
pht(
|
||||
"Provide one of '%s', '%s' or a list of object names.",
|
||||
'--all',
|
||||
'--type'));
|
||||
'Provide a list of objects to index (like "D123"), or a set of '.
|
||||
'query constraint flags (like "--type"), or "--all" to index '.
|
||||
'all objects.'));
|
||||
}
|
||||
|
||||
if ($obj_names) {
|
||||
$phids = $this->loadPHIDsByNames($obj_names);
|
||||
} else {
|
||||
$phids = $this->loadPHIDsByTypes($is_type);
|
||||
}
|
||||
|
||||
if (!$phids) {
|
||||
throw new PhutilArgumentUsageException(pht('Nothing to index!'));
|
||||
}
|
||||
|
||||
if ($args->getArg('background')) {
|
||||
$is_background = true;
|
||||
|
@ -87,21 +123,80 @@ final class PhabricatorSearchManagementIndexWorkflow
|
|||
}
|
||||
|
||||
if (!$is_background) {
|
||||
echo tsprintf(
|
||||
"**<bg:blue> %s </bg>** %s\n",
|
||||
$this->logInfo(
|
||||
pht('NOTE'),
|
||||
pht(
|
||||
'Run this workflow with "%s" to queue tasks for the daemon workers.',
|
||||
'--background'));
|
||||
'Run this workflow with "--background" to queue tasks for the '.
|
||||
'daemon workers.'));
|
||||
}
|
||||
|
||||
$groups = phid_group_by_type($phids);
|
||||
foreach ($groups as $group_type => $group) {
|
||||
$console->writeOut(
|
||||
"%s\n",
|
||||
pht('Indexing %d object(s) of type %s.', count($group), $group_type));
|
||||
$this->logInfo(
|
||||
pht('SELECT'),
|
||||
pht('Selecting objects to index...'));
|
||||
|
||||
$object_phids = null;
|
||||
if ($object_names) {
|
||||
$object_phids = $this->loadPHIDsByNames($object_names);
|
||||
$object_phids = array_fuse($object_phids);
|
||||
}
|
||||
|
||||
$type_phids = null;
|
||||
if ($is_all || $object_types) {
|
||||
$object_map = $this->getIndexableObjectsByTypes($object_types);
|
||||
$type_phids = array();
|
||||
foreach ($object_map as $object) {
|
||||
$iterator = new LiskMigrationIterator($object);
|
||||
foreach ($iterator as $o) {
|
||||
$type_phids[] = $o->getPHID();
|
||||
}
|
||||
}
|
||||
$type_phids = array_fuse($type_phids);
|
||||
}
|
||||
|
||||
$index_phids = null;
|
||||
if ($index_versions || $min_epoch || $max_epoch) {
|
||||
$index_phids = $this->loadPHIDsByIndexConstraints(
|
||||
$index_versions,
|
||||
$min_epoch,
|
||||
$max_epoch);
|
||||
$index_phids = array_fuse($index_phids);
|
||||
}
|
||||
|
||||
$working_set = null;
|
||||
$filter_sets = array(
|
||||
$object_phids,
|
||||
$type_phids,
|
||||
$index_phids,
|
||||
);
|
||||
|
||||
foreach ($filter_sets as $filter_set) {
|
||||
if ($filter_set === null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($working_set === null) {
|
||||
$working_set = $filter_set;
|
||||
continue;
|
||||
}
|
||||
|
||||
$working_set = array_intersect_key($working_set, $filter_set);
|
||||
}
|
||||
|
||||
$phids = array_keys($working_set);
|
||||
|
||||
if (!$phids) {
|
||||
$this->logWarn(
|
||||
pht('NO OBJECTS'),
|
||||
pht('No objects selected to index.'));
|
||||
return 0;
|
||||
}
|
||||
|
||||
$this->logInfo(
|
||||
pht('INDEXING'),
|
||||
pht(
|
||||
'Indexing %s object(s).',
|
||||
phutil_count($phids)));
|
||||
|
||||
$bar = id(new PhutilConsoleProgressBar())
|
||||
->setTotal(count($phids));
|
||||
|
||||
|
@ -166,8 +261,7 @@ final class PhabricatorSearchManagementIndexWorkflow
|
|||
|
||||
if ($track_skips) {
|
||||
if ($count_updated) {
|
||||
echo tsprintf(
|
||||
"**<bg:green> %s </bg>** %s\n",
|
||||
$this->logOkay(
|
||||
pht('DONE'),
|
||||
pht(
|
||||
'Updated search indexes for %s document(s).',
|
||||
|
@ -175,29 +269,25 @@ final class PhabricatorSearchManagementIndexWorkflow
|
|||
}
|
||||
|
||||
if ($count_skipped) {
|
||||
echo tsprintf(
|
||||
"**<bg:yellow> %s </bg>** %s\n",
|
||||
$this->logWarn(
|
||||
pht('SKIP'),
|
||||
pht(
|
||||
'Skipped %s documents(s) which have not updated since they were '.
|
||||
'last indexed.',
|
||||
new PhutilNumber($count_skipped)));
|
||||
echo tsprintf(
|
||||
"**<bg:blue> %s </bg>** %s\n",
|
||||
$this->logInfo(
|
||||
pht('NOTE'),
|
||||
pht(
|
||||
'Use "--force" to force the index to update these documents.'));
|
||||
}
|
||||
} else if ($is_background) {
|
||||
echo tsprintf(
|
||||
"**<bg:green> %s </bg>** %s\n",
|
||||
$this->logOkay(
|
||||
pht('DONE'),
|
||||
pht(
|
||||
'Queued %s document(s) for background indexing.',
|
||||
new PhutilNumber(count($phids))));
|
||||
} else {
|
||||
echo tsprintf(
|
||||
"**<bg:green> %s </bg>** %s\n",
|
||||
$this->logOkay(
|
||||
pht('DONE'),
|
||||
pht(
|
||||
'Forced search index updates for %s document(s).',
|
||||
|
@ -224,62 +314,100 @@ final class PhabricatorSearchManagementIndexWorkflow
|
|||
return mpull($objects, 'getPHID');
|
||||
}
|
||||
|
||||
private function loadPHIDsByTypes($type) {
|
||||
private function getIndexableObjectsByTypes(array $types) {
|
||||
$objects = id(new PhutilClassMapQuery())
|
||||
->setAncestorClass('PhabricatorIndexableInterface')
|
||||
->execute();
|
||||
|
||||
$normalized_type = phutil_utf8_strtolower($type);
|
||||
$type_map = array();
|
||||
$normal_map = array();
|
||||
foreach ($types as $type) {
|
||||
$normalized_type = phutil_utf8_strtolower($type);
|
||||
$type_map[$type] = $normalized_type;
|
||||
|
||||
$matches = array();
|
||||
if (isset($normal_map[$normalized_type])) {
|
||||
$old_type = $normal_map[$normalized_type];
|
||||
throw new PhutilArgumentUsageException(
|
||||
pht(
|
||||
'Type specification "%s" duplicates type specification "%s". '.
|
||||
'Specify each type only once.',
|
||||
$type,
|
||||
$old_type));
|
||||
}
|
||||
|
||||
$normal_map[$normalized_type] = $type;
|
||||
}
|
||||
|
||||
$object_matches = array();
|
||||
|
||||
$matches_map = array();
|
||||
$exact_map = array();
|
||||
foreach ($objects as $object) {
|
||||
$object_class = get_class($object);
|
||||
|
||||
if (!$types) {
|
||||
$object_matches[$object_class] = $object;
|
||||
continue;
|
||||
}
|
||||
|
||||
$normalized_class = phutil_utf8_strtolower($object_class);
|
||||
|
||||
if ($normalized_class === $normalized_type) {
|
||||
$matches = array($object_class => $object);
|
||||
break;
|
||||
// If a specified type is exactly the name of this class, match it.
|
||||
if (isset($normal_map[$normalized_class])) {
|
||||
$object_matches[$object_class] = $object;
|
||||
$matching_type = $normal_map[$normalized_class];
|
||||
$matches_map[$matching_type] = array($object_class);
|
||||
$exact_map[$matching_type] = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!strlen($type) ||
|
||||
strpos($normalized_class, $normalized_type) !== false) {
|
||||
$matches[$object_class] = $object;
|
||||
foreach ($type_map as $type => $normalized_type) {
|
||||
// If we already have an exact match for this type, don't match it
|
||||
// as a substring. An indexable "MothObject" should be selectable
|
||||
// exactly without also selecting "MammothObject".
|
||||
if (isset($exact_map[$type])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If the selector isn't a substring of the class name, continue.
|
||||
if (strpos($normalized_class, $normalized_type) === false) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$matches_map[$type][] = $object_class;
|
||||
$object_matches[$object_class] = $object;
|
||||
}
|
||||
}
|
||||
|
||||
if (!$matches) {
|
||||
$all_types = array();
|
||||
foreach ($objects as $object) {
|
||||
$all_types[] = get_class($object);
|
||||
$all_types = array();
|
||||
foreach ($objects as $object) {
|
||||
$all_types[] = get_class($object);
|
||||
}
|
||||
sort($all_types);
|
||||
$type_list = implode(', ', $all_types);
|
||||
|
||||
foreach ($type_map as $type => $normalized_type) {
|
||||
$matches = idx($matches_map, $type);
|
||||
if (!$matches) {
|
||||
throw new PhutilArgumentUsageException(
|
||||
pht(
|
||||
'Type "%s" matches no indexable objects. '.
|
||||
'Supported types are: %s.',
|
||||
$type,
|
||||
$type_list));
|
||||
}
|
||||
sort($all_types);
|
||||
|
||||
throw new PhutilArgumentUsageException(
|
||||
pht(
|
||||
'Type "%s" matches no indexable objects. Supported types are: %s.',
|
||||
$type,
|
||||
implode(', ', $all_types)));
|
||||
}
|
||||
|
||||
if ((count($matches) > 1) && strlen($type)) {
|
||||
throw new PhutilArgumentUsageException(
|
||||
pht(
|
||||
'Type "%s" matches multiple indexable objects. Use a more '.
|
||||
'specific string. Matching object types are: %s.',
|
||||
$type,
|
||||
implode(', ', array_keys($matches))));
|
||||
}
|
||||
|
||||
$phids = array();
|
||||
foreach ($matches as $match) {
|
||||
$iterator = new LiskMigrationIterator($match);
|
||||
foreach ($iterator as $object) {
|
||||
$phids[] = $object->getPHID();
|
||||
if (count($matches) > 1) {
|
||||
throw new PhutilArgumentUsageException(
|
||||
pht(
|
||||
'Type "%s" matches multiple indexable objects. Use a more '.
|
||||
'specific string. Matching objects are: %s.',
|
||||
$type,
|
||||
implode(', ', $matches)));
|
||||
}
|
||||
}
|
||||
|
||||
return $phids;
|
||||
return $object_matches;
|
||||
}
|
||||
|
||||
private function loadIndexVersions($phid) {
|
||||
|
@ -294,4 +422,43 @@ final class PhabricatorSearchManagementIndexWorkflow
|
|||
$phid);
|
||||
}
|
||||
|
||||
private function loadPHIDsByIndexConstraints(
|
||||
array $index_versions,
|
||||
$min_date,
|
||||
$max_date) {
|
||||
|
||||
$table = new PhabricatorSearchIndexVersion();
|
||||
$conn = $table->establishConnection('r');
|
||||
|
||||
$where = array();
|
||||
if ($index_versions) {
|
||||
$where[] = qsprintf(
|
||||
$conn,
|
||||
'indexVersion IN (%Ls)',
|
||||
$index_versions);
|
||||
}
|
||||
|
||||
if ($min_date !== null) {
|
||||
$where[] = qsprintf(
|
||||
$conn,
|
||||
'indexEpoch >= %d',
|
||||
$min_date);
|
||||
}
|
||||
|
||||
if ($max_date !== null) {
|
||||
$where[] = qsprintf(
|
||||
$conn,
|
||||
'indexEpoch <= %d',
|
||||
$max_date);
|
||||
}
|
||||
|
||||
$rows = queryfx_all(
|
||||
$conn,
|
||||
'SELECT DISTINCT objectPHID FROM %R WHERE %LA',
|
||||
$table,
|
||||
$where);
|
||||
|
||||
return ipull($rows, 'objectPHID');
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -6,6 +6,8 @@ final class PhabricatorSearchIndexVersion
|
|||
protected $objectPHID;
|
||||
protected $extensionKey;
|
||||
protected $version;
|
||||
protected $indexVersion;
|
||||
protected $indexEpoch;
|
||||
|
||||
protected function getConfiguration() {
|
||||
return array(
|
||||
|
@ -13,12 +15,18 @@ final class PhabricatorSearchIndexVersion
|
|||
self::CONFIG_COLUMN_SCHEMA => array(
|
||||
'extensionKey' => 'text64',
|
||||
'version' => 'text128',
|
||||
'indexVersion' => 'bytes12',
|
||||
'indexEpoch' => 'epoch',
|
||||
),
|
||||
self::CONFIG_KEY_SCHEMA => array(
|
||||
'key_object' => array(
|
||||
'columns' => array('objectPHID', 'extensionKey'),
|
||||
'unique' => true,
|
||||
),
|
||||
|
||||
// NOTE: "bin/search index" may query this table by "indexVersion" or
|
||||
// "indexEpoch", but this is rare and scanning the table seems fine.
|
||||
|
||||
),
|
||||
) + parent::getConfiguration();
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue