mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-29 10:12:41 +01:00
Add a workflow for populating (or depopulating) the common ngrams table
Summary: Depends on D18672. Ref T13000. This does an on-demand build of the common ngrams table. Plan here is: - Push to `secure`. - Build the common ngrams table here. - See if stuff breaks? If it looks okay on this dataset, we can build out the GC support and try it in production. Test Plan: - Locally, my dataset has a bunch of `bin/lipsum` tasks with similar, common words. - Verified that ipsum terms now skip ngrams. For "lorem ipsum" search performance actually IMPROVED by skipping the ngrams table (12s to 9s). - Queried for normal terms, got very fast results using the ngram table, as normal. Reviewers: amckinley Reviewed By: amckinley Maniphest Tasks: T13000 Differential Revision: https://secure.phabricator.com/D18673
This commit is contained in:
parent
1de130c9f5
commit
3e589cdd73
2 changed files with 108 additions and 0 deletions
|
@ -3948,6 +3948,7 @@ phutil_register_library_map(array(
|
||||||
'PhabricatorSearchIndexVersionDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchIndexVersionDestructionEngineExtension.php',
|
'PhabricatorSearchIndexVersionDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchIndexVersionDestructionEngineExtension.php',
|
||||||
'PhabricatorSearchManagementIndexWorkflow' => 'applications/search/management/PhabricatorSearchManagementIndexWorkflow.php',
|
'PhabricatorSearchManagementIndexWorkflow' => 'applications/search/management/PhabricatorSearchManagementIndexWorkflow.php',
|
||||||
'PhabricatorSearchManagementInitWorkflow' => 'applications/search/management/PhabricatorSearchManagementInitWorkflow.php',
|
'PhabricatorSearchManagementInitWorkflow' => 'applications/search/management/PhabricatorSearchManagementInitWorkflow.php',
|
||||||
|
'PhabricatorSearchManagementNgramsWorkflow' => 'applications/search/management/PhabricatorSearchManagementNgramsWorkflow.php',
|
||||||
'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php',
|
'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php',
|
||||||
'PhabricatorSearchNgrams' => 'applications/search/ngrams/PhabricatorSearchNgrams.php',
|
'PhabricatorSearchNgrams' => 'applications/search/ngrams/PhabricatorSearchNgrams.php',
|
||||||
'PhabricatorSearchNgramsDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchNgramsDestructionEngineExtension.php',
|
'PhabricatorSearchNgramsDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchNgramsDestructionEngineExtension.php',
|
||||||
|
@ -9528,6 +9529,7 @@ phutil_register_library_map(array(
|
||||||
'PhabricatorSearchIndexVersionDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension',
|
'PhabricatorSearchIndexVersionDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension',
|
||||||
'PhabricatorSearchManagementIndexWorkflow' => 'PhabricatorSearchManagementWorkflow',
|
'PhabricatorSearchManagementIndexWorkflow' => 'PhabricatorSearchManagementWorkflow',
|
||||||
'PhabricatorSearchManagementInitWorkflow' => 'PhabricatorSearchManagementWorkflow',
|
'PhabricatorSearchManagementInitWorkflow' => 'PhabricatorSearchManagementWorkflow',
|
||||||
|
'PhabricatorSearchManagementNgramsWorkflow' => 'PhabricatorSearchManagementWorkflow',
|
||||||
'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow',
|
'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow',
|
||||||
'PhabricatorSearchNgrams' => 'PhabricatorSearchDAO',
|
'PhabricatorSearchNgrams' => 'PhabricatorSearchDAO',
|
||||||
'PhabricatorSearchNgramsDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension',
|
'PhabricatorSearchNgramsDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension',
|
||||||
|
|
|
@ -0,0 +1,106 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
final class PhabricatorSearchManagementNgramsWorkflow
|
||||||
|
extends PhabricatorSearchManagementWorkflow {
|
||||||
|
|
||||||
|
protected function didConstruct() {
|
||||||
|
$this
|
||||||
|
->setName('ngrams')
|
||||||
|
->setSynopsis(pht('Recompute common ngrams.'))
|
||||||
|
->setArguments(
|
||||||
|
array(
|
||||||
|
array(
|
||||||
|
'name' => 'reset',
|
||||||
|
'help' => pht('Reset all common ngram records.'),
|
||||||
|
),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function execute(PhutilArgumentParser $args) {
|
||||||
|
$is_reset = $args->getArg('reset');
|
||||||
|
|
||||||
|
$all_objects = id(new PhutilClassMapQuery())
|
||||||
|
->setAncestorClass('PhabricatorFerretInterface')
|
||||||
|
->execute();
|
||||||
|
|
||||||
|
$min_documents = 4096;
|
||||||
|
$threshold = 0.15;
|
||||||
|
|
||||||
|
foreach ($all_objects as $object) {
|
||||||
|
$engine = $object->newFerretEngine();
|
||||||
|
$conn = $object->establishConnection('w');
|
||||||
|
$display_name = get_class($object);
|
||||||
|
|
||||||
|
if ($is_reset) {
|
||||||
|
echo tsprintf(
|
||||||
|
"%s\n",
|
||||||
|
pht(
|
||||||
|
'Resetting common ngrams for "%s".',
|
||||||
|
$display_name));
|
||||||
|
|
||||||
|
queryfx(
|
||||||
|
$conn,
|
||||||
|
'DELETE FROM %T',
|
||||||
|
$engine->getCommonNgramsTableName());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$document_count = queryfx_one(
|
||||||
|
$conn,
|
||||||
|
'SELECT COUNT(*) N FROM %T',
|
||||||
|
$engine->getDocumentTableName());
|
||||||
|
$document_count = $document_count['N'];
|
||||||
|
|
||||||
|
if ($document_count < $min_documents) {
|
||||||
|
echo tsprintf(
|
||||||
|
"%s\n",
|
||||||
|
pht(
|
||||||
|
'Too few documents of type "%s" for any ngrams to be common.',
|
||||||
|
$display_name));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$min_frequency = (int)ceil($document_count * $threshold);
|
||||||
|
$common_ngrams = queryfx_all(
|
||||||
|
$conn,
|
||||||
|
'SELECT ngram, COUNT(*) N FROM %T
|
||||||
|
GROUP BY ngram
|
||||||
|
HAVING N >= %d',
|
||||||
|
$engine->getNgramsTableName(),
|
||||||
|
$min_frequency);
|
||||||
|
|
||||||
|
if (!$common_ngrams) {
|
||||||
|
echo tsprintf(
|
||||||
|
"%s\n",
|
||||||
|
pht(
|
||||||
|
'No new common ngrams exist for "%s".',
|
||||||
|
$display_name));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$sql = array();
|
||||||
|
foreach ($common_ngrams as $ngram) {
|
||||||
|
$sql[] = qsprintf(
|
||||||
|
$conn,
|
||||||
|
'(%s, 1)',
|
||||||
|
$ngram['ngram']);
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) {
|
||||||
|
queryfx(
|
||||||
|
$conn,
|
||||||
|
'INSERT IGNORE INTO %T (ngram, needsCollection)
|
||||||
|
VALUES %Q',
|
||||||
|
$engine->getCommonNgramsTableName(),
|
||||||
|
$chunk);
|
||||||
|
}
|
||||||
|
|
||||||
|
echo tsprintf(
|
||||||
|
"%s\n",
|
||||||
|
pht(
|
||||||
|
'Updated common ngrams for "%s".',
|
||||||
|
$display_name));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in a new issue