mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-25 16:22:43 +01:00
Add a workflow for populating (or depopulating) the common ngrams table
Summary: Depends on D18672. Ref T13000. This does an on-demand build of the common ngrams table. Plan here is: - Push to `secure`. - Build the common ngrams table here. - See if stuff breaks? If it looks okay on this dataset, we can build out the GC support and try it in production. Test Plan: - Locally, my dataset has a bunch of `bin/lipsum` tasks with similar, common words. - Verified that ipsum terms now skip ngrams. For "lorem ipsum" search performance actually IMPROVED by skipping the ngrams table (12s to 9s). - Queried for normal terms, got very fast results using the ngram table, as normal. Reviewers: amckinley Reviewed By: amckinley Maniphest Tasks: T13000 Differential Revision: https://secure.phabricator.com/D18673
This commit is contained in:
parent
1de130c9f5
commit
3e589cdd73
2 changed files with 108 additions and 0 deletions
|
@ -3948,6 +3948,7 @@ phutil_register_library_map(array(
|
|||
'PhabricatorSearchIndexVersionDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchIndexVersionDestructionEngineExtension.php',
|
||||
'PhabricatorSearchManagementIndexWorkflow' => 'applications/search/management/PhabricatorSearchManagementIndexWorkflow.php',
|
||||
'PhabricatorSearchManagementInitWorkflow' => 'applications/search/management/PhabricatorSearchManagementInitWorkflow.php',
|
||||
'PhabricatorSearchManagementNgramsWorkflow' => 'applications/search/management/PhabricatorSearchManagementNgramsWorkflow.php',
|
||||
'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php',
|
||||
'PhabricatorSearchNgrams' => 'applications/search/ngrams/PhabricatorSearchNgrams.php',
|
||||
'PhabricatorSearchNgramsDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchNgramsDestructionEngineExtension.php',
|
||||
|
@ -9528,6 +9529,7 @@ phutil_register_library_map(array(
|
|||
'PhabricatorSearchIndexVersionDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension',
|
||||
'PhabricatorSearchManagementIndexWorkflow' => 'PhabricatorSearchManagementWorkflow',
|
||||
'PhabricatorSearchManagementInitWorkflow' => 'PhabricatorSearchManagementWorkflow',
|
||||
'PhabricatorSearchManagementNgramsWorkflow' => 'PhabricatorSearchManagementWorkflow',
|
||||
'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow',
|
||||
'PhabricatorSearchNgrams' => 'PhabricatorSearchDAO',
|
||||
'PhabricatorSearchNgramsDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension',
|
||||
|
|
|
@ -0,0 +1,106 @@
|
|||
<?php
|
||||
|
||||
final class PhabricatorSearchManagementNgramsWorkflow
|
||||
extends PhabricatorSearchManagementWorkflow {
|
||||
|
||||
protected function didConstruct() {
|
||||
$this
|
||||
->setName('ngrams')
|
||||
->setSynopsis(pht('Recompute common ngrams.'))
|
||||
->setArguments(
|
||||
array(
|
||||
array(
|
||||
'name' => 'reset',
|
||||
'help' => pht('Reset all common ngram records.'),
|
||||
),
|
||||
));
|
||||
}
|
||||
|
||||
public function execute(PhutilArgumentParser $args) {
|
||||
$is_reset = $args->getArg('reset');
|
||||
|
||||
$all_objects = id(new PhutilClassMapQuery())
|
||||
->setAncestorClass('PhabricatorFerretInterface')
|
||||
->execute();
|
||||
|
||||
$min_documents = 4096;
|
||||
$threshold = 0.15;
|
||||
|
||||
foreach ($all_objects as $object) {
|
||||
$engine = $object->newFerretEngine();
|
||||
$conn = $object->establishConnection('w');
|
||||
$display_name = get_class($object);
|
||||
|
||||
if ($is_reset) {
|
||||
echo tsprintf(
|
||||
"%s\n",
|
||||
pht(
|
||||
'Resetting common ngrams for "%s".',
|
||||
$display_name));
|
||||
|
||||
queryfx(
|
||||
$conn,
|
||||
'DELETE FROM %T',
|
||||
$engine->getCommonNgramsTableName());
|
||||
continue;
|
||||
}
|
||||
|
||||
$document_count = queryfx_one(
|
||||
$conn,
|
||||
'SELECT COUNT(*) N FROM %T',
|
||||
$engine->getDocumentTableName());
|
||||
$document_count = $document_count['N'];
|
||||
|
||||
if ($document_count < $min_documents) {
|
||||
echo tsprintf(
|
||||
"%s\n",
|
||||
pht(
|
||||
'Too few documents of type "%s" for any ngrams to be common.',
|
||||
$display_name));
|
||||
continue;
|
||||
}
|
||||
|
||||
$min_frequency = (int)ceil($document_count * $threshold);
|
||||
$common_ngrams = queryfx_all(
|
||||
$conn,
|
||||
'SELECT ngram, COUNT(*) N FROM %T
|
||||
GROUP BY ngram
|
||||
HAVING N >= %d',
|
||||
$engine->getNgramsTableName(),
|
||||
$min_frequency);
|
||||
|
||||
if (!$common_ngrams) {
|
||||
echo tsprintf(
|
||||
"%s\n",
|
||||
pht(
|
||||
'No new common ngrams exist for "%s".',
|
||||
$display_name));
|
||||
continue;
|
||||
}
|
||||
|
||||
$sql = array();
|
||||
foreach ($common_ngrams as $ngram) {
|
||||
$sql[] = qsprintf(
|
||||
$conn,
|
||||
'(%s, 1)',
|
||||
$ngram['ngram']);
|
||||
}
|
||||
|
||||
foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) {
|
||||
queryfx(
|
||||
$conn,
|
||||
'INSERT IGNORE INTO %T (ngram, needsCollection)
|
||||
VALUES %Q',
|
||||
$engine->getCommonNgramsTableName(),
|
||||
$chunk);
|
||||
}
|
||||
|
||||
echo tsprintf(
|
||||
"%s\n",
|
||||
pht(
|
||||
'Updated common ngrams for "%s".',
|
||||
$display_name));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in a new issue