1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-11-14 02:42:40 +01:00

Add a workflow for populating (or depopulating) the common ngrams table

Summary:
Depends on D18672. Ref T13000. This does an on-demand build of the common ngrams table.

Plan here is:

  - Push to `secure`.
  - Build the common ngrams table here.
  - See if stuff breaks?

If it looks okay on this dataset, we can build out the GC support and try it in production.

Test Plan:
  - Locally, my dataset has a bunch of `bin/lipsum` tasks with similar, common words.
  - Verified that ipsum terms now skip ngrams. For "lorem ipsum" search performance actually IMPROVED by skipping the ngrams table (12s to 9s).
  - Queried for normal terms, got very fast results using the ngram table, as normal.

Reviewers: amckinley

Reviewed By: amckinley

Maniphest Tasks: T13000

Differential Revision: https://secure.phabricator.com/D18673
This commit is contained in:
epriestley 2017-10-02 16:14:46 -07:00
parent 1de130c9f5
commit 3e589cdd73
2 changed files with 108 additions and 0 deletions

View file

@ -3948,6 +3948,7 @@ phutil_register_library_map(array(
'PhabricatorSearchIndexVersionDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchIndexVersionDestructionEngineExtension.php', 'PhabricatorSearchIndexVersionDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchIndexVersionDestructionEngineExtension.php',
'PhabricatorSearchManagementIndexWorkflow' => 'applications/search/management/PhabricatorSearchManagementIndexWorkflow.php', 'PhabricatorSearchManagementIndexWorkflow' => 'applications/search/management/PhabricatorSearchManagementIndexWorkflow.php',
'PhabricatorSearchManagementInitWorkflow' => 'applications/search/management/PhabricatorSearchManagementInitWorkflow.php', 'PhabricatorSearchManagementInitWorkflow' => 'applications/search/management/PhabricatorSearchManagementInitWorkflow.php',
'PhabricatorSearchManagementNgramsWorkflow' => 'applications/search/management/PhabricatorSearchManagementNgramsWorkflow.php',
'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php', 'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php',
'PhabricatorSearchNgrams' => 'applications/search/ngrams/PhabricatorSearchNgrams.php', 'PhabricatorSearchNgrams' => 'applications/search/ngrams/PhabricatorSearchNgrams.php',
'PhabricatorSearchNgramsDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchNgramsDestructionEngineExtension.php', 'PhabricatorSearchNgramsDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchNgramsDestructionEngineExtension.php',
@ -9528,6 +9529,7 @@ phutil_register_library_map(array(
'PhabricatorSearchIndexVersionDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension', 'PhabricatorSearchIndexVersionDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension',
'PhabricatorSearchManagementIndexWorkflow' => 'PhabricatorSearchManagementWorkflow', 'PhabricatorSearchManagementIndexWorkflow' => 'PhabricatorSearchManagementWorkflow',
'PhabricatorSearchManagementInitWorkflow' => 'PhabricatorSearchManagementWorkflow', 'PhabricatorSearchManagementInitWorkflow' => 'PhabricatorSearchManagementWorkflow',
'PhabricatorSearchManagementNgramsWorkflow' => 'PhabricatorSearchManagementWorkflow',
'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow', 'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow',
'PhabricatorSearchNgrams' => 'PhabricatorSearchDAO', 'PhabricatorSearchNgrams' => 'PhabricatorSearchDAO',
'PhabricatorSearchNgramsDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension', 'PhabricatorSearchNgramsDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension',

View file

@ -0,0 +1,106 @@
<?php
final class PhabricatorSearchManagementNgramsWorkflow
extends PhabricatorSearchManagementWorkflow {
protected function didConstruct() {
$this
->setName('ngrams')
->setSynopsis(pht('Recompute common ngrams.'))
->setArguments(
array(
array(
'name' => 'reset',
'help' => pht('Reset all common ngram records.'),
),
));
}
public function execute(PhutilArgumentParser $args) {
$is_reset = $args->getArg('reset');
$all_objects = id(new PhutilClassMapQuery())
->setAncestorClass('PhabricatorFerretInterface')
->execute();
$min_documents = 4096;
$threshold = 0.15;
foreach ($all_objects as $object) {
$engine = $object->newFerretEngine();
$conn = $object->establishConnection('w');
$display_name = get_class($object);
if ($is_reset) {
echo tsprintf(
"%s\n",
pht(
'Resetting common ngrams for "%s".',
$display_name));
queryfx(
$conn,
'DELETE FROM %T',
$engine->getCommonNgramsTableName());
continue;
}
$document_count = queryfx_one(
$conn,
'SELECT COUNT(*) N FROM %T',
$engine->getDocumentTableName());
$document_count = $document_count['N'];
if ($document_count < $min_documents) {
echo tsprintf(
"%s\n",
pht(
'Too few documents of type "%s" for any ngrams to be common.',
$display_name));
continue;
}
$min_frequency = (int)ceil($document_count * $threshold);
$common_ngrams = queryfx_all(
$conn,
'SELECT ngram, COUNT(*) N FROM %T
GROUP BY ngram
HAVING N >= %d',
$engine->getNgramsTableName(),
$min_frequency);
if (!$common_ngrams) {
echo tsprintf(
"%s\n",
pht(
'No new common ngrams exist for "%s".',
$display_name));
continue;
}
$sql = array();
foreach ($common_ngrams as $ngram) {
$sql[] = qsprintf(
$conn,
'(%s, 1)',
$ngram['ngram']);
}
foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) {
queryfx(
$conn,
'INSERT IGNORE INTO %T (ngram, needsCollection)
VALUES %Q',
$engine->getCommonNgramsTableName(),
$chunk);
}
echo tsprintf(
"%s\n",
pht(
'Updated common ngrams for "%s".',
$display_name));
}
}
}