mirror of
https://we.phorge.it/source/phorge.git
synced 2025-02-14 07:48:38 +01:00
Summary: Depends on D18672. Ref T13000. This does an on-demand build of the common ngrams table. Plan here is: - Push to `secure`. - Build the common ngrams table here. - See if stuff breaks? If it looks okay on this dataset, we can build out the GC support and try it in production. Test Plan: - Locally, my dataset has a bunch of `bin/lipsum` tasks with similar, common words. - Verified that ipsum terms now skip ngrams. For "lorem ipsum" search performance actually IMPROVED by skipping the ngrams table (12s to 9s). - Queried for normal terms, got very fast results using the ngram table, as normal. Reviewers: amckinley Reviewed By: amckinley Maniphest Tasks: T13000 Differential Revision: https://secure.phabricator.com/D18673
106 lines
2.5 KiB
PHP
106 lines
2.5 KiB
PHP
<?php
|
|
|
|
final class PhabricatorSearchManagementNgramsWorkflow
|
|
extends PhabricatorSearchManagementWorkflow {
|
|
|
|
protected function didConstruct() {
|
|
$this
|
|
->setName('ngrams')
|
|
->setSynopsis(pht('Recompute common ngrams.'))
|
|
->setArguments(
|
|
array(
|
|
array(
|
|
'name' => 'reset',
|
|
'help' => pht('Reset all common ngram records.'),
|
|
),
|
|
));
|
|
}
|
|
|
|
public function execute(PhutilArgumentParser $args) {
|
|
$is_reset = $args->getArg('reset');
|
|
|
|
$all_objects = id(new PhutilClassMapQuery())
|
|
->setAncestorClass('PhabricatorFerretInterface')
|
|
->execute();
|
|
|
|
$min_documents = 4096;
|
|
$threshold = 0.15;
|
|
|
|
foreach ($all_objects as $object) {
|
|
$engine = $object->newFerretEngine();
|
|
$conn = $object->establishConnection('w');
|
|
$display_name = get_class($object);
|
|
|
|
if ($is_reset) {
|
|
echo tsprintf(
|
|
"%s\n",
|
|
pht(
|
|
'Resetting common ngrams for "%s".',
|
|
$display_name));
|
|
|
|
queryfx(
|
|
$conn,
|
|
'DELETE FROM %T',
|
|
$engine->getCommonNgramsTableName());
|
|
continue;
|
|
}
|
|
|
|
$document_count = queryfx_one(
|
|
$conn,
|
|
'SELECT COUNT(*) N FROM %T',
|
|
$engine->getDocumentTableName());
|
|
$document_count = $document_count['N'];
|
|
|
|
if ($document_count < $min_documents) {
|
|
echo tsprintf(
|
|
"%s\n",
|
|
pht(
|
|
'Too few documents of type "%s" for any ngrams to be common.',
|
|
$display_name));
|
|
continue;
|
|
}
|
|
|
|
$min_frequency = (int)ceil($document_count * $threshold);
|
|
$common_ngrams = queryfx_all(
|
|
$conn,
|
|
'SELECT ngram, COUNT(*) N FROM %T
|
|
GROUP BY ngram
|
|
HAVING N >= %d',
|
|
$engine->getNgramsTableName(),
|
|
$min_frequency);
|
|
|
|
if (!$common_ngrams) {
|
|
echo tsprintf(
|
|
"%s\n",
|
|
pht(
|
|
'No new common ngrams exist for "%s".',
|
|
$display_name));
|
|
continue;
|
|
}
|
|
|
|
$sql = array();
|
|
foreach ($common_ngrams as $ngram) {
|
|
$sql[] = qsprintf(
|
|
$conn,
|
|
'(%s, 1)',
|
|
$ngram['ngram']);
|
|
}
|
|
|
|
foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) {
|
|
queryfx(
|
|
$conn,
|
|
'INSERT IGNORE INTO %T (ngram, needsCollection)
|
|
VALUES %Q',
|
|
$engine->getCommonNgramsTableName(),
|
|
$chunk);
|
|
}
|
|
|
|
echo tsprintf(
|
|
"%s\n",
|
|
pht(
|
|
'Updated common ngrams for "%s".',
|
|
$display_name));
|
|
}
|
|
}
|
|
|
|
}
|