mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-14 02:42:40 +01:00
Allow the Ferret engine to remove "common" ngrams from the index
Summary: Ref T13000. This adds support for tracking "common" ngrams, which occur in too many documents to be useful as part of the ngram index. If an ngram is listed in the "common" table, it won't be written when indexing documents, or queried for when searching for them. In this change, nothing actually writes to the "common" table. I'll start writing to the table in a followup change. Specifically, I plan to do this: - A new GC process updates the "common" table periodically, by writing ngrams which appear in more than X% of documents to it, for some value of X, if there are at least a minimum number of documents (maybe like 4,000). - A new GC process deletes ngrams that have been added to the common table from the existing indexes. Hopefully, this will pare down the ngrams index to something reasonable over time without requiring any manual tuning. Test Plan: - Ran some queries and indexes. - Manually inserted ngrams `xxx` and `yyy` into the ngrams table, searched and indexed, saw them ignored as viable ngrams for search/index. Reviewers: amckinley Reviewed By: amckinley Maniphest Tasks: T13000 Differential Revision: https://secure.phabricator.com/D18672
This commit is contained in:
parent
89fe84f978
commit
1de130c9f5
18 changed files with 200 additions and 12 deletions
|
@ -0,0 +1,7 @@
|
||||||
|
CREATE TABLE {$NAMESPACE}_maniphest.maniphest_task_fngrams_common (
|
||||||
|
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
|
||||||
|
needsCollection BOOL NOT NULL,
|
||||||
|
UNIQUE KEY `key_ngram` (ngram),
|
||||||
|
KEY `key_collect` (needsCollection)
|
||||||
|
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
|
7
resources/sql/autopatches/20171002.cngram.02.event.sql
Normal file
7
resources/sql/autopatches/20171002.cngram.02.event.sql
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
CREATE TABLE {$NAMESPACE}_calendar.calendar_event_fngrams_common (
|
||||||
|
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
|
||||||
|
needsCollection BOOL NOT NULL,
|
||||||
|
UNIQUE KEY `key_ngram` (ngram),
|
||||||
|
KEY `key_collect` (needsCollection)
|
||||||
|
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
|
|
@ -0,0 +1,7 @@
|
||||||
|
CREATE TABLE {$NAMESPACE}_differential.differential_revision_fngrams_common (
|
||||||
|
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
|
||||||
|
needsCollection BOOL NOT NULL,
|
||||||
|
UNIQUE KEY `key_ngram` (ngram),
|
||||||
|
KEY `key_collect` (needsCollection)
|
||||||
|
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
|
7
resources/sql/autopatches/20171002.cngram.04.fund.sql
Normal file
7
resources/sql/autopatches/20171002.cngram.04.fund.sql
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
CREATE TABLE {$NAMESPACE}_fund.fund_initiative_fngrams_common (
|
||||||
|
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
|
||||||
|
needsCollection BOOL NOT NULL,
|
||||||
|
UNIQUE KEY `key_ngram` (ngram),
|
||||||
|
KEY `key_collect` (needsCollection)
|
||||||
|
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
|
7
resources/sql/autopatches/20171002.cngram.05.owners.sql
Normal file
7
resources/sql/autopatches/20171002.cngram.05.owners.sql
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
CREATE TABLE {$NAMESPACE}_owners.owners_package_fngrams_common (
|
||||||
|
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
|
||||||
|
needsCollection BOOL NOT NULL,
|
||||||
|
UNIQUE KEY `key_ngram` (ngram),
|
||||||
|
KEY `key_collect` (needsCollection)
|
||||||
|
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
|
|
@ -0,0 +1,7 @@
|
||||||
|
CREATE TABLE {$NAMESPACE}_passphrase.passphrase_credential_fngrams_common (
|
||||||
|
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
|
||||||
|
needsCollection BOOL NOT NULL,
|
||||||
|
UNIQUE KEY `key_ngram` (ngram),
|
||||||
|
KEY `key_collect` (needsCollection)
|
||||||
|
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
|
7
resources/sql/autopatches/20171002.cngram.07.blog.sql
Normal file
7
resources/sql/autopatches/20171002.cngram.07.blog.sql
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
CREATE TABLE {$NAMESPACE}_phame.phame_blog_fngrams_common (
|
||||||
|
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
|
||||||
|
needsCollection BOOL NOT NULL,
|
||||||
|
UNIQUE KEY `key_ngram` (ngram),
|
||||||
|
KEY `key_collect` (needsCollection)
|
||||||
|
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
|
7
resources/sql/autopatches/20171002.cngram.08.post.sql
Normal file
7
resources/sql/autopatches/20171002.cngram.08.post.sql
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
CREATE TABLE {$NAMESPACE}_phame.phame_post_fngrams_common (
|
||||||
|
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
|
||||||
|
needsCollection BOOL NOT NULL,
|
||||||
|
UNIQUE KEY `key_ngram` (ngram),
|
||||||
|
KEY `key_collect` (needsCollection)
|
||||||
|
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
|
7
resources/sql/autopatches/20171002.cngram.09.pholio.sql
Normal file
7
resources/sql/autopatches/20171002.cngram.09.pholio.sql
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
CREATE TABLE {$NAMESPACE}_pholio.pholio_mock_fngrams_common (
|
||||||
|
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
|
||||||
|
needsCollection BOOL NOT NULL,
|
||||||
|
UNIQUE KEY `key_ngram` (ngram),
|
||||||
|
KEY `key_collect` (needsCollection)
|
||||||
|
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
|
|
@ -0,0 +1,7 @@
|
||||||
|
CREATE TABLE {$NAMESPACE}_phriction.phriction_document_fngrams_common (
|
||||||
|
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
|
||||||
|
needsCollection BOOL NOT NULL,
|
||||||
|
UNIQUE KEY `key_ngram` (ngram),
|
||||||
|
KEY `key_collect` (needsCollection)
|
||||||
|
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
|
7
resources/sql/autopatches/20171002.cngram.11.project.sql
Normal file
7
resources/sql/autopatches/20171002.cngram.11.project.sql
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
CREATE TABLE {$NAMESPACE}_project.project_project_fngrams_common (
|
||||||
|
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
|
||||||
|
needsCollection BOOL NOT NULL,
|
||||||
|
UNIQUE KEY `key_ngram` (ngram),
|
||||||
|
KEY `key_collect` (needsCollection)
|
||||||
|
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
|
7
resources/sql/autopatches/20171002.cngram.12.user.sql
Normal file
7
resources/sql/autopatches/20171002.cngram.12.user.sql
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
CREATE TABLE {$NAMESPACE}_user.user_user_fngrams_common (
|
||||||
|
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
|
||||||
|
needsCollection BOOL NOT NULL,
|
||||||
|
UNIQUE KEY `key_ngram` (ngram),
|
||||||
|
KEY `key_collect` (needsCollection)
|
||||||
|
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
|
|
@ -0,0 +1,7 @@
|
||||||
|
CREATE TABLE {$NAMESPACE}_repository.repository_repository_fngrams_common (
|
||||||
|
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
|
||||||
|
needsCollection BOOL NOT NULL,
|
||||||
|
UNIQUE KEY `key_ngram` (ngram),
|
||||||
|
KEY `key_collect` (needsCollection)
|
||||||
|
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
|
7
resources/sql/autopatches/20171002.cngram.14.commit.sql
Normal file
7
resources/sql/autopatches/20171002.cngram.14.commit.sql
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
CREATE TABLE {$NAMESPACE}_repository.repository_commit_fngrams_common (
|
||||||
|
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
|
||||||
|
needsCollection BOOL NOT NULL,
|
||||||
|
UNIQUE KEY `key_ngram` (ngram),
|
||||||
|
KEY `key_collect` (needsCollection)
|
||||||
|
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
|
|
@ -73,6 +73,12 @@ abstract class PhabricatorConfigSchemaSpec extends Phobject {
|
||||||
$engine->getNgramsTableName(),
|
$engine->getNgramsTableName(),
|
||||||
$engine->getNgramsSchemaColumns(),
|
$engine->getNgramsSchemaColumns(),
|
||||||
$engine->getNgramsSchemaKeys());
|
$engine->getNgramsSchemaKeys());
|
||||||
|
|
||||||
|
$this->buildRawSchema(
|
||||||
|
$engine->getApplicationName(),
|
||||||
|
$engine->getCommonNgramsTableName(),
|
||||||
|
$engine->getCommonNgramsSchemaColumns(),
|
||||||
|
$engine->getCommonNgramsSchemaKeys());
|
||||||
}
|
}
|
||||||
|
|
||||||
protected function buildRawSchema(
|
protected function buildRawSchema(
|
||||||
|
|
|
@ -165,6 +165,30 @@ final class PhabricatorFerretFulltextEngineExtension
|
||||||
$ferret_field['normalCorpus']);
|
$ferret_field['normalCorpus']);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ($ngrams) {
|
||||||
|
$common = queryfx_all(
|
||||||
|
$conn,
|
||||||
|
'SELECT ngram FROM %T WHERE ngram IN (%Ls)',
|
||||||
|
$engine->getCommonNgramsTableName(),
|
||||||
|
$ngrams);
|
||||||
|
$common = ipull($common, 'ngram', 'ngram');
|
||||||
|
|
||||||
|
foreach ($ngrams as $key => $ngram) {
|
||||||
|
if (isset($common[$ngram])) {
|
||||||
|
unset($ngrams[$key]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: MySQL discards trailing whitespace in CHAR(X) columns.
|
||||||
|
$trim_ngram = rtrim($ngram, ' ');
|
||||||
|
if (isset($common[$ngram])) {
|
||||||
|
unset($ngrams[$key]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($ngrams) {
|
||||||
$sql = array();
|
$sql = array();
|
||||||
foreach ($ngrams as $ngram) {
|
foreach ($ngrams as $ngram) {
|
||||||
$sql[] = qsprintf(
|
$sql[] = qsprintf(
|
||||||
|
@ -181,6 +205,7 @@ final class PhabricatorFerretFulltextEngineExtension
|
||||||
$engine->getNgramsTableName(),
|
$engine->getNgramsTableName(),
|
||||||
$chunk);
|
$chunk);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} catch (Exception $ex) {
|
} catch (Exception $ex) {
|
||||||
$object->killTransaction();
|
$object->killTransaction();
|
||||||
throw $ex;
|
throw $ex;
|
||||||
|
|
|
@ -295,4 +295,35 @@ abstract class PhabricatorFerretEngine extends Phobject {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function getCommonNgramsTableName() {
|
||||||
|
$application = $this->getApplicationName();
|
||||||
|
$scope = $this->getScopeName();
|
||||||
|
|
||||||
|
return "{$application}_{$scope}_fngrams_common";
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getCommonNgramsSchemaColumns() {
|
||||||
|
return array(
|
||||||
|
'id' => 'auto',
|
||||||
|
'ngram' => 'char3',
|
||||||
|
'needsCollection' => 'bool',
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getCommonNgramsSchemaKeys() {
|
||||||
|
return array(
|
||||||
|
'PRIMARY' => array(
|
||||||
|
'columns' => array('id'),
|
||||||
|
'unique' => true,
|
||||||
|
),
|
||||||
|
'key_ngram' => array(
|
||||||
|
'columns' => array('ngram'),
|
||||||
|
'unique' => true,
|
||||||
|
),
|
||||||
|
'key_collect' => array(
|
||||||
|
'columns' => array('needsCollection'),
|
||||||
|
),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1700,6 +1700,34 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Remove common ngrams, like "the", which occur too frequently in
|
||||||
|
// documents to be useful in constraining the query. The best ngrams
|
||||||
|
// are obscure sequences which occur in very few documents.
|
||||||
|
|
||||||
|
if ($flat) {
|
||||||
|
$common_ngrams = queryfx_all(
|
||||||
|
$conn,
|
||||||
|
'SELECT ngram FROM %T WHERE ngram IN (%Ls)',
|
||||||
|
$engine->getCommonNgramsTableName(),
|
||||||
|
ipull($flat, 'ngram'));
|
||||||
|
$common_ngrams = ipull($common_ngrams, 'ngram', 'ngram');
|
||||||
|
|
||||||
|
foreach ($flat as $key => $spec) {
|
||||||
|
$ngram = $spec['ngram'];
|
||||||
|
if (isset($common_ngrams[$ngram])) {
|
||||||
|
unset($flat[$key]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: MySQL discards trailing whitespace in CHAR(X) columns.
|
||||||
|
$trim_ngram = rtrim($ngram, ' ');
|
||||||
|
if (isset($common_ngrams[$trim_ngram])) {
|
||||||
|
unset($flat[$key]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// MySQL only allows us to join a maximum of 61 tables per query. Each
|
// MySQL only allows us to join a maximum of 61 tables per query. Each
|
||||||
// ngram is going to cost us a join toward that limit, so if the user
|
// ngram is going to cost us a join toward that limit, so if the user
|
||||||
// specified a very long query string, just pick 16 of the ngrams
|
// specified a very long query string, just pick 16 of the ngrams
|
||||||
|
|
Loading…
Reference in a new issue