1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2025-01-23 21:18:19 +01:00

Implement basic ngram search for Owners Package names

Summary:
Ref T9979. This uses ngrams (specifically, trigrams) to build a reasonably efficient index for substring matching. Specifically, for a package like "Example", with ID 123, we store rows like this:

```
< ex, 123>
<exa, 123>
<xam, 123>
<amp, 123>
<mpl, 123>
<ple, 123>
<le , 123>
```

When the user searches for `exam`, we join this table for packages with tokens `exa` and `xam`. MySQL can do this a lot more efficiently than it can process a `LIKE "%exam%"` query against a huge table.

When the user searches for a one-letter or two-letter string, we only search the beginnings of words. This is probably what they want, the only thing we can do quickly, and a reasonable/expected behavior for typeaheads.

Test Plan:
  - Ran storage upgrades and search indexer.
  - Searched for stuff with "name contains".
  - Used typehaead and got sensible results.
  - Searched for `aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz` and saw only 16 joins.

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T9979

Differential Revision: https://secure.phabricator.com/D14846
This commit is contained in:
epriestley 2015-12-21 12:22:07 -08:00
parent 5c8025c41d
commit 96fe8c0b83
18 changed files with 457 additions and 28 deletions

View file

@ -0,0 +1,7 @@
CREATE TABLE {$NAMESPACE}_owners.owners_name_ngrams (
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
objectID INT UNSIGNED NOT NULL,
ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
KEY `key_object` (objectID),
KEY `key_ngram` (ngram, objectID)
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};

View file

@ -0,0 +1,11 @@
<?php
$table = new PhabricatorOwnersPackage();
foreach (new LiskMigrationIterator($table) as $package) {
PhabricatorSearchWorker::queueDocumentForIndexing(
$package->getPHID(),
array(
'force' => true,
));
}

View file

@ -2548,6 +2548,8 @@ phutil_register_library_map(array(
'PhabricatorNamedQueryQuery' => 'applications/search/query/PhabricatorNamedQueryQuery.php',
'PhabricatorNavigationRemarkupRule' => 'infrastructure/markup/rule/PhabricatorNavigationRemarkupRule.php',
'PhabricatorNeverTriggerClock' => 'infrastructure/daemon/workers/clock/PhabricatorNeverTriggerClock.php',
'PhabricatorNgramsIndexEngineExtension' => 'applications/search/engineextension/PhabricatorNgramsIndexEngineExtension.php',
'PhabricatorNgramsInterface' => 'applications/search/interface/PhabricatorNgramsInterface.php',
'PhabricatorNotificationBuilder' => 'applications/notification/builder/PhabricatorNotificationBuilder.php',
'PhabricatorNotificationClearController' => 'applications/notification/controller/PhabricatorNotificationClearController.php',
'PhabricatorNotificationClient' => 'applications/notification/client/PhabricatorNotificationClient.php',
@ -2636,7 +2638,9 @@ phutil_register_library_map(array(
'PhabricatorOwnersPackage' => 'applications/owners/storage/PhabricatorOwnersPackage.php',
'PhabricatorOwnersPackageDatasource' => 'applications/owners/typeahead/PhabricatorOwnersPackageDatasource.php',
'PhabricatorOwnersPackageEditEngine' => 'applications/owners/editor/PhabricatorOwnersPackageEditEngine.php',
'PhabricatorOwnersPackageFulltextEngine' => 'applications/owners/query/PhabricatorOwnersPackageFulltextEngine.php',
'PhabricatorOwnersPackageFunctionDatasource' => 'applications/owners/typeahead/PhabricatorOwnersPackageFunctionDatasource.php',
'PhabricatorOwnersPackageNameNgrams' => 'applications/owners/storage/PhabricatorOwnersPackageNameNgrams.php',
'PhabricatorOwnersPackageOwnerDatasource' => 'applications/owners/typeahead/PhabricatorOwnersPackageOwnerDatasource.php',
'PhabricatorOwnersPackagePHIDType' => 'applications/owners/phid/PhabricatorOwnersPackagePHIDType.php',
'PhabricatorOwnersPackageQuery' => 'applications/owners/query/PhabricatorOwnersPackageQuery.php',
@ -3047,6 +3051,8 @@ phutil_register_library_map(array(
'PhabricatorSearchManagementIndexWorkflow' => 'applications/search/management/PhabricatorSearchManagementIndexWorkflow.php',
'PhabricatorSearchManagementInitWorkflow' => 'applications/search/management/PhabricatorSearchManagementInitWorkflow.php',
'PhabricatorSearchManagementWorkflow' => 'applications/search/management/PhabricatorSearchManagementWorkflow.php',
'PhabricatorSearchNgrams' => 'applications/search/ngrams/PhabricatorSearchNgrams.php',
'PhabricatorSearchNgramsDestructionEngineExtension' => 'applications/search/engineextension/PhabricatorSearchNgramsDestructionEngineExtension.php',
'PhabricatorSearchOrderController' => 'applications/search/controller/PhabricatorSearchOrderController.php',
'PhabricatorSearchOrderField' => 'applications/search/field/PhabricatorSearchOrderField.php',
'PhabricatorSearchPreferencesSettingsPanel' => 'applications/settings/panel/PhabricatorSearchPreferencesSettingsPanel.php',
@ -6802,6 +6808,7 @@ phutil_register_library_map(array(
'PhabricatorNamedQueryQuery' => 'PhabricatorCursorPagedPolicyAwareQuery',
'PhabricatorNavigationRemarkupRule' => 'PhutilRemarkupRule',
'PhabricatorNeverTriggerClock' => 'PhabricatorTriggerClock',
'PhabricatorNgramsIndexEngineExtension' => 'PhabricatorIndexEngineExtension',
'PhabricatorNotificationBuilder' => 'Phobject',
'PhabricatorNotificationClearController' => 'PhabricatorNotificationController',
'PhabricatorNotificationClient' => 'Phobject',
@ -6907,10 +6914,14 @@ phutil_register_library_map(array(
'PhabricatorCustomFieldInterface',
'PhabricatorDestructibleInterface',
'PhabricatorConduitResultInterface',
'PhabricatorFulltextInterface',
'PhabricatorNgramsInterface',
),
'PhabricatorOwnersPackageDatasource' => 'PhabricatorTypeaheadDatasource',
'PhabricatorOwnersPackageEditEngine' => 'PhabricatorEditEngine',
'PhabricatorOwnersPackageFulltextEngine' => 'PhabricatorFulltextEngine',
'PhabricatorOwnersPackageFunctionDatasource' => 'PhabricatorTypeaheadCompositeDatasource',
'PhabricatorOwnersPackageNameNgrams' => 'PhabricatorSearchNgrams',
'PhabricatorOwnersPackageOwnerDatasource' => 'PhabricatorTypeaheadCompositeDatasource',
'PhabricatorOwnersPackagePHIDType' => 'PhabricatorPHIDType',
'PhabricatorOwnersPackageQuery' => 'PhabricatorCursorPagedPolicyAwareQuery',
@ -7414,6 +7425,8 @@ phutil_register_library_map(array(
'PhabricatorSearchManagementIndexWorkflow' => 'PhabricatorSearchManagementWorkflow',
'PhabricatorSearchManagementInitWorkflow' => 'PhabricatorSearchManagementWorkflow',
'PhabricatorSearchManagementWorkflow' => 'PhabricatorManagementWorkflow',
'PhabricatorSearchNgrams' => 'PhabricatorSearchDAO',
'PhabricatorSearchNgramsDestructionEngineExtension' => 'PhabricatorDestructionEngineExtension',
'PhabricatorSearchOrderController' => 'PhabricatorSearchBaseController',
'PhabricatorSearchOrderField' => 'PhabricatorSearchField',
'PhabricatorSearchPreferencesSettingsPanel' => 'PhabricatorSettingsPanel',

View file

@ -201,7 +201,8 @@ abstract class PhabricatorConfigSchemaSpec extends Phobject {
$is_binary = ($this->getUTF8Charset() == 'binary');
$matches = null;
if (preg_match('/^(fulltext|sort|text)(\d+)?\z/', $data_type, $matches)) {
$pattern = '/^(fulltext|sort|text|char)(\d+)?\z/';
if (preg_match($pattern, $data_type, $matches)) {
// Limit the permitted column lengths under the theory that it would
// be nice to eventually reduce this to a small set of standard lengths.
@ -220,6 +221,7 @@ abstract class PhabricatorConfigSchemaSpec extends Phobject {
'text8' => true,
'text4' => true,
'text' => true,
'char3' => true,
'sort255' => true,
'sort128' => true,
'sort64' => true,
@ -266,10 +268,14 @@ abstract class PhabricatorConfigSchemaSpec extends Phobject {
// the majority of cases.
$column_type = 'longtext';
break;
case 'char':
$column_type = 'char('.$size.')';
break;
}
switch ($type) {
case 'text':
case 'char':
if ($is_binary) {
// We leave collation and character set unspecified in order to
// generate valid SQL.

View file

@ -334,4 +334,8 @@ final class PhabricatorOwnersPackageTransactionEditor
return $body;
}
protected function supportsSearch() {
return true;
}
}

View file

@ -0,0 +1,26 @@
<?php
final class PhabricatorOwnersPackageFulltextEngine
extends PhabricatorFulltextEngine {
protected function buildAbstractDocument(
PhabricatorSearchAbstractDocument $document,
$object) {
$package = $object;
$document->setDocumentTitle($package->getName());
// TODO: These are bogus, but not currently stored on packages.
$document->setDocumentCreated(PhabricatorTime::getNow());
$document->setDocumentModified(PhabricatorTime::getNow());
$document->addRelationship(
$package->isArchived()
? PhabricatorSearchRelationship::RELATIONSHIP_CLOSED
: PhabricatorSearchRelationship::RELATIONSHIP_OPEN,
$package->getPHID(),
PhabricatorOwnersPackagePHIDType::TYPECONST,
PhabricatorTime::getNow());
}
}

View file

@ -9,7 +9,6 @@ final class PhabricatorOwnersPackageQuery
private $authorityPHIDs;
private $repositoryPHIDs;
private $paths;
private $namePrefix;
private $statuses;
private $controlMap = array();
@ -78,9 +77,10 @@ final class PhabricatorOwnersPackageQuery
return $this;
}
public function withNamePrefix($prefix) {
$this->namePrefix = $prefix;
return $this;
public function withNameNgrams($ngrams) {
return $this->withNgramsConstraint(
new PhabricatorOwnersPackageNameNgrams(),
$ngrams);
}
public function needPaths($need_paths) {
@ -208,15 +208,6 @@ final class PhabricatorOwnersPackageQuery
$this->statuses);
}
if (strlen($this->namePrefix)) {
// NOTE: This is a hacky mess, but this column is currently case
// sensitive and unique.
$where[] = qsprintf(
$conn,
'LOWER(p.name) LIKE %>',
phutil_utf8_strtolower($this->namePrefix));
}
if ($this->controlMap) {
$clauses = array();
foreach ($this->controlMap as $repository_phid => $paths) {

View file

@ -25,6 +25,10 @@ final class PhabricatorOwnersPackageSearchEngine
->setDescription(
pht('Search for packages with specific owners.'))
->setDatasource(new PhabricatorProjectOrUserDatasource()),
id(new PhabricatorSearchTextField())
->setLabel(pht('Name Contains'))
->setKey('name')
->setDescription(pht('Search for packages by name substrings.')),
id(new PhabricatorSearchDatasourceField())
->setLabel(pht('Repositories'))
->setKey('repositoryPHIDs')
@ -69,6 +73,10 @@ final class PhabricatorOwnersPackageSearchEngine
$query->withStatuses($map['statuses']);
}
if (strlen($map['name'])) {
$query->withNameNgrams($map['name']);
}
return $query;
}

View file

@ -7,7 +7,9 @@ final class PhabricatorOwnersPackage
PhabricatorApplicationTransactionInterface,
PhabricatorCustomFieldInterface,
PhabricatorDestructibleInterface,
PhabricatorConduitResultInterface {
PhabricatorConduitResultInterface,
PhabricatorFulltextInterface,
PhabricatorNgramsInterface {
protected $name;
protected $originalName;
@ -46,7 +48,7 @@ final class PhabricatorOwnersPackage
self::CONFIG_TIMESTAMPS => false,
self::CONFIG_AUX_PHID => true,
self::CONFIG_COLUMN_SCHEMA => array(
'name' => 'text128',
'name' => 'sort128',
'originalName' => 'text255',
'description' => 'text',
'primaryOwnerPHID' => 'phid?',
@ -54,17 +56,6 @@ final class PhabricatorOwnersPackage
'mailKey' => 'bytes20',
'status' => 'text32',
),
self::CONFIG_KEY_SCHEMA => array(
'key_phid' => null,
'phid' => array(
'columns' => array('phid'),
'unique' => true,
),
'name' => array(
'columns' => array('name'),
'unique' => true,
),
),
) + parent::getConfiguration();
}
@ -433,4 +424,23 @@ final class PhabricatorOwnersPackage
);
}
/* -( PhabricatorFulltextInterface )--------------------------------------- */
public function newFulltextEngine() {
return new PhabricatorOwnersPackageFulltextEngine();
}
/* -( PhabricatorNgramInterface )------------------------------------------ */
public function newNgrams() {
return array(
id(new PhabricatorOwnersPackageNameNgrams())
->setValue($this->getName()),
);
}
}

View file

@ -0,0 +1,18 @@
<?php
final class PhabricatorOwnersPackageNameNgrams
extends PhabricatorSearchNgrams {
public function getNgramKey() {
return 'name';
}
public function getColumnName() {
return 'name';
}
public function getApplicationName() {
return 'owners';
}
}

View file

@ -27,6 +27,14 @@ final class PhabricatorOwnersPackageTransaction
switch ($this->getTransactionType()) {
case self::TYPE_OWNERS:
if (!is_array($old)) {
$old = array();
}
if (!is_array($new)) {
$new = array();
}
$add = array_diff($new, $old);
foreach ($add as $phid) {
$phids[] = $phid;

View file

@ -22,7 +22,7 @@ final class PhabricatorOwnersPackageDatasource
$results = array();
$query = id(new PhabricatorOwnersPackageQuery())
->withNamePrefix($raw_query)
->withNameNgrams($raw_query)
->setOrder('name');
$packages = $this->executeQuery($query);

View file

@ -65,6 +65,9 @@ final class PhabricatorFulltextIndexEngineExtension
try {
$comment = $xaction->getApplicationTransactionCommentObject();
if (!$comment) {
return 'none';
}
} catch (Exception $ex) {
return 'none';
}

View file

@ -0,0 +1,34 @@
<?php
final class PhabricatorNgramsIndexEngineExtension
extends PhabricatorIndexEngineExtension {
const EXTENSIONKEY = 'ngrams';
public function getExtensionName() {
return pht('Ngrams Engine');
}
public function getIndexVersion($object) {
$ngrams = $object->newNgrams();
$map = mpull($ngrams, 'getValue', 'getNgramKey');
ksort($map);
$serialized = serialize($map);
return PhabricatorHash::digestForIndex($serialized);
}
public function shouldIndexObject($object) {
return ($object instanceof PhabricatorNgramsInterface);
}
public function indexObject(
PhabricatorIndexEngine $engine,
$object) {
foreach ($object->newNgrams() as $ngram) {
$ngram->writeNgram($object->getID());
}
}
}

View file

@ -0,0 +1,31 @@
<?php
final class PhabricatorSearchNgramsDestructionEngineExtension
extends PhabricatorDestructionEngineExtension {
const EXTENSIONKEY = 'search.ngrams';
public function getExtensionName() {
return pht('Search Ngram');
}
public function canDestroyObject(
PhabricatorDestructionEngine $engine,
$object) {
return ($object instanceof PhabricatorNgramsInterface);
}
public function destroyObject(
PhabricatorDestructionEngine $engine,
$object) {
foreach ($object->newNgrams() as $ngram) {
queryfx(
$ngram->establishConnection('w'),
'DELETE FROM %T WHERE objectID = %d',
$ngram->getTableName(),
$object->getID());
}
}
}

View file

@ -0,0 +1,7 @@
<?php
interface PhabricatorNgramsInterface {
public function newNgrams();
}

View file

@ -0,0 +1,113 @@
<?php
abstract class PhabricatorSearchNgrams
extends PhabricatorSearchDAO {
protected $objectID;
protected $ngram;
private $value;
abstract public function getNgramKey();
abstract public function getColumnName();
final public function setValue($value) {
$this->value = $value;
return $this;
}
final public function getValue() {
return $this->value;
}
protected function getConfiguration() {
return array(
self::CONFIG_TIMESTAMPS => false,
self::CONFIG_COLUMN_SCHEMA => array(
'objectID' => 'uint32',
'ngram' => 'char3',
),
self::CONFIG_KEY_SCHEMA => array(
'key_ngram' => array(
'columns' => array('ngram', 'objectID'),
),
'key_object' => array(
'columns' => array('objectID'),
),
),
) + parent::getConfiguration();
}
public function getTableName() {
$application = $this->getApplicationName();
$key = $this->getNgramKey();
return "{$application}_{$key}_ngrams";
}
final public function tokenizeString($value) {
$value = trim($value, ' ');
$value = preg_split('/ +/', $value);
return $value;
}
final public function getNgramsFromString($value, $mode) {
$tokens = $this->tokenizeString($value);
$ngrams = array();
foreach ($tokens as $token) {
$token = phutil_utf8_strtolower($token);
switch ($mode) {
case 'query':
break;
case 'index':
$token = ' '.$token.' ';
break;
case 'prefix':
$token = ' '.$token;
break;
}
$len = (strlen($token) - 2);
for ($ii = 0; $ii < $len; $ii++) {
$ngram = substr($token, $ii, 3);
$ngrams[$ngram] = $ngram;
}
}
ksort($ngrams);
return array_keys($ngrams);
}
final public function writeNgram($object_id) {
$ngrams = $this->getNgramsFromString($this->getValue(), 'index');
$conn_w = $this->establishConnection('w');
$sql = array();
foreach ($ngrams as $ngram) {
$sql[] = qsprintf(
$conn_w,
'(%d, %s)',
$object_id,
$ngram);
}
queryfx(
$conn_w,
'DELETE FROM %T WHERE objectID = %d',
$this->getTableName(),
$object_id);
if ($sql) {
queryfx(
$conn_w,
'INSERT INTO %T (objectID, ngram) VALUES %Q',
$this->getTableName(),
implode(', ', $sql));
}
return $this;
}
}

View file

@ -26,6 +26,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
private $edgeLogicConstraintsAreValid = false;
private $spacePHIDs;
private $spaceIsArchived;
private $ngrams = array();
protected function getPageCursors(array $page) {
return array(
@ -253,6 +254,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
$joins = array();
$joins[] = $this->buildEdgeLogicJoinClause($conn);
$joins[] = $this->buildApplicationSearchJoinClause($conn);
$joins[] = $this->buildNgramsJoinClause($conn);
return $joins;
}
@ -274,6 +276,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
$where[] = $this->buildPagingClause($conn);
$where[] = $this->buildEdgeLogicWhereClause($conn);
$where[] = $this->buildSpacesWhereClause($conn);
$where[] = $this->buildNgramsWhereClause($conn);
return $where;
}
@ -324,6 +327,10 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
return true;
}
if ($this->shouldGroupNgramResultRows()) {
return true;
}
return false;
}
@ -1345,6 +1352,138 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
}
/* -( Ngrams )------------------------------------------------------------- */
protected function withNgramsConstraint(
PhabricatorSearchNgrams $index,
$value) {
if (strlen($value)) {
$this->ngrams[] = array(
'index' => $index,
'value' => $value,
'length' => count(phutil_utf8v($value)),
);
}
return $this;
}
protected function buildNgramsJoinClause(AphrontDatabaseConnection $conn) {
$flat = array();
foreach ($this->ngrams as $spec) {
$index = $spec['index'];
$value = $spec['value'];
$length = $spec['length'];
if ($length >= 3) {
$ngrams = $index->getNgramsFromString($value, 'query');
$prefix = false;
} else if ($length == 2) {
$ngrams = $index->getNgramsFromString($value, 'prefix');
$prefix = false;
} else {
$ngrams = array(' '.$value);
$prefix = true;
}
foreach ($ngrams as $ngram) {
$flat[] = array(
'table' => $index->getTableName(),
'ngram' => $ngram,
'prefix' => $prefix,
);
}
}
// MySQL only allows us to join a maximum of 61 tables per query. Each
// ngram is going to cost us a join toward that limit, so if the user
// specified a very long query string, just pick 16 of the ngrams
// at random.
if (count($flat) > 16) {
shuffle($flat);
$flat = array_slice($flat, 0, 16);
}
$alias = $this->getPrimaryTableAlias();
if ($alias) {
$id_column = qsprintf($conn, '%T.%T', $alias, 'id');
} else {
$id_column = qsprintf($conn, '%T', 'id');
}
$idx = 1;
$joins = array();
foreach ($flat as $spec) {
$table = $spec['table'];
$ngram = $spec['ngram'];
$prefix = $spec['prefix'];
$alias = 'ngm'.$idx++;
if ($prefix) {
$joins[] = qsprintf(
$conn,
'JOIN %T %T ON %T.objectID = %Q AND %T.ngram LIKE %>',
$table,
$alias,
$alias,
$id_column,
$alias,
$ngram);
} else {
$joins[] = qsprintf(
$conn,
'JOIN %T %T ON %T.objectID = %Q AND %T.ngram = %s',
$table,
$alias,
$alias,
$id_column,
$alias,
$ngram);
}
}
return $joins;
}
protected function buildNgramsWhereClause(AphrontDatabaseConnection $conn) {
$where = array();
foreach ($this->ngrams as $ngram) {
$index = $ngram['index'];
$value = $ngram['value'];
$column = $index->getColumnName();
$alias = $this->getPrimaryTableAlias();
if ($alias) {
$column = qsprintf($conn, '%T.%T', $alias, $column);
} else {
$column = qsprintf($conn, '%T', $column);
}
$tokens = $index->tokenizeString($value);
foreach ($tokens as $token) {
$where[] = qsprintf(
$conn,
'%Q LIKE %~',
$column,
$token);
}
}
return $where;
}
protected function shouldGroupNgramResultRows() {
return (bool)$this->ngrams;
}
/* -( Edge Logic )--------------------------------------------------------- */