1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-12-22 21:40:55 +01:00

Build a prototype fulltext engine ("Ferret") using only basic MySQL primitives

Summary:
Ref T12819. I gave this stuff a sweet code name because all the terms related to "fulltext" and "search" already mean 5 different things. It, uh, ferrets out documents for you?

I'm building this to work a lot like the existing ngram index, which seems to work pretty well. If this sticks, it will auto-resolve the join issue (in T12443) by letting us do the entire thing locally in a JOIN and thus dodge a lot of mess.

This index gets built alongside other indexes, but only shows up in the UI if you have prototypes enabled. If you do, it appears under the existing fulltext field in Maniphest. No existing functionality is affected or disrupted.

NOTE: The query engine half of this is still EXTREMELY primitive, and this probably performs worse than the existing field for now. If this doesn't show obvious signs of being awful on `secure` I'll improve that in followup changes.

Test Plan:
Indexed my tasks, ran some simple queries, got the results I wanted, even for queries "ko", "k", "v0.1".

{F5147746}

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T12819, T12443

Differential Revision: https://secure.phabricator.com/D18484
This commit is contained in:
epriestley 2017-08-28 13:04:56 -07:00
parent ed75250f1a
commit f97157e7ed
18 changed files with 571 additions and 0 deletions

View file

@ -0,0 +1,9 @@
CREATE TABLE {$NAMESPACE}_maniphest.maniphest_task_fdocument (
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
objectPHID VARBINARY(64) NOT NULL,
isClosed BOOL NOT NULL,
authorPHID VARBINARY(64),
ownerPHID VARBINARY(64),
epochCreated INT UNSIGNED NOT NULL,
epochModified INT UNSIGNED NOT NULL
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};

View file

@ -0,0 +1,7 @@
CREATE TABLE {$NAMESPACE}_maniphest.maniphest_task_ffield (
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
documentID INT UNSIGNED NOT NULL,
fieldKey VARCHAR(4) NOT NULL COLLATE {$COLLATE_TEXT},
rawCorpus LONGTEXT NOT NULL COLLATE {$COLLATE_SORT},
normalCorpus LONGTEXT NOT NULL COLLATE {$COLLATE_SORT}
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};

View file

@ -0,0 +1,5 @@
CREATE TABLE {$NAMESPACE}_maniphest.maniphest_task_fngrams (
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
documentID INT UNSIGNED NOT NULL,
ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};

View file

@ -1533,6 +1533,10 @@ phutil_register_library_map(array(
'ManiphestTaskEditBulkJobType' => 'applications/maniphest/bulk/ManiphestTaskEditBulkJobType.php',
'ManiphestTaskEditController' => 'applications/maniphest/controller/ManiphestTaskEditController.php',
'ManiphestTaskEditEngineLock' => 'applications/maniphest/editor/ManiphestTaskEditEngineLock.php',
'ManiphestTaskFerretDocument' => 'applications/maniphest/storage/ManiphestTaskFerretDocument.php',
'ManiphestTaskFerretEngine' => 'applications/maniphest/search/ManiphestTaskFerretEngine.php',
'ManiphestTaskFerretField' => 'applications/maniphest/storage/ManiphestTaskFerretField.php',
'ManiphestTaskFerretNgrams' => 'applications/maniphest/storage/ManiphestTaskFerretNgrams.php',
'ManiphestTaskFulltextEngine' => 'applications/maniphest/search/ManiphestTaskFulltextEngine.php',
'ManiphestTaskGraph' => 'infrastructure/graph/ManiphestTaskGraph.php',
'ManiphestTaskHasCommitEdgeType' => 'applications/maniphest/edge/ManiphestTaskHasCommitEdgeType.php',
@ -2828,6 +2832,12 @@ phutil_register_library_map(array(
'PhabricatorFeedStoryNotification' => 'applications/notification/storage/PhabricatorFeedStoryNotification.php',
'PhabricatorFeedStoryPublisher' => 'applications/feed/PhabricatorFeedStoryPublisher.php',
'PhabricatorFeedStoryReference' => 'applications/feed/storage/PhabricatorFeedStoryReference.php',
'PhabricatorFerretDocument' => 'applications/search/ferret/PhabricatorFerretDocument.php',
'PhabricatorFerretEngine' => 'applications/search/ferret/PhabricatorFerretEngine.php',
'PhabricatorFerretField' => 'applications/search/ferret/PhabricatorFerretField.php',
'PhabricatorFerretFulltextEngineExtension' => 'applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php',
'PhabricatorFerretInterface' => 'applications/search/ferret/PhabricatorFerretInterface.php',
'PhabricatorFerretNgrams' => 'applications/search/ferret/PhabricatorFerretNgrams.php',
'PhabricatorFile' => 'applications/files/storage/PhabricatorFile.php',
'PhabricatorFileAES256StorageFormat' => 'applications/files/format/PhabricatorFileAES256StorageFormat.php',
'PhabricatorFileBundleLoader' => 'applications/files/query/PhabricatorFileBundleLoader.php',
@ -3195,6 +3205,7 @@ phutil_register_library_map(array(
'PhabricatorNamedQueryQuery' => 'applications/search/query/PhabricatorNamedQueryQuery.php',
'PhabricatorNavigationRemarkupRule' => 'infrastructure/markup/rule/PhabricatorNavigationRemarkupRule.php',
'PhabricatorNeverTriggerClock' => 'infrastructure/daemon/workers/clock/PhabricatorNeverTriggerClock.php',
'PhabricatorNgramEngine' => 'applications/search/ngrams/PhabricatorNgramEngine.php',
'PhabricatorNgramsIndexEngineExtension' => 'applications/search/engineextension/PhabricatorNgramsIndexEngineExtension.php',
'PhabricatorNgramsInterface' => 'applications/search/interface/PhabricatorNgramsInterface.php',
'PhabricatorNotificationBuilder' => 'applications/notification/builder/PhabricatorNotificationBuilder.php',
@ -6659,6 +6670,7 @@ phutil_register_library_map(array(
'PhabricatorSpacesInterface',
'PhabricatorConduitResultInterface',
'PhabricatorFulltextInterface',
'PhabricatorFerretInterface',
'DoorkeeperBridgedObjectInterface',
'PhabricatorEditEngineSubtypeInterface',
'PhabricatorEditEngineLockableInterface',
@ -6682,6 +6694,10 @@ phutil_register_library_map(array(
'ManiphestTaskEditBulkJobType' => 'PhabricatorWorkerBulkJobType',
'ManiphestTaskEditController' => 'ManiphestController',
'ManiphestTaskEditEngineLock' => 'PhabricatorEditEngineLock',
'ManiphestTaskFerretDocument' => 'PhabricatorFerretDocument',
'ManiphestTaskFerretEngine' => 'PhabricatorFerretEngine',
'ManiphestTaskFerretField' => 'PhabricatorFerretField',
'ManiphestTaskFerretNgrams' => 'PhabricatorFerretNgrams',
'ManiphestTaskFulltextEngine' => 'PhabricatorFulltextEngine',
'ManiphestTaskGraph' => 'PhabricatorObjectGraph',
'ManiphestTaskHasCommitEdgeType' => 'PhabricatorEdgeType',
@ -8147,6 +8163,11 @@ phutil_register_library_map(array(
'PhabricatorFeedStoryNotification' => 'PhabricatorFeedDAO',
'PhabricatorFeedStoryPublisher' => 'Phobject',
'PhabricatorFeedStoryReference' => 'PhabricatorFeedDAO',
'PhabricatorFerretDocument' => 'PhabricatorSearchDAO',
'PhabricatorFerretEngine' => 'Phobject',
'PhabricatorFerretField' => 'PhabricatorSearchDAO',
'PhabricatorFerretFulltextEngineExtension' => 'PhabricatorFulltextEngineExtension',
'PhabricatorFerretNgrams' => 'PhabricatorSearchDAO',
'PhabricatorFile' => array(
'PhabricatorFileDAO',
'PhabricatorApplicationTransactionInterface',
@ -8565,6 +8586,7 @@ phutil_register_library_map(array(
'PhabricatorNamedQueryQuery' => 'PhabricatorCursorPagedPolicyAwareQuery',
'PhabricatorNavigationRemarkupRule' => 'PhutilRemarkupRule',
'PhabricatorNeverTriggerClock' => 'PhabricatorTriggerClock',
'PhabricatorNgramEngine' => 'Phobject',
'PhabricatorNgramsIndexEngineExtension' => 'PhabricatorIndexEngineExtension',
'PhabricatorNgramsInterface' => 'PhabricatorIndexableInterface',
'PhabricatorNotificationBuilder' => 'Phobject',

View file

@ -49,6 +49,8 @@ final class ManiphestTaskSearchEngine
$subtype_map = id(new ManiphestTask())->newEditEngineSubtypeMap();
$hide_subtypes = (count($subtype_map) == 1);
$hide_ferret = !PhabricatorEnv::getEnvConfig('phabricator.show-prototypes');
return array(
id(new PhabricatorOwnersSearchField())
->setLabel(pht('Assigned To'))
@ -89,6 +91,10 @@ final class ManiphestTaskSearchEngine
id(new PhabricatorSearchTextField())
->setLabel(pht('Contains Words'))
->setKey('fulltext'),
id(new PhabricatorSearchTextField())
->setLabel(pht('Matches (Prototype)'))
->setKey('ferret')
->setIsHidden($hide_ferret),
id(new PhabricatorSearchThreeStateField())
->setLabel(pht('Open Parents'))
->setKey('hasParents')
@ -145,6 +151,7 @@ final class ManiphestTaskSearchEngine
'priorities',
'subtypes',
'fulltext',
'ferret',
'hasParents',
'hasSubtasks',
'parentIDs',
@ -224,6 +231,12 @@ final class ManiphestTaskSearchEngine
$query->withFullTextSearch($map['fulltext']);
}
if (strlen($map['ferret'])) {
$query->withFerretConstraint(
id(new ManiphestTask())->newFerretEngine(),
$map['ferret']);
}
if ($map['parentIDs']) {
$query->withParentTaskIDs($map['parentIDs']);
}

View file

@ -0,0 +1,18 @@
<?php
final class ManiphestTaskFerretEngine
extends PhabricatorFerretEngine {
public function newNgramsObject() {
return new ManiphestTaskFerretNgrams();
}
public function newDocumentObject() {
return new ManiphestTaskFerretDocument();
}
public function newFieldObject() {
return new ManiphestTaskFerretField();
}
}

View file

@ -16,6 +16,7 @@ final class ManiphestTask extends ManiphestDAO
PhabricatorSpacesInterface,
PhabricatorConduitResultInterface,
PhabricatorFulltextInterface,
PhabricatorFerretInterface,
DoorkeeperBridgedObjectInterface,
PhabricatorEditEngineSubtypeInterface,
PhabricatorEditEngineLockableInterface {
@ -603,4 +604,12 @@ final class ManiphestTask extends ManiphestDAO
return new ManiphestTaskEditEngineLock();
}
/* -( PhabricatorFerretInterface )----------------------------------------- */
public function newFerretEngine() {
return new ManiphestTaskFerretEngine();
}
}

View file

@ -0,0 +1,14 @@
<?php
final class ManiphestTaskFerretDocument
extends PhabricatorFerretDocument {
public function getApplicationName() {
return 'maniphest';
}
public function getIndexKey() {
return 'task';
}
}

View file

@ -0,0 +1,14 @@
<?php
final class ManiphestTaskFerretField
extends PhabricatorFerretField {
public function getApplicationName() {
return 'maniphest';
}
public function getIndexKey() {
return 'task';
}
}

View file

@ -0,0 +1,14 @@
<?php
final class ManiphestTaskFerretNgrams
extends PhabricatorFerretNgrams {
public function getApplicationName() {
return 'maniphest';
}
public function getIndexKey() {
return 'task';
}
}

View file

@ -0,0 +1,126 @@
<?php
final class PhabricatorFerretFulltextEngineExtension
extends PhabricatorFulltextEngineExtension {
const EXTENSIONKEY = 'ferret';
public function getExtensionName() {
return pht('Ferret Fulltext Engine');
}
public function shouldIndexFulltextObject($object) {
return ($object instanceof PhabricatorFerretInterface);
}
public function indexFulltextObject(
$object,
PhabricatorSearchAbstractDocument $document) {
$phid = $document->getPHID();
$engine = $object->newFerretEngine();
$ferret_document = $engine->newDocumentObject()
->setObjectPHID($phid)
->setIsClosed(0)
->setEpochCreated(0)
->setEpochModified(0);
$stemmer = new PhutilSearchStemmer();
$ferret_fields = array();
$ngrams_source = array();
foreach ($document->getFieldData() as $field) {
list($key, $raw_corpus) = $field;
if (!strlen($raw_corpus)) {
continue;
}
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
$ferret_fields[] = $engine->newFieldObject()
->setFieldKey($key)
->setRawCorpus($raw_corpus)
->setNormalCorpus($normal_corpus);
$ngrams_source[] = $raw_corpus;
}
$ngrams_source = implode(' ', $ngrams_source);
$ngrams = id(new PhabricatorNgramEngine())
->getNgramsFromString($ngrams_source, 'index');
$ferret_document->openTransaction();
$this->deleteOldDocument($engine, $object, $document);
$ferret_document->save();
$document_id = $ferret_document->getID();
foreach ($ferret_fields as $ferret_field) {
$ferret_field
->setDocumentID($document_id)
->save();
}
$ferret_ngrams = $engine->newNgramsObject();
$conn = $ferret_ngrams->establishConnection('w');
$sql = array();
foreach ($ngrams as $ngram) {
$sql[] = qsprintf(
$conn,
'(%d, %s)',
$document_id,
$ngram);
}
foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) {
queryfx(
$conn,
'INSERT INTO %T (documentID, ngram) VALUES %Q',
$ferret_ngrams->getTableName(),
$chunk);
}
$ferret_document->saveTransaction();
}
private function deleteOldDocument(
PhabricatorFerretEngine $engine,
$object,
PhabricatorSearchAbstractDocument $document) {
$old_document = $engine->newDocumentObject()->loadOneWhere(
'objectPHID = %s',
$document->getPHID());
if (!$old_document) {
return;
}
$conn = $old_document->establishConnection('w');
$old_id = $old_document->getID();
queryfx(
$conn,
'DELETE FROM %T WHERE id = %d',
$engine->newDocumentObject()->getTableName(),
$old_id);
queryfx(
$conn,
'DELETE FROM %T WHERE documentID = %d',
$engine->newFieldObject()->getTableName(),
$old_id);
queryfx(
$conn,
'DELETE FROM %T WHERE documentID = %d',
$engine->newNgramsObject()->getTableName(),
$old_id);
}
}

View file

@ -0,0 +1,40 @@
<?php
abstract class PhabricatorFerretDocument
extends PhabricatorSearchDAO {
protected $objectPHID;
protected $isClosed;
protected $authorPHID;
protected $ownerPHID;
protected $epochCreated;
protected $epochModified;
abstract public function getIndexKey();
protected function getConfiguration() {
return array(
self::CONFIG_TIMESTAMPS => false,
self::CONFIG_COLUMN_SCHEMA => array(
'isClosed' => 'bool',
'authorPHID' => 'phid?',
'ownerPHID' => 'phid?',
'epochCreated' => 'epoch',
'epochModified' => 'epoch',
),
self::CONFIG_KEY_SCHEMA => array(
'key_object' => array(
'columns' => array('objectPHID'),
'unique' => true,
),
),
) + parent::getConfiguration();
}
public function getTableName() {
$application = $this->getApplicationName();
$key = $this->getIndexKey();
return "{$application}_{$key}_fdocument";
}
}

View file

@ -0,0 +1,9 @@
<?php
abstract class PhabricatorFerretEngine extends Phobject {
abstract public function newNgramsObject();
abstract public function newDocumentObject();
abstract public function newFieldObject();
}

View file

@ -0,0 +1,36 @@
<?php
abstract class PhabricatorFerretField
extends PhabricatorSearchDAO {
protected $documentID;
protected $fieldKey;
protected $rawCorpus;
protected $normalCorpus;
abstract public function getIndexKey();
protected function getConfiguration() {
return array(
self::CONFIG_TIMESTAMPS => false,
self::CONFIG_COLUMN_SCHEMA => array(
'documentID' => 'uint32',
'fieldKey' => 'text4',
'rawCorpus' => 'sort',
'normalCorpus' => 'sort',
),
self::CONFIG_KEY_SCHEMA => array(
'key_document' => array(
'columns' => array('documentID', 'fieldKey'),
),
),
) + parent::getConfiguration();
}
public function getTableName() {
$application = $this->getApplicationName();
$key = $this->getIndexKey();
return "{$application}_{$key}_ffield";
}
}

View file

@ -0,0 +1,7 @@
<?php
interface PhabricatorFerretInterface {
public function newFerretEngine();
}

View file

@ -0,0 +1,35 @@
<?php
abstract class PhabricatorFerretNgrams
extends PhabricatorSearchDAO {
protected $documentID;
protected $ngram;
abstract public function getIndexKey();
protected function getConfiguration() {
return array(
self::CONFIG_TIMESTAMPS => false,
self::CONFIG_COLUMN_SCHEMA => array(
'documentID' => 'uint32',
'ngram' => 'char3',
),
self::CONFIG_KEY_SCHEMA => array(
'key_ngram' => array(
'columns' => array('ngram', 'documentID'),
),
'key_object' => array(
'columns' => array('documentID'),
),
),
) + parent::getConfiguration();
}
public function getTableName() {
$application = $this->getApplicationName();
$key = $this->getIndexKey();
return "{$application}_{$key}_fngrams";
}
}

View file

@ -0,0 +1,41 @@
<?php
final class PhabricatorNgramEngine extends Phobject {
public function tokenizeString($value) {
$value = trim($value, ' ');
$value = preg_split('/ +/', $value);
return $value;
}
public function getNgramsFromString($value, $mode) {
$tokens = $this->tokenizeString($value);
$ngrams = array();
foreach ($tokens as $token) {
$token = phutil_utf8_strtolower($token);
switch ($mode) {
case 'query':
break;
case 'index':
$token = ' '.$token.' ';
break;
case 'prefix':
$token = ' '.$token;
break;
}
$len = (strlen($token) - 2);
for ($ii = 0; $ii < $len; $ii++) {
$ngram = substr($token, $ii, 3);
$ngrams[$ngram] = $ngram;
}
}
ksort($ngrams);
return array_keys($ngrams);
}
}

View file

@ -27,6 +27,8 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
private $spacePHIDs;
private $spaceIsArchived;
private $ngrams = array();
private $ferretEngine;
private $ferretConstraints;
protected function getPageCursors(array $page) {
return array(
@ -270,6 +272,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
$joins[] = $this->buildEdgeLogicJoinClause($conn);
$joins[] = $this->buildApplicationSearchJoinClause($conn);
$joins[] = $this->buildNgramsJoinClause($conn);
$joins[] = $this->buildFerretJoinClause($conn);
return $joins;
}
@ -292,6 +295,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
$where[] = $this->buildEdgeLogicWhereClause($conn);
$where[] = $this->buildSpacesWhereClause($conn);
$where[] = $this->buildNgramsWhereClause($conn);
$where[] = $this->buildFerretWhereClause($conn);
return $where;
}
@ -346,6 +350,10 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
return true;
}
if ($this->shouldGroupFerretResultRows()) {
return true;
}
return false;
}
@ -1373,6 +1381,150 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
}
/* -( Ferret )------------------------------------------------------------- */
public function withFerretConstraint(
PhabricatorFerretEngine $engine,
$raw_query) {
if ($this->ferretEngine) {
throw new Exception(
pht(
'Query may not have multiple fulltext constraints.'));
}
if (!strlen($raw_query)) {
return $this;
}
$this->ferretEngine = $engine;
$this->ferretConstraints = preg_split('/\s+/', $raw_query);
return $this;
}
protected function buildFerretJoinClause(AphrontDatabaseConnection $conn) {
if (!$this->ferretEngine) {
return array();
}
$engine = $this->ferretEngine;
$ngram_engine = new PhabricatorNgramEngine();
$ngram_table = $engine->newNgramsObject();
$ngram_table_name = $ngram_table->getTableName();
$flat = array();
foreach ($this->ferretConstraints as $term) {
$value = $term;
$length = count(phutil_utf8v($term));
if ($length >= 3) {
$ngrams = $ngram_engine->getNgramsFromString($value, 'query');
$prefix = false;
} else if ($length == 2) {
$ngrams = $ngram_engine->getNgramsFromString($value, 'prefix');
$prefix = false;
} else {
$ngrams = array(' '.$value);
$prefix = true;
}
foreach ($ngrams as $ngram) {
$flat[] = array(
'table' => $ngram_table_name,
'ngram' => $ngram,
'prefix' => $prefix,
);
}
}
// MySQL only allows us to join a maximum of 61 tables per query. Each
// ngram is going to cost us a join toward that limit, so if the user
// specified a very long query string, just pick 16 of the ngrams
// at random.
if (count($flat) > 16) {
shuffle($flat);
$flat = array_slice($flat, 0, 16);
}
$alias = $this->getPrimaryTableAlias();
if ($alias) {
$phid_column = qsprintf($conn, '%T.%T', $alias, 'phid');
} else {
$phid_column = qsprintf($conn, '%T', 'phid');
}
$document_table = $engine->newDocumentObject();
$field_table = $engine->newFieldObject();
$joins = array();
$joins[] = qsprintf(
$conn,
'JOIN %T ftdoc ON ftdoc.objectPHID = %Q',
$document_table->getTableName(),
$phid_column);
$idx = 1;
foreach ($flat as $spec) {
$table = $spec['table'];
$ngram = $spec['ngram'];
$prefix = $spec['prefix'];
$alias = 'ft'.$idx++;
if ($prefix) {
$joins[] = qsprintf(
$conn,
'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram LIKE %>',
$table,
$alias,
$alias,
$alias,
$ngram);
} else {
$joins[] = qsprintf(
$conn,
'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram = %s',
$table,
$alias,
$alias,
$alias,
$ngram);
}
}
$joins[] = qsprintf(
$conn,
'JOIN %T ftfield ON ftdoc.id = ftfield.documentID',
$field_table->getTableName());
return $joins;
}
protected function buildFerretWhereClause(AphrontDatabaseConnection $conn) {
if (!$this->ferretEngine) {
return array();
}
$where = array();
foreach ($this->ferretConstraints as $constraint) {
$where[] = qsprintf(
$conn,
'(ftfield.rawCorpus LIKE %~ OR ftfield.normalCorpus LIKE %~)',
$constraint,
$constraint);
}
return $where;
}
protected function shouldGroupFerretResultRows() {
return (bool)$this->ferretConstraints;
}
/* -( Ngrams )------------------------------------------------------------- */