mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-22 14:52:41 +01:00
Build a prototype fulltext engine ("Ferret") using only basic MySQL primitives
Summary: Ref T12819. I gave this stuff a sweet code name because all the terms related to "fulltext" and "search" already mean 5 different things. It, uh, ferrets out documents for you? I'm building this to work a lot like the existing ngram index, which seems to work pretty well. If this sticks, it will auto-resolve the join issue (in T12443) by letting us do the entire thing locally in a JOIN and thus dodge a lot of mess. This index gets built alongside other indexes, but only shows up in the UI if you have prototypes enabled. If you do, it appears under the existing fulltext field in Maniphest. No existing functionality is affected or disrupted. NOTE: The query engine half of this is still EXTREMELY primitive, and this probably performs worse than the existing field for now. If this doesn't show obvious signs of being awful on `secure` I'll improve that in followup changes. Test Plan: Indexed my tasks, ran some simple queries, got the results I wanted, even for queries "ko", "k", "v0.1". {F5147746} Reviewers: chad Reviewed By: chad Maniphest Tasks: T12819, T12443 Differential Revision: https://secure.phabricator.com/D18484
This commit is contained in:
parent
ed75250f1a
commit
f97157e7ed
18 changed files with 571 additions and 0 deletions
9
resources/sql/autopatches/20170828.ferret.01.taskdoc.sql
Normal file
9
resources/sql/autopatches/20170828.ferret.01.taskdoc.sql
Normal file
|
@ -0,0 +1,9 @@
|
|||
CREATE TABLE {$NAMESPACE}_maniphest.maniphest_task_fdocument (
|
||||
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
|
||||
objectPHID VARBINARY(64) NOT NULL,
|
||||
isClosed BOOL NOT NULL,
|
||||
authorPHID VARBINARY(64),
|
||||
ownerPHID VARBINARY(64),
|
||||
epochCreated INT UNSIGNED NOT NULL,
|
||||
epochModified INT UNSIGNED NOT NULL
|
||||
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
|
|
@ -0,0 +1,7 @@
|
|||
CREATE TABLE {$NAMESPACE}_maniphest.maniphest_task_ffield (
|
||||
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
|
||||
documentID INT UNSIGNED NOT NULL,
|
||||
fieldKey VARCHAR(4) NOT NULL COLLATE {$COLLATE_TEXT},
|
||||
rawCorpus LONGTEXT NOT NULL COLLATE {$COLLATE_SORT},
|
||||
normalCorpus LONGTEXT NOT NULL COLLATE {$COLLATE_SORT}
|
||||
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
|
|
@ -0,0 +1,5 @@
|
|||
CREATE TABLE {$NAMESPACE}_maniphest.maniphest_task_fngrams (
|
||||
id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
|
||||
documentID INT UNSIGNED NOT NULL,
|
||||
ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}
|
||||
) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
|
|
@ -1533,6 +1533,10 @@ phutil_register_library_map(array(
|
|||
'ManiphestTaskEditBulkJobType' => 'applications/maniphest/bulk/ManiphestTaskEditBulkJobType.php',
|
||||
'ManiphestTaskEditController' => 'applications/maniphest/controller/ManiphestTaskEditController.php',
|
||||
'ManiphestTaskEditEngineLock' => 'applications/maniphest/editor/ManiphestTaskEditEngineLock.php',
|
||||
'ManiphestTaskFerretDocument' => 'applications/maniphest/storage/ManiphestTaskFerretDocument.php',
|
||||
'ManiphestTaskFerretEngine' => 'applications/maniphest/search/ManiphestTaskFerretEngine.php',
|
||||
'ManiphestTaskFerretField' => 'applications/maniphest/storage/ManiphestTaskFerretField.php',
|
||||
'ManiphestTaskFerretNgrams' => 'applications/maniphest/storage/ManiphestTaskFerretNgrams.php',
|
||||
'ManiphestTaskFulltextEngine' => 'applications/maniphest/search/ManiphestTaskFulltextEngine.php',
|
||||
'ManiphestTaskGraph' => 'infrastructure/graph/ManiphestTaskGraph.php',
|
||||
'ManiphestTaskHasCommitEdgeType' => 'applications/maniphest/edge/ManiphestTaskHasCommitEdgeType.php',
|
||||
|
@ -2828,6 +2832,12 @@ phutil_register_library_map(array(
|
|||
'PhabricatorFeedStoryNotification' => 'applications/notification/storage/PhabricatorFeedStoryNotification.php',
|
||||
'PhabricatorFeedStoryPublisher' => 'applications/feed/PhabricatorFeedStoryPublisher.php',
|
||||
'PhabricatorFeedStoryReference' => 'applications/feed/storage/PhabricatorFeedStoryReference.php',
|
||||
'PhabricatorFerretDocument' => 'applications/search/ferret/PhabricatorFerretDocument.php',
|
||||
'PhabricatorFerretEngine' => 'applications/search/ferret/PhabricatorFerretEngine.php',
|
||||
'PhabricatorFerretField' => 'applications/search/ferret/PhabricatorFerretField.php',
|
||||
'PhabricatorFerretFulltextEngineExtension' => 'applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php',
|
||||
'PhabricatorFerretInterface' => 'applications/search/ferret/PhabricatorFerretInterface.php',
|
||||
'PhabricatorFerretNgrams' => 'applications/search/ferret/PhabricatorFerretNgrams.php',
|
||||
'PhabricatorFile' => 'applications/files/storage/PhabricatorFile.php',
|
||||
'PhabricatorFileAES256StorageFormat' => 'applications/files/format/PhabricatorFileAES256StorageFormat.php',
|
||||
'PhabricatorFileBundleLoader' => 'applications/files/query/PhabricatorFileBundleLoader.php',
|
||||
|
@ -3195,6 +3205,7 @@ phutil_register_library_map(array(
|
|||
'PhabricatorNamedQueryQuery' => 'applications/search/query/PhabricatorNamedQueryQuery.php',
|
||||
'PhabricatorNavigationRemarkupRule' => 'infrastructure/markup/rule/PhabricatorNavigationRemarkupRule.php',
|
||||
'PhabricatorNeverTriggerClock' => 'infrastructure/daemon/workers/clock/PhabricatorNeverTriggerClock.php',
|
||||
'PhabricatorNgramEngine' => 'applications/search/ngrams/PhabricatorNgramEngine.php',
|
||||
'PhabricatorNgramsIndexEngineExtension' => 'applications/search/engineextension/PhabricatorNgramsIndexEngineExtension.php',
|
||||
'PhabricatorNgramsInterface' => 'applications/search/interface/PhabricatorNgramsInterface.php',
|
||||
'PhabricatorNotificationBuilder' => 'applications/notification/builder/PhabricatorNotificationBuilder.php',
|
||||
|
@ -6659,6 +6670,7 @@ phutil_register_library_map(array(
|
|||
'PhabricatorSpacesInterface',
|
||||
'PhabricatorConduitResultInterface',
|
||||
'PhabricatorFulltextInterface',
|
||||
'PhabricatorFerretInterface',
|
||||
'DoorkeeperBridgedObjectInterface',
|
||||
'PhabricatorEditEngineSubtypeInterface',
|
||||
'PhabricatorEditEngineLockableInterface',
|
||||
|
@ -6682,6 +6694,10 @@ phutil_register_library_map(array(
|
|||
'ManiphestTaskEditBulkJobType' => 'PhabricatorWorkerBulkJobType',
|
||||
'ManiphestTaskEditController' => 'ManiphestController',
|
||||
'ManiphestTaskEditEngineLock' => 'PhabricatorEditEngineLock',
|
||||
'ManiphestTaskFerretDocument' => 'PhabricatorFerretDocument',
|
||||
'ManiphestTaskFerretEngine' => 'PhabricatorFerretEngine',
|
||||
'ManiphestTaskFerretField' => 'PhabricatorFerretField',
|
||||
'ManiphestTaskFerretNgrams' => 'PhabricatorFerretNgrams',
|
||||
'ManiphestTaskFulltextEngine' => 'PhabricatorFulltextEngine',
|
||||
'ManiphestTaskGraph' => 'PhabricatorObjectGraph',
|
||||
'ManiphestTaskHasCommitEdgeType' => 'PhabricatorEdgeType',
|
||||
|
@ -8147,6 +8163,11 @@ phutil_register_library_map(array(
|
|||
'PhabricatorFeedStoryNotification' => 'PhabricatorFeedDAO',
|
||||
'PhabricatorFeedStoryPublisher' => 'Phobject',
|
||||
'PhabricatorFeedStoryReference' => 'PhabricatorFeedDAO',
|
||||
'PhabricatorFerretDocument' => 'PhabricatorSearchDAO',
|
||||
'PhabricatorFerretEngine' => 'Phobject',
|
||||
'PhabricatorFerretField' => 'PhabricatorSearchDAO',
|
||||
'PhabricatorFerretFulltextEngineExtension' => 'PhabricatorFulltextEngineExtension',
|
||||
'PhabricatorFerretNgrams' => 'PhabricatorSearchDAO',
|
||||
'PhabricatorFile' => array(
|
||||
'PhabricatorFileDAO',
|
||||
'PhabricatorApplicationTransactionInterface',
|
||||
|
@ -8565,6 +8586,7 @@ phutil_register_library_map(array(
|
|||
'PhabricatorNamedQueryQuery' => 'PhabricatorCursorPagedPolicyAwareQuery',
|
||||
'PhabricatorNavigationRemarkupRule' => 'PhutilRemarkupRule',
|
||||
'PhabricatorNeverTriggerClock' => 'PhabricatorTriggerClock',
|
||||
'PhabricatorNgramEngine' => 'Phobject',
|
||||
'PhabricatorNgramsIndexEngineExtension' => 'PhabricatorIndexEngineExtension',
|
||||
'PhabricatorNgramsInterface' => 'PhabricatorIndexableInterface',
|
||||
'PhabricatorNotificationBuilder' => 'Phobject',
|
||||
|
|
|
@ -49,6 +49,8 @@ final class ManiphestTaskSearchEngine
|
|||
$subtype_map = id(new ManiphestTask())->newEditEngineSubtypeMap();
|
||||
$hide_subtypes = (count($subtype_map) == 1);
|
||||
|
||||
$hide_ferret = !PhabricatorEnv::getEnvConfig('phabricator.show-prototypes');
|
||||
|
||||
return array(
|
||||
id(new PhabricatorOwnersSearchField())
|
||||
->setLabel(pht('Assigned To'))
|
||||
|
@ -89,6 +91,10 @@ final class ManiphestTaskSearchEngine
|
|||
id(new PhabricatorSearchTextField())
|
||||
->setLabel(pht('Contains Words'))
|
||||
->setKey('fulltext'),
|
||||
id(new PhabricatorSearchTextField())
|
||||
->setLabel(pht('Matches (Prototype)'))
|
||||
->setKey('ferret')
|
||||
->setIsHidden($hide_ferret),
|
||||
id(new PhabricatorSearchThreeStateField())
|
||||
->setLabel(pht('Open Parents'))
|
||||
->setKey('hasParents')
|
||||
|
@ -145,6 +151,7 @@ final class ManiphestTaskSearchEngine
|
|||
'priorities',
|
||||
'subtypes',
|
||||
'fulltext',
|
||||
'ferret',
|
||||
'hasParents',
|
||||
'hasSubtasks',
|
||||
'parentIDs',
|
||||
|
@ -224,6 +231,12 @@ final class ManiphestTaskSearchEngine
|
|||
$query->withFullTextSearch($map['fulltext']);
|
||||
}
|
||||
|
||||
if (strlen($map['ferret'])) {
|
||||
$query->withFerretConstraint(
|
||||
id(new ManiphestTask())->newFerretEngine(),
|
||||
$map['ferret']);
|
||||
}
|
||||
|
||||
if ($map['parentIDs']) {
|
||||
$query->withParentTaskIDs($map['parentIDs']);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
<?php
|
||||
|
||||
final class ManiphestTaskFerretEngine
|
||||
extends PhabricatorFerretEngine {
|
||||
|
||||
public function newNgramsObject() {
|
||||
return new ManiphestTaskFerretNgrams();
|
||||
}
|
||||
|
||||
public function newDocumentObject() {
|
||||
return new ManiphestTaskFerretDocument();
|
||||
}
|
||||
|
||||
public function newFieldObject() {
|
||||
return new ManiphestTaskFerretField();
|
||||
}
|
||||
|
||||
}
|
|
@ -16,6 +16,7 @@ final class ManiphestTask extends ManiphestDAO
|
|||
PhabricatorSpacesInterface,
|
||||
PhabricatorConduitResultInterface,
|
||||
PhabricatorFulltextInterface,
|
||||
PhabricatorFerretInterface,
|
||||
DoorkeeperBridgedObjectInterface,
|
||||
PhabricatorEditEngineSubtypeInterface,
|
||||
PhabricatorEditEngineLockableInterface {
|
||||
|
@ -603,4 +604,12 @@ final class ManiphestTask extends ManiphestDAO
|
|||
return new ManiphestTaskEditEngineLock();
|
||||
}
|
||||
|
||||
|
||||
/* -( PhabricatorFerretInterface )----------------------------------------- */
|
||||
|
||||
|
||||
public function newFerretEngine() {
|
||||
return new ManiphestTaskFerretEngine();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
<?php
|
||||
|
||||
final class ManiphestTaskFerretDocument
|
||||
extends PhabricatorFerretDocument {
|
||||
|
||||
public function getApplicationName() {
|
||||
return 'maniphest';
|
||||
}
|
||||
|
||||
public function getIndexKey() {
|
||||
return 'task';
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
<?php
|
||||
|
||||
final class ManiphestTaskFerretField
|
||||
extends PhabricatorFerretField {
|
||||
|
||||
public function getApplicationName() {
|
||||
return 'maniphest';
|
||||
}
|
||||
|
||||
public function getIndexKey() {
|
||||
return 'task';
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
<?php
|
||||
|
||||
final class ManiphestTaskFerretNgrams
|
||||
extends PhabricatorFerretNgrams {
|
||||
|
||||
public function getApplicationName() {
|
||||
return 'maniphest';
|
||||
}
|
||||
|
||||
public function getIndexKey() {
|
||||
return 'task';
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
<?php
|
||||
|
||||
final class PhabricatorFerretFulltextEngineExtension
|
||||
extends PhabricatorFulltextEngineExtension {
|
||||
|
||||
const EXTENSIONKEY = 'ferret';
|
||||
|
||||
|
||||
public function getExtensionName() {
|
||||
return pht('Ferret Fulltext Engine');
|
||||
}
|
||||
|
||||
|
||||
public function shouldIndexFulltextObject($object) {
|
||||
return ($object instanceof PhabricatorFerretInterface);
|
||||
}
|
||||
|
||||
|
||||
public function indexFulltextObject(
|
||||
$object,
|
||||
PhabricatorSearchAbstractDocument $document) {
|
||||
|
||||
$phid = $document->getPHID();
|
||||
$engine = $object->newFerretEngine();
|
||||
|
||||
$ferret_document = $engine->newDocumentObject()
|
||||
->setObjectPHID($phid)
|
||||
->setIsClosed(0)
|
||||
->setEpochCreated(0)
|
||||
->setEpochModified(0);
|
||||
|
||||
$stemmer = new PhutilSearchStemmer();
|
||||
|
||||
$ferret_fields = array();
|
||||
$ngrams_source = array();
|
||||
foreach ($document->getFieldData() as $field) {
|
||||
list($key, $raw_corpus) = $field;
|
||||
|
||||
if (!strlen($raw_corpus)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$normal_corpus = $stemmer->stemCorpus($raw_corpus);
|
||||
|
||||
$ferret_fields[] = $engine->newFieldObject()
|
||||
->setFieldKey($key)
|
||||
->setRawCorpus($raw_corpus)
|
||||
->setNormalCorpus($normal_corpus);
|
||||
|
||||
$ngrams_source[] = $raw_corpus;
|
||||
}
|
||||
$ngrams_source = implode(' ', $ngrams_source);
|
||||
|
||||
$ngrams = id(new PhabricatorNgramEngine())
|
||||
->getNgramsFromString($ngrams_source, 'index');
|
||||
|
||||
$ferret_document->openTransaction();
|
||||
$this->deleteOldDocument($engine, $object, $document);
|
||||
|
||||
$ferret_document->save();
|
||||
|
||||
$document_id = $ferret_document->getID();
|
||||
foreach ($ferret_fields as $ferret_field) {
|
||||
$ferret_field
|
||||
->setDocumentID($document_id)
|
||||
->save();
|
||||
}
|
||||
|
||||
$ferret_ngrams = $engine->newNgramsObject();
|
||||
$conn = $ferret_ngrams->establishConnection('w');
|
||||
|
||||
$sql = array();
|
||||
foreach ($ngrams as $ngram) {
|
||||
$sql[] = qsprintf(
|
||||
$conn,
|
||||
'(%d, %s)',
|
||||
$document_id,
|
||||
$ngram);
|
||||
}
|
||||
|
||||
foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) {
|
||||
queryfx(
|
||||
$conn,
|
||||
'INSERT INTO %T (documentID, ngram) VALUES %Q',
|
||||
$ferret_ngrams->getTableName(),
|
||||
$chunk);
|
||||
}
|
||||
$ferret_document->saveTransaction();
|
||||
}
|
||||
|
||||
|
||||
private function deleteOldDocument(
|
||||
PhabricatorFerretEngine $engine,
|
||||
$object,
|
||||
PhabricatorSearchAbstractDocument $document) {
|
||||
|
||||
$old_document = $engine->newDocumentObject()->loadOneWhere(
|
||||
'objectPHID = %s',
|
||||
$document->getPHID());
|
||||
if (!$old_document) {
|
||||
return;
|
||||
}
|
||||
|
||||
$conn = $old_document->establishConnection('w');
|
||||
$old_id = $old_document->getID();
|
||||
|
||||
queryfx(
|
||||
$conn,
|
||||
'DELETE FROM %T WHERE id = %d',
|
||||
$engine->newDocumentObject()->getTableName(),
|
||||
$old_id);
|
||||
|
||||
queryfx(
|
||||
$conn,
|
||||
'DELETE FROM %T WHERE documentID = %d',
|
||||
$engine->newFieldObject()->getTableName(),
|
||||
$old_id);
|
||||
|
||||
queryfx(
|
||||
$conn,
|
||||
'DELETE FROM %T WHERE documentID = %d',
|
||||
$engine->newNgramsObject()->getTableName(),
|
||||
$old_id);
|
||||
}
|
||||
|
||||
}
|
40
src/applications/search/ferret/PhabricatorFerretDocument.php
Normal file
40
src/applications/search/ferret/PhabricatorFerretDocument.php
Normal file
|
@ -0,0 +1,40 @@
|
|||
<?php
|
||||
|
||||
abstract class PhabricatorFerretDocument
|
||||
extends PhabricatorSearchDAO {
|
||||
|
||||
protected $objectPHID;
|
||||
protected $isClosed;
|
||||
protected $authorPHID;
|
||||
protected $ownerPHID;
|
||||
protected $epochCreated;
|
||||
protected $epochModified;
|
||||
|
||||
abstract public function getIndexKey();
|
||||
|
||||
protected function getConfiguration() {
|
||||
return array(
|
||||
self::CONFIG_TIMESTAMPS => false,
|
||||
self::CONFIG_COLUMN_SCHEMA => array(
|
||||
'isClosed' => 'bool',
|
||||
'authorPHID' => 'phid?',
|
||||
'ownerPHID' => 'phid?',
|
||||
'epochCreated' => 'epoch',
|
||||
'epochModified' => 'epoch',
|
||||
),
|
||||
self::CONFIG_KEY_SCHEMA => array(
|
||||
'key_object' => array(
|
||||
'columns' => array('objectPHID'),
|
||||
'unique' => true,
|
||||
),
|
||||
),
|
||||
) + parent::getConfiguration();
|
||||
}
|
||||
|
||||
public function getTableName() {
|
||||
$application = $this->getApplicationName();
|
||||
$key = $this->getIndexKey();
|
||||
return "{$application}_{$key}_fdocument";
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
<?php
|
||||
|
||||
abstract class PhabricatorFerretEngine extends Phobject {
|
||||
|
||||
abstract public function newNgramsObject();
|
||||
abstract public function newDocumentObject();
|
||||
abstract public function newFieldObject();
|
||||
|
||||
}
|
36
src/applications/search/ferret/PhabricatorFerretField.php
Normal file
36
src/applications/search/ferret/PhabricatorFerretField.php
Normal file
|
@ -0,0 +1,36 @@
|
|||
<?php
|
||||
|
||||
abstract class PhabricatorFerretField
|
||||
extends PhabricatorSearchDAO {
|
||||
|
||||
protected $documentID;
|
||||
protected $fieldKey;
|
||||
protected $rawCorpus;
|
||||
protected $normalCorpus;
|
||||
|
||||
abstract public function getIndexKey();
|
||||
|
||||
protected function getConfiguration() {
|
||||
return array(
|
||||
self::CONFIG_TIMESTAMPS => false,
|
||||
self::CONFIG_COLUMN_SCHEMA => array(
|
||||
'documentID' => 'uint32',
|
||||
'fieldKey' => 'text4',
|
||||
'rawCorpus' => 'sort',
|
||||
'normalCorpus' => 'sort',
|
||||
),
|
||||
self::CONFIG_KEY_SCHEMA => array(
|
||||
'key_document' => array(
|
||||
'columns' => array('documentID', 'fieldKey'),
|
||||
),
|
||||
),
|
||||
) + parent::getConfiguration();
|
||||
}
|
||||
|
||||
public function getTableName() {
|
||||
$application = $this->getApplicationName();
|
||||
$key = $this->getIndexKey();
|
||||
return "{$application}_{$key}_ffield";
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
<?php
|
||||
|
||||
interface PhabricatorFerretInterface {
|
||||
|
||||
public function newFerretEngine();
|
||||
|
||||
}
|
35
src/applications/search/ferret/PhabricatorFerretNgrams.php
Normal file
35
src/applications/search/ferret/PhabricatorFerretNgrams.php
Normal file
|
@ -0,0 +1,35 @@
|
|||
<?php
|
||||
|
||||
abstract class PhabricatorFerretNgrams
|
||||
extends PhabricatorSearchDAO {
|
||||
|
||||
protected $documentID;
|
||||
protected $ngram;
|
||||
|
||||
abstract public function getIndexKey();
|
||||
|
||||
protected function getConfiguration() {
|
||||
return array(
|
||||
self::CONFIG_TIMESTAMPS => false,
|
||||
self::CONFIG_COLUMN_SCHEMA => array(
|
||||
'documentID' => 'uint32',
|
||||
'ngram' => 'char3',
|
||||
),
|
||||
self::CONFIG_KEY_SCHEMA => array(
|
||||
'key_ngram' => array(
|
||||
'columns' => array('ngram', 'documentID'),
|
||||
),
|
||||
'key_object' => array(
|
||||
'columns' => array('documentID'),
|
||||
),
|
||||
),
|
||||
) + parent::getConfiguration();
|
||||
}
|
||||
|
||||
public function getTableName() {
|
||||
$application = $this->getApplicationName();
|
||||
$key = $this->getIndexKey();
|
||||
return "{$application}_{$key}_fngrams";
|
||||
}
|
||||
|
||||
}
|
41
src/applications/search/ngrams/PhabricatorNgramEngine.php
Normal file
41
src/applications/search/ngrams/PhabricatorNgramEngine.php
Normal file
|
@ -0,0 +1,41 @@
|
|||
<?php
|
||||
|
||||
final class PhabricatorNgramEngine extends Phobject {
|
||||
|
||||
public function tokenizeString($value) {
|
||||
$value = trim($value, ' ');
|
||||
$value = preg_split('/ +/', $value);
|
||||
return $value;
|
||||
}
|
||||
|
||||
public function getNgramsFromString($value, $mode) {
|
||||
$tokens = $this->tokenizeString($value);
|
||||
|
||||
$ngrams = array();
|
||||
foreach ($tokens as $token) {
|
||||
$token = phutil_utf8_strtolower($token);
|
||||
|
||||
switch ($mode) {
|
||||
case 'query':
|
||||
break;
|
||||
case 'index':
|
||||
$token = ' '.$token.' ';
|
||||
break;
|
||||
case 'prefix':
|
||||
$token = ' '.$token;
|
||||
break;
|
||||
}
|
||||
|
||||
$len = (strlen($token) - 2);
|
||||
for ($ii = 0; $ii < $len; $ii++) {
|
||||
$ngram = substr($token, $ii, 3);
|
||||
$ngrams[$ngram] = $ngram;
|
||||
}
|
||||
}
|
||||
|
||||
ksort($ngrams);
|
||||
|
||||
return array_keys($ngrams);
|
||||
}
|
||||
|
||||
}
|
|
@ -27,6 +27,8 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
private $spacePHIDs;
|
||||
private $spaceIsArchived;
|
||||
private $ngrams = array();
|
||||
private $ferretEngine;
|
||||
private $ferretConstraints;
|
||||
|
||||
protected function getPageCursors(array $page) {
|
||||
return array(
|
||||
|
@ -270,6 +272,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
$joins[] = $this->buildEdgeLogicJoinClause($conn);
|
||||
$joins[] = $this->buildApplicationSearchJoinClause($conn);
|
||||
$joins[] = $this->buildNgramsJoinClause($conn);
|
||||
$joins[] = $this->buildFerretJoinClause($conn);
|
||||
return $joins;
|
||||
}
|
||||
|
||||
|
@ -292,6 +295,7 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
$where[] = $this->buildEdgeLogicWhereClause($conn);
|
||||
$where[] = $this->buildSpacesWhereClause($conn);
|
||||
$where[] = $this->buildNgramsWhereClause($conn);
|
||||
$where[] = $this->buildFerretWhereClause($conn);
|
||||
return $where;
|
||||
}
|
||||
|
||||
|
@ -346,6 +350,10 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
return true;
|
||||
}
|
||||
|
||||
if ($this->shouldGroupFerretResultRows()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -1373,6 +1381,150 @@ abstract class PhabricatorCursorPagedPolicyAwareQuery
|
|||
}
|
||||
|
||||
|
||||
/* -( Ferret )------------------------------------------------------------- */
|
||||
|
||||
|
||||
public function withFerretConstraint(
|
||||
PhabricatorFerretEngine $engine,
|
||||
$raw_query) {
|
||||
|
||||
if ($this->ferretEngine) {
|
||||
throw new Exception(
|
||||
pht(
|
||||
'Query may not have multiple fulltext constraints.'));
|
||||
}
|
||||
|
||||
if (!strlen($raw_query)) {
|
||||
return $this;
|
||||
}
|
||||
|
||||
$this->ferretEngine = $engine;
|
||||
$this->ferretConstraints = preg_split('/\s+/', $raw_query);
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
protected function buildFerretJoinClause(AphrontDatabaseConnection $conn) {
|
||||
if (!$this->ferretEngine) {
|
||||
return array();
|
||||
}
|
||||
|
||||
$engine = $this->ferretEngine;
|
||||
$ngram_engine = new PhabricatorNgramEngine();
|
||||
|
||||
$ngram_table = $engine->newNgramsObject();
|
||||
$ngram_table_name = $ngram_table->getTableName();
|
||||
|
||||
$flat = array();
|
||||
foreach ($this->ferretConstraints as $term) {
|
||||
$value = $term;
|
||||
$length = count(phutil_utf8v($term));
|
||||
|
||||
if ($length >= 3) {
|
||||
$ngrams = $ngram_engine->getNgramsFromString($value, 'query');
|
||||
$prefix = false;
|
||||
} else if ($length == 2) {
|
||||
$ngrams = $ngram_engine->getNgramsFromString($value, 'prefix');
|
||||
$prefix = false;
|
||||
} else {
|
||||
$ngrams = array(' '.$value);
|
||||
$prefix = true;
|
||||
}
|
||||
|
||||
foreach ($ngrams as $ngram) {
|
||||
$flat[] = array(
|
||||
'table' => $ngram_table_name,
|
||||
'ngram' => $ngram,
|
||||
'prefix' => $prefix,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// MySQL only allows us to join a maximum of 61 tables per query. Each
|
||||
// ngram is going to cost us a join toward that limit, so if the user
|
||||
// specified a very long query string, just pick 16 of the ngrams
|
||||
// at random.
|
||||
if (count($flat) > 16) {
|
||||
shuffle($flat);
|
||||
$flat = array_slice($flat, 0, 16);
|
||||
}
|
||||
|
||||
$alias = $this->getPrimaryTableAlias();
|
||||
if ($alias) {
|
||||
$phid_column = qsprintf($conn, '%T.%T', $alias, 'phid');
|
||||
} else {
|
||||
$phid_column = qsprintf($conn, '%T', 'phid');
|
||||
}
|
||||
|
||||
$document_table = $engine->newDocumentObject();
|
||||
$field_table = $engine->newFieldObject();
|
||||
|
||||
$joins = array();
|
||||
$joins[] = qsprintf(
|
||||
$conn,
|
||||
'JOIN %T ftdoc ON ftdoc.objectPHID = %Q',
|
||||
$document_table->getTableName(),
|
||||
$phid_column);
|
||||
|
||||
$idx = 1;
|
||||
foreach ($flat as $spec) {
|
||||
$table = $spec['table'];
|
||||
$ngram = $spec['ngram'];
|
||||
$prefix = $spec['prefix'];
|
||||
|
||||
$alias = 'ft'.$idx++;
|
||||
|
||||
if ($prefix) {
|
||||
$joins[] = qsprintf(
|
||||
$conn,
|
||||
'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram LIKE %>',
|
||||
$table,
|
||||
$alias,
|
||||
$alias,
|
||||
$alias,
|
||||
$ngram);
|
||||
} else {
|
||||
$joins[] = qsprintf(
|
||||
$conn,
|
||||
'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram = %s',
|
||||
$table,
|
||||
$alias,
|
||||
$alias,
|
||||
$alias,
|
||||
$ngram);
|
||||
}
|
||||
}
|
||||
|
||||
$joins[] = qsprintf(
|
||||
$conn,
|
||||
'JOIN %T ftfield ON ftdoc.id = ftfield.documentID',
|
||||
$field_table->getTableName());
|
||||
|
||||
return $joins;
|
||||
}
|
||||
|
||||
protected function buildFerretWhereClause(AphrontDatabaseConnection $conn) {
|
||||
if (!$this->ferretEngine) {
|
||||
return array();
|
||||
}
|
||||
|
||||
$where = array();
|
||||
foreach ($this->ferretConstraints as $constraint) {
|
||||
$where[] = qsprintf(
|
||||
$conn,
|
||||
'(ftfield.rawCorpus LIKE %~ OR ftfield.normalCorpus LIKE %~)',
|
||||
$constraint,
|
||||
$constraint);
|
||||
}
|
||||
|
||||
return $where;
|
||||
}
|
||||
|
||||
protected function shouldGroupFerretResultRows() {
|
||||
return (bool)$this->ferretConstraints;
|
||||
}
|
||||
|
||||
|
||||
/* -( Ngrams )------------------------------------------------------------- */
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue