1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-12-18 19:40:55 +01:00

Use stemming in the MySQL fulltext search engine

Summary:
Ref T6740. When we index a document, also save a copy of the stemmed version.

When querying, search the combined corpus for the terms.

(We may need to tune this a bit later since it's possible for literal, quoted terms to match in the stemmed section, but I think this wil rarely cause issues in practice.)

A downside here is that search sort of breaks if you upgrade into this and don't reindex. I wasn't able to find a way to issue the query that remained compatible with older indexes and didn't have awful performance, so my plan is:

  - Put this on `secure`.
  - Rebuild the index.
  - If things look good after a couple of days, add a way that we can tell people they need to rebuild the search index with a setup warning.

We might get some reports between now and then, but if this is super awful we should know by the end of the weekend.

Test Plan:
WOW AMAZING

{F2021466}

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T6740

Differential Revision: https://secure.phabricator.com/D16947
This commit is contained in:
epriestley 2016-11-25 13:52:45 -08:00
parent d54c14c644
commit 7c5b5327c8
3 changed files with 29 additions and 11 deletions

View file

@ -0,0 +1,2 @@
ALTER TABLE {$NAMESPACE}_search.search_documentfield
ADD stemmedCorpus LONGTEXT COLLATE {$COLLATE_FULLTEXT};

View file

@ -33,6 +33,8 @@ final class PhabricatorMySQLFulltextStorageEngine
$conn_w = $store->establishConnection('w'); $conn_w = $store->establishConnection('w');
$stemmer = new PhutilSearchStemmer();
$field_dao = new PhabricatorSearchDocumentField(); $field_dao = new PhabricatorSearchDocumentField();
queryfx( queryfx(
$conn_w, $conn_w,
@ -41,16 +43,21 @@ final class PhabricatorMySQLFulltextStorageEngine
$phid); $phid);
foreach ($doc->getFieldData() as $field) { foreach ($doc->getFieldData() as $field) {
list($ftype, $corpus, $aux_phid) = $field; list($ftype, $corpus, $aux_phid) = $field;
$stemmed_corpus = $stemmer->stemCorpus($corpus);
queryfx( queryfx(
$conn_w, $conn_w,
'INSERT INTO %T (phid, phidType, field, auxPHID, corpus) '. 'INSERT INTO %T
'VALUES (%s, %s, %s, %ns, %s)', (phid, phidType, field, auxPHID, corpus, stemmedCorpus) '.
'VALUES (%s, %s, %s, %ns, %s, %s)',
$field_dao->getTableName(), $field_dao->getTableName(),
$phid, $phid,
$doc->getDocumentType(), $doc->getDocumentType(),
$ftype, $ftype,
$aux_phid, $aux_phid,
$corpus); $corpus,
$stemmed_corpus);
} }
@ -205,8 +212,9 @@ final class PhabricatorMySQLFulltextStorageEngine
if (strlen($compiled_query)) { if (strlen($compiled_query)) {
$select[] = qsprintf( $select[] = qsprintf(
$conn, $conn,
'IF(field.field = %s, %d, 0) + '. 'IF(field.field = %s, %d, 0) +
'MATCH(corpus) AGAINST (%s IN BOOLEAN MODE) AS fieldScore', MATCH(corpus, stemmedCorpus) AGAINST (%s IN BOOLEAN MODE)
AS fieldScore',
$title_field, $title_field,
$title_boost, $title_boost,
$compiled_query); $compiled_query);
@ -218,7 +226,7 @@ final class PhabricatorMySQLFulltextStorageEngine
$where[] = qsprintf( $where[] = qsprintf(
$conn, $conn,
'MATCH(corpus) AGAINST (%s IN BOOLEAN MODE)', 'MATCH(corpus, stemmedCorpus) AGAINST (%s IN BOOLEAN MODE)',
$compiled_query); $compiled_query);
if ($query->getParameter('field')) { if ($query->getParameter('field')) {
@ -380,11 +388,17 @@ final class PhabricatorMySQLFulltextStorageEngine
} }
private function compileQuery($raw_query) { private function compileQuery($raw_query) {
$compiler = PhabricatorSearchDocument::newQueryCompiler(); $stemmer = new PhutilSearchStemmer();
return $compiler $compiler = PhabricatorSearchDocument::newQueryCompiler()
->setQuery($raw_query) ->setQuery($raw_query)
->compileQuery(); ->setStemmer($stemmer);
$queries = array();
$queries[] = $compiler->compileLiteralQuery();
$queries[] = $compiler->compileStemmedQuery();
return implode(' ', array_filter($queries));
} }
public function indexExists() { public function indexExists() {

View file

@ -6,6 +6,7 @@ final class PhabricatorSearchDocumentField extends PhabricatorSearchDAO {
protected $field; protected $field;
protected $auxPHID; protected $auxPHID;
protected $corpus; protected $corpus;
protected $stemmedCorpus;
protected function getConfiguration() { protected function getConfiguration() {
return array( return array(
@ -16,14 +17,15 @@ final class PhabricatorSearchDocumentField extends PhabricatorSearchDAO {
'field' => 'text4', 'field' => 'text4',
'auxPHID' => 'phid?', 'auxPHID' => 'phid?',
'corpus' => 'fulltext?', 'corpus' => 'fulltext?',
'stemmedCorpus' => 'fulltext?',
), ),
self::CONFIG_KEY_SCHEMA => array( self::CONFIG_KEY_SCHEMA => array(
'key_phid' => null, 'key_phid' => null,
'phid' => array( 'phid' => array(
'columns' => array('phid'), 'columns' => array('phid'),
), ),
'corpus' => array( 'key_corpus' => array(
'columns' => array('corpus'), 'columns' => array('corpus', 'stemmedCorpus'),
'type' => 'FULLTEXT', 'type' => 'FULLTEXT',
), ),
), ),