1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2024-12-18 11:30:55 +01:00

Use stemming in the MySQL fulltext search engine

Summary:
Ref T6740. When we index a document, also save a copy of the stemmed version.

When querying, search the combined corpus for the terms.

(We may need to tune this a bit later since it's possible for literal, quoted terms to match in the stemmed section, but I think this wil rarely cause issues in practice.)

A downside here is that search sort of breaks if you upgrade into this and don't reindex. I wasn't able to find a way to issue the query that remained compatible with older indexes and didn't have awful performance, so my plan is:

  - Put this on `secure`.
  - Rebuild the index.
  - If things look good after a couple of days, add a way that we can tell people they need to rebuild the search index with a setup warning.

We might get some reports between now and then, but if this is super awful we should know by the end of the weekend.

Test Plan:
WOW AMAZING

{F2021466}

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T6740

Differential Revision: https://secure.phabricator.com/D16947
This commit is contained in:
epriestley 2016-11-25 13:52:45 -08:00
parent d54c14c644
commit 7c5b5327c8
3 changed files with 29 additions and 11 deletions

View file

@ -0,0 +1,2 @@
ALTER TABLE {$NAMESPACE}_search.search_documentfield
ADD stemmedCorpus LONGTEXT COLLATE {$COLLATE_FULLTEXT};

View file

@ -33,6 +33,8 @@ final class PhabricatorMySQLFulltextStorageEngine
$conn_w = $store->establishConnection('w');
$stemmer = new PhutilSearchStemmer();
$field_dao = new PhabricatorSearchDocumentField();
queryfx(
$conn_w,
@ -41,16 +43,21 @@ final class PhabricatorMySQLFulltextStorageEngine
$phid);
foreach ($doc->getFieldData() as $field) {
list($ftype, $corpus, $aux_phid) = $field;
$stemmed_corpus = $stemmer->stemCorpus($corpus);
queryfx(
$conn_w,
'INSERT INTO %T (phid, phidType, field, auxPHID, corpus) '.
'VALUES (%s, %s, %s, %ns, %s)',
'INSERT INTO %T
(phid, phidType, field, auxPHID, corpus, stemmedCorpus) '.
'VALUES (%s, %s, %s, %ns, %s, %s)',
$field_dao->getTableName(),
$phid,
$doc->getDocumentType(),
$ftype,
$aux_phid,
$corpus);
$corpus,
$stemmed_corpus);
}
@ -205,8 +212,9 @@ final class PhabricatorMySQLFulltextStorageEngine
if (strlen($compiled_query)) {
$select[] = qsprintf(
$conn,
'IF(field.field = %s, %d, 0) + '.
'MATCH(corpus) AGAINST (%s IN BOOLEAN MODE) AS fieldScore',
'IF(field.field = %s, %d, 0) +
MATCH(corpus, stemmedCorpus) AGAINST (%s IN BOOLEAN MODE)
AS fieldScore',
$title_field,
$title_boost,
$compiled_query);
@ -218,7 +226,7 @@ final class PhabricatorMySQLFulltextStorageEngine
$where[] = qsprintf(
$conn,
'MATCH(corpus) AGAINST (%s IN BOOLEAN MODE)',
'MATCH(corpus, stemmedCorpus) AGAINST (%s IN BOOLEAN MODE)',
$compiled_query);
if ($query->getParameter('field')) {
@ -380,11 +388,17 @@ final class PhabricatorMySQLFulltextStorageEngine
}
private function compileQuery($raw_query) {
$compiler = PhabricatorSearchDocument::newQueryCompiler();
$stemmer = new PhutilSearchStemmer();
return $compiler
$compiler = PhabricatorSearchDocument::newQueryCompiler()
->setQuery($raw_query)
->compileQuery();
->setStemmer($stemmer);
$queries = array();
$queries[] = $compiler->compileLiteralQuery();
$queries[] = $compiler->compileStemmedQuery();
return implode(' ', array_filter($queries));
}
public function indexExists() {

View file

@ -6,6 +6,7 @@ final class PhabricatorSearchDocumentField extends PhabricatorSearchDAO {
protected $field;
protected $auxPHID;
protected $corpus;
protected $stemmedCorpus;
protected function getConfiguration() {
return array(
@ -16,14 +17,15 @@ final class PhabricatorSearchDocumentField extends PhabricatorSearchDAO {
'field' => 'text4',
'auxPHID' => 'phid?',
'corpus' => 'fulltext?',
'stemmedCorpus' => 'fulltext?',
),
self::CONFIG_KEY_SCHEMA => array(
'key_phid' => null,
'phid' => array(
'columns' => array('phid'),
),
'corpus' => array(
'columns' => array('corpus'),
'key_corpus' => array(
'columns' => array('corpus', 'stemmedCorpus'),
'type' => 'FULLTEXT',
),
),