mirror of
https://we.phorge.it/source/phorge.git
synced 2024-12-18 19:40:55 +01:00
Use stemming in the MySQL fulltext search engine
Summary: Ref T6740. When we index a document, also save a copy of the stemmed version. When querying, search the combined corpus for the terms. (We may need to tune this a bit later since it's possible for literal, quoted terms to match in the stemmed section, but I think this wil rarely cause issues in practice.) A downside here is that search sort of breaks if you upgrade into this and don't reindex. I wasn't able to find a way to issue the query that remained compatible with older indexes and didn't have awful performance, so my plan is: - Put this on `secure`. - Rebuild the index. - If things look good after a couple of days, add a way that we can tell people they need to rebuild the search index with a setup warning. We might get some reports between now and then, but if this is super awful we should know by the end of the weekend. Test Plan: WOW AMAZING {F2021466} Reviewers: chad Reviewed By: chad Maniphest Tasks: T6740 Differential Revision: https://secure.phabricator.com/D16947
This commit is contained in:
parent
d54c14c644
commit
7c5b5327c8
3 changed files with 29 additions and 11 deletions
2
resources/sql/autopatches/20161125.search.01.stemmed.sql
Normal file
2
resources/sql/autopatches/20161125.search.01.stemmed.sql
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
ALTER TABLE {$NAMESPACE}_search.search_documentfield
|
||||||
|
ADD stemmedCorpus LONGTEXT COLLATE {$COLLATE_FULLTEXT};
|
|
@ -33,6 +33,8 @@ final class PhabricatorMySQLFulltextStorageEngine
|
||||||
|
|
||||||
$conn_w = $store->establishConnection('w');
|
$conn_w = $store->establishConnection('w');
|
||||||
|
|
||||||
|
$stemmer = new PhutilSearchStemmer();
|
||||||
|
|
||||||
$field_dao = new PhabricatorSearchDocumentField();
|
$field_dao = new PhabricatorSearchDocumentField();
|
||||||
queryfx(
|
queryfx(
|
||||||
$conn_w,
|
$conn_w,
|
||||||
|
@ -41,16 +43,21 @@ final class PhabricatorMySQLFulltextStorageEngine
|
||||||
$phid);
|
$phid);
|
||||||
foreach ($doc->getFieldData() as $field) {
|
foreach ($doc->getFieldData() as $field) {
|
||||||
list($ftype, $corpus, $aux_phid) = $field;
|
list($ftype, $corpus, $aux_phid) = $field;
|
||||||
|
|
||||||
|
$stemmed_corpus = $stemmer->stemCorpus($corpus);
|
||||||
|
|
||||||
queryfx(
|
queryfx(
|
||||||
$conn_w,
|
$conn_w,
|
||||||
'INSERT INTO %T (phid, phidType, field, auxPHID, corpus) '.
|
'INSERT INTO %T
|
||||||
'VALUES (%s, %s, %s, %ns, %s)',
|
(phid, phidType, field, auxPHID, corpus, stemmedCorpus) '.
|
||||||
|
'VALUES (%s, %s, %s, %ns, %s, %s)',
|
||||||
$field_dao->getTableName(),
|
$field_dao->getTableName(),
|
||||||
$phid,
|
$phid,
|
||||||
$doc->getDocumentType(),
|
$doc->getDocumentType(),
|
||||||
$ftype,
|
$ftype,
|
||||||
$aux_phid,
|
$aux_phid,
|
||||||
$corpus);
|
$corpus,
|
||||||
|
$stemmed_corpus);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -205,8 +212,9 @@ final class PhabricatorMySQLFulltextStorageEngine
|
||||||
if (strlen($compiled_query)) {
|
if (strlen($compiled_query)) {
|
||||||
$select[] = qsprintf(
|
$select[] = qsprintf(
|
||||||
$conn,
|
$conn,
|
||||||
'IF(field.field = %s, %d, 0) + '.
|
'IF(field.field = %s, %d, 0) +
|
||||||
'MATCH(corpus) AGAINST (%s IN BOOLEAN MODE) AS fieldScore',
|
MATCH(corpus, stemmedCorpus) AGAINST (%s IN BOOLEAN MODE)
|
||||||
|
AS fieldScore',
|
||||||
$title_field,
|
$title_field,
|
||||||
$title_boost,
|
$title_boost,
|
||||||
$compiled_query);
|
$compiled_query);
|
||||||
|
@ -218,7 +226,7 @@ final class PhabricatorMySQLFulltextStorageEngine
|
||||||
|
|
||||||
$where[] = qsprintf(
|
$where[] = qsprintf(
|
||||||
$conn,
|
$conn,
|
||||||
'MATCH(corpus) AGAINST (%s IN BOOLEAN MODE)',
|
'MATCH(corpus, stemmedCorpus) AGAINST (%s IN BOOLEAN MODE)',
|
||||||
$compiled_query);
|
$compiled_query);
|
||||||
|
|
||||||
if ($query->getParameter('field')) {
|
if ($query->getParameter('field')) {
|
||||||
|
@ -380,11 +388,17 @@ final class PhabricatorMySQLFulltextStorageEngine
|
||||||
}
|
}
|
||||||
|
|
||||||
private function compileQuery($raw_query) {
|
private function compileQuery($raw_query) {
|
||||||
$compiler = PhabricatorSearchDocument::newQueryCompiler();
|
$stemmer = new PhutilSearchStemmer();
|
||||||
|
|
||||||
return $compiler
|
$compiler = PhabricatorSearchDocument::newQueryCompiler()
|
||||||
->setQuery($raw_query)
|
->setQuery($raw_query)
|
||||||
->compileQuery();
|
->setStemmer($stemmer);
|
||||||
|
|
||||||
|
$queries = array();
|
||||||
|
$queries[] = $compiler->compileLiteralQuery();
|
||||||
|
$queries[] = $compiler->compileStemmedQuery();
|
||||||
|
|
||||||
|
return implode(' ', array_filter($queries));
|
||||||
}
|
}
|
||||||
|
|
||||||
public function indexExists() {
|
public function indexExists() {
|
||||||
|
|
|
@ -6,6 +6,7 @@ final class PhabricatorSearchDocumentField extends PhabricatorSearchDAO {
|
||||||
protected $field;
|
protected $field;
|
||||||
protected $auxPHID;
|
protected $auxPHID;
|
||||||
protected $corpus;
|
protected $corpus;
|
||||||
|
protected $stemmedCorpus;
|
||||||
|
|
||||||
protected function getConfiguration() {
|
protected function getConfiguration() {
|
||||||
return array(
|
return array(
|
||||||
|
@ -16,14 +17,15 @@ final class PhabricatorSearchDocumentField extends PhabricatorSearchDAO {
|
||||||
'field' => 'text4',
|
'field' => 'text4',
|
||||||
'auxPHID' => 'phid?',
|
'auxPHID' => 'phid?',
|
||||||
'corpus' => 'fulltext?',
|
'corpus' => 'fulltext?',
|
||||||
|
'stemmedCorpus' => 'fulltext?',
|
||||||
),
|
),
|
||||||
self::CONFIG_KEY_SCHEMA => array(
|
self::CONFIG_KEY_SCHEMA => array(
|
||||||
'key_phid' => null,
|
'key_phid' => null,
|
||||||
'phid' => array(
|
'phid' => array(
|
||||||
'columns' => array('phid'),
|
'columns' => array('phid'),
|
||||||
),
|
),
|
||||||
'corpus' => array(
|
'key_corpus' => array(
|
||||||
'columns' => array('corpus'),
|
'columns' => array('corpus', 'stemmedCorpus'),
|
||||||
'type' => 'FULLTEXT',
|
'type' => 'FULLTEXT',
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
|
|
Loading…
Reference in a new issue