From 7c5b5327c8cc4e6bba0f255688164035cefbb4a2 Mon Sep 17 00:00:00 2001 From: epriestley Date: Fri, 25 Nov 2016 13:52:45 -0800 Subject: [PATCH] Use stemming in the MySQL fulltext search engine Summary: Ref T6740. When we index a document, also save a copy of the stemmed version. When querying, search the combined corpus for the terms. (We may need to tune this a bit later since it's possible for literal, quoted terms to match in the stemmed section, but I think this wil rarely cause issues in practice.) A downside here is that search sort of breaks if you upgrade into this and don't reindex. I wasn't able to find a way to issue the query that remained compatible with older indexes and didn't have awful performance, so my plan is: - Put this on `secure`. - Rebuild the index. - If things look good after a couple of days, add a way that we can tell people they need to rebuild the search index with a setup warning. We might get some reports between now and then, but if this is super awful we should know by the end of the weekend. Test Plan: WOW AMAZING {F2021466} Reviewers: chad Reviewed By: chad Maniphest Tasks: T6740 Differential Revision: https://secure.phabricator.com/D16947 --- .../20161125.search.01.stemmed.sql | 2 ++ .../PhabricatorMySQLFulltextStorageEngine.php | 32 +++++++++++++------ .../PhabricatorSearchDocumentField.php | 6 ++-- 3 files changed, 29 insertions(+), 11 deletions(-) create mode 100644 resources/sql/autopatches/20161125.search.01.stemmed.sql diff --git a/resources/sql/autopatches/20161125.search.01.stemmed.sql b/resources/sql/autopatches/20161125.search.01.stemmed.sql new file mode 100644 index 0000000000..acb0ac58aa --- /dev/null +++ b/resources/sql/autopatches/20161125.search.01.stemmed.sql @@ -0,0 +1,2 @@ +ALTER TABLE {$NAMESPACE}_search.search_documentfield + ADD stemmedCorpus LONGTEXT COLLATE {$COLLATE_FULLTEXT}; diff --git a/src/applications/search/fulltextstorage/PhabricatorMySQLFulltextStorageEngine.php b/src/applications/search/fulltextstorage/PhabricatorMySQLFulltextStorageEngine.php index 5703efb075..3e02c78077 100644 --- a/src/applications/search/fulltextstorage/PhabricatorMySQLFulltextStorageEngine.php +++ b/src/applications/search/fulltextstorage/PhabricatorMySQLFulltextStorageEngine.php @@ -33,6 +33,8 @@ final class PhabricatorMySQLFulltextStorageEngine $conn_w = $store->establishConnection('w'); + $stemmer = new PhutilSearchStemmer(); + $field_dao = new PhabricatorSearchDocumentField(); queryfx( $conn_w, @@ -41,16 +43,21 @@ final class PhabricatorMySQLFulltextStorageEngine $phid); foreach ($doc->getFieldData() as $field) { list($ftype, $corpus, $aux_phid) = $field; + + $stemmed_corpus = $stemmer->stemCorpus($corpus); + queryfx( $conn_w, - 'INSERT INTO %T (phid, phidType, field, auxPHID, corpus) '. - 'VALUES (%s, %s, %s, %ns, %s)', + 'INSERT INTO %T + (phid, phidType, field, auxPHID, corpus, stemmedCorpus) '. + 'VALUES (%s, %s, %s, %ns, %s, %s)', $field_dao->getTableName(), $phid, $doc->getDocumentType(), $ftype, $aux_phid, - $corpus); + $corpus, + $stemmed_corpus); } @@ -205,8 +212,9 @@ final class PhabricatorMySQLFulltextStorageEngine if (strlen($compiled_query)) { $select[] = qsprintf( $conn, - 'IF(field.field = %s, %d, 0) + '. - 'MATCH(corpus) AGAINST (%s IN BOOLEAN MODE) AS fieldScore', + 'IF(field.field = %s, %d, 0) + + MATCH(corpus, stemmedCorpus) AGAINST (%s IN BOOLEAN MODE) + AS fieldScore', $title_field, $title_boost, $compiled_query); @@ -218,7 +226,7 @@ final class PhabricatorMySQLFulltextStorageEngine $where[] = qsprintf( $conn, - 'MATCH(corpus) AGAINST (%s IN BOOLEAN MODE)', + 'MATCH(corpus, stemmedCorpus) AGAINST (%s IN BOOLEAN MODE)', $compiled_query); if ($query->getParameter('field')) { @@ -380,11 +388,17 @@ final class PhabricatorMySQLFulltextStorageEngine } private function compileQuery($raw_query) { - $compiler = PhabricatorSearchDocument::newQueryCompiler(); + $stemmer = new PhutilSearchStemmer(); - return $compiler + $compiler = PhabricatorSearchDocument::newQueryCompiler() ->setQuery($raw_query) - ->compileQuery(); + ->setStemmer($stemmer); + + $queries = array(); + $queries[] = $compiler->compileLiteralQuery(); + $queries[] = $compiler->compileStemmedQuery(); + + return implode(' ', array_filter($queries)); } public function indexExists() { diff --git a/src/applications/search/storage/document/PhabricatorSearchDocumentField.php b/src/applications/search/storage/document/PhabricatorSearchDocumentField.php index 49529963b0..9c5b839f1a 100644 --- a/src/applications/search/storage/document/PhabricatorSearchDocumentField.php +++ b/src/applications/search/storage/document/PhabricatorSearchDocumentField.php @@ -6,6 +6,7 @@ final class PhabricatorSearchDocumentField extends PhabricatorSearchDAO { protected $field; protected $auxPHID; protected $corpus; + protected $stemmedCorpus; protected function getConfiguration() { return array( @@ -16,14 +17,15 @@ final class PhabricatorSearchDocumentField extends PhabricatorSearchDAO { 'field' => 'text4', 'auxPHID' => 'phid?', 'corpus' => 'fulltext?', + 'stemmedCorpus' => 'fulltext?', ), self::CONFIG_KEY_SCHEMA => array( 'key_phid' => null, 'phid' => array( 'columns' => array('phid'), ), - 'corpus' => array( - 'columns' => array('corpus'), + 'key_corpus' => array( + 'columns' => array('corpus', 'stemmedCorpus'), 'type' => 'FULLTEXT', ), ),