From 7ea6de6e9c9da3ce818ac65c78da8065b73e6d67 Mon Sep 17 00:00:00 2001
From: epriestley <git@epriestley.com>
Date: Fri, 8 Sep 2017 08:06:16 -0700
Subject: [PATCH] Split Ferret engine strings for tokenization on any sequence
 of whitespace

Summary:
Ref T12819. Currently, strings are split only on spaces, but newlines (and, if they exist, tabs) should also split strings.

Without this, we can fail to get the proper term boundary tokens for words which begin at the start of a line or end at the end of a line.

Test Plan: Reindexed a document with "xyz\nabc", saw `"yz "` and `" ab"` term boundary tokens generate properly.

Reviewers: chad

Reviewed By: chad

Maniphest Tasks: T12819

Differential Revision: https://secure.phabricator.com/D18579
---
 src/applications/search/ferret/PhabricatorFerretEngine.php | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/applications/search/ferret/PhabricatorFerretEngine.php b/src/applications/search/ferret/PhabricatorFerretEngine.php
index 219130c02c..3c8098c54f 100644
--- a/src/applications/search/ferret/PhabricatorFerretEngine.php
+++ b/src/applications/search/ferret/PhabricatorFerretEngine.php
@@ -75,7 +75,7 @@ abstract class PhabricatorFerretEngine extends Phobject {
 
   public function tokenizeString($value) {
     $value = trim($value, ' ');
-    $value = preg_split('/ +/', $value);
+    $value = preg_split('/\s+/u', $value);
     return $value;
   }