From 152ddf57092e3c73d9dcc60f57cdedb1625b679d Mon Sep 17 00:00:00 2001 From: epriestley Date: Sun, 8 Nov 2015 05:36:42 -0800 Subject: [PATCH] Use unicode mode when tokenizing strings like user realnames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Fixes T9732. We currently tokenize strings (like user realnames) in the default non-unicode mode, which can cause patterns like `\s` to work incorrectly. Use `/u` to use unicode-aware tokenization instead. Test Plan: The behavior of "\s" depends upon environmental settings like LC_ALL. With LC_ALL set to "C", `\xA0` is not considered a whitespace character. With LC_ALL set to "en_US", it is: ``` $ php -r 'setlocale(LC_ALL, "C"); echo count(preg_split("/\s/", "\xE5\xBF\xA0")) . "\n";' 1 $ php -r 'setlocale(LC_ALL, "en_US"); echo count(preg_split("/\s/", "\xE5\xBF\xA0")) . "\n";' 2 ``` To reproduce the original issue, I added an explicit: ``` setlocale(LC_ALL, "en_US"); ``` ...call before the `preg_split()` call. This caused "忠" to be improperly split. I then added "/u", and observed proper tokenization. Reviewers: chad Reviewed By: chad Subscribers: qiu8310 Maniphest Tasks: T9732 Differential Revision: https://secure.phabricator.com/D14441 --- .../typeahead/datasource/PhabricatorTypeaheadDatasource.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/applications/typeahead/datasource/PhabricatorTypeaheadDatasource.php b/src/applications/typeahead/datasource/PhabricatorTypeaheadDatasource.php index 1514f46d5b..66d8357341 100644 --- a/src/applications/typeahead/datasource/PhabricatorTypeaheadDatasource.php +++ b/src/applications/typeahead/datasource/PhabricatorTypeaheadDatasource.php @@ -107,7 +107,7 @@ abstract class PhabricatorTypeaheadDatasource extends Phobject { return array(); } - $tokens = preg_split('/\s+|[-\[\]]/', $string); + $tokens = preg_split('/\s+|[-\[\]]/u', $string); return array_unique($tokens); }