From 9e82cfcc214bf604773558c197e869a2247308c8 Mon Sep 17 00:00:00 2001 From: epriestley Date: Thu, 26 Feb 2015 10:18:54 -0800 Subject: [PATCH] Use utf8_general_ci for "sort" columns in old MySQL Summary: Fixes T7287. This trades off 4-byte character support for case insensitivity in these columns, which is a much better trade on the balance. Also adds more warnings about old MySQL. Note that we already issue a warning when you run "storage adjust" (which I've made stronger) and already "strongly recommend" MySQL 5.5 or newer in the install documentation. Test Plan: - Ran `storage adjust --disable-utf8mb4` to go to old definitions, then ran `storage adjust` to get back to the new ones. Everything seemed OK in both cases. - Verified that utf8mb4 data can be migrated out of these colums with `--unsafe` (which will truncate). - Verified that manual explains this. - Faked my way into the setup warning. Reviewers: btrahan Reviewed By: btrahan Subscribers: epriestley Maniphest Tasks: T7287 Differential Revision: https://secure.phabricator.com/D11893 --- .../check/PhabricatorMySQLSetupCheck.php | 20 ++++++++++++ .../PhabricatorStorageManagementAPI.php | 31 +++++++++++++------ .../PhabricatorStorageManagementWorkflow.php | 8 +++-- 3 files changed, 47 insertions(+), 12 deletions(-) diff --git a/src/applications/config/check/PhabricatorMySQLSetupCheck.php b/src/applications/config/check/PhabricatorMySQLSetupCheck.php index 053f31394b..44d9cb3cd4 100644 --- a/src/applications/config/check/PhabricatorMySQLSetupCheck.php +++ b/src/applications/config/check/PhabricatorMySQLSetupCheck.php @@ -315,6 +315,26 @@ final class PhabricatorMySQLSetupCheck extends PhabricatorSetupCheck { ->addMySQLConfig('innodb_buffer_pool_size'); } + $ok = PhabricatorStorageManagementAPI::isCharacterSetAvailableOnConnection( + 'utf8mb4', + id(new PhabricatorUser())->establishConnection('w')); + if (!$ok) { + $summary = pht( + 'You are using an old version of MySQL, and should upgrade.'); + + $message = pht( + 'You are using an old version of MySQL which has poor unicode '. + 'support (it does not support the "utf8mb4" collation set). You will '. + 'encounter limitations when working with some unicode data.'. + "\n\n". + 'We strongly recommend you upgrade to MySQL 5.5 or newer.'); + + $this->newIssue('mysql.utf8mb4') + ->setName(pht('Old MySQL Version')) + ->setSummary($summary) + ->setMessage($message); + } + } } diff --git a/src/infrastructure/storage/management/PhabricatorStorageManagementAPI.php b/src/infrastructure/storage/management/PhabricatorStorageManagementAPI.php index 16d4c5674d..9244f647af 100644 --- a/src/infrastructure/storage/management/PhabricatorStorageManagementAPI.php +++ b/src/infrastructure/storage/management/PhabricatorStorageManagementAPI.php @@ -233,7 +233,12 @@ final class PhabricatorStorageManagementAPI { } $conn = $this->getConn(null); + return self::isCharacterSetAvailableOnConnection($character_set, $conn); + } + public static function isCharacterSetAvailableOnConnection( + $character_set, + AphrontDatabaseConnection $conn) { $result = queryfx_one( $conn, 'SELECT CHARACTER_SET_NAME FROM INFORMATION_SCHEMA.CHARACTER_SETS @@ -254,24 +259,32 @@ final class PhabricatorStorageManagementAPI { $collate_sort = 'utf8mb4_unicode_ci'; $collate_full = 'utf8mb4_unicode_ci'; } else { - // If utf8mb4 is not available, we use binary. This allows us to store - // 4-byte unicode characters. This has some tradeoffs: - // - // Unicode characters won't sort correctly. There's nothing we can do - // about this while still supporting 4-byte characters. + // If utf8mb4 is not available, we use binary for most data. This allows + // us to store 4-byte unicode characters. // // It's possible that strings will be truncated in the middle of a // character on insert. We encourage users to set STRICT_ALL_TABLES // to prevent this. // - // There's no valid collation we can use to get a fulltext index on - // 4-byte unicode characters: we can't add a fulltext key to a binary - // column. + // For "fulltext" and "sort" columns, we don't use binary. + // + // With "fulltext", we can not use binary because MySQL won't let us. + // We use 3-byte utf8 instead and accept being unable to index 4-byte + // characters. + // + // With "sort", if we use binary we lose case insensitivity (for + // example, "ALincoln@logcabin.com" and "alincoln@logcabin.com" would no + // longer be identified as the same email address). This can be very + // confusing and is far worse overall than not supporting 4-byte unicode + // characters, so we use 3-byte utf8 and accept limited 4-byte support as + // a tradeoff to get sensible collation behavior. Many columns where + // collation is important rarely contain 4-byte characters anyway, so we + // are not giving up too much. $charset = 'binary'; $charset_full = 'utf8'; $collate_text = 'binary'; - $collate_sort = 'binary'; + $collate_sort = 'utf8_general_ci'; $collate_full = 'utf8_general_ci'; } diff --git a/src/infrastructure/storage/management/workflow/PhabricatorStorageManagementWorkflow.php b/src/infrastructure/storage/management/workflow/PhabricatorStorageManagementWorkflow.php index 35a02bff38..f8da379616 100644 --- a/src/infrastructure/storage/management/workflow/PhabricatorStorageManagementWorkflow.php +++ b/src/infrastructure/storage/management/workflow/PhabricatorStorageManagementWorkflow.php @@ -57,9 +57,11 @@ abstract class PhabricatorStorageManagementWorkflow if (!$force && !$api->isCharacterSetAvailable('utf8mb4')) { $message = pht( "You have an old version of MySQL (older than 5.5) which does not ". - "support the utf8mb4 character set. If you apply adjustments now ". - "and later update MySQL to 5.5 or newer, you'll need to apply ". - "adjustments again (and they will take a long time).\n\n". + "support the utf8mb4 character set. We strongly recomend upgrading to ". + "5.5 or newer.\n\n". + "If you apply adjustments now and later update MySQL to 5.5 or newer, ". + "you'll need to apply adjustments again (and they will take a long ". + "time).\n\n". "You can exit this workflow, update MySQL now, and then run this ". "workflow again. This is recommended, but may cause a lot of downtime ". "right now.\n\n".