From b1921850452c6354354f208b971e3e71011ae4a6 Mon Sep 17 00:00:00 2001 From: epriestley Date: Tue, 2 Oct 2018 10:43:20 -0700 Subject: [PATCH] [Wilds] Fix phutil_is_utf8_slowly() to reject reserved UTF16 surrogate character ranges Summary: Ref T13209. See T11525. We want to reject certain 3-byte characters as "invalid" unicode, primarily because `json_decode()` does not accept them. We currently reject them correctly if we go down the fast path in `phutil_is_utf8()` via `mb_check_encoding()`, but incorrectly accept them if we go down the slow path. Add test coverage that the slow path has the same behavior as the fast path, and then make the slow path reject these byte sequences. Test Plan: - Added failing tests. - Made them pass on OSX and Windows 10. Reviewers: amckinley Reviewed By: amckinley Maniphest Tasks: T13209 Differential Revision: https://secure.phabricator.com/D19724 --- src/utils/__tests__/PhutilUTF8TestCase.php | 14 +++++++++++ src/utils/utf8.php | 28 ++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php index 84c35cb3..7e85f89c 100644 --- a/src/utils/__tests__/PhutilUTF8TestCase.php +++ b/src/utils/__tests__/PhutilUTF8TestCase.php @@ -61,6 +61,13 @@ final class PhutilUTF8TestCase extends PhutilTestCase { ); foreach ($map as $input => $expect) { + if ($input !== $expect) { + $this->assertEqual( + false, + phutil_is_utf8_slowly($input), + pht('Slowly reject overlong form of: %s', $input)); + } + $actual = phutil_utf8ize($input); $this->assertEqual( $expect, @@ -77,6 +84,13 @@ final class PhutilUTF8TestCase extends PhutilTestCase { ); foreach ($map as $input => $expect) { + if ($input !== $expect) { + $this->assertEqual( + false, + phutil_is_utf8_slowly($input), + pht('Slowly reject surrogate: %s', $input)); + } + $actual = phutil_utf8ize($input); $this->assertEqual( $expect, diff --git a/src/utils/utf8.php b/src/utils/utf8.php index 6f8af083..7aff7f76 100644 --- a/src/utils/utf8.php +++ b/src/utils/utf8.php @@ -149,6 +149,34 @@ function phutil_is_utf8_slowly($string, $only_bmp = false) { continue; } return false; + } else if ($chr == 0xED) { + // See T11525. Some sequences in this block are surrogate codepoints + // that are reserved for use in UTF16. We should reject them. + $codepoint = ($chr & 0x0F) << 12; + ++$ii; + if ($ii >= $len) { + return false; + } + $chr = ord($string[$ii]); + $codepoint += ($chr & 0x3F) << 6; + if ($chr >= 0x80 && $chr <= 0xBF) { + ++$ii; + if ($ii >= $len) { + return false; + } + $chr = ord($string[$ii]); + $codepoint += ($chr & 0x3F); + + if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) { + // Reject these surrogate codepoints. + return false; + } + + if ($chr >= 0x80 && $chr <= 0xBF) { + continue; + } + } + return false; } else if ($chr > 0xE0 && $chr <= 0xEF) { ++$ii; if ($ii >= $len) {