1
0
Fork 0
mirror of https://we.phorge.it/source/arcanist.git synced 2025-01-15 17:21:09 +01:00

[Wilds] Fix phutil_is_utf8_slowly() to reject reserved UTF16 surrogate character ranges

Summary:
Ref T13209. See T11525. We want to reject certain 3-byte characters as "invalid" unicode, primarily because `json_decode()` does not accept them.

We currently reject them correctly if we go down the fast path in `phutil_is_utf8()` via `mb_check_encoding()`, but incorrectly accept them if we go down the slow path.

Add test coverage that the slow path has the same behavior as the fast path, and then make the slow path reject these byte sequences.

Test Plan:
- Added failing tests.
- Made them pass on OSX and Windows 10.

Reviewers: amckinley

Reviewed By: amckinley

Maniphest Tasks: T13209

Differential Revision: https://secure.phabricator.com/D19724
This commit is contained in:
epriestley 2018-10-02 10:43:20 -07:00
parent ee756592af
commit b192185045
2 changed files with 42 additions and 0 deletions

View file

@ -61,6 +61,13 @@ final class PhutilUTF8TestCase extends PhutilTestCase {
); );
foreach ($map as $input => $expect) { foreach ($map as $input => $expect) {
if ($input !== $expect) {
$this->assertEqual(
false,
phutil_is_utf8_slowly($input),
pht('Slowly reject overlong form of: %s', $input));
}
$actual = phutil_utf8ize($input); $actual = phutil_utf8ize($input);
$this->assertEqual( $this->assertEqual(
$expect, $expect,
@ -77,6 +84,13 @@ final class PhutilUTF8TestCase extends PhutilTestCase {
); );
foreach ($map as $input => $expect) { foreach ($map as $input => $expect) {
if ($input !== $expect) {
$this->assertEqual(
false,
phutil_is_utf8_slowly($input),
pht('Slowly reject surrogate: %s', $input));
}
$actual = phutil_utf8ize($input); $actual = phutil_utf8ize($input);
$this->assertEqual( $this->assertEqual(
$expect, $expect,

View file

@ -149,6 +149,34 @@ function phutil_is_utf8_slowly($string, $only_bmp = false) {
continue; continue;
} }
return false; return false;
} else if ($chr == 0xED) {
// See T11525. Some sequences in this block are surrogate codepoints
// that are reserved for use in UTF16. We should reject them.
$codepoint = ($chr & 0x0F) << 12;
++$ii;
if ($ii >= $len) {
return false;
}
$chr = ord($string[$ii]);
$codepoint += ($chr & 0x3F) << 6;
if ($chr >= 0x80 && $chr <= 0xBF) {
++$ii;
if ($ii >= $len) {
return false;
}
$chr = ord($string[$ii]);
$codepoint += ($chr & 0x3F);
if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) {
// Reject these surrogate codepoints.
return false;
}
if ($chr >= 0x80 && $chr <= 0xBF) {
continue;
}
}
return false;
} else if ($chr > 0xE0 && $chr <= 0xEF) { } else if ($chr > 0xE0 && $chr <= 0xEF) {
++$ii; ++$ii;
if ($ii >= $len) { if ($ii >= $len) {