[Wilds] Fix phutil_is_utf8_slowly() to reject reserved UTF16 surrogate character ranges

Summary: Ref T13209. See T11525. We want to reject certain 3-byte characters as "invalid" unicode, primarily because `json_decode()` does not accept them. We currently reject them correctly if we go down the fast path in `phutil_is_utf8()` via `mb_check_encoding()`, but incorrectly accept them if we go down the slow path. Add test coverage that the slow path has the same behavior as the fast path, and then make the slow path reject these byte sequences. Test Plan: - Added failing tests. - Made them pass on OSX and Windows 10. Reviewers: amckinley Reviewed By: amckinley Maniphest Tasks: T13209 Differential Revision: https://secure.phabricator.com/D19724
2025-01-15 09:11:06 +01:00 · 2018-10-02 10:43:20 -07:00 · 2018-10-02 10:43:20 -07:00 · b192185045
commit b192185045
parent ee756592af
2 changed files with 42 additions and 0 deletions
--- a/src/utils/tests/PhutilUTF8TestCase.php
+++ b/src/utils/tests/PhutilUTF8TestCase.php
@ -61,6 +61,13 @@ final class PhutilUTF8TestCase extends PhutilTestCase {
    );
    foreach ($map as $input => $expect) {
      if ($input !== $expect) {
        $this->assertEqual(
          false,
          phutil_is_utf8_slowly($input),
          pht('Slowly reject overlong form of: %s', $input));
      }
      $actual = phutil_utf8ize($input);
      $this->assertEqual(
        $expect,
@ -77,6 +84,13 @@ final class PhutilUTF8TestCase extends PhutilTestCase {
    );
    foreach ($map as $input => $expect) {
      if ($input !== $expect) {
        $this->assertEqual(
          false,
          phutil_is_utf8_slowly($input),
          pht('Slowly reject surrogate: %s', $input));
      }
      $actual = phutil_utf8ize($input);
      $this->assertEqual(
        $expect,
--- a/src/utils/utf8.php
+++ b/src/utils/utf8.php
@ -149,6 +149,34 @@ function phutil_is_utf8_slowly($string, $only_bmp = false) {
        continue;
      }
      return false;
    } else if ($chr == 0xED) {
      // See T11525. Some sequences in this block are surrogate codepoints
      // that are reserved for use in UTF16. We should reject them.
      $codepoint = ($chr & 0x0F) << 12;
      ++$ii;
      if ($ii >= $len) {
        return false;
      }
      $chr = ord($string[$ii]);
      $codepoint += ($chr & 0x3F) << 6;
      if ($chr >= 0x80 && $chr <= 0xBF) {
        ++$ii;
        if ($ii >= $len) {
          return false;
        }
        $chr = ord($string[$ii]);
        $codepoint += ($chr & 0x3F);
        if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) {
          // Reject these surrogate codepoints.
          return false;
        }
        if ($chr >= 0x80 && $chr <= 0xBF) {
          continue;
        }
      }
      return false;
    } else if ($chr > 0xE0 && $chr <= 0xEF) {
      ++$ii;
      if ($ii >= $len) {