Fix handling of surrogates on decoding

This implements surrogate handling on decoding as it is in the standard library. Lone escaped surrogates and any raw surrogates in the input result in surrogates in the output, and escaped surrogate pairs get decoded into non-BMP characters. Note that raw surrogate pairs get treated differently on platforms/compilers with 16-bit `wchar_t`, e.g. Microsoft Windows.
2024-12-04 06:38:23 +01:00 · 2022-06-09 17:23:15 +00:00 · 2022-06-09 17:23:15 +00:00 · e0e5db9a46
commit e0e5db9a46
parent 4ac30c9b76
3 changed files with 57 additions and 27 deletions
--- a/lib/ultrajsondec.c
+++ b/lib/ultrajsondec.c
@ -357,13 +357,15 @@ static const JSUINT8 g_decoderLookup[256] =

 static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
 {
-  JSUTF16 sur[2] = { 0 };
-  int iSur = 0;
  int index;
  wchar_t *escOffset;
  wchar_t *escStart;
  size_t escLen = (ds->escEnd - ds->escStart);
  JSUINT8 *inputOffset;
+  JSUTF16 ch = 0;
+#if WCHAR_MAX >= 0x10FFFF
+  JSUINT8 *lastHighSurrogate = NULL;
+#endif
  JSUINT8 oct;
  JSUTF32 ucs;
  ds->lastType = JT_INVALID;
@ -464,7 +466,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
                case '7':
                case '8':
                case '9':
-                  sur[iSur] = (sur[iSur] << 4) + (JSUTF16) (*inputOffset - '0');
+                  ch = (ch << 4) + (JSUTF16) (*inputOffset - '0');
                  break;

                case 'a':
@ -473,7 +475,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
                case 'd':
                case 'e':
                case 'f':
-                  sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'a');
+                  ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'a');
                  break;

                case 'A':
@ -482,39 +484,31 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
                case 'D':
                case 'E':
                case 'F':
-                  sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'A');
+                  ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'A');
                  break;
              }

              inputOffset ++;
            }

-            if (iSur == 0)
+#if WCHAR_MAX >= 0x10FFFF
+            if ((ch & 0xfc00) == 0xdc00 && lastHighSurrogate == inputOffset - 6 * sizeof(*inputOffset))
            {
-              if((sur[iSur] & 0xfc00) == 0xd800)
-              {
-                // First of a surrogate pair, continue parsing
-                iSur ++;
-                break;
-              }
-              (*escOffset++) = (wchar_t) sur[iSur];
-              iSur = 0;
+              // Low surrogate immediately following a high surrogate
+              // Overwrite existing high surrogate with combined character
+              *(escOffset-1) = (((*(escOffset-1) - 0xd800) <<10) | (ch - 0xdc00)) + 0x10000;
            }
            else
-            {
-              // Decode pair
-              if ((sur[1] & 0xfc00) != 0xdc00)
-              {
-                return SetError (ds, -1, "Unpaired high surrogate when decoding 'string'");
-              }
-#if WCHAR_MAX == 0xffff
-              (*escOffset++) = (wchar_t) sur[0];
-              (*escOffset++) = (wchar_t) sur[1];
-#else
-              (*escOffset++) = (wchar_t) 0x10000 + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00));
 #endif
-              iSur = 0;
+            {
+              *(escOffset++) = (wchar_t) ch;
            }
+#if WCHAR_MAX >= 0x10FFFF
+            if ((ch & 0xfc00) == 0xd800)
+            {
+              lastHighSurrogate = inputOffset;
+            }
+#endif
            break;
          }

--- a/python/JSONtoObj.c
+++ b/python/JSONtoObj.c
@ -173,7 +173,7 @@ PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs)
  else
  if (PyUnicode_Check(arg))
  {
-    sarg = PyUnicode_AsUTF8String(arg);
+    sarg = PyUnicode_AsEncodedString(arg, NULL, "surrogatepass");
    if (sarg == NULL)
    {
      //Exception raised above us by codec according to docs
--- a/tests/test_ujson.py
+++ b/tests/test_ujson.py
@ -1,3 +1,4 @@
+import ctypes
 import datetime as dt
 import decimal
 import io
@ -512,6 +513,41 @@ def test_encode_surrogate_characters():
    assert ujson.dumps({"\ud800": "\udfff"}, ensure_ascii=False, sort_keys=True) == out2


+@pytest.mark.parametrize(
+    "test_input, expected",
+    [
+        # Normal cases
+        (r'"\uD83D\uDCA9"', "\U0001F4A9"),
+        (r'"a\uD83D\uDCA9b"', "a\U0001F4A9b"),
+        # Unpaired surrogates
+        (r'"\uD800"', "\uD800"),
+        (r'"a\uD800b"', "a\uD800b"),
+        (r'"\uDEAD"', "\uDEAD"),
+        (r'"a\uDEADb"', "a\uDEADb"),
+        (r'"\uD83D\uD83D\uDCA9"', "\uD83D\U0001F4A9"),
+        (r'"\uDCA9\uD83D\uDCA9"', "\uDCA9\U0001F4A9"),
+        (r'"\uD83D\uDCA9\uD83D"', "\U0001F4A9\uD83D"),
+        (r'"\uD83D\uDCA9\uDCA9"', "\U0001F4A9\uDCA9"),
+        (r'"\uD83D \uDCA9"', "\uD83D \uDCA9"),
+        # No decoding of actual surrogate characters (rather than escaped ones)
+        ('"\uD800"', "\uD800"),
+        ('"\uDEAD"', "\uDEAD"),
+        ('"\uD800a\uDEAD"', "\uD800a\uDEAD"),
+        ('"\uD83D\uDCA9"', "\uD83D\uDCA9"),
+    ],
+)
+def test_decode_surrogate_characters(test_input, expected):
+    # FIXME Wrong output (combined char) on platforms with 16-bit wchar_t
+    if test_input == '"\uD83D\uDCA9"' and ctypes.sizeof(ctypes.c_wchar) == 2:
+        pytest.skip("Raw surrogate pairs are not supported with 16-bit wchar_t")
+
+    assert ujson.loads(test_input) == expected
+    assert ujson.loads(test_input.encode("utf-8", "surrogatepass")) == expected
+
+    # Ensure that this matches stdlib's behaviour
+    assert json.loads(test_input) == expected
+
+
 def test_sort_keys():
    data = {"a": 1, "c": 1, "b": 1, "e": 1, "f": 1, "d": 1}
    sorted_keys = ujson.dumps(data, sort_keys=True)