diff --git a/lib/ultrajsondec.c b/lib/ultrajsondec.c index 6c59bf0..1d647b2 100644 --- a/lib/ultrajsondec.c +++ b/lib/ultrajsondec.c @@ -357,13 +357,15 @@ static const JSUINT8 g_decoderLookup[256] = static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) { - JSUTF16 sur[2] = { 0 }; - int iSur = 0; int index; wchar_t *escOffset; wchar_t *escStart; size_t escLen = (ds->escEnd - ds->escStart); JSUINT8 *inputOffset; + JSUTF16 ch = 0; +#if WCHAR_MAX >= 0x10FFFF + JSUINT8 *lastHighSurrogate = NULL; +#endif JSUINT8 oct; JSUTF32 ucs; ds->lastType = JT_INVALID; @@ -464,7 +466,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds case '7': case '8': case '9': - sur[iSur] = (sur[iSur] << 4) + (JSUTF16) (*inputOffset - '0'); + ch = (ch << 4) + (JSUTF16) (*inputOffset - '0'); break; case 'a': @@ -473,7 +475,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds case 'd': case 'e': case 'f': - sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'a'); + ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'a'); break; case 'A': @@ -482,39 +484,31 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds case 'D': case 'E': case 'F': - sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'A'); + ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'A'); break; } inputOffset ++; } - if (iSur == 0) +#if WCHAR_MAX >= 0x10FFFF + if ((ch & 0xfc00) == 0xdc00 && lastHighSurrogate == inputOffset - 6 * sizeof(*inputOffset)) { - if((sur[iSur] & 0xfc00) == 0xd800) - { - // First of a surrogate pair, continue parsing - iSur ++; - break; - } - (*escOffset++) = (wchar_t) sur[iSur]; - iSur = 0; + // Low surrogate immediately following a high surrogate + // Overwrite existing high surrogate with combined character + *(escOffset-1) = (((*(escOffset-1) - 0xd800) <<10) | (ch - 0xdc00)) + 0x10000; } else - { - // Decode pair - if ((sur[1] & 0xfc00) != 0xdc00) - { - return SetError (ds, -1, "Unpaired high surrogate when decoding 'string'"); - } -#if WCHAR_MAX == 0xffff - (*escOffset++) = (wchar_t) sur[0]; - (*escOffset++) = (wchar_t) sur[1]; -#else - (*escOffset++) = (wchar_t) 0x10000 + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00)); #endif - iSur = 0; + { + *(escOffset++) = (wchar_t) ch; } +#if WCHAR_MAX >= 0x10FFFF + if ((ch & 0xfc00) == 0xd800) + { + lastHighSurrogate = inputOffset; + } +#endif break; } diff --git a/python/JSONtoObj.c b/python/JSONtoObj.c index f370bef..5b94dc3 100644 --- a/python/JSONtoObj.c +++ b/python/JSONtoObj.c @@ -173,7 +173,7 @@ PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs) else if (PyUnicode_Check(arg)) { - sarg = PyUnicode_AsUTF8String(arg); + sarg = PyUnicode_AsEncodedString(arg, NULL, "surrogatepass"); if (sarg == NULL) { //Exception raised above us by codec according to docs diff --git a/tests/test_ujson.py b/tests/test_ujson.py index e4772e8..4a1d230 100644 --- a/tests/test_ujson.py +++ b/tests/test_ujson.py @@ -1,3 +1,4 @@ +import ctypes import datetime as dt import decimal import io @@ -512,6 +513,41 @@ def test_encode_surrogate_characters(): assert ujson.dumps({"\ud800": "\udfff"}, ensure_ascii=False, sort_keys=True) == out2 +@pytest.mark.parametrize( + "test_input, expected", + [ + # Normal cases + (r'"\uD83D\uDCA9"', "\U0001F4A9"), + (r'"a\uD83D\uDCA9b"', "a\U0001F4A9b"), + # Unpaired surrogates + (r'"\uD800"', "\uD800"), + (r'"a\uD800b"', "a\uD800b"), + (r'"\uDEAD"', "\uDEAD"), + (r'"a\uDEADb"', "a\uDEADb"), + (r'"\uD83D\uD83D\uDCA9"', "\uD83D\U0001F4A9"), + (r'"\uDCA9\uD83D\uDCA9"', "\uDCA9\U0001F4A9"), + (r'"\uD83D\uDCA9\uD83D"', "\U0001F4A9\uD83D"), + (r'"\uD83D\uDCA9\uDCA9"', "\U0001F4A9\uDCA9"), + (r'"\uD83D \uDCA9"', "\uD83D \uDCA9"), + # No decoding of actual surrogate characters (rather than escaped ones) + ('"\uD800"', "\uD800"), + ('"\uDEAD"', "\uDEAD"), + ('"\uD800a\uDEAD"', "\uD800a\uDEAD"), + ('"\uD83D\uDCA9"', "\uD83D\uDCA9"), + ], +) +def test_decode_surrogate_characters(test_input, expected): + # FIXME Wrong output (combined char) on platforms with 16-bit wchar_t + if test_input == '"\uD83D\uDCA9"' and ctypes.sizeof(ctypes.c_wchar) == 2: + pytest.skip("Raw surrogate pairs are not supported with 16-bit wchar_t") + + assert ujson.loads(test_input) == expected + assert ujson.loads(test_input.encode("utf-8", "surrogatepass")) == expected + + # Ensure that this matches stdlib's behaviour + assert json.loads(test_input) == expected + + def test_sort_keys(): data = {"a": 1, "c": 1, "b": 1, "e": 1, "f": 1, "d": 1} sorted_keys = ujson.dumps(data, sort_keys=True)