mirror of
https://github.com/ultrajson/ultrajson.git
synced 2024-12-04 19:08:21 +01:00
Fix handling of surrogates on decoding
This implements surrogate handling on decoding as it is in the standard library. Lone escaped surrogates and any raw surrogates in the input result in surrogates in the output, and escaped surrogate pairs get decoded into non-BMP characters. Note that raw surrogate pairs get treated differently on platforms/compilers with 16-bit `wchar_t`, e.g. Microsoft Windows.
This commit is contained in:
parent
4ac30c9b76
commit
e0e5db9a46
@ -357,13 +357,15 @@ static const JSUINT8 g_decoderLookup[256] =
|
||||
|
||||
static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
|
||||
{
|
||||
JSUTF16 sur[2] = { 0 };
|
||||
int iSur = 0;
|
||||
int index;
|
||||
wchar_t *escOffset;
|
||||
wchar_t *escStart;
|
||||
size_t escLen = (ds->escEnd - ds->escStart);
|
||||
JSUINT8 *inputOffset;
|
||||
JSUTF16 ch = 0;
|
||||
#if WCHAR_MAX >= 0x10FFFF
|
||||
JSUINT8 *lastHighSurrogate = NULL;
|
||||
#endif
|
||||
JSUINT8 oct;
|
||||
JSUTF32 ucs;
|
||||
ds->lastType = JT_INVALID;
|
||||
@ -464,7 +466,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
|
||||
case '7':
|
||||
case '8':
|
||||
case '9':
|
||||
sur[iSur] = (sur[iSur] << 4) + (JSUTF16) (*inputOffset - '0');
|
||||
ch = (ch << 4) + (JSUTF16) (*inputOffset - '0');
|
||||
break;
|
||||
|
||||
case 'a':
|
||||
@ -473,7 +475,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
|
||||
case 'd':
|
||||
case 'e':
|
||||
case 'f':
|
||||
sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'a');
|
||||
ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'a');
|
||||
break;
|
||||
|
||||
case 'A':
|
||||
@ -482,39 +484,31 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
|
||||
case 'D':
|
||||
case 'E':
|
||||
case 'F':
|
||||
sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'A');
|
||||
ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'A');
|
||||
break;
|
||||
}
|
||||
|
||||
inputOffset ++;
|
||||
}
|
||||
|
||||
if (iSur == 0)
|
||||
#if WCHAR_MAX >= 0x10FFFF
|
||||
if ((ch & 0xfc00) == 0xdc00 && lastHighSurrogate == inputOffset - 6 * sizeof(*inputOffset))
|
||||
{
|
||||
if((sur[iSur] & 0xfc00) == 0xd800)
|
||||
{
|
||||
// First of a surrogate pair, continue parsing
|
||||
iSur ++;
|
||||
break;
|
||||
}
|
||||
(*escOffset++) = (wchar_t) sur[iSur];
|
||||
iSur = 0;
|
||||
// Low surrogate immediately following a high surrogate
|
||||
// Overwrite existing high surrogate with combined character
|
||||
*(escOffset-1) = (((*(escOffset-1) - 0xd800) <<10) | (ch - 0xdc00)) + 0x10000;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Decode pair
|
||||
if ((sur[1] & 0xfc00) != 0xdc00)
|
||||
{
|
||||
return SetError (ds, -1, "Unpaired high surrogate when decoding 'string'");
|
||||
}
|
||||
#if WCHAR_MAX == 0xffff
|
||||
(*escOffset++) = (wchar_t) sur[0];
|
||||
(*escOffset++) = (wchar_t) sur[1];
|
||||
#else
|
||||
(*escOffset++) = (wchar_t) 0x10000 + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00));
|
||||
#endif
|
||||
iSur = 0;
|
||||
{
|
||||
*(escOffset++) = (wchar_t) ch;
|
||||
}
|
||||
#if WCHAR_MAX >= 0x10FFFF
|
||||
if ((ch & 0xfc00) == 0xd800)
|
||||
{
|
||||
lastHighSurrogate = inputOffset;
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -173,7 +173,7 @@ PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs)
|
||||
else
|
||||
if (PyUnicode_Check(arg))
|
||||
{
|
||||
sarg = PyUnicode_AsUTF8String(arg);
|
||||
sarg = PyUnicode_AsEncodedString(arg, NULL, "surrogatepass");
|
||||
if (sarg == NULL)
|
||||
{
|
||||
//Exception raised above us by codec according to docs
|
||||
|
@ -1,3 +1,4 @@
|
||||
import ctypes
|
||||
import datetime as dt
|
||||
import decimal
|
||||
import io
|
||||
@ -512,6 +513,41 @@ def test_encode_surrogate_characters():
|
||||
assert ujson.dumps({"\ud800": "\udfff"}, ensure_ascii=False, sort_keys=True) == out2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"test_input, expected",
|
||||
[
|
||||
# Normal cases
|
||||
(r'"\uD83D\uDCA9"', "\U0001F4A9"),
|
||||
(r'"a\uD83D\uDCA9b"', "a\U0001F4A9b"),
|
||||
# Unpaired surrogates
|
||||
(r'"\uD800"', "\uD800"),
|
||||
(r'"a\uD800b"', "a\uD800b"),
|
||||
(r'"\uDEAD"', "\uDEAD"),
|
||||
(r'"a\uDEADb"', "a\uDEADb"),
|
||||
(r'"\uD83D\uD83D\uDCA9"', "\uD83D\U0001F4A9"),
|
||||
(r'"\uDCA9\uD83D\uDCA9"', "\uDCA9\U0001F4A9"),
|
||||
(r'"\uD83D\uDCA9\uD83D"', "\U0001F4A9\uD83D"),
|
||||
(r'"\uD83D\uDCA9\uDCA9"', "\U0001F4A9\uDCA9"),
|
||||
(r'"\uD83D \uDCA9"', "\uD83D \uDCA9"),
|
||||
# No decoding of actual surrogate characters (rather than escaped ones)
|
||||
('"\uD800"', "\uD800"),
|
||||
('"\uDEAD"', "\uDEAD"),
|
||||
('"\uD800a\uDEAD"', "\uD800a\uDEAD"),
|
||||
('"\uD83D\uDCA9"', "\uD83D\uDCA9"),
|
||||
],
|
||||
)
|
||||
def test_decode_surrogate_characters(test_input, expected):
|
||||
# FIXME Wrong output (combined char) on platforms with 16-bit wchar_t
|
||||
if test_input == '"\uD83D\uDCA9"' and ctypes.sizeof(ctypes.c_wchar) == 2:
|
||||
pytest.skip("Raw surrogate pairs are not supported with 16-bit wchar_t")
|
||||
|
||||
assert ujson.loads(test_input) == expected
|
||||
assert ujson.loads(test_input.encode("utf-8", "surrogatepass")) == expected
|
||||
|
||||
# Ensure that this matches stdlib's behaviour
|
||||
assert json.loads(test_input) == expected
|
||||
|
||||
|
||||
def test_sort_keys():
|
||||
data = {"a": 1, "c": 1, "b": 1, "e": 1, "f": 1, "d": 1}
|
||||
sorted_keys = ujson.dumps(data, sort_keys=True)
|
||||
|
Loading…
Reference in New Issue
Block a user