mirror of
https://github.com/ultrajson/ultrajson.git
synced 2024-05-18 20:46:02 +02:00
Fix handling of surrogates on encoding
This allows surrogates anywhere in the input, compatible with the json module from the standard library. This also refactors two interfaces: - The `PyUnicode` to `char*` conversion is moved into its own function, separated from the `JSONTypeContext` handling, so it can be reused for other things in the future (e.g. indentation and separators) which don't have a type context. - Converting the `char*` output to a Python string with surrogates intact requires the string length for `PyUnicode_Decode` & Co. While `strlen` could be used, the length is already known inside the encoder, so the encoder function now also takes an extra `size_t` pointer argument to return that and no longer NUL-terminates the string. This also permits output that contains NUL bytes (even though that would be invalid JSON), e.g. if an object's `__json__` method return value were to contain them. Fixes #156 Fixes #447 Fixes #537 Supersedes #284
This commit is contained in:
parent
b300d642f6
commit
9b9af1ab70
|
@ -300,9 +300,10 @@ obj - An anonymous type representing the object
|
|||
enc - Function definitions for querying JSOBJ type
|
||||
buffer - Preallocated buffer to store result in. If NULL function allocates own buffer
|
||||
cbBuffer - Length of buffer (ignored if buffer is NULL)
|
||||
outLen - Will store the length of the encoded string
|
||||
|
||||
Returns:
|
||||
Encoded JSON object as a null terminated char string.
|
||||
Encoded JSON object as a char string.
|
||||
|
||||
NOTE:
|
||||
If the supplied buffer wasn't enough to hold the result the function will allocate a new buffer.
|
||||
|
@ -310,8 +311,10 @@ Life cycle of the provided buffer must still be handled by caller.
|
|||
|
||||
If the return value doesn't equal the specified buffer caller must release the memory using
|
||||
JSONObjectEncoder.free or free() as specified when calling this function.
|
||||
|
||||
If an error occurs during encoding, NULL is returned and no outLen is stored.
|
||||
*/
|
||||
EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *buffer, size_t cbBuffer);
|
||||
EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *buffer, size_t cbBuffer, size_t *outLen);
|
||||
|
||||
typedef struct __JSONObjectDecoder
|
||||
{
|
||||
|
|
|
@ -948,7 +948,7 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c
|
|||
enc->level--;
|
||||
}
|
||||
|
||||
char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t _cbBuffer)
|
||||
char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t _cbBuffer, size_t *_outLen)
|
||||
{
|
||||
enc->malloc = enc->malloc ? enc->malloc : malloc;
|
||||
enc->free = enc->free ? enc->free : free;
|
||||
|
@ -984,12 +984,11 @@ char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t
|
|||
|
||||
encode (obj, enc, NULL, 0);
|
||||
|
||||
Buffer_Reserve(enc, 1);
|
||||
if (enc->errorMsg)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
Buffer_AppendCharUnchecked(enc, '\0');
|
||||
|
||||
*_outLen = enc->offset - enc->start;
|
||||
return enc->start;
|
||||
}
|
||||
|
|
|
@ -114,10 +114,16 @@ static void *PyStringToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, siz
|
|||
return PyBytes_AsString(obj);
|
||||
}
|
||||
|
||||
static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen)
|
||||
static char *PyUnicodeToUTF8Raw(JSOBJ _obj, size_t *_outLen, PyObject *bytesObj)
|
||||
{
|
||||
/*
|
||||
Converts the PyUnicode object to char* whose size is stored in _outLen.
|
||||
This conversion may require the creation of an intermediate PyBytes object.
|
||||
In that case, the returned char* is in fact the internal buffer of that PyBytes object,
|
||||
and when the char* buffer is no longer needed, the bytesObj must be DECREF'd.
|
||||
*/
|
||||
PyObject *obj = (PyObject *) _obj;
|
||||
PyObject *newObj;
|
||||
|
||||
#ifndef Py_LIMITED_API
|
||||
if (PyUnicode_IS_COMPACT_ASCII(obj))
|
||||
{
|
||||
|
@ -127,16 +133,20 @@ static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, si
|
|||
return data;
|
||||
}
|
||||
#endif
|
||||
newObj = PyUnicode_AsUTF8String(obj);
|
||||
if(!newObj)
|
||||
|
||||
bytesObj = PyUnicode_AsEncodedString (obj, "utf-8", "surrogatepass");
|
||||
if (!bytesObj)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
GET_TC(tc)->newObj = newObj;
|
||||
*_outLen = PyBytes_Size(bytesObj);
|
||||
return PyBytes_AsString(bytesObj);
|
||||
}
|
||||
|
||||
*_outLen = PyBytes_Size(newObj);
|
||||
return PyBytes_AsString(newObj);
|
||||
static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen)
|
||||
{
|
||||
return PyUnicodeToUTF8Raw(_obj, _outLen, GET_TC(tc)->newObj);
|
||||
}
|
||||
|
||||
static void *PyRawJSONToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen)
|
||||
|
@ -240,7 +250,7 @@ static int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc)
|
|||
if (PyUnicode_Check(GET_TC(tc)->itemName))
|
||||
{
|
||||
itemNameTmp = GET_TC(tc)->itemName;
|
||||
GET_TC(tc)->itemName = PyUnicode_AsUTF8String (GET_TC(tc)->itemName);
|
||||
GET_TC(tc)->itemName = PyUnicode_AsEncodedString (GET_TC(tc)->itemName, "utf-8", "surrogatepass");
|
||||
Py_DECREF(itemNameTmp);
|
||||
}
|
||||
else
|
||||
|
@ -263,7 +273,7 @@ static int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc)
|
|||
return -1;
|
||||
}
|
||||
itemNameTmp = GET_TC(tc)->itemName;
|
||||
GET_TC(tc)->itemName = PyUnicode_AsUTF8String (GET_TC(tc)->itemName);
|
||||
GET_TC(tc)->itemName = PyUnicode_AsEncodedString (GET_TC(tc)->itemName, "utf-8", "surrogatepass");
|
||||
Py_DECREF(itemNameTmp);
|
||||
}
|
||||
PRINTMARK();
|
||||
|
@ -332,7 +342,7 @@ static int SortedDict_iterNext(JSOBJ obj, JSONTypeContext *tc)
|
|||
// Subject the key to the same type restrictions and conversions as in Dict_iterGetValue.
|
||||
if (PyUnicode_Check(key))
|
||||
{
|
||||
key = PyUnicode_AsUTF8String(key);
|
||||
key = PyUnicode_AsEncodedString(key, "utf-8", "surrogatepass");
|
||||
}
|
||||
else if (!PyBytes_Check(key))
|
||||
{
|
||||
|
@ -342,7 +352,7 @@ static int SortedDict_iterNext(JSOBJ obj, JSONTypeContext *tc)
|
|||
goto error;
|
||||
}
|
||||
keyTmp = key;
|
||||
key = PyUnicode_AsUTF8String(key);
|
||||
key = PyUnicode_AsEncodedString(key, "utf-8", "surrogatepass");
|
||||
Py_DECREF(keyTmp);
|
||||
}
|
||||
else
|
||||
|
@ -777,6 +787,7 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
|
|||
PyObject *odefaultFn = NULL;
|
||||
int allowNan = -1;
|
||||
int orejectBytes = -1;
|
||||
size_t retLen;
|
||||
|
||||
JSONObjectEncoder encoder =
|
||||
{
|
||||
|
@ -860,7 +871,7 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
|
|||
csInf, csNan, 'e', DCONV_DECIMAL_IN_SHORTEST_LOW, DCONV_DECIMAL_IN_SHORTEST_HIGH, 0, 0);
|
||||
|
||||
PRINTMARK();
|
||||
ret = JSON_EncodeObject (oinput, &encoder, buffer, sizeof (buffer));
|
||||
ret = JSON_EncodeObject (oinput, &encoder, buffer, sizeof (buffer), &retLen);
|
||||
PRINTMARK();
|
||||
|
||||
dconv_d2s_free(&encoder.d2s);
|
||||
|
@ -881,7 +892,7 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
newobj = PyUnicode_FromString (ret);
|
||||
newobj = PyUnicode_DecodeUTF8(ret, retLen, "surrogatepass");
|
||||
|
||||
if (ret != buffer)
|
||||
{
|
||||
|
|
|
@ -498,10 +498,18 @@ def test_decode_array_empty():
|
|||
assert [] == obj
|
||||
|
||||
|
||||
def test_encoding_invalid_unicode_character():
|
||||
s = "\udc7f"
|
||||
with pytest.raises(UnicodeEncodeError):
|
||||
ujson.dumps(s)
|
||||
def test_encode_surrogate_characters():
|
||||
assert ujson.dumps("\udc7f") == r'"\udc7f"'
|
||||
out = r'{"\ud800":"\udfff"}'
|
||||
assert ujson.dumps({"\ud800": "\udfff"}) == out
|
||||
assert ujson.dumps({"\ud800": "\udfff"}, sort_keys=True) == out
|
||||
o = {b"\xed\xa0\x80": b"\xed\xbf\xbf"}
|
||||
assert ujson.dumps(o, reject_bytes=False) == out
|
||||
assert ujson.dumps(o, reject_bytes=False, sort_keys=True) == out
|
||||
|
||||
out2 = '{"\ud800":"\udfff"}'
|
||||
assert ujson.dumps({"\ud800": "\udfff"}, ensure_ascii=False) == out2
|
||||
assert ujson.dumps({"\ud800": "\udfff"}, ensure_ascii=False, sort_keys=True) == out2
|
||||
|
||||
|
||||
def test_sort_keys():
|
||||
|
|
Loading…
Reference in New Issue