From 8a946e58301205f99f60a671644ea347ddc2aeb2 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Mon, 27 Jun 2022 22:54:49 +0000 Subject: [PATCH] Add separators encoding parameter Closes #283 --- lib/ultrajson.h | 7 ++++ lib/ultrajsonenc.c | 21 +++++------- python/objToJSON.c | 83 +++++++++++++++++++++++++++++++++++++++++++-- tests/test_ujson.py | 36 ++++++++++++++++++++ 4 files changed, 132 insertions(+), 15 deletions(-) diff --git a/lib/ultrajson.h b/lib/ultrajson.h index eda221a..d4fec61 100644 --- a/lib/ultrajson.h +++ b/lib/ultrajson.h @@ -268,6 +268,13 @@ typedef struct __JSONObjectEncoder If true, bytes are rejected. */ int rejectBytes; + /* + Configuration for item and key separators, e.g. "," and ":" for a compact representation or ", " and ": " to match the Python standard library's defaults. */ + size_t itemSeparatorLength; + const char *itemSeparatorChars; + size_t keySeparatorLength; + const char *keySeparatorChars; + /* Private pointer to be used by the caller. Passed as encoder_prv in JSONTypeContext */ void *prv; diff --git a/lib/ultrajsonenc.c b/lib/ultrajsonenc.c index 9f72f9b..dfc692d 100644 --- a/lib/ultrajsonenc.c +++ b/lib/ultrajsonenc.c @@ -677,8 +677,7 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c if (name) { - // 2 extra for the colon and optional space after it - Buffer_Reserve(enc, RESERVE_STRING(cbName) + 2); + Buffer_Reserve(enc, RESERVE_STRING(cbName) + enc->keySeparatorLength); Buffer_AppendCharUnchecked(enc, '\"'); if (enc->forceASCII) @@ -698,11 +697,7 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c Buffer_AppendCharUnchecked(enc, '\"'); - Buffer_AppendCharUnchecked (enc, ':'); - if (enc->indent) - { - Buffer_AppendCharUnchecked (enc, ' '); - } + Buffer_memcpy(enc, enc->keySeparatorChars, enc->keySeparatorLength); } tc.encoder_prv = enc->prv; @@ -741,12 +736,12 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c while (enc->iterNext(obj, &tc)) { - // The extra 2 bytes cover the comma and (optional) newline. - Buffer_Reserve (enc, enc->indent * (enc->level + 1) + 2); + // The extra 1 byte covers the optional newline. + Buffer_Reserve (enc, enc->indent * (enc->level + 1) + enc->itemSeparatorLength + 1); if (count > 0) { - Buffer_AppendCharUnchecked (enc, ','); + Buffer_memcpy(enc, enc->itemSeparatorChars, enc->itemSeparatorLength); } Buffer_AppendIndentNewlineUnchecked (enc); @@ -786,8 +781,8 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c while ((res = enc->iterNext(obj, &tc))) { - // The extra 2 bytes cover the comma and optional newline. - Buffer_Reserve (enc, enc->indent * (enc->level + 1) + 2); + // The extra 1 byte covers the optional newline. + Buffer_Reserve (enc, enc->indent * (enc->level + 1) + enc->itemSeparatorLength + 1); if(res < 0) { @@ -799,7 +794,7 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c if (count > 0) { - Buffer_AppendCharUnchecked (enc, ','); + Buffer_memcpy(enc, enc->itemSeparatorChars, enc->itemSeparatorLength); } Buffer_AppendIndentNewlineUnchecked (enc); diff --git a/python/objToJSON.c b/python/objToJSON.c index fd0d6c1..b051b8b 100644 --- a/python/objToJSON.c +++ b/python/objToJSON.c @@ -794,7 +794,7 @@ static char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) { - static char *kwlist[] = { "obj", "ensure_ascii", "encode_html_chars", "escape_forward_slashes", "sort_keys", "indent", "allow_nan", "reject_bytes", "default", NULL }; + static char *kwlist[] = { "obj", "ensure_ascii", "encode_html_chars", "escape_forward_slashes", "sort_keys", "indent", "allow_nan", "reject_bytes", "default", "separators", NULL }; char buffer[65536]; char *ret; @@ -806,6 +806,11 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) PyObject *oescapeForwardSlashes = NULL; PyObject *osortKeys = NULL; PyObject *odefaultFn = NULL; + PyObject *oseparators = NULL; + PyObject *oseparatorsItem = NULL; + PyObject *separatorsItemBytes = NULL; + PyObject *oseparatorsKey = NULL; + PyObject *separatorsKeyBytes = NULL; int allowNan = -1; int orejectBytes = -1; size_t retLen; @@ -834,13 +839,17 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) 0, //indent 1, //allowNan 1, //rejectBytes + 0, //itemSeparatorLength + NULL, //itemSeparatorChars + 0, //keySeparatorLength + NULL, //keySeparatorChars NULL, //prv }; PRINTMARK(); - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OOOOiiiO", kwlist, &oinput, &oensureAscii, &oencodeHTMLChars, &oescapeForwardSlashes, &osortKeys, &encoder.indent, &allowNan, &orejectBytes, &odefaultFn)) + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OOOOiiiOO", kwlist, &oinput, &oensureAscii, &oencodeHTMLChars, &oescapeForwardSlashes, &osortKeys, &encoder.indent, &allowNan, &orejectBytes, &odefaultFn, &oseparators)) { return NULL; } @@ -887,6 +896,69 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) encoder.rejectBytes = orejectBytes; } + if (oseparators != NULL && oseparators != Py_None) + { + if (!PyTuple_Check(oseparators)) + { + PyErr_SetString(PyExc_TypeError, "expected tuple or None as separator"); + return NULL; + } + if (PyTuple_Size (oseparators) != 2) + { + PyErr_SetString(PyExc_ValueError, "expected tuple of size 2 as separator"); + return NULL; + } + oseparatorsItem = PyTuple_GetItem(oseparators, 0); + if (PyErr_Occurred()) + { + return NULL; + } + if (!PyUnicode_Check(oseparatorsItem)) + { + PyErr_SetString(PyExc_TypeError, "expected str as item separator"); + return NULL; + } + oseparatorsKey = PyTuple_GetItem(oseparators, 1); + if (PyErr_Occurred()) + { + return NULL; + } + if (!PyUnicode_Check(oseparatorsKey)) + { + PyErr_SetString(PyExc_TypeError, "expected str as key separator"); + return NULL; + } + encoder.itemSeparatorChars = PyUnicodeToUTF8Raw(oseparatorsItem, &encoder.itemSeparatorLength, &separatorsItemBytes); + if (encoder.itemSeparatorChars == NULL) + { + PyErr_SetString(PyExc_ValueError, "item separator malformed"); + goto ERROR; + } + encoder.keySeparatorChars = PyUnicodeToUTF8Raw(oseparatorsKey, &encoder.keySeparatorLength, &separatorsKeyBytes); + if (encoder.keySeparatorChars == NULL) + { + PyErr_SetString(PyExc_ValueError, "key separator malformed"); + goto ERROR; + } + } + else + { + // Default to most compact representation + encoder.itemSeparatorChars = ","; + encoder.itemSeparatorLength = 1; + if (encoder.indent) + { + // Extra space when indentation is in use + encoder.keySeparatorChars = ": "; + encoder.keySeparatorLength = 2; + } + else + { + encoder.keySeparatorChars = ":"; + encoder.keySeparatorLength = 1; + } + } + encoder.d2s = NULL; dconv_d2s_init(&encoder.d2s, DCONV_D2S_EMIT_TRAILING_DECIMAL_POINT | DCONV_D2S_EMIT_TRAILING_ZERO_AFTER_POINT | DCONV_D2S_EMIT_POSITIVE_EXPONENT_SIGN, csInf, csNan, 'e', DCONV_DECIMAL_IN_SHORTEST_LOW, DCONV_DECIMAL_IN_SHORTEST_HIGH, 0, 0); @@ -896,6 +968,8 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) PRINTMARK(); dconv_d2s_free(&encoder.d2s); + Py_XDECREF(separatorsItemBytes); + Py_XDECREF(separatorsKeyBytes); if (encoder.errorMsg && !PyErr_Occurred()) { @@ -923,6 +997,11 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) PRINTMARK(); return newobj; + +ERROR: + Py_XDECREF(separatorsItemBytes); + Py_XDECREF(separatorsKeyBytes); + return NULL; } PyObject* objToJSONFile(PyObject* self, PyObject *args, PyObject *kwargs) diff --git a/tests/test_ujson.py b/tests/test_ujson.py index d50ebbf..61dfaa0 100644 --- a/tests/test_ujson.py +++ b/tests/test_ujson.py @@ -1088,6 +1088,42 @@ def test_no_memory_leak_encoding_errors(input): no_memory_leak(f"functools.partial(ujson.dumps, {input})") +@pytest.mark.parametrize( + "separators, expected", + [ + (None, '{"a":0,"b":1}'), + ((",", ":"), '{"a":0,"b":1}'), + ((", ", ": "), '{"a": 0, "b": 1}'), + # And some weird values, even though they produce invalid JSON + (("\u203d", "\u00a1"), '{"a"\u00a10\u203d"b"\u00a11}'), + (("i\x00", "k\x00"), '{"a"k\x000i\x00"b"k\x001}'), + (("\udc80", "\udc81"), '{"a"\udc810\udc80"b"\udc811}'), + ], +) +def test_separators(separators, expected): + assert ujson.dumps({"a": 0, "b": 1}, separators=separators) == expected + + +@pytest.mark.parametrize( + "separators, expected_exception", + [ + (True, TypeError), + (0, TypeError), + (b"", TypeError), + ((), ValueError), + ((",",), ValueError), + ((",", ":", "x"), ValueError), + ((True, 0), TypeError), + ((",", True), TypeError), + ((True, ":"), TypeError), + ((b",", b":"), TypeError), + ], +) +def test_separators_errors(separators, expected_exception): + with pytest.raises(expected_exception): + ujson.dumps({"a": 0, "b": 1}, separators=separators) + + """ def test_decode_numeric_int_frc_overflow(): input = "X.Y"