From 8a946e58301205f99f60a671644ea347ddc2aeb2 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Mon, 27 Jun 2022 22:54:49 +0000
Subject: [PATCH] Add separators encoding parameter

Closes #283
---
 lib/ultrajson.h     |  7 ++++
 lib/ultrajsonenc.c  | 21 +++++-------
 python/objToJSON.c  | 83 +++++++++++++++++++++++++++++++++++++++++++--
 tests/test_ujson.py | 36 ++++++++++++++++++++
 4 files changed, 132 insertions(+), 15 deletions(-)

diff --git a/lib/ultrajson.h b/lib/ultrajson.h
index eda221a..d4fec61 100644
--- a/lib/ultrajson.h
+++ b/lib/ultrajson.h
@@ -268,6 +268,13 @@ typedef struct __JSONObjectEncoder
   If true, bytes are rejected. */
   int rejectBytes;
 
+  /*
+  Configuration for item and key separators, e.g. "," and ":" for a compact representation or ", " and ": " to match the Python standard library's defaults. */
+  size_t itemSeparatorLength;
+  const char *itemSeparatorChars;
+  size_t keySeparatorLength;
+  const char *keySeparatorChars;
+
   /*
   Private pointer to be used by the caller. Passed as encoder_prv in JSONTypeContext */
   void *prv;
diff --git a/lib/ultrajsonenc.c b/lib/ultrajsonenc.c
index 9f72f9b..dfc692d 100644
--- a/lib/ultrajsonenc.c
+++ b/lib/ultrajsonenc.c
@@ -677,8 +677,7 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c
 
   if (name)
   {
-    // 2 extra for the colon and optional space after it
-    Buffer_Reserve(enc, RESERVE_STRING(cbName) + 2);
+    Buffer_Reserve(enc, RESERVE_STRING(cbName) + enc->keySeparatorLength);
     Buffer_AppendCharUnchecked(enc, '\"');
 
     if (enc->forceASCII)
@@ -698,11 +697,7 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c
 
     Buffer_AppendCharUnchecked(enc, '\"');
 
-    Buffer_AppendCharUnchecked (enc, ':');
-    if (enc->indent)
-    {
-      Buffer_AppendCharUnchecked (enc, ' ');
-    }
+    Buffer_memcpy(enc, enc->keySeparatorChars, enc->keySeparatorLength);
   }
 
   tc.encoder_prv = enc->prv;
@@ -741,12 +736,12 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c
 
       while (enc->iterNext(obj, &tc))
       {
-        // The extra 2 bytes cover the comma and (optional) newline.
-        Buffer_Reserve (enc, enc->indent * (enc->level + 1) + 2);
+        // The extra 1 byte covers the optional newline.
+        Buffer_Reserve (enc, enc->indent * (enc->level + 1) + enc->itemSeparatorLength + 1);
 
         if (count > 0)
         {
-          Buffer_AppendCharUnchecked (enc, ',');
+          Buffer_memcpy(enc, enc->itemSeparatorChars, enc->itemSeparatorLength);
         }
         Buffer_AppendIndentNewlineUnchecked (enc);
 
@@ -786,8 +781,8 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c
 
       while ((res = enc->iterNext(obj, &tc)))
       {
-        // The extra 2 bytes cover the comma and optional newline.
-        Buffer_Reserve (enc, enc->indent * (enc->level + 1) + 2);
+        // The extra 1 byte covers the optional newline.
+        Buffer_Reserve (enc, enc->indent * (enc->level + 1) + enc->itemSeparatorLength + 1);
 
         if(res < 0)
         {
@@ -799,7 +794,7 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c
 
         if (count > 0)
         {
-          Buffer_AppendCharUnchecked (enc, ',');
+          Buffer_memcpy(enc, enc->itemSeparatorChars, enc->itemSeparatorLength);
         }
         Buffer_AppendIndentNewlineUnchecked (enc);
 
diff --git a/python/objToJSON.c b/python/objToJSON.c
index fd0d6c1..b051b8b 100644
--- a/python/objToJSON.c
+++ b/python/objToJSON.c
@@ -794,7 +794,7 @@ static char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen)
 
 PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
 {
-  static char *kwlist[] = { "obj", "ensure_ascii", "encode_html_chars", "escape_forward_slashes", "sort_keys", "indent", "allow_nan", "reject_bytes", "default", NULL };
+  static char *kwlist[] = { "obj", "ensure_ascii", "encode_html_chars", "escape_forward_slashes", "sort_keys", "indent", "allow_nan", "reject_bytes", "default", "separators", NULL };
 
   char buffer[65536];
   char *ret;
@@ -806,6 +806,11 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
   PyObject *oescapeForwardSlashes = NULL;
   PyObject *osortKeys = NULL;
   PyObject *odefaultFn = NULL;
+  PyObject *oseparators = NULL;
+  PyObject *oseparatorsItem = NULL;
+  PyObject *separatorsItemBytes = NULL;
+  PyObject *oseparatorsKey = NULL;
+  PyObject *separatorsKeyBytes = NULL;
   int allowNan = -1;
   int orejectBytes = -1;
   size_t retLen;
@@ -834,13 +839,17 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
     0, //indent
     1, //allowNan
     1, //rejectBytes
+    0, //itemSeparatorLength
+    NULL, //itemSeparatorChars
+    0, //keySeparatorLength
+    NULL, //keySeparatorChars
     NULL, //prv
   };
 
 
   PRINTMARK();
 
-  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OOOOiiiO", kwlist, &oinput, &oensureAscii, &oencodeHTMLChars, &oescapeForwardSlashes, &osortKeys, &encoder.indent, &allowNan, &orejectBytes, &odefaultFn))
+  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OOOOiiiOO", kwlist, &oinput, &oensureAscii, &oencodeHTMLChars, &oescapeForwardSlashes, &osortKeys, &encoder.indent, &allowNan, &orejectBytes, &odefaultFn, &oseparators))
   {
     return NULL;
   }
@@ -887,6 +896,69 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
     encoder.rejectBytes = orejectBytes;
   }
 
+  if (oseparators != NULL && oseparators != Py_None)
+  {
+    if (!PyTuple_Check(oseparators))
+    {
+      PyErr_SetString(PyExc_TypeError, "expected tuple or None as separator");
+      return NULL;
+    }
+    if (PyTuple_Size (oseparators) != 2)
+    {
+      PyErr_SetString(PyExc_ValueError, "expected tuple of size 2 as separator");
+      return NULL;
+    }
+    oseparatorsItem = PyTuple_GetItem(oseparators, 0);
+    if (PyErr_Occurred())
+    {
+      return NULL;
+    }
+    if (!PyUnicode_Check(oseparatorsItem))
+    {
+      PyErr_SetString(PyExc_TypeError, "expected str as item separator");
+      return NULL;
+    }
+    oseparatorsKey = PyTuple_GetItem(oseparators, 1);
+    if (PyErr_Occurred())
+    {
+      return NULL;
+    }
+    if (!PyUnicode_Check(oseparatorsKey))
+    {
+      PyErr_SetString(PyExc_TypeError, "expected str as key separator");
+      return NULL;
+    }
+    encoder.itemSeparatorChars = PyUnicodeToUTF8Raw(oseparatorsItem, &encoder.itemSeparatorLength, &separatorsItemBytes);
+    if (encoder.itemSeparatorChars == NULL)
+    {
+      PyErr_SetString(PyExc_ValueError, "item separator malformed");
+      goto ERROR;
+    }
+    encoder.keySeparatorChars = PyUnicodeToUTF8Raw(oseparatorsKey, &encoder.keySeparatorLength, &separatorsKeyBytes);
+    if (encoder.keySeparatorChars == NULL)
+    {
+      PyErr_SetString(PyExc_ValueError, "key separator malformed");
+      goto ERROR;
+    }
+  }
+  else
+  {
+    // Default to most compact representation
+    encoder.itemSeparatorChars = ",";
+    encoder.itemSeparatorLength = 1;
+    if (encoder.indent)
+    {
+      // Extra space when indentation is in use
+      encoder.keySeparatorChars = ": ";
+      encoder.keySeparatorLength = 2;
+    }
+    else
+    {
+      encoder.keySeparatorChars = ":";
+      encoder.keySeparatorLength = 1;
+    }
+  }
+
   encoder.d2s = NULL;
   dconv_d2s_init(&encoder.d2s, DCONV_D2S_EMIT_TRAILING_DECIMAL_POINT | DCONV_D2S_EMIT_TRAILING_ZERO_AFTER_POINT | DCONV_D2S_EMIT_POSITIVE_EXPONENT_SIGN,
                  csInf, csNan, 'e', DCONV_DECIMAL_IN_SHORTEST_LOW, DCONV_DECIMAL_IN_SHORTEST_HIGH, 0, 0);
@@ -896,6 +968,8 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
   PRINTMARK();
 
   dconv_d2s_free(&encoder.d2s);
+  Py_XDECREF(separatorsItemBytes);
+  Py_XDECREF(separatorsKeyBytes);
 
   if (encoder.errorMsg && !PyErr_Occurred())
   {
@@ -923,6 +997,11 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
   PRINTMARK();
 
   return newobj;
+
+ERROR:
+  Py_XDECREF(separatorsItemBytes);
+  Py_XDECREF(separatorsKeyBytes);
+  return NULL;
 }
 
 PyObject* objToJSONFile(PyObject* self, PyObject *args, PyObject *kwargs)
diff --git a/tests/test_ujson.py b/tests/test_ujson.py
index d50ebbf..61dfaa0 100644
--- a/tests/test_ujson.py
+++ b/tests/test_ujson.py
@@ -1088,6 +1088,42 @@ def test_no_memory_leak_encoding_errors(input):
     no_memory_leak(f"functools.partial(ujson.dumps, {input})")
 
 
+@pytest.mark.parametrize(
+    "separators, expected",
+    [
+        (None, '{"a":0,"b":1}'),
+        ((",", ":"), '{"a":0,"b":1}'),
+        ((", ", ": "), '{"a": 0, "b": 1}'),
+        # And some weird values, even though they produce invalid JSON
+        (("\u203d", "\u00a1"), '{"a"\u00a10\u203d"b"\u00a11}'),
+        (("i\x00", "k\x00"), '{"a"k\x000i\x00"b"k\x001}'),
+        (("\udc80", "\udc81"), '{"a"\udc810\udc80"b"\udc811}'),
+    ],
+)
+def test_separators(separators, expected):
+    assert ujson.dumps({"a": 0, "b": 1}, separators=separators) == expected
+
+
+@pytest.mark.parametrize(
+    "separators, expected_exception",
+    [
+        (True, TypeError),
+        (0, TypeError),
+        (b"", TypeError),
+        ((), ValueError),
+        ((",",), ValueError),
+        ((",", ":", "x"), ValueError),
+        ((True, 0), TypeError),
+        ((",", True), TypeError),
+        ((True, ":"), TypeError),
+        ((b",", b":"), TypeError),
+    ],
+)
+def test_separators_errors(separators, expected_exception):
+    with pytest.raises(expected_exception):
+        ujson.dumps({"a": 0, "b": 1}, separators=separators)
+
+
 """
 def test_decode_numeric_int_frc_overflow():
 input = "X.Y"