- Added forceAscii option to encoder to control if output is forced to be ascii (<128) or not.

- Added ensure_ascii kwargs to encode/dumps (true is default). Use ensure_ascii=false to allow UTF_8 strings to be outputted, should be faster and more space efficient. - Bumped version
2024-11-24 01:04:19 +01:00 · 2011-09-09 16:00:23 +02:00 · 2011-09-09 16:00:23 +02:00 · e962ae68e2
commit e962ae68e2
parent 6738e59af1
6 changed files with 145 additions and 23 deletions
--- a/python/objToJSON.c
+++ b/python/objToJSON.c
@ -662,11 +662,15 @@ char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen)
 }


-PyObject* objToJSON(PyObject* self, PyObject *arg)
+PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
 {
+	static char *kwlist[] = { "ensure_ascii", NULL};
+
 	char buffer[65536];
 	char *ret;
 	PyObject *newobj;
+	PyObject *oinput = NULL;
+	PyObject *oensureAscii = NULL;

 	JSONObjectEncoder encoder = 
 	{
@ -687,9 +691,24 @@ PyObject* objToJSON(PyObject* self, PyObject *arg)
 		PyObject_Free, //JSPFN_FREE free;
 		-1, //recursionMax
 		5, //default decimal precision
+		1, //forceAscii
 	};
+	
+	PRINTMARK();

-	ret = JSON_EncodeObject (arg, &encoder, buffer, sizeof (buffer));
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", kwlist, &oinput, &oensureAscii))
+	{
+		return PyErr_Format(PyExc_TypeError, "Expected object, **kw ensure_ascii true/false");
+	}
+
+	if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii))
+	{
+		encoder.forceASCII = 0;
+	}
+
+	PRINTMARK();
+	ret = JSON_EncodeObject (oinput, &encoder, buffer, sizeof (buffer));
+	PRINTMARK();


 	if (encoder.errorMsg)
@ -710,10 +729,12 @@ PyObject* objToJSON(PyObject* self, PyObject *arg)
 		encoder.free (ret);
 	}

+	PRINTMARK();
+
 	return newobj;
 }

-PyObject* objToJSONFile(PyObject* self, PyObject *args)
+PyObject* objToJSONFile(PyObject* self, PyObject *args, PyObject *kwargs)
 {
 	PyObject *data;
 	PyObject *file;
@ -721,6 +742,8 @@ PyObject* objToJSONFile(PyObject* self, PyObject *args)
 	PyObject *write;
 	PyObject *argtuple;

+	PRINTMARK();
+
 	if (!PyArg_ParseTuple (args, "OO", &data, &file)) {
 		return NULL;
 	}
@ -739,11 +762,14 @@ PyObject* objToJSONFile(PyObject* self, PyObject *args)
 		return NULL;
 	}

-	string = objToJSON (self, data);
+	argtuple = PyTuple_Pack(1, data);
+
+	string = objToJSON (self, argtuple, kwargs);

 	if (string == NULL)
 	{
 		Py_XDECREF(write);
+		Py_XDECREF(argtuple);
 		return NULL;
 	}

@ -763,6 +789,11 @@ PyObject* objToJSONFile(PyObject* self, PyObject *args)
 	Py_XDECREF(write);
 	Py_XDECREF(argtuple);
 	Py_XDECREF(string);
+
+	PRINTMARK();
+
 	Py_RETURN_NONE;
+	
+
 }

--- a/python/tests.py
+++ b/python/tests.py
@ -214,6 +214,14 @@ class UltraJSONTests(TestCase):
        self.assertEquals(int(expected), json.loads(output))
        self.assertEquals(int(expected), ujson.decode(output))
        pass
+        
+    def test_encodeToUTF8(self):
+        input = "\xe6\x97\xa5\xd1\x88"
+        enc = ujson.encode(input, ensure_ascii=false)
+        dec = ujson.decode(enc)
+        self.assertEquals(enc, json.dumps(input, encoding="utf-8"))
+        self.assertEquals(dec, json.loads(enc))
+        

    def test_encodeRecursionMax(self):
        # 8 is the max recursion depth
--- a/python/ujson.c
+++ b/python/ujson.c
@ -16,11 +16,11 @@ PyObject* JSONFileToObj(PyObject* self, PyObject *file);


 static PyMethodDef ujsonMethods[] = {
-	{"encode", objToJSON, METH_O, "Converts arbitrary object recursivly into JSON"},
+	{"encode", objToJSON, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursivly into JSON. Use ensure_ascii=false to output UTF-8"},
 	{"decode", JSONToObj, METH_O, "Converts JSON as string to dict object structure"},
-	{"dumps", objToJSON, METH_O,  "Converts arbitrary object recursivly into JSON"},
+	{"dumps", objToJSON, METH_VARARGS | METH_KEYWORDS,  "Converts arbitrary object recursivly into JSON. Use ensure_ascii=false to output UTF-8"},
 	{"loads", JSONToObj, METH_O,  "Converts JSON as string to dict object structure"},
-	{"dump", objToJSONFile, METH_VARARGS, "Converts arbitrary object recursively into JSON file"},
+	{"dump", objToJSONFile, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursively into JSON file. Use ensure_ascii=false to output UTF-8"},
 	{"load", JSONFileToObj, METH_O, "Converts JSON as file to dict object structure"},
 	{NULL, NULL, 0, NULL}		/* Sentinel */
 };
--- a/python/version.h
+++ b/python/version.h
@ -1 +1 @@
-#define UJSON_VERSION "1.6"
+#define UJSON_VERSION "1.7"
--- a/ultrajson.h
+++ b/ultrajson.h
@ -165,6 +165,10 @@ typedef	void *(*JSPFN_MALLOC)(size_t size);
 typedef void (*JSPFN_FREE)(void *pptr);
 typedef void *(*JSPFN_REALLOC)(void *base, size_t size);

+
+
+typedef int (*JSPFN_ESCAPESTRING)(JSOBJ obj, void *enc, const char *io, const char *end);
+
 typedef struct __JSONObjectEncoder
 {
 	void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc);
@ -224,6 +228,10 @@ typedef struct __JSONObjectEncoder
 	Configuration for max decimals of double floating poiunt numbers to encode (0-9) */
 	int doublePrecision;

+	/*
+	If true output will be ASCII with all characters above 127 encoded as \uXXXX. If false output will be UTF-8 or what ever charset strings are brought as */
+	int forceASCII;
+

 	/*
 	Set to an error message if error occured */
@ -237,6 +245,8 @@ typedef struct __JSONObjectEncoder
 	int heap;
 	int level;

+	JSPFN_ESCAPESTRING EscapeString;
+
 } JSONObjectEncoder;


--- a/ultrajsonenc.c
+++ b/ultrajsonenc.c
@ -52,17 +52,20 @@ Copyright (c) 2007  Nick Galbreath -- nickg [at] modp [dot] com. All rights rese

 static const double g_pow10[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000};
 static const char g_hexChars[] = "0123456789abcdef";
-static const char g_escapeChars[] = "0123456789\\b\\t\\n\\f\\r\\\"\\\\";
+static const char g_escapeChars[] = "0123456789\\b\\t\\n\\f\\r\\\"\\\\\\/";


 /*
 FIXME: While this is fine dandy and working it's a magic value mess which probably only the author understands.
 Needs a cleanup and more documentation */
-static const JSUINT8 g_utf8LengthLookup[256] = 
+
+/*
+Table for pure ascii output escaping all characters above 127 to \u00XXX */
+static const JSUINT8 g_asciiOutputTable[256] = 
 {
 /* 0x00 */ 0, 30, 30, 30, 30, 30, 30, 30, 10, 12, 14, 30, 16, 18, 30, 30, 
 /* 0x10 */ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
-/* 0x20 */ 1, 1, 20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+/* 0x20 */ 1, 1, 20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 24, 
 /* 0x30 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 /* 0x40 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
 /* 0x50 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 22, 1, 1, 1,
@ -78,6 +81,7 @@ static const JSUINT8 g_utf8LengthLookup[256] =
 /* 0xf0 */ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
 };

+
 static void SetError (JSOBJ obj, JSONObjectEncoder *enc, const char *message)
 {
 	enc->errorMsg = message;
@ -121,6 +125,71 @@ FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC Buffer_AppendShortHexUnchecked (c
 	*(outputOffset++) = g_hexChars[(value & 0x000f) >> 0];
 }

+int Buffer_EscapeStringUnvalidated (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end)
+{
+	char *of = (char *) enc->offset;
+
+	while (1)
+	{
+		switch (*io)
+		{
+		case 0x00:
+			enc->offset += (of - enc->offset); 
+			return TRUE;
+
+		case '\"': (*of++) = '\\'; (*of++) = '\"'; break;
+		case '\\': (*of++) = '\\'; (*of++) = '\\'; break;
+		//case '/':  (*of++) = '\\'; (*of++) = '/'; break;
+		case '\b': (*of++) = '\\'; (*of++) = 'b'; break;
+		case '\f': (*of++) = '\\'; (*of++) = 'f'; break;
+		case '\n': (*of++) = '\\'; (*of++) = 'n'; break;
+		case '\r': (*of++) = '\\'; (*of++) = 'r'; break;
+		case '\t': (*of++) = '\\'; (*of++) = 't'; break;
+
+		case 0x01:
+		case 0x02:
+		case 0x03:
+		case 0x04:
+		case 0x05:
+		case 0x06:
+		case 0x07:
+		case 0x0b:
+		case 0x0e:
+		case 0x0f:
+		case 0x10:
+		case 0x11:
+		case 0x12:
+		case 0x13:
+		case 0x14:
+		case 0x15:
+		case 0x16:
+		case 0x17:
+		case 0x18:
+		case 0x19:
+		case 0x1a:
+		case 0x1b:
+		case 0x1c:
+		case 0x1d:
+		case 0x1e:
+		case 0x1f:
+			*(of++) = '\\';
+			*(of++) = 'u';
+			*(of++) = '0';
+			*(of++) = '0';
+			*(of++) = g_hexChars[ (unsigned char) (((*io) & 0xf0) >> 4)];
+			*(of++) = g_hexChars[ (unsigned char) ((*io) & 0x0f)];
+			break;
+
+		default: (*of++) = (*io); break;
+		}
+
+		*io++;
+	}
+
+	return FALSE;
+}
+
+
 /*
 FIXME:
 This code only works with Little and Big Endian
@ -128,24 +197,17 @@ This code only works with Little and Big Endian
 FIXME: The JSON spec says escape "/" but non of the others do and we don't 
 want to be left alone doing it so we don't :)

-FIXME: It should be faster to do SHIFT and then AND instead of AND and SHIFT.
-Example:
-(x & 0x3f00) >> 8) => Longer/more opcodes than below
-(x >> 8) & 0x3f)   => Probably faster/smaller
-Seems that atleast MSVC9 does this optimization by itself from time to time. Not sure really
-
 */
-
-int Buffer_EscapeString (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end)
+int Buffer_EscapeStringValidated (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end)
 {
 	JSUTF32 ucs;
 	char *of = (char *) enc->offset;
-	
+
 	while (1)
 	{

 		//JSUINT8 chr = (unsigned char) *io;
-		JSUINT8 utflen = g_utf8LengthLookup[(unsigned char) *io];
+		JSUINT8 utflen = g_asciiOutputTable[(unsigned char) *io];

 		switch (utflen)
 		{
@ -277,6 +339,7 @@ int Buffer_EscapeString (JSOBJ obj, JSONObjectEncoder *enc, const char *io, cons
 			case 18:
 			case 20:
 			case 22:
+			//case 24: (enable for / escaping)
 				*(of++) = *( (char *) (g_escapeChars + utflen + 0));
 				*(of++) = *( (char *) (g_escapeChars + utflen + 1));
 				io ++;
@ -539,7 +602,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName)
 	if (name)
 	{
 		Buffer_AppendCharUnchecked(enc, '\"');
-		if (!Buffer_EscapeString(obj, enc, name, name + cbName))
+		if (!enc->EscapeString(obj, enc, name, name + cbName))
 		{
 			return;
 		}
@ -680,7 +743,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName)
 			Buffer_Reserve(enc, ((szlen / 4) + 1) * 12);
 			Buffer_AppendCharUnchecked (enc, '\"');

-			if (!Buffer_EscapeString(obj, enc, value, value + szlen))
+			if (!enc->EscapeString(obj, enc, value, value + szlen))
 			{
 				enc->endTypeContext(obj, &tc);
 				enc->level --;
@ -717,6 +780,16 @@ char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t
 		enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS;
 	}

+	if (enc->forceASCII)
+	{
+		enc->EscapeString = Buffer_EscapeStringValidated;
+	}
+	else
+	{
+		enc->EscapeString = Buffer_EscapeStringUnvalidated;
+	}
+
+
 	if (_buffer == NULL)
 	{
 		_cbBuffer = 32768;