From e962ae68e22cb1d60927cbdd02bc0a24324b807a Mon Sep 17 00:00:00 2001 From: Jonas Tarnstrom Date: Fri, 9 Sep 2011 16:00:23 +0200 Subject: [PATCH] - Added forceAscii option to encoder to control if output is forced to be ascii (<128) or not. - Added ensure_ascii kwargs to encode/dumps (true is default). Use ensure_ascii=false to allow UTF_8 strings to be outputted, should be faster and more space efficient. - Bumped version --- python/objToJSON.c | 39 +++++++++++++++-- python/tests.py | 8 ++++ python/ujson.c | 6 +-- python/version.h | 2 +- ultrajson.h | 10 +++++ ultrajsonenc.c | 103 ++++++++++++++++++++++++++++++++++++++------- 6 files changed, 145 insertions(+), 23 deletions(-) diff --git a/python/objToJSON.c b/python/objToJSON.c index 34e7c8b..dec3420 100644 --- a/python/objToJSON.c +++ b/python/objToJSON.c @@ -662,11 +662,15 @@ char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) } -PyObject* objToJSON(PyObject* self, PyObject *arg) +PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) { + static char *kwlist[] = { "ensure_ascii", NULL}; + char buffer[65536]; char *ret; PyObject *newobj; + PyObject *oinput = NULL; + PyObject *oensureAscii = NULL; JSONObjectEncoder encoder = { @@ -687,9 +691,24 @@ PyObject* objToJSON(PyObject* self, PyObject *arg) PyObject_Free, //JSPFN_FREE free; -1, //recursionMax 5, //default decimal precision + 1, //forceAscii }; + + PRINTMARK(); - ret = JSON_EncodeObject (arg, &encoder, buffer, sizeof (buffer)); + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", kwlist, &oinput, &oensureAscii)) + { + return PyErr_Format(PyExc_TypeError, "Expected object, **kw ensure_ascii true/false"); + } + + if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii)) + { + encoder.forceASCII = 0; + } + + PRINTMARK(); + ret = JSON_EncodeObject (oinput, &encoder, buffer, sizeof (buffer)); + PRINTMARK(); if (encoder.errorMsg) @@ -710,10 +729,12 @@ PyObject* objToJSON(PyObject* self, PyObject *arg) encoder.free (ret); } + PRINTMARK(); + return newobj; } -PyObject* objToJSONFile(PyObject* self, PyObject *args) +PyObject* objToJSONFile(PyObject* self, PyObject *args, PyObject *kwargs) { PyObject *data; PyObject *file; @@ -721,6 +742,8 @@ PyObject* objToJSONFile(PyObject* self, PyObject *args) PyObject *write; PyObject *argtuple; + PRINTMARK(); + if (!PyArg_ParseTuple (args, "OO", &data, &file)) { return NULL; } @@ -739,11 +762,14 @@ PyObject* objToJSONFile(PyObject* self, PyObject *args) return NULL; } - string = objToJSON (self, data); + argtuple = PyTuple_Pack(1, data); + + string = objToJSON (self, argtuple, kwargs); if (string == NULL) { Py_XDECREF(write); + Py_XDECREF(argtuple); return NULL; } @@ -763,6 +789,11 @@ PyObject* objToJSONFile(PyObject* self, PyObject *args) Py_XDECREF(write); Py_XDECREF(argtuple); Py_XDECREF(string); + + PRINTMARK(); + Py_RETURN_NONE; + + } diff --git a/python/tests.py b/python/tests.py index ecc51a8..7958be8 100644 --- a/python/tests.py +++ b/python/tests.py @@ -214,6 +214,14 @@ class UltraJSONTests(TestCase): self.assertEquals(int(expected), json.loads(output)) self.assertEquals(int(expected), ujson.decode(output)) pass + + def test_encodeToUTF8(self): + input = "\xe6\x97\xa5\xd1\x88" + enc = ujson.encode(input, ensure_ascii=false) + dec = ujson.decode(enc) + self.assertEquals(enc, json.dumps(input, encoding="utf-8")) + self.assertEquals(dec, json.loads(enc)) + def test_encodeRecursionMax(self): # 8 is the max recursion depth diff --git a/python/ujson.c b/python/ujson.c index f59b9b9..e3d8827 100644 --- a/python/ujson.c +++ b/python/ujson.c @@ -16,11 +16,11 @@ PyObject* JSONFileToObj(PyObject* self, PyObject *file); static PyMethodDef ujsonMethods[] = { - {"encode", objToJSON, METH_O, "Converts arbitrary object recursivly into JSON"}, + {"encode", objToJSON, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursivly into JSON. Use ensure_ascii=false to output UTF-8"}, {"decode", JSONToObj, METH_O, "Converts JSON as string to dict object structure"}, - {"dumps", objToJSON, METH_O, "Converts arbitrary object recursivly into JSON"}, + {"dumps", objToJSON, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursivly into JSON. Use ensure_ascii=false to output UTF-8"}, {"loads", JSONToObj, METH_O, "Converts JSON as string to dict object structure"}, - {"dump", objToJSONFile, METH_VARARGS, "Converts arbitrary object recursively into JSON file"}, + {"dump", objToJSONFile, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursively into JSON file. Use ensure_ascii=false to output UTF-8"}, {"load", JSONFileToObj, METH_O, "Converts JSON as file to dict object structure"}, {NULL, NULL, 0, NULL} /* Sentinel */ }; diff --git a/python/version.h b/python/version.h index 2cdc585..017dfcb 100644 --- a/python/version.h +++ b/python/version.h @@ -1 +1 @@ -#define UJSON_VERSION "1.6" +#define UJSON_VERSION "1.7" diff --git a/ultrajson.h b/ultrajson.h index 665d221..badce27 100644 --- a/ultrajson.h +++ b/ultrajson.h @@ -165,6 +165,10 @@ typedef void *(*JSPFN_MALLOC)(size_t size); typedef void (*JSPFN_FREE)(void *pptr); typedef void *(*JSPFN_REALLOC)(void *base, size_t size); + + +typedef int (*JSPFN_ESCAPESTRING)(JSOBJ obj, void *enc, const char *io, const char *end); + typedef struct __JSONObjectEncoder { void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc); @@ -224,6 +228,10 @@ typedef struct __JSONObjectEncoder Configuration for max decimals of double floating poiunt numbers to encode (0-9) */ int doublePrecision; + /* + If true output will be ASCII with all characters above 127 encoded as \uXXXX. If false output will be UTF-8 or what ever charset strings are brought as */ + int forceASCII; + /* Set to an error message if error occured */ @@ -237,6 +245,8 @@ typedef struct __JSONObjectEncoder int heap; int level; + JSPFN_ESCAPESTRING EscapeString; + } JSONObjectEncoder; diff --git a/ultrajsonenc.c b/ultrajsonenc.c index b4f83d4..21d4efb 100644 --- a/ultrajsonenc.c +++ b/ultrajsonenc.c @@ -52,17 +52,20 @@ Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights rese static const double g_pow10[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000}; static const char g_hexChars[] = "0123456789abcdef"; -static const char g_escapeChars[] = "0123456789\\b\\t\\n\\f\\r\\\"\\\\"; +static const char g_escapeChars[] = "0123456789\\b\\t\\n\\f\\r\\\"\\\\\\/"; /* FIXME: While this is fine dandy and working it's a magic value mess which probably only the author understands. Needs a cleanup and more documentation */ -static const JSUINT8 g_utf8LengthLookup[256] = + +/* +Table for pure ascii output escaping all characters above 127 to \u00XXX */ +static const JSUINT8 g_asciiOutputTable[256] = { /* 0x00 */ 0, 30, 30, 30, 30, 30, 30, 30, 10, 12, 14, 30, 16, 18, 30, 30, /* 0x10 */ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, -/* 0x20 */ 1, 1, 20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x20 */ 1, 1, 20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 24, /* 0x30 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x50 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 22, 1, 1, 1, @@ -78,6 +81,7 @@ static const JSUINT8 g_utf8LengthLookup[256] = /* 0xf0 */ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 }; + static void SetError (JSOBJ obj, JSONObjectEncoder *enc, const char *message) { enc->errorMsg = message; @@ -121,6 +125,71 @@ FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC Buffer_AppendShortHexUnchecked (c *(outputOffset++) = g_hexChars[(value & 0x000f) >> 0]; } +int Buffer_EscapeStringUnvalidated (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end) +{ + char *of = (char *) enc->offset; + + while (1) + { + switch (*io) + { + case 0x00: + enc->offset += (of - enc->offset); + return TRUE; + + case '\"': (*of++) = '\\'; (*of++) = '\"'; break; + case '\\': (*of++) = '\\'; (*of++) = '\\'; break; + //case '/': (*of++) = '\\'; (*of++) = '/'; break; + case '\b': (*of++) = '\\'; (*of++) = 'b'; break; + case '\f': (*of++) = '\\'; (*of++) = 'f'; break; + case '\n': (*of++) = '\\'; (*of++) = 'n'; break; + case '\r': (*of++) = '\\'; (*of++) = 'r'; break; + case '\t': (*of++) = '\\'; (*of++) = 't'; break; + + case 0x01: + case 0x02: + case 0x03: + case 0x04: + case 0x05: + case 0x06: + case 0x07: + case 0x0b: + case 0x0e: + case 0x0f: + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: + case 0x18: + case 0x19: + case 0x1a: + case 0x1b: + case 0x1c: + case 0x1d: + case 0x1e: + case 0x1f: + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = g_hexChars[ (unsigned char) (((*io) & 0xf0) >> 4)]; + *(of++) = g_hexChars[ (unsigned char) ((*io) & 0x0f)]; + break; + + default: (*of++) = (*io); break; + } + + *io++; + } + + return FALSE; +} + + /* FIXME: This code only works with Little and Big Endian @@ -128,24 +197,17 @@ This code only works with Little and Big Endian FIXME: The JSON spec says escape "/" but non of the others do and we don't want to be left alone doing it so we don't :) -FIXME: It should be faster to do SHIFT and then AND instead of AND and SHIFT. -Example: -(x & 0x3f00) >> 8) => Longer/more opcodes than below -(x >> 8) & 0x3f) => Probably faster/smaller -Seems that atleast MSVC9 does this optimization by itself from time to time. Not sure really - */ - -int Buffer_EscapeString (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end) +int Buffer_EscapeStringValidated (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end) { JSUTF32 ucs; char *of = (char *) enc->offset; - + while (1) { //JSUINT8 chr = (unsigned char) *io; - JSUINT8 utflen = g_utf8LengthLookup[(unsigned char) *io]; + JSUINT8 utflen = g_asciiOutputTable[(unsigned char) *io]; switch (utflen) { @@ -277,6 +339,7 @@ int Buffer_EscapeString (JSOBJ obj, JSONObjectEncoder *enc, const char *io, cons case 18: case 20: case 22: + //case 24: (enable for / escaping) *(of++) = *( (char *) (g_escapeChars + utflen + 0)); *(of++) = *( (char *) (g_escapeChars + utflen + 1)); io ++; @@ -539,7 +602,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName) if (name) { Buffer_AppendCharUnchecked(enc, '\"'); - if (!Buffer_EscapeString(obj, enc, name, name + cbName)) + if (!enc->EscapeString(obj, enc, name, name + cbName)) { return; } @@ -680,7 +743,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName) Buffer_Reserve(enc, ((szlen / 4) + 1) * 12); Buffer_AppendCharUnchecked (enc, '\"'); - if (!Buffer_EscapeString(obj, enc, value, value + szlen)) + if (!enc->EscapeString(obj, enc, value, value + szlen)) { enc->endTypeContext(obj, &tc); enc->level --; @@ -717,6 +780,16 @@ char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS; } + if (enc->forceASCII) + { + enc->EscapeString = Buffer_EscapeStringValidated; + } + else + { + enc->EscapeString = Buffer_EscapeStringUnvalidated; + } + + if (_buffer == NULL) { _cbBuffer = 32768;