1
0
mirror of https://github.com/ultrajson/ultrajson.git synced 2024-11-24 01:04:19 +01:00

- Added forceAscii option to encoder to control if output is forced to be ascii (<128) or not.

- Added ensure_ascii kwargs to encode/dumps (true is default). Use ensure_ascii=false to allow UTF_8 strings to be outputted, should be faster and more space efficient.

- Bumped version
This commit is contained in:
Jonas Tarnstrom 2011-09-09 16:00:23 +02:00
parent 6738e59af1
commit e962ae68e2
6 changed files with 145 additions and 23 deletions

@ -662,11 +662,15 @@ char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen)
}
PyObject* objToJSON(PyObject* self, PyObject *arg)
PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
{
static char *kwlist[] = { "ensure_ascii", NULL};
char buffer[65536];
char *ret;
PyObject *newobj;
PyObject *oinput = NULL;
PyObject *oensureAscii = NULL;
JSONObjectEncoder encoder =
{
@ -687,9 +691,24 @@ PyObject* objToJSON(PyObject* self, PyObject *arg)
PyObject_Free, //JSPFN_FREE free;
-1, //recursionMax
5, //default decimal precision
1, //forceAscii
};
PRINTMARK();
ret = JSON_EncodeObject (arg, &encoder, buffer, sizeof (buffer));
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", kwlist, &oinput, &oensureAscii))
{
return PyErr_Format(PyExc_TypeError, "Expected object, **kw ensure_ascii true/false");
}
if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii))
{
encoder.forceASCII = 0;
}
PRINTMARK();
ret = JSON_EncodeObject (oinput, &encoder, buffer, sizeof (buffer));
PRINTMARK();
if (encoder.errorMsg)
@ -710,10 +729,12 @@ PyObject* objToJSON(PyObject* self, PyObject *arg)
encoder.free (ret);
}
PRINTMARK();
return newobj;
}
PyObject* objToJSONFile(PyObject* self, PyObject *args)
PyObject* objToJSONFile(PyObject* self, PyObject *args, PyObject *kwargs)
{
PyObject *data;
PyObject *file;
@ -721,6 +742,8 @@ PyObject* objToJSONFile(PyObject* self, PyObject *args)
PyObject *write;
PyObject *argtuple;
PRINTMARK();
if (!PyArg_ParseTuple (args, "OO", &data, &file)) {
return NULL;
}
@ -739,11 +762,14 @@ PyObject* objToJSONFile(PyObject* self, PyObject *args)
return NULL;
}
string = objToJSON (self, data);
argtuple = PyTuple_Pack(1, data);
string = objToJSON (self, argtuple, kwargs);
if (string == NULL)
{
Py_XDECREF(write);
Py_XDECREF(argtuple);
return NULL;
}
@ -763,6 +789,11 @@ PyObject* objToJSONFile(PyObject* self, PyObject *args)
Py_XDECREF(write);
Py_XDECREF(argtuple);
Py_XDECREF(string);
PRINTMARK();
Py_RETURN_NONE;
}

@ -214,6 +214,14 @@ class UltraJSONTests(TestCase):
self.assertEquals(int(expected), json.loads(output))
self.assertEquals(int(expected), ujson.decode(output))
pass
def test_encodeToUTF8(self):
input = "\xe6\x97\xa5\xd1\x88"
enc = ujson.encode(input, ensure_ascii=false)
dec = ujson.decode(enc)
self.assertEquals(enc, json.dumps(input, encoding="utf-8"))
self.assertEquals(dec, json.loads(enc))
def test_encodeRecursionMax(self):
# 8 is the max recursion depth

@ -16,11 +16,11 @@ PyObject* JSONFileToObj(PyObject* self, PyObject *file);
static PyMethodDef ujsonMethods[] = {
{"encode", objToJSON, METH_O, "Converts arbitrary object recursivly into JSON"},
{"encode", objToJSON, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursivly into JSON. Use ensure_ascii=false to output UTF-8"},
{"decode", JSONToObj, METH_O, "Converts JSON as string to dict object structure"},
{"dumps", objToJSON, METH_O, "Converts arbitrary object recursivly into JSON"},
{"dumps", objToJSON, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursivly into JSON. Use ensure_ascii=false to output UTF-8"},
{"loads", JSONToObj, METH_O, "Converts JSON as string to dict object structure"},
{"dump", objToJSONFile, METH_VARARGS, "Converts arbitrary object recursively into JSON file"},
{"dump", objToJSONFile, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursively into JSON file. Use ensure_ascii=false to output UTF-8"},
{"load", JSONFileToObj, METH_O, "Converts JSON as file to dict object structure"},
{NULL, NULL, 0, NULL} /* Sentinel */
};

@ -1 +1 @@
#define UJSON_VERSION "1.6"
#define UJSON_VERSION "1.7"

@ -165,6 +165,10 @@ typedef void *(*JSPFN_MALLOC)(size_t size);
typedef void (*JSPFN_FREE)(void *pptr);
typedef void *(*JSPFN_REALLOC)(void *base, size_t size);
typedef int (*JSPFN_ESCAPESTRING)(JSOBJ obj, void *enc, const char *io, const char *end);
typedef struct __JSONObjectEncoder
{
void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc);
@ -224,6 +228,10 @@ typedef struct __JSONObjectEncoder
Configuration for max decimals of double floating poiunt numbers to encode (0-9) */
int doublePrecision;
/*
If true output will be ASCII with all characters above 127 encoded as \uXXXX. If false output will be UTF-8 or what ever charset strings are brought as */
int forceASCII;
/*
Set to an error message if error occured */
@ -237,6 +245,8 @@ typedef struct __JSONObjectEncoder
int heap;
int level;
JSPFN_ESCAPESTRING EscapeString;
} JSONObjectEncoder;

@ -52,17 +52,20 @@ Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights rese
static const double g_pow10[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000};
static const char g_hexChars[] = "0123456789abcdef";
static const char g_escapeChars[] = "0123456789\\b\\t\\n\\f\\r\\\"\\\\";
static const char g_escapeChars[] = "0123456789\\b\\t\\n\\f\\r\\\"\\\\\\/";
/*
FIXME: While this is fine dandy and working it's a magic value mess which probably only the author understands.
Needs a cleanup and more documentation */
static const JSUINT8 g_utf8LengthLookup[256] =
/*
Table for pure ascii output escaping all characters above 127 to \u00XXX */
static const JSUINT8 g_asciiOutputTable[256] =
{
/* 0x00 */ 0, 30, 30, 30, 30, 30, 30, 30, 10, 12, 14, 30, 16, 18, 30, 30,
/* 0x10 */ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
/* 0x20 */ 1, 1, 20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* 0x20 */ 1, 1, 20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 24,
/* 0x30 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* 0x40 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* 0x50 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 22, 1, 1, 1,
@ -78,6 +81,7 @@ static const JSUINT8 g_utf8LengthLookup[256] =
/* 0xf0 */ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
};
static void SetError (JSOBJ obj, JSONObjectEncoder *enc, const char *message)
{
enc->errorMsg = message;
@ -121,6 +125,71 @@ FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC Buffer_AppendShortHexUnchecked (c
*(outputOffset++) = g_hexChars[(value & 0x000f) >> 0];
}
int Buffer_EscapeStringUnvalidated (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end)
{
char *of = (char *) enc->offset;
while (1)
{
switch (*io)
{
case 0x00:
enc->offset += (of - enc->offset);
return TRUE;
case '\"': (*of++) = '\\'; (*of++) = '\"'; break;
case '\\': (*of++) = '\\'; (*of++) = '\\'; break;
//case '/': (*of++) = '\\'; (*of++) = '/'; break;
case '\b': (*of++) = '\\'; (*of++) = 'b'; break;
case '\f': (*of++) = '\\'; (*of++) = 'f'; break;
case '\n': (*of++) = '\\'; (*of++) = 'n'; break;
case '\r': (*of++) = '\\'; (*of++) = 'r'; break;
case '\t': (*of++) = '\\'; (*of++) = 't'; break;
case 0x01:
case 0x02:
case 0x03:
case 0x04:
case 0x05:
case 0x06:
case 0x07:
case 0x0b:
case 0x0e:
case 0x0f:
case 0x10:
case 0x11:
case 0x12:
case 0x13:
case 0x14:
case 0x15:
case 0x16:
case 0x17:
case 0x18:
case 0x19:
case 0x1a:
case 0x1b:
case 0x1c:
case 0x1d:
case 0x1e:
case 0x1f:
*(of++) = '\\';
*(of++) = 'u';
*(of++) = '0';
*(of++) = '0';
*(of++) = g_hexChars[ (unsigned char) (((*io) & 0xf0) >> 4)];
*(of++) = g_hexChars[ (unsigned char) ((*io) & 0x0f)];
break;
default: (*of++) = (*io); break;
}
*io++;
}
return FALSE;
}
/*
FIXME:
This code only works with Little and Big Endian
@ -128,24 +197,17 @@ This code only works with Little and Big Endian
FIXME: The JSON spec says escape "/" but non of the others do and we don't
want to be left alone doing it so we don't :)
FIXME: It should be faster to do SHIFT and then AND instead of AND and SHIFT.
Example:
(x & 0x3f00) >> 8) => Longer/more opcodes than below
(x >> 8) & 0x3f) => Probably faster/smaller
Seems that atleast MSVC9 does this optimization by itself from time to time. Not sure really
*/
int Buffer_EscapeString (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end)
int Buffer_EscapeStringValidated (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end)
{
JSUTF32 ucs;
char *of = (char *) enc->offset;
while (1)
{
//JSUINT8 chr = (unsigned char) *io;
JSUINT8 utflen = g_utf8LengthLookup[(unsigned char) *io];
JSUINT8 utflen = g_asciiOutputTable[(unsigned char) *io];
switch (utflen)
{
@ -277,6 +339,7 @@ int Buffer_EscapeString (JSOBJ obj, JSONObjectEncoder *enc, const char *io, cons
case 18:
case 20:
case 22:
//case 24: (enable for / escaping)
*(of++) = *( (char *) (g_escapeChars + utflen + 0));
*(of++) = *( (char *) (g_escapeChars + utflen + 1));
io ++;
@ -539,7 +602,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName)
if (name)
{
Buffer_AppendCharUnchecked(enc, '\"');
if (!Buffer_EscapeString(obj, enc, name, name + cbName))
if (!enc->EscapeString(obj, enc, name, name + cbName))
{
return;
}
@ -680,7 +743,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName)
Buffer_Reserve(enc, ((szlen / 4) + 1) * 12);
Buffer_AppendCharUnchecked (enc, '\"');
if (!Buffer_EscapeString(obj, enc, value, value + szlen))
if (!enc->EscapeString(obj, enc, value, value + szlen))
{
enc->endTypeContext(obj, &tc);
enc->level --;
@ -717,6 +780,16 @@ char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t
enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS;
}
if (enc->forceASCII)
{
enc->EscapeString = Buffer_EscapeStringValidated;
}
else
{
enc->EscapeString = Buffer_EscapeStringUnvalidated;
}
if (_buffer == NULL)
{
_cbBuffer = 32768;