1
0
mirror of https://github.com/ultrajson/ultrajson.git synced 2024-11-24 05:12:02 +01:00

- Added forceAscii option to encoder to control if output is forced to be ascii (<128) or not.

- Added ensure_ascii kwargs to encode/dumps (true is default). Use ensure_ascii=false to allow UTF_8 strings to be outputted, should be faster and more space efficient.

- Bumped version
This commit is contained in:
Jonas Tarnstrom 2011-09-09 16:00:23 +02:00
parent 6738e59af1
commit e962ae68e2
6 changed files with 145 additions and 23 deletions

@ -662,11 +662,15 @@ char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen)
} }
PyObject* objToJSON(PyObject* self, PyObject *arg) PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
{ {
static char *kwlist[] = { "ensure_ascii", NULL};
char buffer[65536]; char buffer[65536];
char *ret; char *ret;
PyObject *newobj; PyObject *newobj;
PyObject *oinput = NULL;
PyObject *oensureAscii = NULL;
JSONObjectEncoder encoder = JSONObjectEncoder encoder =
{ {
@ -687,9 +691,24 @@ PyObject* objToJSON(PyObject* self, PyObject *arg)
PyObject_Free, //JSPFN_FREE free; PyObject_Free, //JSPFN_FREE free;
-1, //recursionMax -1, //recursionMax
5, //default decimal precision 5, //default decimal precision
1, //forceAscii
}; };
PRINTMARK();
ret = JSON_EncodeObject (arg, &encoder, buffer, sizeof (buffer)); if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", kwlist, &oinput, &oensureAscii))
{
return PyErr_Format(PyExc_TypeError, "Expected object, **kw ensure_ascii true/false");
}
if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii))
{
encoder.forceASCII = 0;
}
PRINTMARK();
ret = JSON_EncodeObject (oinput, &encoder, buffer, sizeof (buffer));
PRINTMARK();
if (encoder.errorMsg) if (encoder.errorMsg)
@ -710,10 +729,12 @@ PyObject* objToJSON(PyObject* self, PyObject *arg)
encoder.free (ret); encoder.free (ret);
} }
PRINTMARK();
return newobj; return newobj;
} }
PyObject* objToJSONFile(PyObject* self, PyObject *args) PyObject* objToJSONFile(PyObject* self, PyObject *args, PyObject *kwargs)
{ {
PyObject *data; PyObject *data;
PyObject *file; PyObject *file;
@ -721,6 +742,8 @@ PyObject* objToJSONFile(PyObject* self, PyObject *args)
PyObject *write; PyObject *write;
PyObject *argtuple; PyObject *argtuple;
PRINTMARK();
if (!PyArg_ParseTuple (args, "OO", &data, &file)) { if (!PyArg_ParseTuple (args, "OO", &data, &file)) {
return NULL; return NULL;
} }
@ -739,11 +762,14 @@ PyObject* objToJSONFile(PyObject* self, PyObject *args)
return NULL; return NULL;
} }
string = objToJSON (self, data); argtuple = PyTuple_Pack(1, data);
string = objToJSON (self, argtuple, kwargs);
if (string == NULL) if (string == NULL)
{ {
Py_XDECREF(write); Py_XDECREF(write);
Py_XDECREF(argtuple);
return NULL; return NULL;
} }
@ -763,6 +789,11 @@ PyObject* objToJSONFile(PyObject* self, PyObject *args)
Py_XDECREF(write); Py_XDECREF(write);
Py_XDECREF(argtuple); Py_XDECREF(argtuple);
Py_XDECREF(string); Py_XDECREF(string);
PRINTMARK();
Py_RETURN_NONE; Py_RETURN_NONE;
} }

@ -214,6 +214,14 @@ class UltraJSONTests(TestCase):
self.assertEquals(int(expected), json.loads(output)) self.assertEquals(int(expected), json.loads(output))
self.assertEquals(int(expected), ujson.decode(output)) self.assertEquals(int(expected), ujson.decode(output))
pass pass
def test_encodeToUTF8(self):
input = "\xe6\x97\xa5\xd1\x88"
enc = ujson.encode(input, ensure_ascii=false)
dec = ujson.decode(enc)
self.assertEquals(enc, json.dumps(input, encoding="utf-8"))
self.assertEquals(dec, json.loads(enc))
def test_encodeRecursionMax(self): def test_encodeRecursionMax(self):
# 8 is the max recursion depth # 8 is the max recursion depth

@ -16,11 +16,11 @@ PyObject* JSONFileToObj(PyObject* self, PyObject *file);
static PyMethodDef ujsonMethods[] = { static PyMethodDef ujsonMethods[] = {
{"encode", objToJSON, METH_O, "Converts arbitrary object recursivly into JSON"}, {"encode", objToJSON, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursivly into JSON. Use ensure_ascii=false to output UTF-8"},
{"decode", JSONToObj, METH_O, "Converts JSON as string to dict object structure"}, {"decode", JSONToObj, METH_O, "Converts JSON as string to dict object structure"},
{"dumps", objToJSON, METH_O, "Converts arbitrary object recursivly into JSON"}, {"dumps", objToJSON, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursivly into JSON. Use ensure_ascii=false to output UTF-8"},
{"loads", JSONToObj, METH_O, "Converts JSON as string to dict object structure"}, {"loads", JSONToObj, METH_O, "Converts JSON as string to dict object structure"},
{"dump", objToJSONFile, METH_VARARGS, "Converts arbitrary object recursively into JSON file"}, {"dump", objToJSONFile, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursively into JSON file. Use ensure_ascii=false to output UTF-8"},
{"load", JSONFileToObj, METH_O, "Converts JSON as file to dict object structure"}, {"load", JSONFileToObj, METH_O, "Converts JSON as file to dict object structure"},
{NULL, NULL, 0, NULL} /* Sentinel */ {NULL, NULL, 0, NULL} /* Sentinel */
}; };

@ -1 +1 @@
#define UJSON_VERSION "1.6" #define UJSON_VERSION "1.7"

@ -165,6 +165,10 @@ typedef void *(*JSPFN_MALLOC)(size_t size);
typedef void (*JSPFN_FREE)(void *pptr); typedef void (*JSPFN_FREE)(void *pptr);
typedef void *(*JSPFN_REALLOC)(void *base, size_t size); typedef void *(*JSPFN_REALLOC)(void *base, size_t size);
typedef int (*JSPFN_ESCAPESTRING)(JSOBJ obj, void *enc, const char *io, const char *end);
typedef struct __JSONObjectEncoder typedef struct __JSONObjectEncoder
{ {
void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc); void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc);
@ -224,6 +228,10 @@ typedef struct __JSONObjectEncoder
Configuration for max decimals of double floating poiunt numbers to encode (0-9) */ Configuration for max decimals of double floating poiunt numbers to encode (0-9) */
int doublePrecision; int doublePrecision;
/*
If true output will be ASCII with all characters above 127 encoded as \uXXXX. If false output will be UTF-8 or what ever charset strings are brought as */
int forceASCII;
/* /*
Set to an error message if error occured */ Set to an error message if error occured */
@ -237,6 +245,8 @@ typedef struct __JSONObjectEncoder
int heap; int heap;
int level; int level;
JSPFN_ESCAPESTRING EscapeString;
} JSONObjectEncoder; } JSONObjectEncoder;

@ -52,17 +52,20 @@ Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights rese
static const double g_pow10[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000}; static const double g_pow10[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000};
static const char g_hexChars[] = "0123456789abcdef"; static const char g_hexChars[] = "0123456789abcdef";
static const char g_escapeChars[] = "0123456789\\b\\t\\n\\f\\r\\\"\\\\"; static const char g_escapeChars[] = "0123456789\\b\\t\\n\\f\\r\\\"\\\\\\/";
/* /*
FIXME: While this is fine dandy and working it's a magic value mess which probably only the author understands. FIXME: While this is fine dandy and working it's a magic value mess which probably only the author understands.
Needs a cleanup and more documentation */ Needs a cleanup and more documentation */
static const JSUINT8 g_utf8LengthLookup[256] =
/*
Table for pure ascii output escaping all characters above 127 to \u00XXX */
static const JSUINT8 g_asciiOutputTable[256] =
{ {
/* 0x00 */ 0, 30, 30, 30, 30, 30, 30, 30, 10, 12, 14, 30, 16, 18, 30, 30, /* 0x00 */ 0, 30, 30, 30, 30, 30, 30, 30, 10, 12, 14, 30, 16, 18, 30, 30,
/* 0x10 */ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, /* 0x10 */ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
/* 0x20 */ 1, 1, 20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x20 */ 1, 1, 20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 24,
/* 0x30 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x30 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* 0x40 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* 0x50 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 22, 1, 1, 1, /* 0x50 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 22, 1, 1, 1,
@ -78,6 +81,7 @@ static const JSUINT8 g_utf8LengthLookup[256] =
/* 0xf0 */ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 /* 0xf0 */ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
}; };
static void SetError (JSOBJ obj, JSONObjectEncoder *enc, const char *message) static void SetError (JSOBJ obj, JSONObjectEncoder *enc, const char *message)
{ {
enc->errorMsg = message; enc->errorMsg = message;
@ -121,6 +125,71 @@ FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC Buffer_AppendShortHexUnchecked (c
*(outputOffset++) = g_hexChars[(value & 0x000f) >> 0]; *(outputOffset++) = g_hexChars[(value & 0x000f) >> 0];
} }
int Buffer_EscapeStringUnvalidated (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end)
{
char *of = (char *) enc->offset;
while (1)
{
switch (*io)
{
case 0x00:
enc->offset += (of - enc->offset);
return TRUE;
case '\"': (*of++) = '\\'; (*of++) = '\"'; break;
case '\\': (*of++) = '\\'; (*of++) = '\\'; break;
//case '/': (*of++) = '\\'; (*of++) = '/'; break;
case '\b': (*of++) = '\\'; (*of++) = 'b'; break;
case '\f': (*of++) = '\\'; (*of++) = 'f'; break;
case '\n': (*of++) = '\\'; (*of++) = 'n'; break;
case '\r': (*of++) = '\\'; (*of++) = 'r'; break;
case '\t': (*of++) = '\\'; (*of++) = 't'; break;
case 0x01:
case 0x02:
case 0x03:
case 0x04:
case 0x05:
case 0x06:
case 0x07:
case 0x0b:
case 0x0e:
case 0x0f:
case 0x10:
case 0x11:
case 0x12:
case 0x13:
case 0x14:
case 0x15:
case 0x16:
case 0x17:
case 0x18:
case 0x19:
case 0x1a:
case 0x1b:
case 0x1c:
case 0x1d:
case 0x1e:
case 0x1f:
*(of++) = '\\';
*(of++) = 'u';
*(of++) = '0';
*(of++) = '0';
*(of++) = g_hexChars[ (unsigned char) (((*io) & 0xf0) >> 4)];
*(of++) = g_hexChars[ (unsigned char) ((*io) & 0x0f)];
break;
default: (*of++) = (*io); break;
}
*io++;
}
return FALSE;
}
/* /*
FIXME: FIXME:
This code only works with Little and Big Endian This code only works with Little and Big Endian
@ -128,24 +197,17 @@ This code only works with Little and Big Endian
FIXME: The JSON spec says escape "/" but non of the others do and we don't FIXME: The JSON spec says escape "/" but non of the others do and we don't
want to be left alone doing it so we don't :) want to be left alone doing it so we don't :)
FIXME: It should be faster to do SHIFT and then AND instead of AND and SHIFT.
Example:
(x & 0x3f00) >> 8) => Longer/more opcodes than below
(x >> 8) & 0x3f) => Probably faster/smaller
Seems that atleast MSVC9 does this optimization by itself from time to time. Not sure really
*/ */
int Buffer_EscapeStringValidated (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end)
int Buffer_EscapeString (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end)
{ {
JSUTF32 ucs; JSUTF32 ucs;
char *of = (char *) enc->offset; char *of = (char *) enc->offset;
while (1) while (1)
{ {
//JSUINT8 chr = (unsigned char) *io; //JSUINT8 chr = (unsigned char) *io;
JSUINT8 utflen = g_utf8LengthLookup[(unsigned char) *io]; JSUINT8 utflen = g_asciiOutputTable[(unsigned char) *io];
switch (utflen) switch (utflen)
{ {
@ -277,6 +339,7 @@ int Buffer_EscapeString (JSOBJ obj, JSONObjectEncoder *enc, const char *io, cons
case 18: case 18:
case 20: case 20:
case 22: case 22:
//case 24: (enable for / escaping)
*(of++) = *( (char *) (g_escapeChars + utflen + 0)); *(of++) = *( (char *) (g_escapeChars + utflen + 0));
*(of++) = *( (char *) (g_escapeChars + utflen + 1)); *(of++) = *( (char *) (g_escapeChars + utflen + 1));
io ++; io ++;
@ -539,7 +602,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName)
if (name) if (name)
{ {
Buffer_AppendCharUnchecked(enc, '\"'); Buffer_AppendCharUnchecked(enc, '\"');
if (!Buffer_EscapeString(obj, enc, name, name + cbName)) if (!enc->EscapeString(obj, enc, name, name + cbName))
{ {
return; return;
} }
@ -680,7 +743,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName)
Buffer_Reserve(enc, ((szlen / 4) + 1) * 12); Buffer_Reserve(enc, ((szlen / 4) + 1) * 12);
Buffer_AppendCharUnchecked (enc, '\"'); Buffer_AppendCharUnchecked (enc, '\"');
if (!Buffer_EscapeString(obj, enc, value, value + szlen)) if (!enc->EscapeString(obj, enc, value, value + szlen))
{ {
enc->endTypeContext(obj, &tc); enc->endTypeContext(obj, &tc);
enc->level --; enc->level --;
@ -717,6 +780,16 @@ char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t
enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS; enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS;
} }
if (enc->forceASCII)
{
enc->EscapeString = Buffer_EscapeStringValidated;
}
else
{
enc->EscapeString = Buffer_EscapeStringUnvalidated;
}
if (_buffer == NULL) if (_buffer == NULL)
{ {
_cbBuffer = 32768; _cbBuffer = 32768;