From 503ad6ba9921d0c301d2234492e274351077e58b Mon Sep 17 00:00:00 2001 From: kindy lin Date: Fri, 18 May 2012 16:24:10 +0800 Subject: [PATCH 1/3] Add encode_escape_utf8() method for escape utf8 to \uxxxx --- lua_cjson.c | 89 +++++++++++++++++++++++++++++++++++++++++++++----- utf8_decoder.h | 55 +++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+), 9 deletions(-) create mode 100644 utf8_decoder.h diff --git a/lua_cjson.c b/lua_cjson.c index c14a1c5c..48995fdc 100644 --- a/lua_cjson.c +++ b/lua_cjson.c @@ -45,6 +45,7 @@ #include "strbuf.h" #include "fpconv.h" +#include "utf8_decoder.h" #ifndef CJSON_MODNAME #define CJSON_MODNAME "cjson" @@ -124,6 +125,7 @@ typedef struct { int encode_invalid_numbers; /* 2 => Encode as "null" */ int encode_number_precision; int encode_keep_buffer; + int encode_escape_utf8; /* 0, 1, -i -> char i */ int decode_invalid_numbers; int decode_max_depth; @@ -300,6 +302,29 @@ static int json_cfg_encode_number_precision(lua_State *l) return json_integer_option(l, 1, &cfg->encode_number_precision, 1, 14); } +/* Configures JSON encoding converting utf-8 text to \uxxxx */ +/* false, true, char */ +static int json_cfg_encode_escape_utf8(lua_State *l) +{ + json_config_t *cfg = json_arg_init(l, 1); + const unsigned char *str; + size_t len; + + switch (lua_type(l, 1)) { + case LUA_TBOOLEAN: + cfg->encode_escape_utf8 = lua_toboolean(l, 1) ? 1 : 0; + break; + case LUA_TSTRING: + str = (unsigned char* )lua_tolstring(l, 1, &len); + if (len > 0) { + cfg->encode_escape_utf8 = -1 * (unsigned int) str[0]; + } + break; + } + + return 1; +} + /* Configures JSON encoding buffer persistence */ static int json_cfg_encode_keep_buffer(lua_State *l) { @@ -454,17 +479,25 @@ static void json_encode_exception(lua_State *l, json_config_t *cfg, strbuf_t *js /* json_append_string args: * - lua_State + * - JSON config * - JSON strbuf * - String (Lua stack index) * * Returns nothing. Doesn't remove string from Lua stack */ -static void json_append_string(lua_State *l, strbuf_t *json, int lindex) +static void json_append_string(lua_State *l, json_config_t *cfg, strbuf_t *json, int lindex) { const char *escstr; int i; const char *str; size_t len; + /* for utf8 escape */ + uint32_t prev_state = 0; + uint32_t state = 0; + uint32_t codepoint; + unsigned char utf8_place_char; + char utf8_buf[7]; + str = lua_tolstring(l, lindex, &len); /* Worst case is len * 6 (all unicode escapes). @@ -474,13 +507,50 @@ static void json_append_string(lua_State *l, strbuf_t *json, int lindex) strbuf_ensure_empty_length(json, len * 6 + 2); strbuf_append_char_unsafe(json, '\"'); - for (i = 0; i < len; i++) { - escstr = char2escape[(unsigned char)str[i]]; - if (escstr) - strbuf_append_string(json, escstr); - else - strbuf_append_char_unsafe(json, str[i]); + + if (cfg->encode_escape_utf8 < 0 || cfg->encode_escape_utf8 == 1) { + utf8_place_char = cfg->encode_escape_utf8 == 1 ? 0 : ((unsigned char) (-1 * cfg->encode_escape_utf8)); + /* fprintf(stderr, "%c", utf8_place_char); */ + + for (i = 0; i < len; prev_state = state, ++i) { + switch (utf8_decode(&state, &codepoint, (unsigned char) str[i])) { + case UTF8_ACCEPT: + /* A properly encoded character has been found. */ + if (codepoint >= 0 && codepoint <= 127) { + escstr = char2escape[(unsigned char) codepoint]; + if (escstr) + strbuf_append_string(json, escstr); + else + strbuf_append_char_unsafe(json, codepoint); + } else { + snprintf(utf8_buf, 7, "\\u%04x", codepoint); + strbuf_append_string(json, utf8_buf); + } + break; + + case UTF8_REJECT: + /* The byte is invalid, replace it and restart. */ + if (! utf8_place_char) { + luaL_error(l, "utf8 string (%s) invalid at (%d)", str, i); + } + + strbuf_append_char_unsafe(json, utf8_place_char); + state = UTF8_ACCEPT; + if (prev_state != UTF8_ACCEPT) + --i; + break; + } + } + } else { + for (i = 0; i < len; i++) { + escstr = char2escape[(unsigned char)str[i]]; + if (escstr) + strbuf_append_string(json, escstr); + else + strbuf_append_char_unsafe(json, str[i]); + } } + strbuf_append_char_unsafe(json, '\"'); } @@ -645,7 +715,7 @@ static void json_append_object(lua_State *l, json_config_t *cfg, json_append_number(l, cfg, json, -2); strbuf_append_mem(json, "\":", 2); } else if (keytype == LUA_TSTRING) { - json_append_string(l, json, -2); + json_append_string(l, cfg, json, -2); strbuf_append_char(json, ':'); } else { json_encode_exception(l, cfg, json, -2, @@ -670,7 +740,7 @@ static void json_append_data(lua_State *l, json_config_t *cfg, switch (lua_type(l, -1)) { case LUA_TSTRING: - json_append_string(l, json, -1); + json_append_string(l, cfg, json, -1); break; case LUA_TNUMBER: json_append_number(l, cfg, json, -1); @@ -1357,6 +1427,7 @@ static int lua_cjson_new(lua_State *l) { "decode_max_depth", json_cfg_decode_max_depth }, { "encode_number_precision", json_cfg_encode_number_precision }, { "encode_keep_buffer", json_cfg_encode_keep_buffer }, + { "encode_escape_utf8", json_cfg_encode_escape_utf8 }, { "encode_invalid_numbers", json_cfg_encode_invalid_numbers }, { "decode_invalid_numbers", json_cfg_decode_invalid_numbers }, { "new", lua_cjson_new }, diff --git a/utf8_decoder.h b/utf8_decoder.h new file mode 100644 index 00000000..07a1a353 --- /dev/null +++ b/utf8_decoder.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2008-2010 Bjoern Hoehrmann + * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + * */ + +#include + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 12 + +static const uint8_t utf8d[] = { + /* The first part of the table maps bytes to character classes that + * to reduce the size of the transition table and create bitmasks. + * */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + /* The second part is a transition table that maps a combination + * of a state of the automaton and a character class to a state. + * */ + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12 +}; + +uint32_t inline +utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state + type]; + return *state; +} + +int +utf8_valide(uint8_t* s) { + uint32_t codepoint, state = 0; + + while (*s) + utf8_decode(&state, &codepoint, *s++); + + return state == UTF8_ACCEPT; +} + From 18bcb78679ac07525e4a4a865971f4d34874e253 Mon Sep 17 00:00:00 2001 From: kindy lin Date: Wed, 23 May 2012 11:34:21 +0800 Subject: [PATCH 2/3] Init encode_escape_utf8 in config init --- lua_cjson.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lua_cjson.c b/lua_cjson.c index 48995fdc..b30717b4 100644 --- a/lua_cjson.c +++ b/lua_cjson.c @@ -68,6 +68,7 @@ #define DEFAULT_ENCODE_INVALID_NUMBERS 0 #define DEFAULT_DECODE_INVALID_NUMBERS 1 #define DEFAULT_ENCODE_KEEP_BUFFER 1 +#define DEFAULT_ENCODE_ESCAPE_UTF8 0 #define DEFAULT_ENCODE_NUMBER_PRECISION 14 #ifdef DISABLE_INVALID_NUMBERS @@ -414,6 +415,7 @@ static void json_create_config(lua_State *l) cfg->encode_invalid_numbers = DEFAULT_ENCODE_INVALID_NUMBERS; cfg->decode_invalid_numbers = DEFAULT_DECODE_INVALID_NUMBERS; cfg->encode_keep_buffer = DEFAULT_ENCODE_KEEP_BUFFER; + cfg->encode_escape_utf8 = DEFAULT_ENCODE_ESCAPE_UTF8; cfg->encode_number_precision = DEFAULT_ENCODE_NUMBER_PRECISION; #if DEFAULT_ENCODE_KEEP_BUFFER > 0 From 8317e466bce60b8e73c5c510d1da6b6bda0d2b04 Mon Sep 17 00:00:00 2001 From: Kindy Lin Date: Thu, 13 Dec 2012 11:49:03 +0800 Subject: [PATCH 3/3] add manual for encode_escape_utf8 --- manual.txt | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/manual.txt b/manual.txt index a12e3785..5665c262 100644 --- a/manual.txt +++ b/manual.txt @@ -185,6 +185,7 @@ setting = cjson.encode_invalid_numbers([setting]) keep = cjson.encode_keep_buffer([keep]) depth = cjson.encode_max_depth([depth]) depth = cjson.decode_max_depth([depth]) +escape = cjson.encode_escape_utf8([escape]) convert, ratio, safe = cjson.encode_sparse_array([convert[, ratio[, safe]]]) ------------ @@ -328,6 +329,28 @@ The current setting is always returned, and is only updated when an argument is provided. +[[encode_escape_utf8]] +encode_escape_utf8 +~~~~~~ + +[source,lua] +------------ +escape = cjson.encode_escape_utf8([escape]) +-- escape can be true, false, or any char. Default: false. +------------ + +Encode all utf-8 char to '\uffff' style. + +escape + +.Example: Escape utf-8 char +[source,lua] +value = { "中文" } +cjson.encode_escape_utf8(true) +json_text = cjson.encode(value) +-- Returns: '["\u4e2d\u6587"]' + + [[encode]] encode ~~~~~~