Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 82 additions & 9 deletions lua_cjson.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@

#include "strbuf.h"
#include "fpconv.h"
#include "utf8_decoder.h"

#ifndef CJSON_MODNAME
#define CJSON_MODNAME "cjson"
Expand All @@ -67,6 +68,7 @@
#define DEFAULT_ENCODE_INVALID_NUMBERS 0
#define DEFAULT_DECODE_INVALID_NUMBERS 1
#define DEFAULT_ENCODE_KEEP_BUFFER 1
#define DEFAULT_ENCODE_ESCAPE_UTF8 0
#define DEFAULT_ENCODE_NUMBER_PRECISION 14

#ifdef DISABLE_INVALID_NUMBERS
Expand Down Expand Up @@ -124,6 +126,7 @@ typedef struct {
int encode_invalid_numbers; /* 2 => Encode as "null" */
int encode_number_precision;
int encode_keep_buffer;
int encode_escape_utf8; /* 0, 1, -i -> char i */

int decode_invalid_numbers;
int decode_max_depth;
Expand Down Expand Up @@ -300,6 +303,29 @@ static int json_cfg_encode_number_precision(lua_State *l)
return json_integer_option(l, 1, &cfg->encode_number_precision, 1, 14);
}

/* Configures JSON encoding converting utf-8 text to \uxxxx */
/* false, true, char */
static int json_cfg_encode_escape_utf8(lua_State *l)
{
json_config_t *cfg = json_arg_init(l, 1);
const unsigned char *str;
size_t len;

switch (lua_type(l, 1)) {
case LUA_TBOOLEAN:
cfg->encode_escape_utf8 = lua_toboolean(l, 1) ? 1 : 0;
break;
case LUA_TSTRING:
str = (unsigned char* )lua_tolstring(l, 1, &len);
if (len > 0) {
cfg->encode_escape_utf8 = -1 * (unsigned int) str[0];
}
break;
}

return 1;
}

/* Configures JSON encoding buffer persistence */
static int json_cfg_encode_keep_buffer(lua_State *l)
{
Expand Down Expand Up @@ -389,6 +415,7 @@ static void json_create_config(lua_State *l)
cfg->encode_invalid_numbers = DEFAULT_ENCODE_INVALID_NUMBERS;
cfg->decode_invalid_numbers = DEFAULT_DECODE_INVALID_NUMBERS;
cfg->encode_keep_buffer = DEFAULT_ENCODE_KEEP_BUFFER;
cfg->encode_escape_utf8 = DEFAULT_ENCODE_ESCAPE_UTF8;
cfg->encode_number_precision = DEFAULT_ENCODE_NUMBER_PRECISION;

#if DEFAULT_ENCODE_KEEP_BUFFER > 0
Expand Down Expand Up @@ -454,17 +481,25 @@ static void json_encode_exception(lua_State *l, json_config_t *cfg, strbuf_t *js

/* json_append_string args:
* - lua_State
* - JSON config
* - JSON strbuf
* - String (Lua stack index)
*
* Returns nothing. Doesn't remove string from Lua stack */
static void json_append_string(lua_State *l, strbuf_t *json, int lindex)
static void json_append_string(lua_State *l, json_config_t *cfg, strbuf_t *json, int lindex)
{
const char *escstr;
int i;
const char *str;
size_t len;

/* for utf8 escape */
uint32_t prev_state = 0;
uint32_t state = 0;
uint32_t codepoint;
unsigned char utf8_place_char;
char utf8_buf[7];

str = lua_tolstring(l, lindex, &len);

/* Worst case is len * 6 (all unicode escapes).
Expand All @@ -474,13 +509,50 @@ static void json_append_string(lua_State *l, strbuf_t *json, int lindex)
strbuf_ensure_empty_length(json, len * 6 + 2);

strbuf_append_char_unsafe(json, '\"');
for (i = 0; i < len; i++) {
escstr = char2escape[(unsigned char)str[i]];
if (escstr)
strbuf_append_string(json, escstr);
else
strbuf_append_char_unsafe(json, str[i]);

if (cfg->encode_escape_utf8 < 0 || cfg->encode_escape_utf8 == 1) {
utf8_place_char = cfg->encode_escape_utf8 == 1 ? 0 : ((unsigned char) (-1 * cfg->encode_escape_utf8));
/* fprintf(stderr, "%c", utf8_place_char); */

for (i = 0; i < len; prev_state = state, ++i) {
switch (utf8_decode(&state, &codepoint, (unsigned char) str[i])) {
case UTF8_ACCEPT:
/* A properly encoded character has been found. */
if (codepoint >= 0 && codepoint <= 127) {
escstr = char2escape[(unsigned char) codepoint];
if (escstr)
strbuf_append_string(json, escstr);
else
strbuf_append_char_unsafe(json, codepoint);
} else {
snprintf(utf8_buf, 7, "\\u%04x", codepoint);
strbuf_append_string(json, utf8_buf);
}
break;

case UTF8_REJECT:
/* The byte is invalid, replace it and restart. */
if (! utf8_place_char) {
luaL_error(l, "utf8 string (%s) invalid at (%d)", str, i);
}

strbuf_append_char_unsafe(json, utf8_place_char);
state = UTF8_ACCEPT;
if (prev_state != UTF8_ACCEPT)
--i;
break;
}
}
} else {
for (i = 0; i < len; i++) {
escstr = char2escape[(unsigned char)str[i]];
if (escstr)
strbuf_append_string(json, escstr);
else
strbuf_append_char_unsafe(json, str[i]);
}
}

strbuf_append_char_unsafe(json, '\"');
}

Expand Down Expand Up @@ -645,7 +717,7 @@ static void json_append_object(lua_State *l, json_config_t *cfg,
json_append_number(l, cfg, json, -2);
strbuf_append_mem(json, "\":", 2);
} else if (keytype == LUA_TSTRING) {
json_append_string(l, json, -2);
json_append_string(l, cfg, json, -2);
strbuf_append_char(json, ':');
} else {
json_encode_exception(l, cfg, json, -2,
Expand All @@ -670,7 +742,7 @@ static void json_append_data(lua_State *l, json_config_t *cfg,

switch (lua_type(l, -1)) {
case LUA_TSTRING:
json_append_string(l, json, -1);
json_append_string(l, cfg, json, -1);
break;
case LUA_TNUMBER:
json_append_number(l, cfg, json, -1);
Expand Down Expand Up @@ -1357,6 +1429,7 @@ static int lua_cjson_new(lua_State *l)
{ "decode_max_depth", json_cfg_decode_max_depth },
{ "encode_number_precision", json_cfg_encode_number_precision },
{ "encode_keep_buffer", json_cfg_encode_keep_buffer },
{ "encode_escape_utf8", json_cfg_encode_escape_utf8 },
{ "encode_invalid_numbers", json_cfg_encode_invalid_numbers },
{ "decode_invalid_numbers", json_cfg_decode_invalid_numbers },
{ "new", lua_cjson_new },
Expand Down
23 changes: 23 additions & 0 deletions manual.txt
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ setting = cjson.encode_invalid_numbers([setting])
keep = cjson.encode_keep_buffer([keep])
depth = cjson.encode_max_depth([depth])
depth = cjson.decode_max_depth([depth])
escape = cjson.encode_escape_utf8([escape])
convert, ratio, safe = cjson.encode_sparse_array([convert[, ratio[, safe]]])
------------

Expand Down Expand Up @@ -328,6 +329,28 @@ The current setting is always returned, and is only updated when an
argument is provided.


[[encode_escape_utf8]]
encode_escape_utf8
~~~~~~

[source,lua]
------------
escape = cjson.encode_escape_utf8([escape])
-- escape can be true, false, or any char. Default: false.
------------

Encode all utf-8 char to '\uffff' style.

escape

.Example: Escape utf-8 char
[source,lua]
value = { "中文" }
cjson.encode_escape_utf8(true)
json_text = cjson.encode(value)
-- Returns: '["\u4e2d\u6587"]'


[[encode]]
encode
~~~~~~
Expand Down
55 changes: 55 additions & 0 deletions utf8_decoder.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
* */

#include <stdint.h>

#define UTF8_ACCEPT 0
#define UTF8_REJECT 12

static const uint8_t utf8d[] = {
/* The first part of the table maps bytes to character classes that
* to reduce the size of the transition table and create bitmasks.
* */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

/* The second part is a transition table that maps a combination
* of a state of the automaton and a character class to a state.
* */
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12
};

uint32_t inline
utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
uint32_t type = utf8d[byte];

*codep = (*state != UTF8_ACCEPT) ?
(byte & 0x3fu) | (*codep << 6) :
(0xff >> type) & (byte);

*state = utf8d[256 + *state + type];
return *state;
}

int
utf8_valide(uint8_t* s) {
uint32_t codepoint, state = 0;

while (*s)
utf8_decode(&state, &codepoint, *s++);

return state == UTF8_ACCEPT;
}