From 21a8a295cb30eadb3d490aa706ee3d35818b80e0 Mon Sep 17 00:00:00 2001 From: Harry Roberts Date: Tue, 27 Nov 2012 18:43:54 +0000 Subject: [PATCH] UTF-8 encoding stol... ahem, *borrowed* from akheron/jansson I realized it is no fun to write this stuff, so I re-used what is presumably a well tested library. --- parson.c | 233 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 220 insertions(+), 13 deletions(-) diff --git a/parson.c b/parson.c index f7c009f..7a8d318 100644 --- a/parson.c +++ b/parson.c @@ -651,6 +651,223 @@ void json_value_free(JSON_Value *value) { /* -------------------------------------------------------------------------- */ +/* + * Copyright (c) 2009-2012 Petri Lehtinen + * + * Jansson is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + */ + +int utf8_check_first(char byte) +{ + unsigned char u = (unsigned char)byte; + + if(u < 0x80) + return 1; + + if(0x80 <= u && u <= 0xBF) { + /* second, third or fourth byte of a multi-byte + sequence, i.e. a "continuation byte" */ + return 0; + } + else if(u == 0xC0 || u == 0xC1) { + /* overlong encoding of an ASCII byte */ + return 0; + } + else if(0xC2 <= u && u <= 0xDF) { + /* 2-byte sequence */ + return 2; + } + + else if(0xE0 <= u && u <= 0xEF) { + /* 3-byte sequence */ + return 3; + } + else if(0xF0 <= u && u <= 0xF4) { + /* 4-byte sequence */ + return 4; + } + else { /* u >= 0xF5 */ + /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid + UTF-8 */ + return 0; + } +} + +int utf8_check_full(const char *buffer, int size, int *codepoint) +{ + int i; + int value = 0; + unsigned char u = (unsigned char)buffer[0]; + + if(size == 2) + { + value = u & 0x1F; + } + else if(size == 3) + { + value = u & 0xF; + } + else if(size == 4) + { + value = u & 0x7; + } + else + return 0; + + for(i = 1; i < size; i++) + { + u = (unsigned char)buffer[i]; + + if(u < 0x80 || u > 0xBF) { + /* not a continuation byte */ + return 0; + } + + value = (value << 6) + (u & 0x3F); + } + + if(value > 0x10FFFF) { + /* not in Unicode range */ + return 0; + } + + else if(0xD800 <= value && value <= 0xDFFF) { + /* invalid code point (UTF-16 surrogate halves) */ + return 0; + } + + else if((size == 2 && value < 0x80) || + (size == 3 && value < 0x800) || + (size == 4 && value < 0x10000)) { + /* overlong encoding */ + return 0; + } + + if(codepoint) + *codepoint = value; + + return 1; +} + +const char *utf8_iterate(const char *buffer, int *codepoint) +{ + int count; + int value; + + if(!*buffer) + return buffer; + + count = utf8_check_first(buffer[0]); + if(count <= 0) + return NULL; + + if(count == 1) + value = (unsigned char)buffer[0]; + else + { + if(!utf8_check_full(buffer, count, &value)) + return NULL; + } + + if(codepoint) + *codepoint = value; + + return buffer + count; +} + + +static void json_serialize_string(const char *str, json_print_cb dump, void *data) +{ + const char *pos, *end; + int codepoint; + + dump("\"", 1, data); + + end = pos = str; + while(1) + { + const char *text; + char seq[13]; + int length; + + while(*end) + { + end = utf8_iterate(pos, &codepoint); + if(!end) + return -1; + + /* mandatory escape or control char */ + if(codepoint == '\\' || codepoint == '"' || codepoint < 0x20) + break; + + /* slash */ + if(codepoint == '/') + break; + + /* non-ASCII */ + if(codepoint > 0x7F) + break; + + pos = end; + } + + if(pos != str) { + dump(str, pos - str, data); + } + + if(end == pos) + break; + + /* handle \, /, ", and control codes */ + length = 2; + switch(codepoint) + { + case '\\': text = "\\\\"; break; + case '\"': text = "\\\""; break; + case '\b': text = "\\b"; break; + case '\f': text = "\\f"; break; + case '\n': text = "\\n"; break; + case '\r': text = "\\r"; break; + case '\t': text = "\\t"; break; + case '/': text = "\\/"; break; + default: + { + /* codepoint is in BMP */ + if(codepoint < 0x10000) + { + sprintf(seq, "\\u%04x", codepoint); + length = 6; + } + + /* not in BMP -> construct a UTF-16 surrogate pair */ + else + { + int first, last; + + codepoint -= 0x10000; + first = 0xD800 | ((codepoint & 0xffc00) >> 10); + last = 0xDC00 | (codepoint & 0x003ff); + + sprintf(seq, "\\u%04x\\u%04x", first, last); + length = 12; + } + + text = seq; + break; + } + } + + dump(text, length, data); + + str = pos = end; + } + + dump("\"", 1, data); +} + +/* -------------------------------------------------------------------------- */ + struct _serialize_str { char *str; size_t len; @@ -659,7 +876,8 @@ struct _serialize_str { /* * Callback which appends data to the string */ -void _serialize_str_cb( const char *str, size_t len, void *data ) { +static void +_serialize_str_cb( const char *str, size_t len, void *data ) { struct _serialize_str *self = (struct _serialize_str*)data; if( self->str == NULL ) { self->str = parson_malloc(len + 1); @@ -677,16 +895,6 @@ void _serialize_str_cb( const char *str, size_t len, void *data ) { } } -static void -json_serialize_string( const char *str, json_print_cb cb, void *arg ) { - cb("\"", 1, arg); - /* TODO: handle unicode encoding of string? - * XXX: need to be aware of unicode escape sequences so that we don't mess up - */ - cb(str, strlen(str), arg); - cb("\"", 1, arg); -} - static void json_serialize_number( const JSON_Value *value, json_print_cb cb, void *arg ) { char buf[64]; @@ -771,8 +979,7 @@ json_serialize_cb( const JSON_Value *value, json_print_cb cb, void *arg ) { break; default: - printf("Got unknown type!\n"); - /* XXX: this shouldn't ever happen! */ + assert( 0 ); /* XXX: this shouldn't ever happen! */ break; } }