From f08b4a7f7d8c6a99330cdada290ed2d0f248fed1 Mon Sep 17 00:00:00 2001 From: Michael R Sweet Date: Sun, 21 Dec 2003 15:01:15 +0000 Subject: [PATCH] More character entity fun, along with a reworking of UTF-8 support. --- CHANGES | 5 +- README | 2 +- index.html | 2 +- mxml-file.c | 801 ++++++++++++++++++++++++++-------------------------- 4 files changed, 410 insertions(+), 400 deletions(-) diff --git a/CHANGES b/CHANGES index 965d83d..3b00a94 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,4 @@ -README - 12/18/2003 +README - 12/21/2003 ------------------- CHANGES IN Mini-XML 1.3 @@ -6,7 +6,8 @@ CHANGES IN Mini-XML 1.3 - Fixes for mxmldoc. - Added support for reading standard HTML entity names. - mxmlLoadString/File() did not decode character - entities in attribute names or values. + entities in element names, attribute names, or + attribute values. - mxmlLoadString/File() would crash when loading non- conformant XML data under an existing parent (top) node. diff --git a/README b/README index ee153eb..78cb4c2 100644 --- a/README +++ b/README @@ -1,4 +1,4 @@ -README - 12/01/2003 +README - 12/21/2003 ------------------- diff --git a/index.html b/index.html index 3fbe165..4d46962 100644 --- a/index.html +++ b/index.html @@ -16,7 +16,7 @@ href="../index.html">Back to Home Page ]

Mini-XML Home Page

-

Current Release: v1.3, December 1, 2003
+

Current Release: v1.3, December 21, 2003
Download Source (.tar.gz 77k) | = (*buffer + *bufsize - 1)) + if (*bufptr >= (*buffer + *bufsize - 4)) { /* * Increase the size of the buffer... @@ -315,12 +324,102 @@ mxml_add_char(int ch, /* I - Character to add */ *buffer = newbuffer; } - *(*bufptr)++ = ch; + if (ch < 128) + { + /* + * Single byte ASCII... + */ + + *(*bufptr)++ = ch; + } + else if (ch < 2048) + { + /* + * Two-byte UTF-8... + */ + + *(*bufptr)++ = 0xc0 | (ch >> 6); + *(*bufptr)++ = 0x80 | (ch & 0x3f); + } + else if (ch < 65536) + { + /* + * Three-byte UTF-8... + */ + + *(*bufptr)++ = 0xe0 | (ch >> 12); + *(*bufptr)++ = 0x80 | ((ch >> 6) & 0x3f); + *(*bufptr)++ = 0x80 | (ch & 0x3f); + } + else + { + /* + * Four-byte UTF-8... + */ + + *(*bufptr)++ = 0xf0 | (ch >> 18); + *(*bufptr)++ = 0x80 | ((ch >> 12) & 0x3f); + *(*bufptr)++ = 0x80 | ((ch >> 6) & 0x3f); + *(*bufptr)++ = 0x80 | (ch & 0x3f); + } return (0); } +/* + * 'mxml_get_entity()' - Get the character corresponding to an entity... + */ + +static int /* O - Character value or EOF on error */ +mxml_get_entity(mxml_node_t *parent, /* I - Parent node */ + void *p, /* I - Pointer to source */ + int (*getc_cb)(void *)) + /* I - Get character function */ +{ + int ch; /* Current character */ + char entity[64], /* Entity string */ + *entptr; /* Pointer into entity */ + + + entptr = entity; + + while ((ch = (*getc_cb)(p)) != EOF) + if (!isalnum(ch) && ch != '#') + break; + else if (entptr < (entity + sizeof(entity) - 1)) + *entptr++ = ch; + else + { + mxml_error("Entity name too long under parent <%s>!", + parent ? parent->value.element.name : "null"); + break; + } + + *entptr = '\0'; + + if (ch != ';') + { + mxml_error("Character entity \"%s\" not terminated under parent <%s>!", + entity, parent ? parent->value.element.name : "null"); + return (EOF); + } + + if (entity[1] == '#') + { + if (entity[2] == 'x') + ch = strtol(entity + 3, NULL, 16); + else + ch = strtol(entity + 2, NULL, 10); + } + else if ((ch = mxmlEntityGetValue(entity)) < 0) + mxml_error("Entity name \"%s;\" not supported under parent <%s>!", + entity, parent ? parent->value.element.name : "null"); + + return (ch); +} + + /* * 'mxml_file_getc()' - Get a character from a file. */ @@ -328,7 +427,77 @@ mxml_add_char(int ch, /* I - Character to add */ static int /* O - Character or EOF */ mxml_file_getc(void *p) /* I - Pointer to file */ { - return (getc((FILE *)p)); + int ch, /* Character from file */ + temp; /* Temporary character */ + FILE *fp; /* Pointer to file */ + + + /* + * Read a character from the file and see if it is EOF or ASCII... + */ + + fp = (FILE *)p; + ch = getc(fp); + + if (ch == EOF || !(ch & 0x80)) + return (ch); + + /* + * Got a UTF-8 character; convert UTF-8 to Unicode and return... + */ + + if ((ch & 0xe0) == 0xc0) + { + /* + * Two-byte value... + */ + + if ((temp = getc(fp)) == EOF || (temp & 0xc0) != 0x80) + return (EOF); + + ch = ((ch & 0x1f) << 6) | (temp & 0x3f); + } + else if ((ch & 0xf0) == 0xe0) + { + /* + * Three-byte value... + */ + + if ((temp = getc(fp)) == EOF || (temp & 0xc0) != 0x80) + return (EOF); + + ch = ((ch & 0x0f) << 6) | (temp & 0x3f); + + if ((temp = getc(fp)) == EOF || (temp & 0xc0) != 0x80) + return (EOF); + + ch = (ch << 6) | (temp & 0x3f); + } + else if ((ch & 0xf8) == 0xf0) + { + /* + * Four-byte value... + */ + + if ((temp = getc(fp)) == EOF || (temp & 0xc0) != 0x80) + return (EOF); + + ch = ((ch & 0x07) << 6) | (temp & 0x3f); + + if ((temp = getc(fp)) == EOF || (temp & 0xc0) != 0x80) + return (EOF); + + ch = (ch << 6) | (temp & 0x3f); + + if ((temp = getc(fp)) == EOF || (temp & 0xc0) != 0x80) + return (EOF); + + ch = (ch << 6) | (temp & 0x3f); + } + else + return (EOF); + + return (ch); } @@ -340,7 +509,47 @@ static int /* O - 0 on success, -1 on failure */ mxml_file_putc(int ch, /* I - Character to write */ void *p) /* I - Pointer to file */ { - return (putc(ch, (FILE *)p)); + if (ch < 128) + return (putc(ch, (FILE *)p)); + else if (ch < 2048) + { + /* + * Two-byte UTF-8 character... + */ + + if (putc(0xc0 | (ch >> 6), (FILE *)p) < 0) + return (-1); + else + return (putc(0x80 | (ch & 0x3f), (FILE *)p)); + } + else if (ch < 65536) + { + /* + * Three-byte UTF-8 character... + */ + + if (putc(0xe0 | (ch >> 12), (FILE *)p) < 0) + return (-1); + else if (putc(0x80 | ((ch >> 6) & 0x3f), (FILE *)p) < 0) + return (-1); + else + return (putc(0x80 | (ch & 0x3f), (FILE *)p)); + } + else + { + /* + * Four-byte UTF-8 character... + */ + + if (putc(0xf0 | (ch >> 18), (FILE *)p) < 0) + return (-1); + else if (putc(0x80 | ((ch >> 12) & 0x3f), (FILE *)p) < 0) + return (-1); + else if (putc(0x80 | ((ch >> 6) & 0x3f), (FILE *)p) < 0) + return (-1); + else + return (putc(0x80 | (ch & 0x3f), (FILE *)p)); + } } @@ -470,11 +679,16 @@ mxml_load_data(mxml_node_t *top, /* I - Top node */ while ((ch = (*getc_cb)(p)) != EOF) if (isspace(ch) || ch == '>' || (ch == '/' && bufptr > buffer)) break; - else if (mxml_add_char(ch, &bufptr, &buffer, &bufsize)) + else if (ch == '&') { - free(buffer); - return (NULL); + if ((ch = mxml_get_entity(parent, p, getc_cb)) == EOF) + goto error; + + if (mxml_add_char(ch, &bufptr, &buffer, &bufsize)) + goto error; } + else if (mxml_add_char(ch, &bufptr, &buffer, &bufsize)) + goto error; else if ((bufptr - buffer) == 3 && !strncmp(buffer, "!--", 3)) break; @@ -491,10 +705,14 @@ mxml_load_data(mxml_node_t *top, /* I - Top node */ if (ch == '>' && bufptr > (buffer + 4) && !strncmp(bufptr - 2, "--", 2)) break; - else if (mxml_add_char(ch, &bufptr, &buffer, &bufsize)) + else { - free(buffer); - return (NULL); + if (ch == '&') + if ((ch = mxml_get_entity(parent, p, getc_cb)) == EOF) + goto error; + + if (mxml_add_char(ch, &bufptr, &buffer, &bufsize)) + goto error; } } @@ -532,10 +750,14 @@ mxml_load_data(mxml_node_t *top, /* I - Top node */ { if (ch == '>') break; - else if (mxml_add_char(ch, &bufptr, &buffer, &bufsize)) + else { - free(buffer); - return (NULL); + if (ch == '&') + if ((ch = mxml_get_entity(parent, p, getc_cb)) == EOF) + goto error; + + if (mxml_add_char(ch, &bufptr, &buffer, &bufsize)) + goto error; } } while ((ch = (*getc_cb)(p)) != EOF); @@ -661,125 +883,14 @@ mxml_load_data(mxml_node_t *top, /* I - Top node */ else if (ch == '&') { /* - * Add character entity to current buffer... Currently we only - * support <, &, >,  , ", &#nnn;, and &#xXXXX;... + * Add character entity to current buffer... */ - char entity[64], /* Entity string */ - *entptr; /* Pointer into entity */ - - - entity[0] = ch; - entptr = entity + 1; - - while ((ch = (*getc_cb)(p)) != EOF) - if (!isalnum(ch) && ch != '#') - break; - else if (entptr < (entity + sizeof(entity) - 1)) - *entptr++ = ch; - else - { - mxml_error("Entity name too long under parent <%s>!", - parent ? parent->value.element.name : "null"); - break; - } - - *entptr = '\0'; - - if (ch != ';') - { - mxml_error("Entity name \"%s\" not terminated under parent <%s>!", - entity, parent ? parent->value.element.name : "null"); - break; - } - - if (entity[1] == '#') - { - if (entity[2] == 'x') - ch = strtol(entity + 3, NULL, 16); - else - ch = strtol(entity + 2, NULL, 10); - } - else if ((ch = mxmlEntityGetValue(entity)) < 0) - { - mxml_error("Entity name \"%s;\" not supported under parent <%s>!", - entity, parent ? parent->value.element.name : "null"); - break; - } - - if (ch < 128) - { - /* - * Plain ASCII doesn't need special encoding... - */ - - if (mxml_add_char(ch, &bufptr, &buffer, &bufsize)) - { - free(buffer); - return (NULL); - } - } - else - { - /* - * Use UTF-8 encoding for the Unicode char... - */ + if ((ch = mxml_get_entity(parent, p, getc_cb)) == EOF) + goto error; - if (ch < 2048) - { - if (mxml_add_char(0xc0 | (ch >> 6), &bufptr, &buffer, &bufsize)) - { - free(buffer); - return (NULL); - } - if (mxml_add_char(0x80 | (ch & 63), &bufptr, &buffer, &bufsize)) - { - free(buffer); - return (NULL); - } - } - else if (ch < 65536) - { - if (mxml_add_char(0xe0 | (ch >> 12), &bufptr, &buffer, &bufsize)) - { - free(buffer); - return (NULL); - } - if (mxml_add_char(0x80 | ((ch >> 6) & 63), &bufptr, &buffer, &bufsize)) - { - free(buffer); - return (NULL); - } - if (mxml_add_char(0x80 | (ch & 63), &bufptr, &buffer, &bufsize)) - { - free(buffer); - return (NULL); - } - } - else - { - if (mxml_add_char(0xf0 | (ch >> 18), &bufptr, &buffer, &bufsize)) - { - free(buffer); - return (NULL); - } - if (mxml_add_char(0x80 | ((ch >> 12) & 63), &bufptr, &buffer, &bufsize)) - { - free(buffer); - return (NULL); - } - if (mxml_add_char(0x80 | ((ch >> 6) & 63), &bufptr, &buffer, &bufsize)) - { - free(buffer); - return (NULL); - } - if (mxml_add_char(0x80 | (ch & 63), &bufptr, &buffer, &bufsize)) - { - free(buffer); - return (NULL); - } - } - } + if (mxml_add_char(ch, &bufptr, &buffer, &bufsize)) + goto error; } else if (type == MXML_OPAQUE || !isspace(ch)) { @@ -788,10 +899,7 @@ mxml_load_data(mxml_node_t *top, /* I - Top node */ */ if (mxml_add_char(ch, &bufptr, &buffer, &bufsize)) - { - free(buffer); - return (NULL); - } + goto error; } } @@ -812,6 +920,16 @@ mxml_load_data(mxml_node_t *top, /* I - Top node */ } return (parent); + + /* + * Common error return... + */ + +error: + + free(buffer); + + return (NULL); } @@ -832,8 +950,6 @@ mxml_parse_element(mxml_node_t *node, /* I - Element node */ *ptr; /* Pointer into name/value */ int namesize, /* Size of name string */ valsize; /* Size of value string */ - char entity[64], /* Entity string */ - *entptr; /* Pointer into entity */ @@ -917,12 +1033,12 @@ mxml_parse_element(mxml_node_t *node, /* I - Element node */ while ((ch = (*getc_cb)(p)) != EOF) { + if (ch == '&') + if ((ch = mxml_get_entity(node, p, getc_cb)) == EOF) + goto error; + if (mxml_add_char(ch, &ptr, &name, &namesize)) - { - free(name); - free(value); - return (EOF); - } + goto error; if (ch == quote) break; @@ -937,11 +1053,14 @@ mxml_parse_element(mxml_node_t *node, /* I - Element node */ while ((ch = (*getc_cb)(p)) != EOF) if (isspace(ch) || ch == '=' || ch == '/' || ch == '>' || ch == '?') break; - else if (mxml_add_char(ch, &ptr, &name, &namesize)) + else { - free(name); - free(value); - return (EOF); + if (ch == '&') + if ((ch = mxml_get_entity(node, p, getc_cb)) == EOF) + goto error; + + if (mxml_add_char(ch, &ptr, &name, &namesize)) + goto error; } } @@ -972,129 +1091,14 @@ mxml_parse_element(mxml_node_t *node, /* I - Element node */ while ((ch = (*getc_cb)(p)) != EOF) if (ch == quote) break; - else if (ch == '&') - { - /* - * Add character entity to current buffer... - */ - - entity[0] = ch; - entptr = entity + 1; - - while ((ch = (*getc_cb)(p)) != EOF) - if (!isalnum(ch) && ch != '#') - break; - else if (entptr < (entity + sizeof(entity) - 1)) - *entptr++ = ch; - else - { - mxml_error("Entity name too long under node <%s>!", - node ? node->value.element.name : "null"); - break; - } - - *entptr = '\0'; - - if (ch != ';') - { - mxml_error("Entity name \"%s\" not terminated under node <%s>!", - entity, node ? node->value.element.name : "null"); - break; - } - - if (entity[1] == '#') - { - if (entity[2] == 'x') - ch = strtol(entity + 3, NULL, 16); - else - ch = strtol(entity + 2, NULL, 10); - } - else if ((ch = mxmlEntityGetValue(entity)) < 0) - { - mxml_error("Entity name \"%s;\" not supported under node <%s>!", - entity, node ? node->value.element.name : "null"); - break; - } - - if (ch < 128) - { - /* - * Plain ASCII doesn't need special encoding... - */ - - if (mxml_add_char(ch, &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - } - else - { - /* - * Use UTF-8 encoding for the Unicode char... - */ - - if (ch < 2048) - { - if (mxml_add_char(0xc0 | (ch >> 6), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - if (mxml_add_char(0x80 | (ch & 63), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - } - else if (ch < 65536) - { - if (mxml_add_char(0xe0 | (ch >> 12), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - if (mxml_add_char(0x80 | ((ch >> 6) & 63), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - if (mxml_add_char(0x80 | (ch & 63), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - } - else - { - if (mxml_add_char(0xf0 | (ch >> 18), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - if (mxml_add_char(0x80 | ((ch >> 12) & 63), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - if (mxml_add_char(0x80 | ((ch >> 6) & 63), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - if (mxml_add_char(0x80 | (ch & 63), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - } - } - } - else if (mxml_add_char(ch, &ptr, &value, &valsize)) + else { - free(name); - free(value); - return (EOF); + if (ch == '&') + if ((ch = mxml_get_entity(node, p, getc_cb)) == EOF) + goto error; + + if (mxml_add_char(ch, &ptr, &value, &valsize)) + goto error; } *ptr = '\0'; @@ -1111,129 +1115,14 @@ mxml_parse_element(mxml_node_t *node, /* I - Element node */ while ((ch = (*getc_cb)(p)) != EOF) if (isspace(ch) || ch == '=' || ch == '/' || ch == '>') break; - else if (ch == '&') - { - /* - * Add character entity to current buffer... - */ - - entity[0] = ch; - entptr = entity + 1; - - while ((ch = (*getc_cb)(p)) != EOF) - if (!isalnum(ch) && ch != '#') - break; - else if (entptr < (entity + sizeof(entity) - 1)) - *entptr++ = ch; - else - { - mxml_error("Entity name too long under node <%s>!", - node ? node->value.element.name : "null"); - break; - } - - *entptr = '\0'; - - if (ch != ';') - { - mxml_error("Entity name \"%s\" not terminated under node <%s>!", - entity, node ? node->value.element.name : "null"); - break; - } - - if (entity[1] == '#') - { - if (entity[2] == 'x') - ch = strtol(entity + 3, NULL, 16); - else - ch = strtol(entity + 2, NULL, 10); - } - else if ((ch = mxmlEntityGetValue(entity)) < 0) - { - mxml_error("Entity name \"%s;\" not supported under node <%s>!", - entity, node ? node->value.element.name : "null"); - break; - } - - if (ch < 128) - { - /* - * Plain ASCII doesn't need special encoding... - */ - - if (mxml_add_char(ch, &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - } - else - { - /* - * Use UTF-8 encoding for the Unicode char... - */ - - if (ch < 2048) - { - if (mxml_add_char(0xc0 | (ch >> 6), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - if (mxml_add_char(0x80 | (ch & 63), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - } - else if (ch < 65536) - { - if (mxml_add_char(0xe0 | (ch >> 12), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - if (mxml_add_char(0x80 | ((ch >> 6) & 63), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - if (mxml_add_char(0x80 | (ch & 63), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - } - else - { - if (mxml_add_char(0xf0 | (ch >> 18), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - if (mxml_add_char(0x80 | ((ch >> 12) & 63), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - if (mxml_add_char(0x80 | ((ch >> 6) & 63), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - if (mxml_add_char(0x80 | (ch & 63), &ptr, &value, &valsize)) - { - free(value); - return (NULL); - } - } - } - } - else if (mxml_add_char(ch, &ptr, &value, &valsize)) + else { - free(name); - free(value); - return (EOF); + if (ch == '&') + if ((ch = mxml_get_entity(node, p, getc_cb)) == EOF) + goto error; + + if (mxml_add_char(ch, &ptr, &value, &valsize)) + goto error; } *ptr = '\0'; @@ -1287,6 +1176,17 @@ mxml_parse_element(mxml_node_t *node, /* I - Element node */ free(value); return (ch); + + /* + * Common error return point... + */ + +error: + + free(name); + free(value); + + return (EOF); } @@ -1305,8 +1205,65 @@ mxml_string_getc(void *p) /* I - Pointer to file */ if ((ch = *s[0]) != 0) { + /* + * Got character; convert UTF-8 to integer and return... + */ + (*s)++; - return (ch); + + if (!(ch & 0x80)) + return (ch); + else if ((ch & 0xe0) == 0xc0) + { + /* + * Two-byte value... + */ + + if ((*s[0] & 0xc0) != 0x80) + return (EOF); + + ch = ((ch & 0x1f) << 6) | (*s[0] & 0x3f); + + (*s)++; + + return (ch); + } + else if ((ch & 0xf0) == 0xe0) + { + /* + * Three-byte value... + */ + + if ((*s[0] & 0xc0) != 0x80 || + (*s[1] & 0xc0) != 0x80) + return (EOF); + + ch = ((((ch & 0x0f) << 6) | (*s[0] & 0x3f)) << 6) | (*s[1] & 0x3f); + + (*s) += 2; + + return (ch); + } + else if ((ch & 0xf8) == 0xf0) + { + /* + * Four-byte value... + */ + + if ((*s[0] & 0xc0) != 0x80 || + (*s[1] & 0xc0) != 0x80 || + (*s[2] & 0xc0) != 0x80) + return (EOF); + + ch = ((((((ch & 0x07) << 6) | (*s[0] & 0x3f)) << 6) | + (*s[1] & 0x3f)) << 6) | (*s[2] & 0x3f); + + (*s) += 3; + + return (ch); + } + else + return (EOF); } else return (EOF); @@ -1326,10 +1283,62 @@ mxml_string_putc(int ch, /* I - Character to write */ pp = (char **)p; - if (pp[0] < pp[1]) - pp[0][0] = ch; + if (ch < 128) + { + /* + * Plain ASCII doesn't need special encoding... + */ + + if (pp[0] < pp[1]) + pp[0][0] = ch; + + pp[0] ++; + } + else if (ch < 2048) + { + /* + * Two-byte UTF-8 character... + */ - pp[0] ++; + if ((pp[0] + 1) < pp[1]) + { + pp[0][0] = 0xc0 | (ch >> 6); + pp[0][1] = 0x80 | (ch & 0x3f); + } + + pp[0] += 2; + } + else if (ch < 65536) + { + /* + * Three-byte UTF-8 character... + */ + + if ((pp[0] + 2) < pp[1]) + { + pp[0][0] = 0xe0 | (ch >> 12); + pp[0][1] = 0x80 | ((ch >> 6) & 0x3f); + pp[0][2] = 0x80 | (ch & 0x3f); + } + + pp[0] += 3; + } + else + { + /* + * Four-byte UTF-8 character... + */ + + if ((pp[0] + 2) < pp[1]) + { + pp[0][0] = 0xf0 | (ch >> 18); + pp[0][1] = 0x80 | ((ch >> 12) & 0x3f); + pp[0][2] = 0x80 | ((ch >> 6) & 0x3f); + pp[0][3] = 0x80 | (ch & 0x3f); + } + + pp[0] += 4; + } return (0); } @@ -1720,5 +1729,5 @@ mxml_write_ws(mxml_node_t *node, /* I - Current node */ /* - * End of "$Id: mxml-file.c,v 1.26 2003/12/19 02:56:11 mike Exp $". + * End of "$Id: mxml-file.c,v 1.27 2003/12/21 15:01:15 mike Exp $". */