diff options
-rw-r--r-- | LICENSE | 15 | ||||
-rw-r--r-- | xml.c | 454 | ||||
-rw-r--r-- | xml.h | 47 | ||||
-rw-r--r-- | xml2tsv.c | 228 |
4 files changed, 744 insertions, 0 deletions
@@ -0,0 +1,15 @@ +ISC License + +Copyright (c) 2020 Vincenzo "KatolaZ" Nicosia <katolaz@freaknet.org> + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. @@ -0,0 +1,454 @@ +#include <sys/types.h> + +#include <ctype.h> +#include <errno.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "xml.h" + +static void +xml_parseattrs(XMLParser *x) +{ + size_t namelen = 0, valuelen; + int c, endsep, endname = 0, valuestart = 0; + + while ((c = GETNEXT()) != EOF) { + if (isspace(c)) { + if (namelen) + endname = 1; + continue; + } else if (c == '?') + ; /* ignore */ + else if (c == '=') { + x->name[namelen] = '\0'; + valuestart = 1; + endname = 1; + } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) { + /* attribute without value */ + x->name[namelen] = '\0'; + if (x->xmlattrstart) + x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0); + if (x->xmlattrend) + x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); + endname = 0; + x->name[0] = c; + namelen = 1; + } else if (namelen && valuestart) { + /* attribute with value */ + if (x->xmlattrstart) + x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); + + valuelen = 0; + if (c == '\'' || c == '"') { + endsep = c; + } else { + endsep = ' '; /* isspace() */ + goto startvalue; + } + + while ((c = GETNEXT()) != EOF) { +startvalue: + if (c == '&') { /* entities */ + x->data[valuelen] = '\0'; + /* call data function with data before entity if there is data */ + if (valuelen && x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + while ((c = GETNEXT()) != EOF) { + if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) + break; + if (valuelen < sizeof(x->data) - 1) + x->data[valuelen++] = c; + else { + /* entity too long for buffer, handle as normal data */ + x->data[valuelen] = '\0'; + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + break; + } + if (c == ';') { + x->data[valuelen] = '\0'; + if (x->xmlattrentity) + x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + valuelen = 0; + break; + } + } + } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) { + if (valuelen < sizeof(x->data) - 1) { + x->data[valuelen++] = c; + } else { + x->data[valuelen] = '\0'; + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + } + } + if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) { + x->data[valuelen] = '\0'; + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + if (x->xmlattrend) + x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); + break; + } + } + namelen = endname = valuestart = 0; + } else if (namelen < sizeof(x->name) - 1) { + x->name[namelen++] = c; + } + if (c == '>') { + break; + } else if (c == '/') { + x->isshorttag = 1; + x->name[0] = '\0'; + namelen = 0; + } + } +} + +static void +xml_parsecomment(XMLParser *x) +{ + size_t datalen = 0, i = 0; + int c; + + if (x->xmlcommentstart) + x->xmlcommentstart(x); + while ((c = GETNEXT()) != EOF) { + if (c == '-' || c == '>') { + if (x->xmlcomment && datalen) { + x->data[datalen] = '\0'; + x->xmlcomment(x, x->data, datalen); + datalen = 0; + } + } + + if (c == '-') { + if (++i > 2) { + if (x->xmlcomment) + for (; i > 2; i--) + x->xmlcomment(x, "-", 1); + i = 2; + } + continue; + } else if (c == '>' && i == 2) { + if (x->xmlcommentend) + x->xmlcommentend(x); + return; + } else if (i) { + if (x->xmlcomment) { + for (; i > 0; i--) + x->xmlcomment(x, "-", 1); + } + i = 0; + } + + if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + if (x->xmlcomment) + x->xmlcomment(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + } + } +} + +static void +xml_parsecdata(XMLParser *x) +{ + size_t datalen = 0, i = 0; + int c; + + if (x->xmlcdatastart) + x->xmlcdatastart(x); + while ((c = GETNEXT()) != EOF) { + if (c == ']' || c == '>') { + if (x->xmlcdata && datalen) { + x->data[datalen] = '\0'; + x->xmlcdata(x, x->data, datalen); + datalen = 0; + } + } + + if (c == ']') { + if (++i > 2) { + if (x->xmlcdata) + for (; i > 2; i--) + x->xmlcdata(x, "]", 1); + i = 2; + } + continue; + } else if (c == '>' && i == 2) { + if (x->xmlcdataend) + x->xmlcdataend(x); + return; + } else if (i) { + if (x->xmlcdata) + for (; i > 0; i--) + x->xmlcdata(x, "]", 1); + i = 0; + } + + if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + if (x->xmlcdata) + x->xmlcdata(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + } + } +} + +static int +codepointtoutf8(long r, char *s) +{ + if (r == 0) { + return 0; /* NUL byte */ + } else if (r <= 0x7F) { + /* 1 byte: 0aaaaaaa */ + s[0] = r; + return 1; + } else if (r <= 0x07FF) { + /* 2 bytes: 00000aaa aabbbbbb */ + s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ + s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ + return 2; + } else if (r <= 0xFFFF) { + /* 3 bytes: aaaabbbb bbcccccc */ + s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ + s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ + s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ + return 3; + } else { + /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ + s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ + s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ + s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ + s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ + return 4; + } +} + +static int +namedentitytostr(const char *e, char *buf, size_t bufsiz) +{ + static const struct { + const char *entity; + int c; + } entities[] = { + { "amp;", '&' }, + { "lt;", '<' }, + { "gt;", '>' }, + { "apos;", '\'' }, + { "quot;", '"' }, + }; + size_t i; + + /* buffer is too small */ + if (bufsiz < 2) + return -1; + + for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { + if (!strcmp(e, entities[i].entity)) { + buf[0] = entities[i].c; + buf[1] = '\0'; + return 1; + } + } + return 0; +} + +static int +numericentitytostr(const char *e, char *buf, size_t bufsiz) +{ + long l; + int len; + char *end; + + /* buffer is too small */ + if (bufsiz < 5) + return -1; + + errno = 0; + /* hex (16) or decimal (10) */ + if (*e == 'x') + l = strtoul(e + 1, &end, 16); + else + l = strtoul(e, &end, 10); + /* invalid value or not a well-formed entity or too high codepoint */ + if (errno || *end != ';' || l > 0x10FFFF) + return 0; + len = codepointtoutf8(l, buf); + buf[len] = '\0'; + + return len; +} + +/* convert named- or numeric entity string to buffer string + * returns byte-length of string. */ +int +xml_entitytostr(const char *e, char *buf, size_t bufsiz) +{ + /* doesn't start with & */ + if (e[0] != '&') + return 0; + /* numeric entity */ + if (e[1] == '#') + return numericentitytostr(e + 2, buf, bufsiz); + else /* named entity */ + return namedentitytostr(e + 1, buf, bufsiz); +} + +void +xml_parse(XMLParser *x) +{ + size_t datalen, tagdatalen; + int c, isend; + + while ((c = GETNEXT()) != EOF && c != '<') + ; /* skip until < */ + + while (c != EOF) { + if (c == '<') { /* parse tag */ + if ((c = GETNEXT()) == EOF) + return; + + if (c == '!') { /* cdata and comments */ + for (tagdatalen = 0; (c = GETNEXT()) != EOF;) { + /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */ + if (tagdatalen <= sizeof("[CDATA[") - 1) + x->data[tagdatalen++] = c; + if (c == '>') + break; + else if (c == '-' && tagdatalen == sizeof("--") - 1 && + (x->data[0] == '-')) { + xml_parsecomment(x); + break; + } else if (c == '[') { + if (tagdatalen == sizeof("[CDATA[") - 1 && + !strncmp(x->data, "[CDATA[", tagdatalen)) { + xml_parsecdata(x); + break; + } + } + } + } else { + /* normal tag (open, short open, close), processing instruction. */ + x->tag[0] = c; + x->taglen = 1; + x->isshorttag = isend = 0; + + /* treat processing instruction as shorttag, don't strip "?" prefix. */ + if (c == '?') { + x->isshorttag = 1; + } else if (c == '/') { + if ((c = GETNEXT()) == EOF) + return; + x->tag[0] = c; + isend = 1; + } + + while ((c = GETNEXT()) != EOF) { + if (c == '/') + x->isshorttag = 1; /* short tag */ + else if (c == '>' || isspace(c)) { + x->tag[x->taglen] = '\0'; + if (isend) { /* end tag, starts with </ */ + if (x->xmltagend) + x->xmltagend(x, x->tag, x->taglen, x->isshorttag); + x->tag[0] = '\0'; + x->taglen = 0; + } else { + /* start tag */ + if (x->xmltagstart) + x->xmltagstart(x, x->tag, x->taglen); + if (isspace(c)) + xml_parseattrs(x); + if (x->xmltagstartparsed) + x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag); + } + /* call tagend for shortform or processing instruction */ + if (x->isshorttag) { + if (x->xmltagend) + x->xmltagend(x, x->tag, x->taglen, x->isshorttag); + x->tag[0] = '\0'; + x->taglen = 0; + } + break; + } else if (x->taglen < sizeof(x->tag) - 1) + x->tag[x->taglen++] = c; /* NOTE: tag name truncation */ + } + } + } else { + /* parse tag data */ + datalen = 0; + if (x->xmldatastart) + x->xmldatastart(x); + while ((c = GETNEXT()) != EOF) { + if (c == '&') { + if (datalen) { + x->data[datalen] = '\0'; + if (x->xmldata) + x->xmldata(x, x->data, datalen); + } + x->data[0] = c; + datalen = 1; + while ((c = GETNEXT()) != EOF) { + if (c == '<') + break; + if (datalen < sizeof(x->data) - 1) + x->data[datalen++] = c; + else { + /* entity too long for buffer, handle as normal data */ + x->data[datalen] = '\0'; + if (x->xmldata) + x->xmldata(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + break; + } + if (c == ';') { + x->data[datalen] = '\0'; + if (x->xmldataentity) + x->xmldataentity(x, x->data, datalen); + datalen = 0; + break; + } + } + } else if (c != '<') { + if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + if (x->xmldata) + x->xmldata(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + } + } + if (c == '<') { + x->data[datalen] = '\0'; + if (x->xmldata && datalen) + x->xmldata(x, x->data, datalen); + if (x->xmldataend) + x->xmldataend(x); + break; + } + } + } + } +} @@ -0,0 +1,47 @@ +#ifndef _XML_H +#define _XML_H + +typedef struct xmlparser { + /* handlers */ + void (*xmlattr)(struct xmlparser *, const char *, size_t, + const char *, size_t, const char *, size_t); + void (*xmlattrend)(struct xmlparser *, const char *, size_t, + const char *, size_t); + void (*xmlattrstart)(struct xmlparser *, const char *, size_t, + const char *, size_t); + void (*xmlattrentity)(struct xmlparser *, const char *, size_t, + const char *, size_t, const char *, size_t); + void (*xmlcdatastart)(struct xmlparser *); + void (*xmlcdata)(struct xmlparser *, const char *, size_t); + void (*xmlcdataend)(struct xmlparser *); + void (*xmlcommentstart)(struct xmlparser *); + void (*xmlcomment)(struct xmlparser *, const char *, size_t); + void (*xmlcommentend)(struct xmlparser *); + void (*xmldata)(struct xmlparser *, const char *, size_t); + void (*xmldataend)(struct xmlparser *); + void (*xmldataentity)(struct xmlparser *, const char *, size_t); + void (*xmldatastart)(struct xmlparser *); + void (*xmltagend)(struct xmlparser *, const char *, size_t, int); + void (*xmltagstart)(struct xmlparser *, const char *, size_t); + void (*xmltagstartparsed)(struct xmlparser *, const char *, + size_t, int); + +#ifndef GETNEXT + #define GETNEXT (x)->getnext + int (*getnext)(void); +#endif + + /* current tag */ + char tag[1024]; + size_t taglen; + /* current tag is in short form ? <tag /> */ + int isshorttag; + /* current attribute name */ + char name[1024]; + /* data buffer used for tag data, cdata and attribute data */ + char data[BUFSIZ]; +} XMLParser; + +int xml_entitytostr(const char *, char *, size_t); +void xml_parse(XMLParser *); +#endif diff --git a/xml2tsv.c b/xml2tsv.c new file mode 100644 index 0000000..9418d66 --- /dev/null +++ b/xml2tsv.c @@ -0,0 +1,228 @@ +/* +* (c) 2020 Vincenzo "KatolaZ" Nicosia <katolaz@freaknet.org> +* +* A simple xml-to-rsv converter, based on xmlparser by Hiltjo Posthuma +* http://codemadness.org/git/xmlparser/ +* +* You can use, distribute, modify, and/or redistribute this program under +* the terms of the ISC LICENSE. See LICENSE for details. +* +*/ + + +#include <sys/types.h> + +#include <stdio.h> +#include <string.h> + +#include "xml.h" + +#define STR_MAX 128 +#define DEPTH_MAX 50 + + +/* tag stack */ + +typedef struct { + int top; + char st[DEPTH_MAX][STR_MAX]; +} tstack_t; + +int stack_push(tstack_t *t, const char *c){ + if (t->top < DEPTH_MAX){ + t->top ++; + strncpy(t->st[t->top], c, STR_MAX); + return 0; + } + return -1; +} + +char* stack_pop(tstack_t *t){ + if (t->top >= 0) + return t->st[t->top--]; + return NULL; +} + +char* stack_peek(tstack_t *t){ + if (t->top >= 0) + return t->st[t->top]; + return NULL; +} + +int stack_empty(tstack_t *t){ + return (t->top < 0); +} + +void stack_init(tstack_t *t){ + t->top = -1; +} + + +/* utility functions */ + +void print_no_cr(FILE *f, const char *c){ + char *tmp = c; + while (c != NULL){ + tmp = strchr(c, '\n'); + if (tmp != NULL) + *tmp = '\0'; + fprintf(f, "%s", c); + if (tmp != NULL) + c = tmp + 1; + else + c = NULL; + } +} + +void print_cur_str(FILE *f, tstack_t *t){ + int i; + for (i=0; i<=t->top; i++){ + fprintf(f, "/%s", t->st[i]); + } +} + +/* global variables */ + +tstack_t st; + + +/* xml callbacks */ + +void +xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, + const char *v, size_t vl) +{ + printf("\t%s=%s", a, v); +} + +void +xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, + const char *v, size_t vl) +{ + printf("attrentity: %s\n", a); +} + +void +xmlattrend(XMLParser *x, const char *t, size_t tl, const char *a, size_t al) +{ +} + +void +xmlattrstart(XMLParser *x, const char *t, size_t tl, const char *a, size_t al) +{ +} + +void +xmlcdatastart(XMLParser *x) +{ +} + +void +xmlcdata(XMLParser *x, const char *d, size_t dl) +{ + printf("\t%s", d); +} + +void +xmlcdataend(XMLParser *x) +{ +} + +void +xmlcommentstart(XMLParser *x) +{ +} + +void +xmlcomment(XMLParser *x, const char *c, size_t cl) +{ +} + +void +xmlcommentend(XMLParser *x) +{ +} + +void +xmldata(XMLParser *x, const char *d, size_t dl) +{ + printf("\t"); + print_no_cr(stdout, d); +} + +void +xmldataend(XMLParser *x) +{ +} + +void +xmldataentity(XMLParser *x, const char *d, size_t dl) +{ +} + +void +xmldatastart(XMLParser *x) +{ +} + +void +xmltagend(XMLParser *x, const char *t, size_t tl, int isshort) +{ + char *tag; + if (stack_empty(&st)){ + fprintf(stderr, "Error: tag-end '%s' before any open tag", t); + } + tag = stack_pop(&st); + if (strcmp(t, tag)){ + fprintf(stderr, "Error: tag-end '%s' closes tag '%s'", t, tag); + } + /* printf("\n"); */ +} + +void +xmltagstart(XMLParser *x, const char *t, size_t tl) +{ + if (stack_push(&st, t)){ + fprintf(stderr, "Error: stack full. Ignoring tag '%s' (parent tag: '%s')\n", t, stack_peek(&st)); + return; + } + printf("\n"); + print_cur_str(stdout, &st); +} + +void +xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort) +{ + /* printf("inside tagstartparsed\n"); */ +} + +int +main(void) +{ + stack_init(&st); + XMLParser x = { 0 }; + + x.xmlattr = xmlattr; + x.xmlattrend = xmlattrend; + x.xmlattrstart = xmlattrstart; + x.xmlattrentity = xmlattrentity; + x.xmlcdatastart = xmlcdatastart; + x.xmlcdata = xmlcdata; + x.xmlcdataend = xmlcdataend; + x.xmlcommentstart = xmlcommentstart; + x.xmlcomment = xmlcomment; + x.xmlcommentend = xmlcommentend; + x.xmldata = xmldata; + x.xmldataend = xmldataend; + x.xmldataentity = xmldataentity; + x.xmldatastart = xmldatastart; + x.xmltagend = xmltagend; + x.xmltagstart = xmltagstart; + x.xmltagstartparsed = xmltagstartparsed; + + x.getnext = getchar; + + xml_parse(&x); + printf("\n"); + return 0; +} |