diff options
| -rw-r--r-- | LICENSE | 15 | ||||
| -rw-r--r-- | xml.c | 454 | ||||
| -rw-r--r-- | xml.h | 47 | ||||
| -rw-r--r-- | xml2tsv.c | 228 | 
4 files changed, 744 insertions, 0 deletions
@@ -0,0 +1,15 @@ +ISC License + +Copyright (c) 2020 Vincenzo "KatolaZ" Nicosia <katolaz@freaknet.org> + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. @@ -0,0 +1,454 @@ +#include <sys/types.h> + +#include <ctype.h> +#include <errno.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "xml.h" + +static void +xml_parseattrs(XMLParser *x) +{ +	size_t namelen = 0, valuelen; +	int c, endsep, endname = 0, valuestart = 0; + +	while ((c = GETNEXT()) != EOF) { +		if (isspace(c)) { +			if (namelen) +				endname = 1; +			continue; +		} else if (c == '?') +			; /* ignore */ +		else if (c == '=') { +			x->name[namelen] = '\0'; +			valuestart = 1; +			endname = 1; +		} else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) { +			/* attribute without value */ +			x->name[namelen] = '\0'; +			if (x->xmlattrstart) +				x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); +			if (x->xmlattr) +				x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0); +			if (x->xmlattrend) +				x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); +			endname = 0; +			x->name[0] = c; +			namelen = 1; +		} else if (namelen && valuestart) { +			/* attribute with value */ +			if (x->xmlattrstart) +				x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); + +			valuelen = 0; +			if (c == '\'' || c == '"') { +				endsep = c; +			} else { +				endsep = ' '; /* isspace() */ +				goto startvalue; +			} + +			while ((c = GETNEXT()) != EOF) { +startvalue: +				if (c == '&') { /* entities */ +					x->data[valuelen] = '\0'; +					/* call data function with data before entity if there is data */ +					if (valuelen && x->xmlattr) +						x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); +					x->data[0] = c; +					valuelen = 1; +					while ((c = GETNEXT()) != EOF) { +						if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) +							break; +						if (valuelen < sizeof(x->data) - 1) +							x->data[valuelen++] = c; +						else { +							/* entity too long for buffer, handle as normal data */ +							x->data[valuelen] = '\0'; +							if (x->xmlattr) +								x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); +							x->data[0] = c; +							valuelen = 1; +							break; +						} +						if (c == ';') { +							x->data[valuelen] = '\0'; +							if (x->xmlattrentity) +								x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); +							valuelen = 0; +							break; +						} +					} +				} else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) { +					if (valuelen < sizeof(x->data) - 1) { +						x->data[valuelen++] = c; +					} else { +						x->data[valuelen] = '\0'; +						if (x->xmlattr) +							x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); +						x->data[0] = c; +						valuelen = 1; +					} +				} +				if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) { +					x->data[valuelen] = '\0'; +					if (x->xmlattr) +						x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); +					if (x->xmlattrend) +						x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); +					break; +				} +			} +			namelen = endname = valuestart = 0; +		} else if (namelen < sizeof(x->name) - 1) { +			x->name[namelen++] = c; +		} +		if (c == '>') { +			break; +		} else if (c == '/') { +			x->isshorttag = 1; +			x->name[0] = '\0'; +			namelen = 0; +		} +	} +} + +static void +xml_parsecomment(XMLParser *x) +{ +	size_t datalen = 0, i = 0; +	int c; + +	if (x->xmlcommentstart) +		x->xmlcommentstart(x); +	while ((c = GETNEXT()) != EOF) { +		if (c == '-' || c == '>') { +			if (x->xmlcomment && datalen) { +				x->data[datalen] = '\0'; +				x->xmlcomment(x, x->data, datalen); +				datalen = 0; +			} +		} + +		if (c == '-') { +			if (++i > 2) { +				if (x->xmlcomment) +					for (; i > 2; i--) +						x->xmlcomment(x, "-", 1); +				i = 2; +			} +			continue; +		} else if (c == '>' && i == 2) { +			if (x->xmlcommentend) +				x->xmlcommentend(x); +			return; +		} else if (i) { +			if (x->xmlcomment) { +				for (; i > 0; i--) +					x->xmlcomment(x, "-", 1); +			} +			i = 0; +		} + +		if (datalen < sizeof(x->data) - 1) { +			x->data[datalen++] = c; +		} else { +			x->data[datalen] = '\0'; +			if (x->xmlcomment) +				x->xmlcomment(x, x->data, datalen); +			x->data[0] = c; +			datalen = 1; +		} +	} +} + +static void +xml_parsecdata(XMLParser *x) +{ +	size_t datalen = 0, i = 0; +	int c; + +	if (x->xmlcdatastart) +		x->xmlcdatastart(x); +	while ((c = GETNEXT()) != EOF) { +		if (c == ']' || c == '>') { +			if (x->xmlcdata && datalen) { +				x->data[datalen] = '\0'; +				x->xmlcdata(x, x->data, datalen); +				datalen = 0; +			} +		} + +		if (c == ']') { +			if (++i > 2) { +				if (x->xmlcdata) +					for (; i > 2; i--) +						x->xmlcdata(x, "]", 1); +				i = 2; +			} +			continue; +		} else if (c == '>' && i == 2) { +			if (x->xmlcdataend) +				x->xmlcdataend(x); +			return; +		} else if (i) { +			if (x->xmlcdata) +				for (; i > 0; i--) +					x->xmlcdata(x, "]", 1); +			i = 0; +		} + +		if (datalen < sizeof(x->data) - 1) { +			x->data[datalen++] = c; +		} else { +			x->data[datalen] = '\0'; +			if (x->xmlcdata) +				x->xmlcdata(x, x->data, datalen); +			x->data[0] = c; +			datalen = 1; +		} +	} +} + +static int +codepointtoutf8(long r, char *s) +{ +	if (r == 0) { +		return 0; /* NUL byte */ +	} else if (r <= 0x7F) { +		/* 1 byte: 0aaaaaaa */ +		s[0] = r; +		return 1; +	} else if (r <= 0x07FF) { +		/* 2 bytes: 00000aaa aabbbbbb */ +		s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */ +		s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */ +		return 2; +	} else if (r <= 0xFFFF) { +		/* 3 bytes: aaaabbbb bbcccccc */ +		s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ +		s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */ +		s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */ +		return 3; +	} else { +		/* 4 bytes: 000aaabb bbbbcccc ccdddddd */ +		s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ +		s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ +		s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */ +		s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */ +		return 4; +	} +} + +static int +namedentitytostr(const char *e, char *buf, size_t bufsiz) +{ +	static const struct { +		const char *entity; +		int c; +	} entities[] = { +		{ "amp;",  '&'  }, +		{ "lt;",   '<'  }, +		{ "gt;",   '>'  }, +		{ "apos;", '\'' }, +		{ "quot;", '"'  }, +	}; +	size_t i; + +	/* buffer is too small */ +	if (bufsiz < 2) +		return -1; + +	for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { +		if (!strcmp(e, entities[i].entity)) { +			buf[0] = entities[i].c; +			buf[1] = '\0'; +			return 1; +		} +	} +	return 0; +} + +static int +numericentitytostr(const char *e, char *buf, size_t bufsiz) +{ +	long l; +	int len; +	char *end; + +	/* buffer is too small */ +	if (bufsiz < 5) +		return -1; + +	errno = 0; +	/* hex (16) or decimal (10) */ +	if (*e == 'x') +		l = strtoul(e + 1, &end, 16); +	else +		l = strtoul(e, &end, 10); +	/* invalid value or not a well-formed entity or too high codepoint */ +	if (errno || *end != ';' || l > 0x10FFFF) +		return 0; +	len = codepointtoutf8(l, buf); +	buf[len] = '\0'; + +	return len; +} + +/* convert named- or numeric entity string to buffer string + * returns byte-length of string. */ +int +xml_entitytostr(const char *e, char *buf, size_t bufsiz) +{ +	/* doesn't start with & */ +	if (e[0] != '&') +		return 0; +	/* numeric entity */ +	if (e[1] == '#') +		return numericentitytostr(e + 2, buf, bufsiz); +	else /* named entity */ +		return namedentitytostr(e + 1, buf, bufsiz); +} + +void +xml_parse(XMLParser *x) +{ +	size_t datalen, tagdatalen; +	int c, isend; + +	while ((c = GETNEXT()) != EOF && c != '<') +		; /* skip until < */ + +	while (c != EOF) { +		if (c == '<') { /* parse tag */ +			if ((c = GETNEXT()) == EOF) +				return; + +			if (c == '!') { /* cdata and comments */ +				for (tagdatalen = 0; (c = GETNEXT()) != EOF;) { +					/* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */ +					if (tagdatalen <= sizeof("[CDATA[") - 1) +						x->data[tagdatalen++] = c; +					if (c == '>') +						break; +					else if (c == '-' && tagdatalen == sizeof("--") - 1 && +							(x->data[0] == '-')) { +						xml_parsecomment(x); +						break; +					} else if (c == '[') { +						if (tagdatalen == sizeof("[CDATA[") - 1 && +						    !strncmp(x->data, "[CDATA[", tagdatalen)) { +							xml_parsecdata(x); +							break; +						} +					} +				} +			} else { +				/* normal tag (open, short open, close), processing instruction. */ +				x->tag[0] = c; +				x->taglen = 1; +				x->isshorttag = isend = 0; + +				/* treat processing instruction as shorttag, don't strip "?" prefix. */ +				if (c == '?') { +					x->isshorttag = 1; +				} else if (c == '/') { +					if ((c = GETNEXT()) == EOF) +						return; +					x->tag[0] = c; +					isend = 1; +				} + +				while ((c = GETNEXT()) != EOF) { +					if (c == '/') +						x->isshorttag = 1; /* short tag */ +					else if (c == '>' || isspace(c)) { +						x->tag[x->taglen] = '\0'; +						if (isend) { /* end tag, starts with </ */ +							if (x->xmltagend) +								x->xmltagend(x, x->tag, x->taglen, x->isshorttag); +							x->tag[0] = '\0'; +							x->taglen = 0; +						} else { +							/* start tag */ +							if (x->xmltagstart) +								x->xmltagstart(x, x->tag, x->taglen); +							if (isspace(c)) +								xml_parseattrs(x); +							if (x->xmltagstartparsed) +								x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag); +						} +						/* call tagend for shortform or processing instruction */ +						if (x->isshorttag) { +							if (x->xmltagend) +								x->xmltagend(x, x->tag, x->taglen, x->isshorttag); +							x->tag[0] = '\0'; +							x->taglen = 0; +						} +						break; +					} else if (x->taglen < sizeof(x->tag) - 1) +						x->tag[x->taglen++] = c; /* NOTE: tag name truncation */ +				} +			} +		} else { +			/* parse tag data */ +			datalen = 0; +			if (x->xmldatastart) +				x->xmldatastart(x); +			while ((c = GETNEXT()) != EOF) { +				if (c == '&') { +					if (datalen) { +						x->data[datalen] = '\0'; +						if (x->xmldata) +							x->xmldata(x, x->data, datalen); +					} +					x->data[0] = c; +					datalen = 1; +					while ((c = GETNEXT()) != EOF) { +						if (c == '<') +							break; +						if (datalen < sizeof(x->data) - 1) +							x->data[datalen++] = c; +						else { +							/* entity too long for buffer, handle as normal data */ +							x->data[datalen] = '\0'; +							if (x->xmldata) +								x->xmldata(x, x->data, datalen); +							x->data[0] = c; +							datalen = 1; +							break; +						} +						if (c == ';') { +							x->data[datalen] = '\0'; +							if (x->xmldataentity) +								x->xmldataentity(x, x->data, datalen); +							datalen = 0; +							break; +						} +					} +				} else if (c != '<') { +					if (datalen < sizeof(x->data) - 1) { +						x->data[datalen++] = c; +					} else { +						x->data[datalen] = '\0'; +						if (x->xmldata) +							x->xmldata(x, x->data, datalen); +						x->data[0] = c; +						datalen = 1; +					} +				} +				if (c == '<') { +					x->data[datalen] = '\0'; +					if (x->xmldata && datalen) +						x->xmldata(x, x->data, datalen); +					if (x->xmldataend) +						x->xmldataend(x); +					break; +				} +			} +		} +	} +} @@ -0,0 +1,47 @@ +#ifndef _XML_H +#define _XML_H + +typedef struct xmlparser { +	/* handlers */ +	void (*xmlattr)(struct xmlparser *, const char *, size_t, +	      const char *, size_t, const char *, size_t); +	void (*xmlattrend)(struct xmlparser *, const char *, size_t, +	      const char *, size_t); +	void (*xmlattrstart)(struct xmlparser *, const char *, size_t, +	      const char *, size_t); +	void (*xmlattrentity)(struct xmlparser *, const char *, size_t, +	      const char *, size_t, const char *, size_t); +	void (*xmlcdatastart)(struct xmlparser *); +	void (*xmlcdata)(struct xmlparser *, const char *, size_t); +	void (*xmlcdataend)(struct xmlparser *); +	void (*xmlcommentstart)(struct xmlparser *); +	void (*xmlcomment)(struct xmlparser *, const char *, size_t); +	void (*xmlcommentend)(struct xmlparser *); +	void (*xmldata)(struct xmlparser *, const char *, size_t); +	void (*xmldataend)(struct xmlparser *); +	void (*xmldataentity)(struct xmlparser *, const char *, size_t); +	void (*xmldatastart)(struct xmlparser *); +	void (*xmltagend)(struct xmlparser *, const char *, size_t, int); +	void (*xmltagstart)(struct xmlparser *, const char *, size_t); +	void (*xmltagstartparsed)(struct xmlparser *, const char *, +	      size_t, int); + +#ifndef GETNEXT +	#define GETNEXT (x)->getnext +	int (*getnext)(void); +#endif + +	/* current tag */ +	char tag[1024]; +	size_t taglen; +	/* current tag is in short form ? <tag /> */ +	int isshorttag; +	/* current attribute name */ +	char name[1024]; +	/* data buffer used for tag data, cdata and attribute data */ +	char data[BUFSIZ]; +} XMLParser; + +int xml_entitytostr(const char *, char *, size_t); +void xml_parse(XMLParser *); +#endif diff --git a/xml2tsv.c b/xml2tsv.c new file mode 100644 index 0000000..9418d66 --- /dev/null +++ b/xml2tsv.c @@ -0,0 +1,228 @@ +/* +* (c) 2020 Vincenzo "KatolaZ" Nicosia <katolaz@freaknet.org> +*  +* A simple xml-to-rsv converter, based on xmlparser by Hiltjo Posthuma +* http://codemadness.org/git/xmlparser/ +* +* You can use, distribute, modify, and/or redistribute this program under +* the terms of the ISC LICENSE. See LICENSE for details. +* +*/ + + +#include <sys/types.h> + +#include <stdio.h> +#include <string.h> + +#include "xml.h" + +#define STR_MAX 128 +#define DEPTH_MAX 50 + + +/* tag stack */ + +typedef struct { +	int top; +	char st[DEPTH_MAX][STR_MAX]; +} tstack_t; + +int stack_push(tstack_t *t, const char *c){ +	if (t->top < DEPTH_MAX){ +		t->top ++; +		strncpy(t->st[t->top], c, STR_MAX); +		return 0; +	} +	return -1; +}	 + +char* stack_pop(tstack_t *t){ +	if (t->top >= 0) +		return t->st[t->top--]; +	return NULL; +}  + +char* stack_peek(tstack_t *t){ +	if (t->top >= 0) +		return t->st[t->top]; +	return NULL; +} + +int stack_empty(tstack_t *t){ +	return (t->top < 0); +} + +void stack_init(tstack_t *t){ +	t->top = -1; +} + + +/* utility functions */ + +void print_no_cr(FILE *f, const char *c){ +	char *tmp = c; +	while (c != NULL){ +		tmp = strchr(c, '\n'); +		if (tmp != NULL) +			*tmp = '\0'; +		fprintf(f, "%s", c); +		if (tmp != NULL) +			c = tmp + 1; +		else +			c = NULL; +	} +} + +void print_cur_str(FILE *f, tstack_t *t){ +	int i; +	for (i=0; i<=t->top; i++){ +		fprintf(f, "/%s", t->st[i]); +	} +} + +/* global variables */ + +tstack_t st; + + +/* xml callbacks */ + +void +xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, +        const char *v, size_t vl) +{ +	printf("\t%s=%s", a, v); +} + +void +xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, +              const char *v, size_t vl) +{ +	printf("attrentity: %s\n", a); +} + +void +xmlattrend(XMLParser *x, const char *t, size_t tl, const char *a, size_t al) +{ +} + +void +xmlattrstart(XMLParser *x, const char *t, size_t tl, const char *a, size_t al) +{ +} + +void +xmlcdatastart(XMLParser *x) +{ +} + +void +xmlcdata(XMLParser *x, const char *d, size_t dl) +{ +	printf("\t%s", d); +} + +void +xmlcdataend(XMLParser *x) +{ +} + +void +xmlcommentstart(XMLParser *x) +{ +} + +void +xmlcomment(XMLParser *x, const char *c, size_t cl) +{ +} + +void +xmlcommentend(XMLParser *x) +{ +} + +void +xmldata(XMLParser *x, const char *d, size_t dl) +{ +	printf("\t"); +	print_no_cr(stdout, d); +} + +void +xmldataend(XMLParser *x) +{ +} + +void +xmldataentity(XMLParser *x, const char *d, size_t dl) +{ +} + +void +xmldatastart(XMLParser *x) +{ +} + +void +xmltagend(XMLParser *x, const char *t, size_t tl, int isshort) +{ +	char *tag; +	if (stack_empty(&st)){ +		fprintf(stderr, "Error: tag-end '%s' before any open tag", t); +	} +	tag = stack_pop(&st); +	if (strcmp(t, tag)){ +		fprintf(stderr, "Error: tag-end '%s' closes tag '%s'", t, tag); +	} +	/* printf("\n"); */ +} + +void +xmltagstart(XMLParser *x, const char *t, size_t tl) +{ +	if (stack_push(&st, t)){ +		fprintf(stderr, "Error: stack full. Ignoring tag '%s' (parent tag: '%s')\n", t, stack_peek(&st)); +		return; +	} +	printf("\n"); +	print_cur_str(stdout, &st); +} + +void +xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort) +{ +	/* printf("inside tagstartparsed\n"); */ +} + +int +main(void) +{ +	stack_init(&st); +	XMLParser x = { 0 }; + +	x.xmlattr = xmlattr; +	x.xmlattrend = xmlattrend; +	x.xmlattrstart = xmlattrstart; +	x.xmlattrentity = xmlattrentity; +	x.xmlcdatastart = xmlcdatastart; +	x.xmlcdata = xmlcdata; +	x.xmlcdataend = xmlcdataend; +	x.xmlcommentstart = xmlcommentstart; +	x.xmlcomment = xmlcomment; +	x.xmlcommentend = xmlcommentend; +	x.xmldata = xmldata; +	x.xmldataend = xmldataend; +	x.xmldataentity = xmldataentity; +	x.xmldatastart = xmldatastart; +	x.xmltagend = xmltagend; +	x.xmltagstart = xmltagstart; +	x.xmltagstartparsed = xmltagstartparsed; + +	x.getnext = getchar; + +	xml_parse(&x); +	printf("\n"); +	return 0; +}  | 
