Expose lower-level parsing API.

The new functions cmark_new_doc_parser, cmark_free_doc_parser, cmark_process_line, and cmark_finish allow you to feed lines one by one (possibly from several files) to the parser and call finish when you're done. This is now used in main for mulitple files.
author: John MacFarlane <jgm@berkeley.edu> 2014-11-14 23:07:28 -0800
committer: John MacFarlane <jgm@berkeley.edu> 2014-11-15 18:34:31 -0800
commit: 8f523b0149f250f733d80357320f92ffbe2a8b8f (patch)
tree: 1df9572e60d88795ee992089eaf46b0fd7987ce1 /src/blocks.c
parent: 694fa11266741aa061477aaca627e0445ba20723 (diff)
1 files changed, 181 insertions, 156 deletions
diff --git a/src/blocks.c b/src/blocks.c
index 604b933..568af0a 100644
--- a/src/blocks.c
+++ b/src/blocks.c
@@ -10,13 +10,10 @@
 #include "inlines.h"
 #include "html/houdini.h"
 #include "buffer.h"
-#include "bench.h"
+#include "debug.h"
 
 #define peek_at(i, n) (i)->data[n]
 
-static void incorporate_line(strbuf *ln, int line_number, node_block** curptr);
-static void finalize(node_block* b, int line_number);
-
 static node_block* make_block(int tag, int start_line, int start_column)
 {
 	node_block* e;
@@ -44,18 +41,42 @@ static node_block* make_document()
 	return e;
 }
 
+cmark_doc_parser *cmark_new_doc_parser()
+{
+	cmark_doc_parser *parser = (cmark_doc_parser*)malloc(sizeof(cmark_doc_parser));
+	node_block *document = make_document();
+	strbuf *line = (strbuf*)malloc(sizeof(strbuf));
+	cmark_strbuf_init(line, 256);
+
+	parser->head = document;
+	parser->current = document;
+	parser->line_number = 0;
+	parser->curline = line;
+
+	return parser;
+}
+
+void cmark_free_doc_parser(cmark_doc_parser *parser)
+{
+	cmark_strbuf_free(parser->curline);
+	free(parser->curline);
+	free(parser);
+}
+
+static void finalize(node_block* b, int line_number);
+
 // Returns true if line has only space characters, else false.
 static bool is_blank(strbuf *s, int offset)
 {
 	while (offset < s->size) {
 		switch (s->ptr[offset]) {
-		case '\n':
-			return true;
-		case ' ':
-			offset++;
-			break;
-		default:
-			return false;
+			case '\n':
+				return true;
+			case ' ':
+				offset++;
+				break;
+			default:
+				return false;
 		}
 	}
 
@@ -65,17 +86,17 @@ static bool is_blank(strbuf *s, int offset)
 static inline bool can_contain(int parent_type, int child_type)
 {
 	return ( parent_type == BLOCK_DOCUMENT ||
-		 parent_type == BLOCK_BQUOTE ||
-		 parent_type == BLOCK_LIST_ITEM ||
-		 (parent_type == BLOCK_LIST && child_type == BLOCK_LIST_ITEM) );
+			parent_type == BLOCK_BQUOTE ||
+			parent_type == BLOCK_LIST_ITEM ||
+			(parent_type == BLOCK_LIST && child_type == BLOCK_LIST_ITEM) );
 }
 
 static inline bool accepts_lines(int block_type)
 {
 	return (block_type == BLOCK_PARAGRAPH ||
-		block_type == BLOCK_ATX_HEADER ||
-		block_type == BLOCK_INDENTED_CODE ||
-		block_type == BLOCK_FENCED_CODE);
+			block_type == BLOCK_ATX_HEADER ||
+			block_type == BLOCK_INDENTED_CODE ||
+			block_type == BLOCK_FENCED_CODE);
 }
 
 static void add_line(node_block* node_block, chunk *ch, int offset)
@@ -158,77 +179,77 @@ static void finalize(node_block* b, int line_number)
 	}
 
 	switch (b->tag) {
-	case BLOCK_PARAGRAPH:
-		pos = 0;
-		while (strbuf_at(&b->string_content, 0) == '[' &&
-		       (pos = parse_reference_inline(&b->string_content, b->top->as.document.refmap))) {
+		case BLOCK_PARAGRAPH:
+			pos = 0;
+			while (strbuf_at(&b->string_content, 0) == '[' &&
+					(pos = parse_reference_inline(&b->string_content, b->top->as.document.refmap))) {
 
-			strbuf_drop(&b->string_content, pos);
-		}
-		if (is_blank(&b->string_content, 0)) {
-			b->tag = BLOCK_REFERENCE_DEF;
-		}
-		break;
-
-	case BLOCK_INDENTED_CODE:
-		remove_trailing_blank_lines(&b->string_content);
-		strbuf_putc(&b->string_content, '\n');
-		break;
-
-	case BLOCK_FENCED_CODE:
-		// first line of contents becomes info
-		firstlinelen = strbuf_strchr(&b->string_content, '\n', 0);
-
-		strbuf_init(&b->as.code.info, 0);
-		houdini_unescape_html_f(
-			&b->as.code.info,
-			b->string_content.ptr,
-			firstlinelen
-			);
-
-		strbuf_drop(&b->string_content, firstlinelen + 1);
-
-		strbuf_trim(&b->as.code.info);
-		strbuf_unescape(&b->as.code.info);
-		break;
-
-	case BLOCK_LIST: // determine tight/loose status
-		b->as.list.tight = true; // tight by default
-		item = b->children;
-
-		while (item) {
-			// check for non-final non-empty list item ending with blank line:
-			if (item->last_line_blank && item->next) {
-				b->as.list.tight = false;
-				break;
+				strbuf_drop(&b->string_content, pos);
 			}
-			// recurse into children of list item, to see if there are
-			// spaces between them:
-			subitem = item->children;
-			while (subitem) {
-				if (ends_with_blank_line(subitem) &&
-				    (item->next || subitem->next)) {
+			if (is_blank(&b->string_content, 0)) {
+				b->tag = BLOCK_REFERENCE_DEF;
+			}
+			break;
+
+		case BLOCK_INDENTED_CODE:
+			remove_trailing_blank_lines(&b->string_content);
+			strbuf_putc(&b->string_content, '\n');
+			break;
+
+		case BLOCK_FENCED_CODE:
+			// first line of contents becomes info
+			firstlinelen = strbuf_strchr(&b->string_content, '\n', 0);
+
+			strbuf_init(&b->as.code.info, 0);
+			houdini_unescape_html_f(
+					&b->as.code.info,
+					b->string_content.ptr,
+					firstlinelen
+					);
+
+			strbuf_drop(&b->string_content, firstlinelen + 1);
+
+			strbuf_trim(&b->as.code.info);
+			strbuf_unescape(&b->as.code.info);
+			break;
+
+		case BLOCK_LIST: // determine tight/loose status
+			b->as.list.tight = true; // tight by default
+			item = b->children;
+
+			while (item) {
+				// check for non-final non-empty list item ending with blank line:
+				if (item->last_line_blank && item->next) {
 					b->as.list.tight = false;
 					break;
 				}
-				subitem = subitem->next;
-			}
-			if (!(b->as.list.tight)) {
-				break;
+				// recurse into children of list item, to see if there are
+				// spaces between them:
+				subitem = item->children;
+				while (subitem) {
+					if (ends_with_blank_line(subitem) &&
+							(item->next || subitem->next)) {
+						b->as.list.tight = false;
+						break;
+					}
+					subitem = subitem->next;
+				}
+				if (!(b->as.list.tight)) {
+					break;
+				}
+				item = item->next;
 			}
-			item = item->next;
-		}
 
-		break;
+			break;
 
-	default:
-		break;
+		default:
+			break;
 	}
 }
 
 // Add a node_block as child of another.  Return pointer to child.
 static node_block* add_child(node_block* parent,
-			     int block_type, int start_line, int start_column)
+		int block_type, int start_line, int start_column)
 {
 	assert(parent);
 
@@ -269,14 +290,14 @@ static void process_inlines(node_block* cur, reference_map *refmap)
 
 	while (cur != NULL) {
 		switch (cur->tag) {
-		case BLOCK_PARAGRAPH:
-		case BLOCK_ATX_HEADER:
-		case BLOCK_SETEXT_HEADER:
-			cur->inline_content = parse_inlines(&cur->string_content, refmap);
-			break;
+			case BLOCK_PARAGRAPH:
+			case BLOCK_ATX_HEADER:
+			case BLOCK_SETEXT_HEADER:
+				cur->inline_content = parse_inlines(&cur->string_content, refmap);
+				break;
 
-		default:
-			break;
+			default:
+				break;
 		}
 
 		if (cur->children) {
@@ -373,14 +394,13 @@ static int parse_list_marker(chunk *input, int pos, struct ListData ** dataptr)
 static int lists_match(struct ListData *list_data, struct ListData *item_data)
 {
 	return (list_data->list_type == item_data->list_type &&
-		list_data->delimiter == item_data->delimiter &&
-		// list_data->marker_offset == item_data.marker_offset &&
-		list_data->bullet_char == item_data->bullet_char);
+			list_data->delimiter == item_data->delimiter &&
+			// list_data->marker_offset == item_data.marker_offset &&
+			list_data->bullet_char == item_data->bullet_char);
 }
 
 static node_block *finalize_document(node_block *document, int linenum)
 {
-	start_timer();
 	while (document != document->top) {
 		finalize(document, linenum);
 		document = document->parent;
@@ -388,56 +408,46 @@ static node_block *finalize_document(node_block *document, int linenum)
 
 	finalize(document, linenum);
 	process_inlines(document, document->as.document.refmap);
-	end_timer("finalize_document");
 
 	return document;
 }
 
 extern node_block *cmark_parse_file(FILE *f)
 {
-	strbuf line = GH_BUF_INIT;
 	unsigned char buffer[4096];
-	int linenum = 1;
-	node_block *document = make_document();
+	cmark_doc_parser *parser = cmark_new_doc_parser();
+	size_t offset;
+	node_block *document;
 
-	start_timer();
 	while (fgets((char *)buffer, sizeof(buffer), f)) {
-		utf8proc_detab(&line, buffer, strlen((char *)buffer));
-		incorporate_line(&line, linenum, &document);
-		strbuf_clear(&line);
-		linenum++;
+		offset = strlen((char *)buffer);
+		cmark_process_line(parser, buffer, offset);
 	}
-	end_timer("incorporate_line(s)");
 
-	strbuf_free(&line);
-	return finalize_document(document, linenum);
+	document = cmark_finish(parser);
+	cmark_free_doc_parser(parser);
+	return document;
 }
 
 extern node_block *cmark_parse_document(const unsigned char *buffer, size_t len)
 {
-	strbuf line = GH_BUF_INIT;
 	int linenum = 1;
 	const unsigned char *end = buffer + len;
-	node_block *document = make_document();
+	size_t offset;
+	cmark_doc_parser *parser = cmark_new_doc_parser();
+	node_block *document;
 
 	while (buffer < end) {
 		const unsigned char *eol = memchr(buffer, '\n', end - buffer);
-
-		if (!eol) {
-			utf8proc_detab(&line, buffer, end - buffer);
-			buffer = end;
-		} else {
-			utf8proc_detab(&line, buffer, (eol - buffer) + 1);
-			buffer += (eol - buffer) + 1;
-		}
-
-		incorporate_line(&line, linenum, &document);
-		strbuf_clear(&line);
+		offset = eol ? (eol - buffer) + 1 : eol - buffer;
+		cmark_process_line(parser, buffer, offset);
+		buffer += offset;
 		linenum++;
 	}
 
-	strbuf_free(&line);
-	return finalize_document(document, linenum);
+	document = cmark_finish(parser);
+	cmark_free_doc_parser(parser);
+	return document;
 }
 
 static void chop_trailing_hashtags(chunk *ch)
@@ -458,8 +468,8 @@ static void chop_trailing_hashtags(chunk *ch)
 	}
 }
 
-// Process one line at a time, modifying a node_block.
-static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
+void cmark_process_line(cmark_doc_parser *parser, const unsigned char *buffer,
+		 size_t bytes)
 {
 	node_block* last_matched_container;
 	int offset = 0;
@@ -469,22 +479,27 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
 	struct ListData * data = NULL;
 	bool all_matched = true;
 	node_block* container;
-	node_block* cur = *curptr;
+	node_block* cur = parser->current;
 	bool blank = false;
 	int first_nonspace;
 	int indent;
 	chunk input;
 
+	utf8proc_detab(parser->curline, buffer, bytes);
+
 	// Add a newline to the end if not present:
-	if (line->ptr[line->size - 1] != '\n') {
-		strbuf_putc(line, '\n');
+	// TODO this breaks abstraction:
+	if (parser->curline->ptr[parser->curline->size - 1] != '\n') {
+		strbuf_putc(parser->curline, '\n');
 	}
-	input.data = line->ptr;
-	input.len = line->size;
+	input.data = parser->curline->ptr;
+	input.len = parser->curline->size;
 
 	// container starts at the document root.
 	container = cur->top;
 
+	parser->line_number++;
+
 	// for each containing node_block, try to parse the associated line start.
 	// bail out on failure:  container will point to the last matching node_block.
 
@@ -512,7 +527,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
 		} else if (container->tag == BLOCK_LIST_ITEM) {
 
 			if (indent >= container->as.list.marker_offset +
-			    container->as.list.padding) {
+					container->as.list.padding) {
 				offset += container->as.list.marker_offset +
 					container->as.list.padding;
 			} else if (blank) {
@@ -532,7 +547,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
 			}
 
 		} else if (container->tag == BLOCK_ATX_HEADER ||
-			   container->tag == BLOCK_SETEXT_HEADER) {
+				container->tag == BLOCK_SETEXT_HEADER) {
 
 			// a header can never contain more than one line
 			all_matched = false;
@@ -571,12 +586,12 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
 
 	// check to see if we've hit 2nd blank line, break out of list:
 	if (blank && container->last_line_blank) {
-		break_out_of_lists(&container, line_number);
+		break_out_of_lists(&container, parser->line_number);
 	}
 
 	// unless last matched container is code node_block, try new container starts:
 	while (container->tag != BLOCK_FENCED_CODE && container->tag != BLOCK_INDENTED_CODE &&
-	       container->tag != BLOCK_HTML) {
+			container->tag != BLOCK_HTML) {
 
 		first_nonspace = offset;
 		while (peek_at(&input, first_nonspace) == ' ')
@@ -588,7 +603,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
 		if (indent >= CODE_INDENT) {
 			if (cur->tag != BLOCK_PARAGRAPH && !blank) {
 				offset += CODE_INDENT;
-				container = add_child(container, BLOCK_INDENTED_CODE, line_number, offset + 1);
+				container = add_child(container, BLOCK_INDENTED_CODE, parser->line_number, offset + 1);
 			} else { // indent > 4 in lazy line
 				break;
 			}
@@ -599,12 +614,12 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
 			// optional following character
 			if (peek_at(&input, offset) == ' ')
 				offset++;
-			container = add_child(container, BLOCK_BQUOTE, line_number, offset + 1);
+			container = add_child(container, BLOCK_BQUOTE, parser->line_number, offset + 1);
 
 		} else if ((matched = scan_atx_header_start(&input, first_nonspace))) {
 
 			offset = first_nonspace + matched;
-			container = add_child(container, BLOCK_ATX_HEADER, line_number, offset + 1);
+			container = add_child(container, BLOCK_ATX_HEADER, parser->line_number, offset + 1);
 
 			int hashpos = chunk_strchr(&input, '#', first_nonspace);
 			int level = 0;
@@ -617,7 +632,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
 
 		} else if ((matched = scan_open_code_fence(&input, first_nonspace))) {
 
-			container = add_child(container, BLOCK_FENCED_CODE, line_number, first_nonspace + 1);
+			container = add_child(container, BLOCK_FENCED_CODE, parser->line_number, first_nonspace + 1);
 			container->as.code.fence_char = peek_at(&input, first_nonspace);
 			container->as.code.fence_length = matched;
 			container->as.code.fence_offset = first_nonspace - offset;
@@ -625,25 +640,25 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
 
 		} else if ((matched = scan_html_block_tag(&input, first_nonspace))) {
 
-			container = add_child(container, BLOCK_HTML, line_number, first_nonspace + 1);
+			container = add_child(container, BLOCK_HTML, parser->line_number, first_nonspace + 1);
 			// note, we don't adjust offset because the tag is part of the text
 
 		} else if (container->tag == BLOCK_PARAGRAPH &&
-			   (lev = scan_setext_header_line(&input, first_nonspace)) &&
-			   // check that there is only one line in the paragraph:
-			   strbuf_strrchr(&container->string_content, '\n',
-					  strbuf_len(&container->string_content) - 2) < 0) {
+				(lev = scan_setext_header_line(&input, first_nonspace)) &&
+				// check that there is only one line in the paragraph:
+				strbuf_strrchr(&container->string_content, '\n',
+					strbuf_len(&container->string_content) - 2) < 0) {
 
 			container->tag = BLOCK_SETEXT_HEADER;
 			container->as.header.level = lev;
 			offset = input.len - 1;
 
 		} else if (!(container->tag == BLOCK_PARAGRAPH && !all_matched) &&
-			   (matched = scan_hrule(&input, first_nonspace))) {
+				(matched = scan_hrule(&input, first_nonspace))) {
 
 			// it's only now that we know the line is not part of a setext header:
-			container = add_child(container, BLOCK_HRULE, line_number, first_nonspace + 1);
-			finalize(container, line_number);
+			container = add_child(container, BLOCK_HRULE, parser->line_number, first_nonspace + 1);
+			finalize(container, parser->line_number);
 			container = container->parent;
 			offset = input.len - 1;
 
@@ -672,16 +687,16 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
 			data->marker_offset = indent;
 
 			if (container->tag != BLOCK_LIST ||
-			    !lists_match(&container->as.list, data)) {
-				container = add_child(container, BLOCK_LIST, line_number,
-						      first_nonspace + 1);
+					!lists_match(&container->as.list, data)) {
+				container = add_child(container, BLOCK_LIST, parser->line_number,
+						first_nonspace + 1);
 
 				memcpy(&container->as.list, data, sizeof(*data));
 			}
 
 			// add the list item
-			container = add_child(container, BLOCK_LIST_ITEM, line_number,
-					      first_nonspace + 1);
+			container = add_child(container, BLOCK_LIST_ITEM, parser->line_number,
+					first_nonspace + 1);
 			/* TODO: static */
 			memcpy(&container->as.list, data, sizeof(*data));
 			free(data);
@@ -710,11 +725,11 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
 	// lists or breaking out of lists.  we also don't set last_line_blank
 	// on an empty list item.
 	container->last_line_blank = (blank &&
-				      container->tag != BLOCK_BQUOTE &&
-				      container->tag != BLOCK_FENCED_CODE &&
-				      !(container->tag == BLOCK_LIST_ITEM &&
-					container->children == NULL &&
-					container->start_line == line_number));
+			container->tag != BLOCK_BQUOTE &&
+			container->tag != BLOCK_FENCED_CODE &&
+			!(container->tag == BLOCK_LIST_ITEM &&
+				container->children == NULL &&
+				container->start_line == parser->line_number));
 
 	node_block *cont = container;
 	while (cont->parent) {
@@ -723,10 +738,10 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
 	}
 
 	if (cur != last_matched_container &&
-	    container == last_matched_container &&
-	    !blank &&
-	    cur->tag == BLOCK_PARAGRAPH &&
-	    strbuf_len(&cur->string_content) > 0) {
+			container == last_matched_container &&
+			!blank &&
+			cur->tag == BLOCK_PARAGRAPH &&
+			strbuf_len(&cur->string_content) > 0) {
 
 		add_line(cur, &input, offset);
 
@@ -734,7 +749,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
 
 		// finalize any blocks that were not matched and set cur to container:
 		while (cur != last_matched_container) {
-			finalize(cur, line_number);
+			finalize(cur, parser->line_number);
 			cur = cur->parent;
 			assert(cur != NULL);
 		}
@@ -747,7 +762,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
 			matched = 0;
 
 			if (indent <= 3 &&
-			    peek_at(&input, first_nonspace) == container->as.code.fence_char) {
+					peek_at(&input, first_nonspace) == container->as.code.fence_char) {
 				int fence_len = scan_close_code_fence(&input, first_nonspace);
 				if (fence_len > container->as.code.fence_length)
 					matched = 1;
@@ -755,7 +770,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
 
 			if (matched) {
 				// if closing fence, don't add line to container; instead, close it:
-				finalize(container, line_number);
+				finalize(container, parser->line_number);
 				container = container->parent; // back up to parent
 			} else {
 				add_line(container, &input, offset);
@@ -773,7 +788,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
 
 			chop_trailing_hashtags(&input);
 			add_line(container, &input, first_nonspace);
-			finalize(container, line_number);
+			finalize(container, parser->line_number);
 			container = container->parent;
 
 		} else if (accepts_lines(container->tag)) {
@@ -783,13 +798,23 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
 		} else if (container->tag != BLOCK_HRULE && container->tag != BLOCK_SETEXT_HEADER) {
 
 			// create paragraph container for line
-			container = add_child(container, BLOCK_PARAGRAPH, line_number, first_nonspace + 1);
+			container = add_child(container, BLOCK_PARAGRAPH, parser->line_number, first_nonspace + 1);
 			add_line(container, &input, first_nonspace);
 
 		} else {
 			assert(false);
 		}
 
-		*curptr = container;
+		parser->current = container;
 	}
+	strbuf_clear(parser->curline);
+
 }
+
+node_block *cmark_finish(cmark_doc_parser *parser)
+{
+	finalize_document(parser->current, parser->line_number);
+	strbuf_free(parser->curline);
+	return parser->head;
+}
+
author	John MacFarlane <jgm@berkeley.edu>	2014-11-14 23:07:28 -0800
committer	John MacFarlane <jgm@berkeley.edu>	2014-11-15 18:34:31 -0800
commit	8f523b0149f250f733d80357320f92ffbe2a8b8f (patch)
tree	1df9572e60d88795ee992089eaf46b0fd7987ce1 /src/blocks.c
parent	694fa11266741aa061477aaca627e0445ba20723 (diff)