From 7f491b0bdf8e206458d284938efa8a0890c9d352 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Thu, 11 Jun 2015 14:32:22 -0700 Subject: Preliminary changes for new tab handling. We no longer preprocess tabs to spaces before parsing. Instead, we keep track of both the byte offset and the (virtual) column as we parse block starts. This allows us to handle tabs without converting to spaces first. Tabs are left as tabs in the output. Added `column` and `first_nonspace_column` fields to `parser`. Added utility function to advance the offset, computing the virtual column too. Note that we don't need to deal with UTF-8 here at all. Only ASCII occurs in block starts. Significant performance improvement due to the fact that we're not doing UTF-8 validation -- though we might want to add that back in. --- src/blocks.c | 92 +++++++++++++++++++++++++++++++++++++++++++++--------------- src/parser.h | 2 ++ 2 files changed, 71 insertions(+), 23 deletions(-) (limited to 'src') diff --git a/src/blocks.c b/src/blocks.c index 8c3e18c..06f6dcb 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -16,6 +16,8 @@ #include "debug.h" #define CODE_INDENT 4 +#define TAB_STOP 4 + #define peek_at(i, n) (i)->data[n] static inline bool @@ -70,7 +72,9 @@ cmark_parser *cmark_parser_new(int options) parser->current = document; parser->line_number = 0; parser->offset = 0; + parser->column = 0; parser->first_nonspace = 0; + parser->first_nonspace_column = 0; parser->indent = 0; parser->blank = false; parser->curline = line; @@ -555,15 +559,52 @@ static void chop_trailing_hashtags(cmark_chunk *ch) static void S_find_first_nonspace(cmark_parser *parser, cmark_chunk *input) { + char c; + int chars_to_tab = TAB_STOP - (parser->column % TAB_STOP); + parser->first_nonspace = parser->offset; - while (peek_at(input, parser->first_nonspace) == ' ') { - parser->first_nonspace++; + parser->first_nonspace_column = parser->column; + while ((c = peek_at(input, parser->first_nonspace))) { + if (c == ' ') { + parser->first_nonspace += 1; + parser->first_nonspace_column += 1; + chars_to_tab = chars_to_tab - 1; + if (chars_to_tab == 0) { + chars_to_tab = TAB_STOP; + } + } else if (c == '\t') { + parser->first_nonspace += 1; + parser->first_nonspace_column += chars_to_tab; + chars_to_tab = TAB_STOP; + } else { + break; + } } - parser->indent = parser->first_nonspace - parser->offset; + parser->indent = parser->first_nonspace_column - parser->column; parser->blank = S_is_line_end_char(peek_at(input, parser->first_nonspace)); } +static void +S_advance_offset(cmark_parser *parser, cmark_chunk *input, bufsize_t count, bool columns) +{ + char c; + int chars_to_tab; + while (count > 0 && (c = peek_at(input, parser->offset))) { + if (c == '\t') { + chars_to_tab = 4 - (parser->column % TAB_STOP); + parser->column += chars_to_tab; + parser->offset += 1; + count -= (columns ? chars_to_tab : 1); + } else { + parser->offset += 1; + parser->column += 1; // assume ascii; block starts are ascii + count -= 1; + } + } +} + + static void S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t bytes) { @@ -578,8 +619,9 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte cmark_chunk input; bool maybe_lazy; - utf8proc_detab(parser->curline, buffer, bytes); + cmark_strbuf_put(parser->curline, buffer, bytes); parser->offset = 0; + parser->column = 0; parser->blank = false; input.data = parser->curline->ptr; @@ -601,7 +643,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte if (container->type == NODE_BLOCK_QUOTE) { matched = parser->indent <= 3 && peek_at(&input, parser->first_nonspace) == '>'; if (matched) { - parser->offset = parser->first_nonspace + 1; + S_advance_offset(parser, &input, parser->indent + 1, true); if (peek_at(&input, parser->offset) == ' ') parser->offset++; } else { @@ -609,13 +651,14 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte } } else if (container->type == NODE_ITEM) { - if (parser->indent >= container->as.list.marker_offset + container->as.list.padding) { - parser->offset += container->as.list.marker_offset + - container->as.list.padding; + S_advance_offset(parser, &input, + container->as.list.marker_offset + + container->as.list.padding, true); } else if (parser->blank) { - parser->offset = parser->first_nonspace; + S_advance_offset(parser, &input, + parser->first_nonspace - parser->offset, false); } else { all_matched = false; } @@ -624,9 +667,11 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte if (!container->as.code.fenced) { // indented if (parser->indent >= CODE_INDENT) { - parser->offset += CODE_INDENT; + S_advance_offset(parser, &input, CODE_INDENT, true); } else if (parser->blank) { - parser->offset = parser->first_nonspace; + S_advance_offset(parser, &input, + parser->first_nonspace - parser->offset, + false); } else { all_matched = false; } @@ -642,7 +687,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte // closing fence - and since we're at // the end of a line, we can return: all_matched = false; - parser->offset += matched; + S_advance_offset(parser, &input, matched, false); parser->current = finalize(parser, container); goto finished; } else { @@ -650,7 +695,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte i = container->as.code.fence_offset; while (i > 0 && peek_at(&input, parser->offset) == ' ') { - parser->offset++; + S_advance_offset(parser, &input, 1, false); i--; } } @@ -697,15 +742,16 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte if (!indented && peek_at(&input, parser->first_nonspace) == '>') { - parser->offset = parser->first_nonspace + 1; + S_advance_offset(parser, &input, parser->first_nonspace + 1 - parser->offset, false); // optional following character if (peek_at(&input, parser->offset) == ' ') - parser->offset++; + S_advance_offset(parser, &input, 1, false); container = add_child(parser, container, NODE_BLOCK_QUOTE, parser->offset + 1); } else if (!indented && (matched = scan_atx_header_start(&input, parser->first_nonspace))) { - parser->offset = parser->first_nonspace + matched; + S_advance_offset(parser, &input, + parser->first_nonspace + matched - parser->offset, false); container = add_child(parser, container, NODE_HEADER, parser->offset + 1); bufsize_t hashpos = cmark_chunk_strchr(&input, '#', parser->first_nonspace); @@ -726,7 +772,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte container->as.code.fence_length = matched; container->as.code.fence_offset = parser->first_nonspace - parser->offset; container->as.code.info = cmark_chunk_literal(""); - parser->offset = parser->first_nonspace + matched; + S_advance_offset(parser, &input, parser->first_nonspace + matched - parser->offset, false); } else if (!indented && (matched = scan_html_block_tag(&input, parser->first_nonspace))) { @@ -743,7 +789,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte container->type = NODE_HEADER; container->as.header.level = lev; container->as.header.setext = true; - parser->offset = input.len - 1; + S_advance_offset(parser, &input, input.len - 1 - parser->offset, false); } else if (!indented && !(container->type == NODE_PARAGRAPH && @@ -753,7 +799,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte // it's only now that we know the line is not part of a setext header: container = add_child(parser, container, NODE_HRULE, parser->first_nonspace + 1); container = finalize(parser, container); - parser->offset = input.len - 1; + S_advance_offset(parser, &input, input.len - 1 - parser->offset, false); } else if ((matched = parse_list_marker(&input, parser->first_nonspace, &data)) && (!indented || container->type == NODE_LIST)) { @@ -761,7 +807,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte // spaces indent, as long as the list container is still open. // compute padding: - parser->offset = parser->first_nonspace + matched; + S_advance_offset(parser, &input, parser->first_nonspace + matched - parser->offset, false); i = 0; while (i <= 5 && peek_at(&input, parser->offset + i) == ' ') { i++; @@ -771,11 +817,11 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte S_is_line_end_char(peek_at(&input, parser->offset))) { data->padding = matched + 1; if (i > 0) { - parser->offset += 1; + S_advance_offset(parser, &input, 1, false); } } else { data->padding = matched + i; - parser->offset += i; + S_advance_offset(parser, &input, i, true); } // check container; if it's a list, see if this list item @@ -799,7 +845,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte free(data); } else if (indented && !maybe_lazy && !parser->blank) { - parser->offset += CODE_INDENT; + S_advance_offset(parser, &input, CODE_INDENT, true); container = add_child(parser, container, NODE_CODE_BLOCK, parser->offset + 1); container->as.code.fenced = false; container->as.code.fence_char = 0; diff --git a/src/parser.h b/src/parser.h index 6e18c67..01a7aeb 100644 --- a/src/parser.h +++ b/src/parser.h @@ -17,7 +17,9 @@ struct cmark_parser { struct cmark_node* current; int line_number; bufsize_t offset; + bufsize_t column; bufsize_t first_nonspace; + bufsize_t first_nonspace_column; int indent; bool blank; cmark_strbuf *curline; -- cgit v1.2.3