diff options
author | Nick Wellnhofer <wellnhofer@aevum.de> | 2020-01-19 00:51:02 +0100 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2020-01-23 08:25:54 -0800 |
commit | b0a4cfa36e99c27dd2b20be8f8888fa7721bad58 (patch) | |
tree | 528ace24d0526b0dd647bcd774f348e677b78a9f | |
parent | 75b48c5938f5984dbcf79a579d15c9cbd6447d12 (diff) |
Use C string instead of chunk for literal text
Use zero-terminated C strings and a separate length field instead of
cmark_chunks. Literal inline text will now be copied from the parent
block's content buffer, slowing the benchmark down by 10-15%.
The node struct never references memory of other nodes now, fixing #309.
Node accessors don't have to check for delayed creation of C strings,
so parsing and iterating all literals using the public API should
actually be faster than before.
-rw-r--r-- | api_test/main.c | 19 | ||||
-rw-r--r-- | src/blocks.c | 3 | ||||
-rw-r--r-- | src/commonmark.c | 3 | ||||
-rw-r--r-- | src/inlines.c | 59 | ||||
-rw-r--r-- | src/iterator.c | 5 | ||||
-rw-r--r-- | src/node.c | 7 | ||||
-rw-r--r-- | src/node.h | 8 |
7 files changed, 71 insertions, 33 deletions
diff --git a/api_test/main.c b/api_test/main.c index e7fccbd..994ee39 100644 --- a/api_test/main.c +++ b/api_test/main.c @@ -243,6 +243,21 @@ static void accessors(test_batch_runner *runner) { cmark_node_free(doc); } +static void free_parent(test_batch_runner *runner) { + static const char markdown[] = "text\n"; + + cmark_node *doc = + cmark_parse_document(markdown, sizeof(markdown) - 1, CMARK_OPT_DEFAULT); + + cmark_node *para = cmark_node_first_child(doc); + cmark_node *text = cmark_node_first_child(para); + cmark_node_unlink(text); + cmark_node_free(doc); + STR_EQ(runner, cmark_node_get_literal(text), "text", + "inline content after freeing parent block"); + cmark_node_free(text); +} + static void node_check(test_batch_runner *runner) { // Construct an incomplete tree. cmark_node *doc = cmark_node_new(CMARK_NODE_DOCUMENT); @@ -381,9 +396,6 @@ static void create_tree(test_batch_runner *runner) { free(html); cmark_node_free(doc); - - // TODO: Test that the contents of an unlinked inline are valid - // after the parent block was destroyed. This doesn't work so far. cmark_node_free(emph); } @@ -1031,6 +1043,7 @@ int main() { version(runner); constructor(runner); accessors(runner); + free_parent(runner); node_check(runner); iterator(runner); iterator_delete(runner); diff --git a/src/blocks.c b/src/blocks.c index 5214f47..9970cc9 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -322,7 +322,8 @@ static cmark_node *finalize(cmark_parser *parser, cmark_node *b) { break; case CMARK_NODE_HTML_BLOCK: - b->as.literal = cmark_chunk_buf_detach(node_content); + b->as.literal.len = node_content->size; + b->as.literal.data = cmark_strbuf_detach(node_content); break; case CMARK_NODE_LIST: // determine tight/loose status diff --git a/src/commonmark.c b/src/commonmark.c index 89aef5b..41bfa52 100644 --- a/src/commonmark.c +++ b/src/commonmark.c @@ -146,8 +146,7 @@ static bool is_autolink(cmark_node *node) { if (strcmp((const char *)url, "mailto:") == 0) { url += 7; } - return strncmp((const char *)url, (char *)link_text->as.literal.data, - link_text->as.literal.len) == 0; + return strcmp((const char *)url, (char *)link_text->as.literal.data) == 0; } // if node is a block node, returns node. diff --git a/src/inlines.c b/src/inlines.c index 7d584ca..2c13546 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -22,9 +22,6 @@ static const char *LEFTSINGLEQUOTE = "\xE2\x80\x98"; static const char *RIGHTSINGLEQUOTE = "\xE2\x80\x99"; // Macros for creating various kinds of simple. -#define make_str(subj, sc, ec, s) make_literal(subj, CMARK_NODE_TEXT, sc, ec, s) -#define make_code(subj, sc, ec, s) make_literal(subj, CMARK_NODE_CODE, sc, ec, s) -#define make_raw_html(subj, sc, ec, s) make_literal(subj, CMARK_NODE_HTML_INLINE, sc, ec, s) #define make_linebreak(mem) make_simple(mem, CMARK_NODE_LINEBREAK) #define make_softbreak(mem) make_simple(mem, CMARK_NODE_SOFTBREAK) #define make_emph(mem) make_simple(mem, CMARK_NODE_EMPH) @@ -81,12 +78,10 @@ static bufsize_t subject_find_special_char(subject *subj, int options); // Create an inline with a literal string value. static CMARK_INLINE cmark_node *make_literal(subject *subj, cmark_node_type t, - int start_column, int end_column, - cmark_chunk s) { + int start_column, int end_column) { cmark_node *e = (cmark_node *)subj->mem->calloc(1, sizeof(*e)); cmark_strbuf_init(subj->mem, &e->content, 0); e->type = (uint16_t)t; - e->as.literal = s; e->start_line = e->end_line = subj->line; // columns are 1 based. e->start_column = start_column + 1 + subj->column_offset + subj->block_offset; @@ -102,6 +97,23 @@ static CMARK_INLINE cmark_node *make_simple(cmark_mem *mem, cmark_node_type t) { return e; } +static cmark_node *make_str(subject *subj, int sc, int ec, cmark_chunk s) { + cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec); + e->as.literal.data = (unsigned char *)subj->mem->realloc(NULL, s.len + 1); + memcpy(e->as.literal.data, s.data, s.len); + e->as.literal.data[s.len] = 0; + e->as.literal.len = s.len; + return e; +} + +static cmark_node *make_str_from_buf(subject *subj, int sc, int ec, + cmark_strbuf *buf) { + cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec); + e->as.literal.len = buf->size; + e->as.literal.data = cmark_strbuf_detach(buf); + return e; +} + // Like make_str, but parses entities. static cmark_node *make_str_with_entities(subject *subj, int start_column, int end_column, @@ -109,7 +121,7 @@ static cmark_node *make_str_with_entities(subject *subj, cmark_strbuf unescaped = CMARK_BUF_INIT(subj->mem); if (houdini_unescape_html(&unescaped, content->data, content->len)) { - return make_str(subj, start_column, end_column, cmark_chunk_buf_detach(&unescaped)); + return make_str_from_buf(subj, start_column, end_column, &unescaped); } else { return make_str(subj, start_column, end_column, *content); } @@ -368,7 +380,10 @@ static cmark_node *handle_backticks(subject *subj, int options) { endpos - startpos - openticks.len); S_normalize_code(&buf); - cmark_node *node = make_code(subj, startpos, endpos - openticks.len - 1, cmark_chunk_buf_detach(&buf)); + cmark_node *node = make_literal(subj, CMARK_NODE_CODE, startpos, + endpos - openticks.len - 1); + node->as.literal.len = buf.size; + node->as.literal.data = cmark_strbuf_detach(&buf); adjust_subj_node_newlines(subj, node, endpos - startpos, openticks.len, options); return node; } @@ -579,7 +594,7 @@ static cmark_node *handle_hyphen(subject *subj, bool smart) { cmark_strbuf_puts(&buf, ENDASH); } - return make_str(subj, startpos, subj->pos - 1, cmark_chunk_buf_detach(&buf)); + return make_str_from_buf(subj, startpos, subj->pos - 1, &buf); } // Assumes we have a period at the current position. @@ -656,19 +671,15 @@ static void process_emphasis(subject *subj, delimiter *stack_bottom) { closer = closer->next; } } else if (closer->delim_char == '\'') { - cmark_chunk_free(subj->mem, &closer->inl_text->as.literal); - closer->inl_text->as.literal = cmark_chunk_literal(RIGHTSINGLEQUOTE); + cmark_node_set_literal(closer->inl_text, RIGHTSINGLEQUOTE); if (opener_found) { - cmark_chunk_free(subj->mem, &opener->inl_text->as.literal); - opener->inl_text->as.literal = cmark_chunk_literal(LEFTSINGLEQUOTE); + cmark_node_set_literal(opener->inl_text, LEFTSINGLEQUOTE); } closer = closer->next; } else if (closer->delim_char == '"') { - cmark_chunk_free(subj->mem, &closer->inl_text->as.literal); - closer->inl_text->as.literal = cmark_chunk_literal(RIGHTDOUBLEQUOTE); + cmark_node_set_literal(closer->inl_text, RIGHTDOUBLEQUOTE); if (opener_found) { - cmark_chunk_free(subj->mem, &opener->inl_text->as.literal); - opener->inl_text->as.literal = cmark_chunk_literal(LEFTDOUBLEQUOTE); + cmark_node_set_literal(opener->inl_text, LEFTDOUBLEQUOTE); } closer = closer->next; } @@ -709,7 +720,9 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener, opener_num_chars -= use_delims; closer_num_chars -= use_delims; opener_inl->as.literal.len = opener_num_chars; + opener_inl->as.literal.data[opener_num_chars] = 0; closer_inl->as.literal.len = closer_num_chars; + closer_inl->as.literal.data[closer_num_chars] = 0; // free delimiters between opener and closer delim = closer->previous; @@ -785,7 +798,7 @@ static cmark_node *handle_entity(subject *subj) { return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("&")); subj->pos += len; - return make_str(subj, subj->pos - 1 - len, subj->pos - 1, cmark_chunk_buf_detach(&ent)); + return make_str_from_buf(subj, subj->pos - 1 - len, subj->pos - 1, &ent); } // Clean a URL: remove surrounding whitespace, and remove \ that escape @@ -853,9 +866,15 @@ static cmark_node *handle_pointy_brace(subject *subj, int options) { // finally, try to match an html tag matchlen = scan_html_tag(&subj->input, subj->pos); if (matchlen > 0) { - contents = cmark_chunk_dup(&subj->input, subj->pos - 1, matchlen + 1); + unsigned char *src = subj->input.data + subj->pos - 1; + bufsize_t len = matchlen + 1; subj->pos += matchlen; - cmark_node *node = make_raw_html(subj, subj->pos - matchlen - 1, subj->pos - 1, contents); + cmark_node *node = make_literal(subj, CMARK_NODE_HTML_INLINE, + subj->pos - matchlen - 1, subj->pos - 1); + node->as.literal.data = (unsigned char *)subj->mem->realloc(NULL, len + 1); + memcpy(node->as.literal.data, src, len); + node->as.literal.data[len] = 0; + node->as.literal.len = len; adjust_subj_node_newlines(subj, node, matchlen, 1, options); return node; } diff --git a/src/iterator.c b/src/iterator.c index f5cd802..cd7db8e 100644 --- a/src/iterator.c +++ b/src/iterator.c @@ -111,8 +111,9 @@ void cmark_consolidate_text_nodes(cmark_node *root) { cmark_node_free(tmp); tmp = next; } - cmark_chunk_free(iter->mem, &cur->as.literal); - cur->as.literal = cmark_chunk_buf_detach(&buf); + iter->mem->free(cur->as.literal.data); + cur->as.literal.len = buf.size; + cur->as.literal.data = cmark_strbuf_detach(&buf); } } @@ -116,7 +116,7 @@ static void S_free_nodes(cmark_node *e) { case CMARK_NODE_HTML_INLINE: case CMARK_NODE_CODE: case CMARK_NODE_HTML_BLOCK: - cmark_chunk_free(NODE_MEM(e), &e->as.literal); + NODE_MEM(e)->free(e->as.literal.data); break; case CMARK_NODE_LINK: case CMARK_NODE_IMAGE: @@ -295,7 +295,7 @@ const char *cmark_node_get_literal(cmark_node *node) { case CMARK_NODE_TEXT: case CMARK_NODE_HTML_INLINE: case CMARK_NODE_CODE: - return cmark_chunk_to_cstr(NODE_MEM(node), &node->as.literal); + return node->as.literal.data ? (char *)node->as.literal.data : ""; case CMARK_NODE_CODE_BLOCK: return (char *)node->as.code.literal; @@ -317,7 +317,8 @@ int cmark_node_set_literal(cmark_node *node, const char *content) { case CMARK_NODE_TEXT: case CMARK_NODE_HTML_INLINE: case CMARK_NODE_CODE: - cmark_chunk_set_cstr(NODE_MEM(node), &node->as.literal, content); + node->as.literal.len = cmark_set_cstr(NODE_MEM(node), + &node->as.literal.data, content); return 1; case CMARK_NODE_CODE_BLOCK: @@ -10,7 +10,11 @@ extern "C" { #include "cmark.h" #include "buffer.h" -#include "chunk.h" + +typedef struct { + unsigned char *data; + bufsize_t len; +} cmark_literal; typedef struct { cmark_list_type list_type; @@ -72,7 +76,7 @@ struct cmark_node { uint16_t flags; union { - cmark_chunk literal; + cmark_literal literal; cmark_list list; cmark_code code; cmark_heading heading; |