diff options
-rwxr-xr-x | CMakeLists.txt | 4 | ||||
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | api_test/main.c | 97 | ||||
-rw-r--r-- | changelog.txt | 149 | ||||
-rw-r--r-- | src/CMakeLists.txt | 12 | ||||
-rw-r--r-- | src/blocks.c | 13 | ||||
-rw-r--r-- | src/inlines.c | 262 | ||||
-rw-r--r-- | src/inlines.h | 2 | ||||
-rw-r--r-- | src/iterator.c | 1 | ||||
-rw-r--r-- | src/latex.c | 22 | ||||
-rw-r--r-- | src/libcmark.pc.in | 2 | ||||
-rw-r--r-- | src/main.c | 1 | ||||
-rw-r--r-- | src/node.h | 1 | ||||
-rw-r--r-- | src/scanners.c | 58 | ||||
-rw-r--r-- | src/scanners.re | 2 | ||||
-rw-r--r-- | test/regression.txt | 2 | ||||
-rw-r--r-- | test/smart_punct.txt | 9 | ||||
-rw-r--r-- | test/spec.txt | 80 |
18 files changed, 565 insertions, 156 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 33180e5..4eb0541 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,8 +17,8 @@ endif() set(PROJECT_NAME "cmark") set(PROJECT_VERSION_MAJOR 0) -set(PROJECT_VERSION_MINOR 27) -set(PROJECT_VERSION_PATCH 1) +set(PROJECT_VERSION_MINOR 28) +set(PROJECT_VERSION_PATCH 3) set(PROJECT_VERSION ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH} ) option(CMARK_TESTS "Build cmark tests and enable testing" ON) @@ -110,9 +110,9 @@ To run a test for memory leaks using `valgrind`: make leakcheck -To reformat source code using `astyle`: +To reformat source code using `clang-format`: - make astyle + make format To run a "fuzz test" against ten long randomly generated inputs: diff --git a/api_test/main.c b/api_test/main.c index d720234..c30dc71 100644 --- a/api_test/main.c +++ b/api_test/main.c @@ -552,9 +552,9 @@ static void render_xml(test_batch_runner *runner) { STR_EQ(runner, xml, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n" "<paragraph sourcepos=\"1:1-1:9\">\n" - " <text>foo </text>\n" - " <emph>\n" - " <text>bar</text>\n" + " <text sourcepos=\"1:1-1:4\">foo </text>\n" + " <emph sourcepos=\"1:5-1:9\">\n" + " <text sourcepos=\"1:6-1:8\">bar</text>\n" " </emph>\n" "</paragraph>\n", "render first paragraph with source pos"); @@ -883,6 +883,95 @@ static void test_feed_across_line_ending(test_batch_runner *runner) { cmark_node_free(document); } +static void source_pos(test_batch_runner *runner) { + static const char markdown[] = + "# Hi *there*.\n" + "\n" + "Hello “ <http://www.google.com>\n" + "there `hi` -- [okay](www.google.com (ok)).\n" + "\n" + "> 1. Okay.\n" + "> Sure.\n" + ">\n" + "> 2. Yes, okay.\n" + "> ![ok](hi \"yes\")\n"; + + cmark_node *doc = cmark_parse_document(markdown, sizeof(markdown) - 1, CMARK_OPT_DEFAULT); + char *xml = cmark_render_xml(doc, CMARK_OPT_DEFAULT | CMARK_OPT_SOURCEPOS); + STR_EQ(runner, xml, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n" + "<document sourcepos=\"1:1-10:20\" xmlns=\"http://commonmark.org/xml/1.0\">\n" + " <heading sourcepos=\"1:1-1:13\" level=\"1\">\n" + " <text sourcepos=\"1:3-1:5\">Hi </text>\n" + " <emph sourcepos=\"1:6-1:12\">\n" + " <text sourcepos=\"1:7-1:11\">there</text>\n" + " </emph>\n" + " <text sourcepos=\"1:13-1:13\">.</text>\n" + " </heading>\n" + " <paragraph sourcepos=\"3:1-4:42\">\n" + " <text sourcepos=\"3:1-3:14\">Hello “ </text>\n" + " <link sourcepos=\"3:15-3:37\" destination=\"http://www.google.com\" title=\"\">\n" + " <text sourcepos=\"3:16-3:36\">http://www.google.com</text>\n" + " </link>\n" + " <softbreak />\n" + " <text sourcepos=\"4:1-4:6\">there </text>\n" + " <code sourcepos=\"4:8-4:9\">hi</code>\n" + " <text sourcepos=\"4:11-4:14\"> -- </text>\n" + " <link sourcepos=\"4:15-4:41\" destination=\"www.google.com\" title=\"ok\">\n" + " <text sourcepos=\"4:16-4:19\">okay</text>\n" + " </link>\n" + " <text sourcepos=\"4:42-4:42\">.</text>\n" + " </paragraph>\n" + " <block_quote sourcepos=\"6:1-10:20\">\n" + " <list sourcepos=\"6:3-10:20\" type=\"ordered\" start=\"1\" delim=\"period\" tight=\"false\">\n" + " <item sourcepos=\"6:3-8:1\">\n" + " <paragraph sourcepos=\"6:6-7:10\">\n" + " <text sourcepos=\"6:6-6:10\">Okay.</text>\n" + " <softbreak />\n" + " <text sourcepos=\"7:6-7:10\">Sure.</text>\n" + " </paragraph>\n" + " </item>\n" + " <item sourcepos=\"9:3-10:20\">\n" + " <paragraph sourcepos=\"9:6-10:20\">\n" + " <text sourcepos=\"9:6-9:15\">Yes, okay.</text>\n" + " <softbreak />\n" + " <image sourcepos=\"10:6-10:20\" destination=\"hi\" title=\"yes\">\n" + " <text sourcepos=\"10:8-10:9\">ok</text>\n" + " </image>\n" + " </paragraph>\n" + " </item>\n" + " </list>\n" + " </block_quote>\n" + "</document>\n", + "sourcepos are as expected"); + free(xml); + cmark_node_free(doc); +} + +static void ref_source_pos(test_batch_runner *runner) { + static const char markdown[] = + "Let's try [reference] links.\n" + "\n" + "[reference]: https://github.com (GitHub)\n"; + + cmark_node *doc = cmark_parse_document(markdown, sizeof(markdown) - 1, CMARK_OPT_DEFAULT); + char *xml = cmark_render_xml(doc, CMARK_OPT_DEFAULT | CMARK_OPT_SOURCEPOS); + STR_EQ(runner, xml, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n" + "<document sourcepos=\"1:1-3:40\" xmlns=\"http://commonmark.org/xml/1.0\">\n" + " <paragraph sourcepos=\"1:1-1:28\">\n" + " <text sourcepos=\"1:1-1:10\">Let's try </text>\n" + " <link sourcepos=\"1:11-1:21\" destination=\"https://github.com\" title=\"GitHub\">\n" + " <text sourcepos=\"1:12-1:20\">reference</text>\n" + " </link>\n" + " <text sourcepos=\"1:22-1:28\"> links.</text>\n" + " </paragraph>\n" + "</document>\n", + "sourcepos are as expected"); + free(xml); + cmark_node_free(doc); +} + int main() { int retval; test_batch_runner *runner = test_batch_runner_new(); @@ -908,6 +997,8 @@ int main() { test_cplusplus(runner); test_safe(runner); test_feed_across_line_ending(runner); + source_pos(runner); + ref_source_pos(runner); test_print_summary(runner); retval = test_ok(runner) ? 0 : 1; diff --git a/changelog.txt b/changelog.txt index 883ef6c..33cff54 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,3 +1,152 @@ +[0.28.3] + + * Include GNUInstallDirs in src/CMakeLists.txt (Nick Wellnhofer, #240). + This fixes build problems on some cmake versions (#241). + +[0.28.2] + + * Fixed regression in install dest for static library (#238). + Due to a mistake, 0.28.1 installed libcmark.a into include/. + +[0.28.1] + + * `--smart`: open quote can never occur right after `]` or `)` (#227). + * Fix quadratic behavior in `finalize` (Vicent Marti). + * Don't use `CMAKE_INSTALL_LIBDIR` to create `libcmark.pc` (#236). + This wasn't getting set in processing `libcmark.pc.in`, and we + were getting the wrong entry in `libcmark.pc`. + The new approach sets an internal `libdir` variable to + `lib${LIB_SUFFIX}`. This variable is used both to set the + install destination and in the libcmark.pc.in template. + * Update README.md, replace `make astyle` with `make format` + (Nguyễn Thái Ngọc Duy). + +[0.28.0] + + * Update spec. + * Use unsigned integer when shifting (Phil Turnbull). + Avoids a UBSAN warning which can be triggered when handling a + long sequence of backticks. + * Avoid memcpy'ing NULL pointers (Phil Turnbull). + Avoids a UBSAN warning when link title is empty string. + The length of the memcpy is zero so the NULL pointer is not + dereferenced but it is still undefined behaviour. + * DeMorgan simplification of some tests in emphasis parser. + This also brings the code into closer alignment with the wording + of the spec (see jgm/CommonMark#467). + * Fixed undefined shift in commonmark writer (#211). + Found by google/oss-fuzz: + <https://oss-fuzz.com/v2/testcase-detail/4686992824598528>. + * latex writer: fix memory overflow (#210). + We got an array overflow in enumerated lists nested more than + 10 deep with start number =/= 1. + This commit also ensures that we don't try to set `enum_` counters + that aren't defined by LaTeX (generally up to enumv). + Found by google/oss-fuzz: + <https://oss-fuzz.com/v2/testcase-detail/5546760854306816>. + * Check for NULL pointer in get_link_type (Phil Turnbull). + `echo '[](xx:)' | ./build/src/cmark -t latex` gave a + segfault. + * Move fuzzing dictionary into single file (Phil Turnbull). + This allows AFL and libFuzzer to use the same dictionary + * Reset bytes after UTF8 proc (Yuki Izumi, #206). + * Don't scan past an EOL (Yuki Izumi). + The existing negated character classes (`[^…]`) are careful to + always include` \x00` in the characters excluded, but these `.` + catch-alls can scan right past the terminating NUL placed + at the end of the buffer by `_scan_at`. As such, buffer + overruns can occur. Also, don't scan past a newline in HTML + block end scanners. + * Document cases where `get_` functions return `NULL` (#155). + E.g. `cmark_node_get_url` on a non-link or image. + * Properly handle backslashes in link destinations (#192). + Only ascii punctuation characters are escapable, per the spec. + * Fixed `cmark_node_get_list_start` to return 0 for bullet lists, + as documented (#202). + * Use `CMARK_NO_DELIM` for bullet lists (#201). + * Fixed code for freeing delimiter stack (#189). + * Removed abort outside of conditional (typo). + * Removed coercion in error message when aborting from buffer. + * Print message to stderr when we abort due to memory demands (#188). + * `libcmark.pc`: use `CMAKE_INSTALL_LIBDIR` (#185, Jens Petersen). + Needed for multilib distros like Fedora. + * Fixed buffer overflow error in `S_parser_feed` (#184). + The overflow could occur in the following condition: + the buffer ends with `\r` and the next memory address + contains `\n`. + * Update emphasis parsing for spec change. + Strong now goes inside Emph rather than the reverse, + when both scopes are possible. The code is much simpler. + This also avoids a spec inconsistency that cmark had previously: + `***hi***` became Strong (Emph "hi")) but + `***hi****` became Emph (Strong "hi")) "*" + * Fixes for the LaTeX renderer (#182, Doeme) + + Don't double-output the link in latex-rendering. + + Prevent ligatures in dashes sensibly when rendering latex. + `\-` is a hyphenation, so it doesn't get displayed at all. + * Added a test for NULL when freeing `subj->last_delim`. + * Cleaned up setting of lower bounds for openers. + We now use a much smaller array. + * Fix #178, quadratic parsing bug. Add pathological test. + * Slight improvement of clarity of logic in emph matching. + * Fix "multiple of 3" determination in emph/strong parsing. + We need to store the length of the original delimiter run, + instead of using the length of the remaining delimiters + after some have been subtracted. Test case: + `a***b* c*`. Thanks to Raph Levin for reporting. + * Correctly initialize chunk in S_process_line (Nick Wellnhofer, #170). + The `alloc` member wasn't initialized. This also allows to add an + assertion in `chunk_rtrim` which doesn't work for alloced chunks. + * Added 'make newbench'. + * `scanners.c` generated with re2c 0.16 (68K smaller!). + * `scanners.re` - fixed warnings; use `*` for fallback. + * Fixed some warnings in `scanners.re`. + * Update CaseFolding to latest (Kevin Wojniak, #168). + * Allow balanced nested parens in link destinations (Yuki Izumi, #166) + * Allocate enough bytes for backticks array. + * Inlines: Ensure that the delimiter stack is freed in subject. + * Fixed pathological cases with backtick code spans: + + - Removed recursion in scan_to_closing_backticks + - Added an array of pointers to potential backtick closers + to subject + - This array is used to avoid traversing the subject again + when we've already seen all the potential backtick closers. + - Added a max bound of 1000 for backtick code span delimiters. + - This helps with pathological cases like: + + x + x ` + x `` + x ``` + x ```` + ... + + - Added pathological test case. + + Thanks to Martin Mitáš for identifying the problem and for + discussion of solutions. + * Remove redundant cmake_minimum_required (#163, @kainjow). + * Make shared and static libraries optional (Azamat H. Hackimov). + Now you can enable/disable compilation and installation targets for + shared and static libraries via `-DCMARK_SHARED=ON/OFF` and + `-DCMARK_STATIC=ON/OFF`. + * Added support for built-in `${LIB_SUFFIX}` feature (Azamat H. + Hackimov). Replaced `${LIB_INSTALL_DIR}` option with built-in + `${LIB_SUFFIX}` for installing for 32/64-bit systems. Normally, + CMake will set `${LIB_SUFFIX}` automatically for required enviroment. + If you have any issues with it, you can override this option with + `-DLIB_SUFFIX=64` or `-DLIB_SUFFIX=""` during configuration. + * Add Makefile target and harness to fuzz with libFuzzer (Phil Turnbull). + This can be run locally with `make libFuzzer` but the harness will be + integrated into oss-fuzz for large-scale fuzzing. + * Advertise `--validate-utf8` in usage information + (Nguyễn Thái Ngọc Duy). + * Makefile: use warnings with re2c. + * README: Add link to Python wrapper, prettify languages list + (Pavlo Kapyshin). + * README: Add link to cmark-scala (Tim Nieradzik, #196) + [0.27.1] * Set policy for CMP0063 to avoid a warning (#162). diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3197196..d5a1936 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,6 +2,8 @@ if(${CMAKE_VERSION} VERSION_GREATER "3.3") cmake_policy(SET CMP0063 NEW) endif() +include(GNUInstallDirs) + set(LIBRARY "libcmark") set(STATICLIBRARY "libcmark_static") set(HEADERS @@ -123,19 +125,21 @@ endif(MSVC) set(CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_NO_WARNINGS ON) +set(libdir lib${LIB_SUFFIX}) + include (InstallRequiredSystemLibraries) install(TARGETS ${PROGRAM} ${CMARK_INSTALL} EXPORT cmark RUNTIME DESTINATION bin - LIBRARY DESTINATION lib${LIB_SUFFIX} - ARCHIVE DESTINATION lib${LIB_SUFFIX} + LIBRARY DESTINATION ${libdir} + ARCHIVE DESTINATION ${libdir} ) if(CMARK_SHARED OR CMARK_STATIC) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libcmark.pc.in ${CMAKE_CURRENT_BINARY_DIR}/libcmark.pc @ONLY) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libcmark.pc - DESTINATION lib${LIB_SUFFIX}/pkgconfig) + DESTINATION ${libdir}/pkgconfig) install(FILES cmark.h @@ -144,7 +148,7 @@ if(CMARK_SHARED OR CMARK_STATIC) DESTINATION include ) - install(EXPORT cmark DESTINATION lib${LIB_SUFFIX}/cmake) + install(EXPORT cmark DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake) endif() # Feature tests diff --git a/src/blocks.c b/src/blocks.c index 5a293b2..7f58ffd 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -255,17 +255,21 @@ static cmark_node *finalize(cmark_parser *parser, cmark_node *b) { switch (S_type(b)) { case CMARK_NODE_PARAGRAPH: - while (cmark_strbuf_at(node_content, 0) == '[' && - (pos = cmark_parse_reference_inline(parser->mem, node_content, - parser->refmap))) { + { + cmark_chunk chunk = {node_content->ptr, node_content->size, 0}; + while (chunk.len && chunk.data[0] == '[' && + (pos = cmark_parse_reference_inline(parser->mem, &chunk, parser->refmap))) { - cmark_strbuf_drop(node_content, pos); + chunk.data += pos; + chunk.len -= pos; } + cmark_strbuf_drop(node_content, (node_content->size - chunk.len)); if (is_blank(node_content, 0)) { // remove blank node (former reference def) cmark_node_free(b); } break; + } case CMARK_NODE_CODE_BLOCK: if (!b->as.code.fenced) { // indented code @@ -900,6 +904,7 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, (*container)->as.heading.level = level; (*container)->as.heading.setext = false; + (*container)->internal_offset = matched; } else if (!indented && (matched = scan_open_code_fence( input, parser->first_nonspace))) { diff --git a/src/inlines.c b/src/inlines.c index c95809c..d0ab253 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -22,9 +22,9 @@ static const char *LEFTSINGLEQUOTE = "\xE2\x80\x98"; static const char *RIGHTSINGLEQUOTE = "\xE2\x80\x99"; // Macros for creating various kinds of simple. -#define make_str(mem, s) make_literal(mem, CMARK_NODE_TEXT, s) -#define make_code(mem, s) make_literal(mem, CMARK_NODE_CODE, s) -#define make_raw_html(mem, s) make_literal(mem, CMARK_NODE_HTML_INLINE, s) +#define make_str(subj, sc, ec, s) make_literal(subj, CMARK_NODE_TEXT, sc, ec, s) +#define make_code(subj, sc, ec, s) make_literal(subj, CMARK_NODE_CODE, sc, ec, s) +#define make_raw_html(subj, sc, ec, s) make_literal(subj, CMARK_NODE_HTML_INLINE, sc, ec, s) #define make_linebreak(mem) make_simple(mem, CMARK_NODE_LINEBREAK) #define make_softbreak(mem) make_simple(mem, CMARK_NODE_SOFTBREAK) #define make_emph(mem) make_simple(mem, CMARK_NODE_EMPH) @@ -55,7 +55,10 @@ typedef struct bracket { typedef struct { cmark_mem *mem; cmark_chunk input; + int line; bufsize_t pos; + int block_offset; + int column_offset; cmark_reference_map *refmap; delimiter *last_delim; bracket *last_bracket; @@ -72,17 +75,22 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener, static int parse_inline(subject *subj, cmark_node *parent, int options); -static void subject_from_buf(cmark_mem *mem, subject *e, cmark_strbuf *buffer, - cmark_reference_map *refmap); +static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e, + cmark_chunk *chunk, cmark_reference_map *refmap); static bufsize_t subject_find_special_char(subject *subj, int options); // Create an inline with a literal string value. -static CMARK_INLINE cmark_node *make_literal(cmark_mem *mem, cmark_node_type t, +static CMARK_INLINE cmark_node *make_literal(subject *subj, cmark_node_type t, + int start_column, int end_column, cmark_chunk s) { - cmark_node *e = (cmark_node *)mem->calloc(1, sizeof(*e)); - cmark_strbuf_init(mem, &e->content, 0); - e->type = t; + cmark_node *e = (cmark_node *)subj->mem->calloc(1, sizeof(*e)); + cmark_strbuf_init(subj->mem, &e->content, 0); + e->type = (uint16_t)t; e->as.literal = s; + e->start_line = e->end_line = subj->line; + // columns are 1 based. + e->start_column = start_column + 1 + subj->column_offset + subj->block_offset; + e->end_column = end_column + 1 + subj->column_offset + subj->block_offset; return e; } @@ -95,14 +103,15 @@ static CMARK_INLINE cmark_node *make_simple(cmark_mem *mem, cmark_node_type t) { } // Like make_str, but parses entities. -static cmark_node *make_str_with_entities(cmark_mem *mem, +static cmark_node *make_str_with_entities(subject *subj, + int start_column, int end_column, cmark_chunk *content) { - cmark_strbuf unescaped = CMARK_BUF_INIT(mem); + cmark_strbuf unescaped = CMARK_BUF_INIT(subj->mem); if (houdini_unescape_html(&unescaped, content->data, content->len)) { - return make_str(mem, cmark_chunk_buf_detach(&unescaped)); + return make_str(subj, start_column, end_column, cmark_chunk_buf_detach(&unescaped)); } else { - return make_str(mem, *content); + return make_str(subj, start_column, end_column, *content); } } @@ -140,23 +149,28 @@ static cmark_chunk cmark_clean_autolink(cmark_mem *mem, cmark_chunk *url, return cmark_chunk_buf_detach(&buf); } -static CMARK_INLINE cmark_node *make_autolink(cmark_mem *mem, cmark_chunk url, - int is_email) { - cmark_node *link = make_simple(mem, CMARK_NODE_LINK); - link->as.link.url = cmark_clean_autolink(mem, &url, is_email); +static CMARK_INLINE cmark_node *make_autolink(subject *subj, + int start_column, int end_column, + cmark_chunk url, int is_email) { + cmark_node *link = make_simple(subj->mem, CMARK_NODE_LINK); + link->as.link.url = cmark_clean_autolink(subj->mem, &url, is_email); link->as.link.title = cmark_chunk_literal(""); - cmark_node_append_child(link, make_str_with_entities(mem, &url)); + link->start_line = link->end_line = subj->line; + link->start_column = start_column + 1; + link->end_column = end_column + 1; + cmark_node_append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url)); return link; } -static void subject_from_buf(cmark_mem *mem, subject *e, cmark_strbuf *buffer, - cmark_reference_map *refmap) { +static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e, + cmark_chunk *chunk, cmark_reference_map *refmap) { int i; e->mem = mem; - e->input.data = buffer->ptr; - e->input.len = buffer->size; - e->input.alloc = 0; + e->input = *chunk; + e->line = line_number; e->pos = 0; + e->block_offset = block_offset; + e->column_offset = 0; e->refmap = refmap; e->last_delim = NULL; e->last_bracket = NULL; @@ -223,6 +237,47 @@ static CMARK_INLINE cmark_chunk take_while(subject *subj, int (*f)(int)) { return cmark_chunk_dup(&subj->input, startpos, len); } +// Return the number of newlines in a given span of text in a subject. If +// the number is greater than zero, also return the number of characters +// between the last newline and the end of the span in `since_newline`. +static int count_newlines(subject *subj, bufsize_t from, bufsize_t len, int *since_newline) { + int nls = 0; + int since_nl = 0; + + while (len--) { + if (subj->input.data[from++] == '\n') { + ++nls; + since_nl = 0; + } else { + ++since_nl; + } + } + + if (!nls) + return 0; + + *since_newline = since_nl; + return nls; +} + +// Adjust `node`'s `end_line`, `end_column`, and `subj`'s `line` and +// `column_offset` according to the number of newlines in a just-matched span +// of text in `subj`. +static void adjust_subj_node_newlines(subject *subj, cmark_node *node, int matchlen, int extra, int options) { + if (!(options & CMARK_OPT_SOURCEPOS)) { + return; + } + + int since_newline; + int newlines = count_newlines(subj, subj->pos - matchlen - extra, matchlen, &since_newline); + if (newlines) { + subj->line += newlines; + node->end_line += newlines; + node->end_column = since_newline; + subj->column_offset = -subj->pos + since_newline + extra; + } +} + // Try to process a backtick code span that began with a // span of ticks of length openticklength length (already // parsed). Return 0 if you don't find matching closing @@ -270,14 +325,14 @@ static bufsize_t scan_to_closing_backticks(subject *subj, // Parse backtick code section or raw backticks, return an inline. // Assumes that the subject has a backtick at the current position. -static cmark_node *handle_backticks(subject *subj) { +static cmark_node *handle_backticks(subject *subj, int options) { cmark_chunk openticks = take_while(subj, isbacktick); bufsize_t startpos = subj->pos; bufsize_t endpos = scan_to_closing_backticks(subj, openticks.len); if (endpos == 0) { // not found subj->pos = startpos; // rewind - return make_str(subj->mem, openticks); + return make_str(subj, subj->pos, subj->pos, openticks); } else { cmark_strbuf buf = CMARK_BUF_INIT(subj->mem); @@ -286,7 +341,9 @@ static cmark_node *handle_backticks(subject *subj) { cmark_strbuf_trim(&buf); cmark_strbuf_normalize_whitespace(&buf); - return make_code(subj->mem, cmark_chunk_buf_detach(&buf)); + cmark_node *node = make_code(subj, startpos, endpos - openticks.len - 1, cmark_chunk_buf_detach(&buf)); + adjust_subj_node_newlines(subj, node, endpos - startpos, openticks.len, options); + return node; } } @@ -345,7 +402,8 @@ static int scan_delims(subject *subj, unsigned char c, bool *can_open, *can_close = right_flanking && (!left_flanking || cmark_utf8proc_is_punctuation(after_char)); } else if (c == '\'' || c == '"') { - *can_open = left_flanking && !right_flanking; + *can_open = left_flanking && !right_flanking && + before_char != ']' && before_char != ')'; *can_close = right_flanking; } else { *can_open = left_flanking; @@ -443,7 +501,7 @@ static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) { contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims); } - inl_text = make_str(subj->mem, contents); + inl_text = make_str(subj, subj->pos - numdelims, subj->pos - 1, contents); if ((can_open || can_close) && (!(c == '\'' || c == '"') || smart)) { push_delimiter(subj, c, can_open, can_close, inl_text); @@ -459,7 +517,7 @@ static cmark_node *handle_hyphen(subject *subj, bool smart) { advance(subj); if (!smart || peek_char(subj) != '-') { - return make_str(subj->mem, cmark_chunk_literal("-")); + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("-")); } while (smart && peek_char(subj) == '-') { @@ -492,7 +550,7 @@ static cmark_node *handle_hyphen(subject *subj, bool smart) { cmark_strbuf_puts(&buf, ENDASH); } - return make_str(subj->mem, cmark_chunk_buf_detach(&buf)); + return make_str(subj, startpos, subj->pos - 1, cmark_chunk_buf_detach(&buf)); } // Assumes we have a period at the current position. @@ -502,12 +560,12 @@ static cmark_node *handle_period(subject *subj, bool smart) { advance(subj); if (peek_char(subj) == '.') { advance(subj); - return make_str(subj->mem, cmark_chunk_literal(ELLIPSES)); + return make_str(subj, subj->pos - 3, subj->pos - 1, cmark_chunk_literal(ELLIPSES)); } else { - return make_str(subj->mem, cmark_chunk_literal("..")); + return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal("..")); } } else { - return make_str(subj->mem, cmark_chunk_literal(".")); + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal(".")); } } @@ -615,7 +673,7 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener, cmark_node *tmp, *tmpnext, *emph; // calculate the actual number of characters used from this closer - use_delims = (closer_num_chars >= 2 && opener_num_chars >=2) ? 2 : 1; + use_delims = (closer_num_chars >= 2 && opener_num_chars >= 2) ? 2 : 1; // remove used characters from associated inlines. opener_num_chars -= use_delims; @@ -643,6 +701,10 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener, } cmark_node_insert_after(opener_inl, emph); + emph->start_line = emph->end_line = subj->line; + emph->start_column = opener_inl->start_column + subj->column_offset; + emph->end_column = closer_inl->end_column + subj->column_offset; + // if opener has 0 characters, remove it and its associated inline if (opener_num_chars == 0) { cmark_node_free(opener_inl); @@ -669,11 +731,11 @@ static cmark_node *handle_backslash(subject *subj) { if (cmark_ispunct( nextchar)) { // only ascii symbols and newline can be escaped advance(subj); - return make_str(subj->mem, cmark_chunk_dup(&subj->input, subj->pos - 1, 1)); + return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_dup(&subj->input, subj->pos - 1, 1)); } else if (!is_eof(subj) && skip_line_end(subj)) { return make_linebreak(subj->mem); } else { - return make_str(subj->mem, cmark_chunk_literal("\\")); + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("\\")); } } @@ -689,14 +751,14 @@ static cmark_node *handle_entity(subject *subj) { subj->input.len - subj->pos); if (len == 0) - return make_str(subj->mem, cmark_chunk_literal("&")); + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("&")); subj->pos += len; - return make_str(subj->mem, cmark_chunk_buf_detach(&ent)); + return make_str(subj, subj->pos - 1 - len, subj->pos - 1, cmark_chunk_buf_detach(&ent)); } -// Clean a URL: remove surrounding whitespace and surrounding <>, -// and remove \ that escape punctuation. +// Clean a URL: remove surrounding whitespace, and remove \ that escape +// punctuation. cmark_chunk cmark_clean_url(cmark_mem *mem, cmark_chunk *url) { cmark_strbuf buf = CMARK_BUF_INIT(mem); @@ -707,11 +769,7 @@ cmark_chunk cmark_clean_url(cmark_mem *mem, cmark_chunk *url) { return result; } - if (url->data[0] == '<' && url->data[url->len - 1] == '>') { - houdini_unescape_html_f(&buf, url->data + 1, url->len - 2); - } else { houdini_unescape_html_f(&buf, url->data, url->len); - } cmark_strbuf_unescape(&buf); return cmark_chunk_buf_detach(&buf); @@ -743,7 +801,7 @@ cmark_chunk cmark_clean_title(cmark_mem *mem, cmark_chunk *title) { // Parse an autolink or HTML tag. // Assumes the subject has a '<' character at the current position. -static cmark_node *handle_pointy_brace(subject *subj) { +static cmark_node *handle_pointy_brace(subject *subj, int options) { bufsize_t matchlen = 0; cmark_chunk contents; @@ -755,7 +813,7 @@ static cmark_node *handle_pointy_brace(subject *subj) { contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1); subj->pos += matchlen; - return make_autolink(subj->mem, contents, 0); + return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 0); } // next try to match an email autolink @@ -764,7 +822,7 @@ static cmark_node *handle_pointy_brace(subject *subj) { contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1); subj->pos += matchlen; - return make_autolink(subj->mem, contents, 1); + return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 1); } // finally, try to match an html tag @@ -772,11 +830,13 @@ static cmark_node *handle_pointy_brace(subject *subj) { if (matchlen > 0) { contents = cmark_chunk_dup(&subj->input, subj->pos - 1, matchlen + 1); subj->pos += matchlen; - return make_raw_html(subj->mem, contents); + cmark_node *node = make_raw_html(subj, subj->pos - matchlen - 1, subj->pos - 1, contents); + adjust_subj_node_newlines(subj, node, matchlen, 1, options); + return node; } // if nothing matches, just return the opening <: - return make_str(subj->mem, cmark_chunk_literal("<")); + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("<")); } // Parse a link label. Returns 1 if successful. @@ -824,24 +884,12 @@ noMatch: subj->pos = startpos; // rewind return 0; } -static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset) { + +static bufsize_t manual_scan_link_url_2(cmark_chunk *input, bufsize_t offset, + cmark_chunk *output) { bufsize_t i = offset; size_t nb_p = 0; - if (i < input->len && input->data[i] == '<') { - ++i; - while (i < input->len) { - if (input->data[i] == '>') { - ++i; - break; - } else if (input->data[i] == '\\') - i += 2; - else if (cmark_isspace(input->data[i])) - return -1; - else - ++i; - } - } else { while (i < input->len) { if (input->data[i] == '\\' && i + 1 < input-> len && @@ -862,18 +910,53 @@ static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset) { else ++i; } + + if (i >= input->len) + return -1; + + { + cmark_chunk result = {input->data + offset, i - offset, 0}; + *output = result; + } + return i - offset; +} + +static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset, + cmark_chunk *output) { + bufsize_t i = offset; + + if (i < input->len && input->data[i] == '<') { + ++i; + while (i < input->len) { + if (input->data[i] == '>') { + ++i; + break; + } else if (input->data[i] == '\\') + i += 2; + else if (cmark_isspace(input->data[i]) || input->data[i] == '<') + return manual_scan_link_url_2(input, offset, output); + else + ++i; + } + } else { + return manual_scan_link_url_2(input, offset, output); } if (i >= input->len) return -1; + + { + cmark_chunk result = {input->data + offset + 1, i - 2 - offset, 0}; + *output = result; + } return i - offset; } + // Return a link, an image, or a literal close bracket. static cmark_node *handle_close_bracket(subject *subj) { bufsize_t initial_pos, after_link_text_pos; - bufsize_t starturl, endurl, starttitle, endtitle, endall; - bufsize_t n; - bufsize_t sps; + bufsize_t endurl, starttitle, endtitle, endall; + bufsize_t sps, n; cmark_reference *ref = NULL; cmark_chunk url_chunk, title_chunk; cmark_chunk url, title; @@ -891,13 +974,13 @@ static cmark_node *handle_close_bracket(subject *subj) { opener = subj->last_bracket; if (opener == NULL) { - return make_str(subj->mem, cmark_chunk_literal("]")); + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]")); } if (!opener->active) { // take delimiter off stack pop_bracket(subj); - return make_str(subj->mem, cmark_chunk_literal("]")); + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]")); } // If we got here, we matched a potential link/image text. @@ -909,11 +992,11 @@ static cmark_node *handle_close_bracket(subject *subj) { // First, look for an inline link. if (peek_char(subj) == '(' && ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) && - ((n = manual_scan_link_url(&subj->input, subj->pos + 1 + sps)) > -1)) { + ((n = manual_scan_link_url(&subj->input, subj->pos + 1 + sps, + &url_chunk)) > -1)) { // try to parse an explicit link: - starturl = subj->pos + 1 + sps; // after ( - endurl = starturl + n; + endurl = subj->pos + 1 + sps + n; starttitle = endurl + scan_spacechars(&subj->input, endurl); // ensure there are spaces btw url and title @@ -926,7 +1009,6 @@ static cmark_node *handle_close_bracket(subject *subj) { if (peek_at(subj, endall) == ')') { subj->pos = endall + 1; - url_chunk = cmark_chunk_dup(&subj->input, starturl, endurl - starturl); title_chunk = cmark_chunk_dup(&subj->input, starttitle, endtitle - starttitle); url = cmark_clean_url(subj->mem, &url_chunk); @@ -975,12 +1057,15 @@ noMatch: // If we fall through to here, it means we didn't match a link: pop_bracket(subj); // remove this opener from delimiter list subj->pos = initial_pos; - return make_str(subj->mem, cmark_chunk_literal("]")); + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]")); match: inl = make_simple(subj->mem, is_image ? CMARK_NODE_IMAGE : CMARK_NODE_LINK); inl->as.link.url = url; inl->as.link.title = title; + inl->start_line = inl->end_line = subj->line; + inl->start_column = opener->inl_text->start_column; + inl->end_column = subj->pos + subj->column_offset + subj->block_offset; cmark_node_insert_before(opener->inl_text, inl); // Add link text: tmp = opener->inl_text->next; @@ -1027,6 +1112,8 @@ static cmark_node *handle_newline(subject *subj) { if (peek_at(subj, subj->pos) == '\n') { advance(subj); } + ++subj->line; + subj->column_offset = -subj->pos; // skip spaces at beginning of line skip_spaces(subj); if (nlpos > 1 && peek_at(subj, nlpos - 1) == ' ' && @@ -1086,7 +1173,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { cmark_node *new_inl = NULL; cmark_chunk contents; unsigned char c; - bufsize_t endpos; + bufsize_t startpos, endpos; c = peek_char(subj); if (c == 0) { return 0; @@ -1097,7 +1184,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { new_inl = handle_newline(subj); break; case '`': - new_inl = handle_backticks(subj); + new_inl = handle_backticks(subj, options); break; case '\\': new_inl = handle_backslash(subj); @@ -1106,7 +1193,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { new_inl = handle_entity(subj); break; case '<': - new_inl = handle_pointy_brace(subj); + new_inl = handle_pointy_brace(subj, options); break; case '*': case '_': @@ -1122,7 +1209,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { break; case '[': advance(subj); - new_inl = make_str(subj->mem, cmark_chunk_literal("[")); + new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("[")); push_bracket(subj, false, new_inl); break; case ']': @@ -1132,15 +1219,16 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { advance(subj); if (peek_char(subj) == '[') { advance(subj); - new_inl = make_str(subj->mem, cmark_chunk_literal("![")); + new_inl = make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal("![")); push_bracket(subj, true, new_inl); } else { - new_inl = make_str(subj->mem, cmark_chunk_literal("!")); + new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("!")); } break; default: endpos = subject_find_special_char(subj, options); contents = cmark_chunk_dup(&subj->input, subj->pos, endpos - subj->pos); + startpos = subj->pos; subj->pos = endpos; // if we're at a newline, strip trailing spaces. @@ -1148,7 +1236,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { cmark_chunk_rtrim(&contents); } - new_inl = make_str(subj->mem, contents); + new_inl = make_str(subj, startpos, endpos - 1, contents); } if (new_inl != NULL) { cmark_node_append_child(parent, new_inl); @@ -1161,7 +1249,8 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { extern void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent, cmark_reference_map *refmap, int options) { subject subj; - subject_from_buf(mem, &subj, &parent->content, refmap); + cmark_chunk content = {parent->content.ptr, parent->content.size, 0}; + subject_from_buf(mem, parent->start_line, parent->start_column - 1 + parent->internal_offset, &subj, &content, refmap); cmark_chunk_rtrim(&subj.input); while (!is_eof(&subj) && parse_inline(&subj, parent, options)) @@ -1189,7 +1278,7 @@ static void spnl(subject *subj) { // Modify refmap if a reference is encountered. // Return 0 if no reference found, otherwise position of subject // after reference is parsed. -bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, +bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_chunk *input, cmark_reference_map *refmap) { subject subj; @@ -1200,7 +1289,7 @@ bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, bufsize_t matchlen = 0; bufsize_t beforetitle; - subject_from_buf(mem, &subj, input, NULL); + subject_from_buf(mem, -1, 0, &subj, input, NULL); // parse label: if (!link_label(&subj, &lab) || lab.len == 0) @@ -1215,9 +1304,8 @@ bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, // parse link url: spnl(&subj); - matchlen = manual_scan_link_url(&subj.input, subj.pos); - if (matchlen > 0) { - url = cmark_chunk_dup(&subj.input, subj.pos, matchlen); + if ((matchlen = manual_scan_link_url(&subj.input, subj.pos, &url)) > -1 && + url.len > 0) { subj.pos += matchlen; } else { return 0; diff --git a/src/inlines.h b/src/inlines.h index 52be768..39d3363 100644 --- a/src/inlines.h +++ b/src/inlines.h @@ -11,7 +11,7 @@ cmark_chunk cmark_clean_title(cmark_mem *mem, cmark_chunk *title); void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent, cmark_reference_map *refmap, int options); -bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, +bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_chunk *input, cmark_reference_map *refmap); #ifdef __cplusplus diff --git a/src/iterator.c b/src/iterator.c index 24423a2..f5cd802 100644 --- a/src/iterator.c +++ b/src/iterator.c @@ -106,6 +106,7 @@ void cmark_consolidate_text_nodes(cmark_node *root) { while (tmp && tmp->type == CMARK_NODE_TEXT) { cmark_iter_next(iter); // advance pointer cmark_strbuf_put(&buf, tmp->as.literal.data, tmp->as.literal.len); + cur->end_column = tmp->end_column; next = tmp->next; cmark_node_free(tmp); tmp = next; diff --git a/src/latex.c b/src/latex.c index f372a13..0d9517d 100644 --- a/src/latex.c +++ b/src/latex.c @@ -252,24 +252,24 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node, CR(); list_number = cmark_node_get_list_start(node); if (list_number > 1) { - enumlevel = S_get_enumlevel(node); - // latex normally supports only five levels - if (enumlevel >= 1 && enumlevel <= 5) { + enumlevel = S_get_enumlevel(node); + // latex normally supports only five levels + if (enumlevel >= 1 && enumlevel <= 5) { snprintf(list_number_string, LIST_NUMBER_STRING_SIZE, "%d", list_number); LIT("\\setcounter{enum"); - switch(enumlevel) { - case 1: LIT("i"); break; - case 2: LIT("ii"); break; - case 3: LIT("iii"); break; - case 4: LIT("iv"); break; - case 5: LIT("v"); break; - default: LIT("i"); break; + switch (enumlevel) { + case 1: LIT("i"); break; + case 2: LIT("ii"); break; + case 3: LIT("iii"); break; + case 4: LIT("iv"); break; + case 5: LIT("v"); break; + default: LIT("i"); break; } LIT("}{"); OUT(list_number_string, false, NORMAL); LIT("}"); - } + } CR(); } } else { diff --git a/src/libcmark.pc.in b/src/libcmark.pc.in index 024ae48..0f87c30 100644 --- a/src/libcmark.pc.in +++ b/src/libcmark.pc.in @@ -1,6 +1,6 @@ prefix=@CMAKE_INSTALL_PREFIX@ exec_prefix=@CMAKE_INSTALL_PREFIX@ -libdir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@ +libdir=@CMAKE_INSTALL_PREFIX@/@libdir@ includedir=@CMAKE_INSTALL_PREFIX@/include Name: libcmark @@ -32,6 +32,7 @@ void print_usage() { printf(" --nobreaks Render soft line breaks as spaces\n"); printf(" --safe Suppress raw HTML and dangerous URLs\n"); printf(" --smart Use smart punctuation\n"); + printf(" --validate-utf8 Replace UTF-8 invalid sequences with U+FFFD\n"); printf(" --help, -h Print usage information\n"); printf(" --version Print version\n"); } @@ -66,6 +66,7 @@ struct cmark_node { int start_column; int end_line; int end_column; + int internal_offset; uint16_t type; uint16_t flags; diff --git a/src/scanners.c b/src/scanners.c index c96490d..b312f66 100644 --- a/src/scanners.c +++ b/src/scanners.c @@ -752,7 +752,7 @@ bufsize_t _scan_autolink_uri(const unsigned char *p) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 0, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 0, 128, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, @@ -839,7 +839,7 @@ bufsize_t _scan_autolink_uri(const unsigned char *p) { } if (yych <= 0xEC) { if (yych <= 0xC1) { - if (yych <= ' ') + if (yych <= '<') goto yy45; if (yych <= '>') goto yy85; @@ -7887,35 +7887,45 @@ bufsize_t _scan_html_tag(const unsigned char *p) { unsigned char yych; static const unsigned char yybm[] = { /* table 1 .. 8: 0 */ - 0, 239, 239, 239, 239, 239, 239, 239, 239, 238, 238, 238, 238, 238, 239, + 0, 239, 239, 239, 239, 239, 239, 239, 239, 238, 238, 238, 238, 238, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, - 239, 239, 239, 238, 239, 234, 239, 239, 239, 239, 236, 239, 239, 239, - 239, 239, 207, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, - 239, 239, 239, 238, 238, 174, 231, 239, 255, 255, 255, 255, 255, 255, + 239, 239, 239, 239, 238, 239, 234, 239, 239, 239, 239, 236, 239, 239, + 239, 239, 239, 207, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, + 239, 239, 239, 239, 238, 238, 174, 231, 239, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 239, 239, 111, 239, 239, 238, 239, 239, + 255, 255, 255, 255, 255, 255, 255, 239, 239, 111, 239, 239, 238, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, - 239, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 239, 239, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, /* table 9 .. 11: 256 */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 64, 64, 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 160, 128, 0, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 128, 0, - 0, 0, 0, 0, 0, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 64, 64, 64, 64, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 160, 128, 0, 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 128, 0, 0, 0, 0, 0, 0, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, - 160, 0, 0, 0, 0, 128, 0, 160, 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 0, 0, 0, 0, 128, 0, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, - 160, 160, 160, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, }; yych = *p; if (yych <= '>') { diff --git a/src/scanners.re b/src/scanners.re index a0650f2..b20a954 100644 --- a/src/scanners.re +++ b/src/scanners.re @@ -91,7 +91,7 @@ bufsize_t _scan_autolink_uri(const unsigned char *p) const unsigned char *marker = NULL; const unsigned char *start = p; /*!re2c - scheme [:][^\x00-\x20>]*[>] { return (bufsize_t)(p - start); } + scheme [:][^\x00-\x20<>]*[>] { return (bufsize_t)(p - start); } * { return 0; } */ } diff --git a/test/regression.txt b/test/regression.txt index 18b7d79..a6669d0 100644 --- a/test/regression.txt +++ b/test/regression.txt @@ -81,7 +81,7 @@ Issue #193 - unescaped left angle brackets in link destination [a]: <te<st> . -<p><a href="te%3Cst">a</a></p> +<p><a href="%3Cte%3Cst%3E">a</a></p> ```````````````````````````````` Issue #192 - escaped spaces in link destination diff --git a/test/smart_punct.txt b/test/smart_punct.txt index 3522c94..fd55e62 100644 --- a/test/smart_punct.txt +++ b/test/smart_punct.txt @@ -78,6 +78,15 @@ left double quote, to facilitate this style: <p>“Second paragraph by same speaker, in fiction.”</p> ```````````````````````````````` +A quote following a `]` or `)` character cannot +be an open quote: + +```````````````````````````````` example +[a]'s b' +. +<p>[a]’s b’</p> +```````````````````````````````` + Quotes that are escaped come out as literal straight quotes: diff --git a/test/spec.txt b/test/spec.txt index 64a60b1..9fd5841 100644 --- a/test/spec.txt +++ b/test/spec.txt @@ -1,8 +1,8 @@ --- title: CommonMark Spec author: John MacFarlane -version: 0.27 -date: '2016-11-18' +version: 0.28 +date: '2017-08-01' license: '[CC-BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/)' ... @@ -1645,6 +1645,15 @@ With tildes: </code></pre> ```````````````````````````````` +Fewer than three backticks is not enough: + +```````````````````````````````` example +`` +foo +`` +. +<p><code>foo</code></p> +```````````````````````````````` The closing code fence must use the same character as the opening fence: @@ -2033,6 +2042,37 @@ or [closing tag] (with any [tag name] other than `script`, or the end of the line.\ **End condition:** line is followed by a [blank line]. +HTML blocks continue until they are closed by their appropriate +[end condition], or the last line of the document or other [container block]. +This means any HTML **within an HTML block** that might otherwise be recognised +as a start condition will be ignored by the parser and passed through as-is, +without changing the parser's state. + +For instance, `<pre>` within a HTML block started by `<table>` will not affect +the parser state; as the HTML block was started in by start condition 6, it +will end at any blank line. This can be surprising: + +```````````````````````````````` example +<table><tr><td> +<pre> +**Hello**, + +_world_. +</pre> +</td></tr></table> +. +<table><tr><td> +<pre> +**Hello**, +<p><em>world</em>. +</pre></p> +</td></tr></table> +```````````````````````````````` + +In this case, the HTML block is terminated by the newline — the `**hello**` +text remains verbatim — and regular parsing resumes, with a paragraph, +emphasised `world` and inline and block HTML following. + All types of [HTML blocks] except type 7 may interrupt a paragraph. Blocks of type 7 may not interrupt a paragraph. (This restriction is intended to prevent unwanted interpretation @@ -3639,11 +3679,15 @@ The following rules define [list items]: If the list item is ordered, then it is also assigned a start number, based on the ordered list marker. - Exceptions: When the first list item in a [list] interrupts - a paragraph---that is, when it starts on a line that would - otherwise count as [paragraph continuation text]---then (a) - the lines *Ls* must not begin with a blank line, and (b) if - the list item is ordered, the start number must be 1. + Exceptions: + + 1. When the first list item in a [list] interrupts + a paragraph---that is, when it starts on a line that would + otherwise count as [paragraph continuation text]---then (a) + the lines *Ls* must not begin with a blank line, and (b) if + the list item is ordered, the start number must be 1. + 2. If any line is a [thematic break][thematic breaks] then + that line is not a list item. For example, let *Ls* be the lines @@ -5856,8 +5900,9 @@ for efficient parsing strategies that do not backtrack. First, some definitions. A [delimiter run](@) is either a sequence of one or more `*` characters that is not preceded or -followed by a `*` character, or a sequence of one or more `_` -characters that is not preceded or followed by a `_` character. +followed by a non-backslash-escaped `*` character, or a sequence +of one or more `_` characters that is not preceded or followed by +a non-backslash-escaped `_` character. A [left-flanking delimiter run](@) is a [delimiter run] that is (a) not followed by [Unicode whitespace], @@ -7159,7 +7204,9 @@ A [link destination](@) consists of either - a nonempty sequence of characters that does not include ASCII space or control characters, and includes parentheses only if (a) they are backslash-escaped or (b) they are part of - a balanced pair of unescaped parentheses. + a balanced pair of unescaped parentheses. (Implementations + may impose limits on parentheses nesting to avoid performance + issues, but at least three levels of nesting should be supported.) A [link title](@) consists of either @@ -7265,7 +7312,7 @@ Parentheses inside the link destination may be escaped: <p><a href="(foo)">link</a></p> ```````````````````````````````` -Any number parentheses are allowed without escaping, as long as they are +Any number of parentheses are allowed without escaping, as long as they are balanced: ```````````````````````````````` example @@ -7571,13 +7618,16 @@ that [matches] a [link reference definition] elsewhere in the document. A [link label](@) begins with a left bracket (`[`) and ends with the first right bracket (`]`) that is not backslash-escaped. Between these brackets there must be at least one [non-whitespace character]. -Unescaped square bracket characters are not allowed in -[link labels]. A link label can have at most 999 -characters inside the square brackets. +Unescaped square bracket characters are not allowed inside the +opening and closing square brackets of [link labels]. A link +label can have at most 999 characters inside the square +brackets. One label [matches](@) another just in case their normalized forms are equal. To normalize a -label, perform the *Unicode case fold* and collapse consecutive internal +label, strip off the opening and closing brackets, +perform the *Unicode case fold*, strip leading and trailing +[whitespace] and collapse consecutive internal [whitespace] to a single space. If there are multiple matching reference link definitions, the one that comes first in the document is used. (It is desirable in such cases to emit a warning.) |