diff options
-rw-r--r-- | .editorconfig | 1 | ||||
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | README.md | 33 | ||||
-rw-r--r-- | js/.npmignore | 3 | ||||
-rw-r--r-- | js/README.md | 24 | ||||
-rw-r--r-- | js/lib/html-renderer.js | 2 | ||||
-rw-r--r-- | js/lib/inlines.js | 40 | ||||
-rw-r--r-- | js/package.json | 27 | ||||
-rwxr-xr-x | make_site_index.sh | 12 | ||||
-rw-r--r-- | runtests.pl | 2 | ||||
-rw-r--r-- | spec.txt | 712 | ||||
-rw-r--r-- | src/blocks.c | 124 | ||||
-rw-r--r-- | src/buffer.c | 25 | ||||
-rw-r--r-- | src/buffer.h | 1 | ||||
-rw-r--r-- | src/cmark.h | 1 | ||||
-rw-r--r-- | src/html/html.c | 466 | ||||
-rw-r--r-- | src/inlines.c | 272 | ||||
-rw-r--r-- | src/references.c | 42 |
18 files changed, 1077 insertions, 712 deletions
diff --git a/.editorconfig b/.editorconfig index f0f2032..946396e 100644 --- a/.editorconfig +++ b/.editorconfig @@ -15,3 +15,4 @@ indent_size = 2 [{*.c,Makefile}] trim_trailing_whitespace = true indent_style = tab +indent_size = 8 @@ -110,7 +110,7 @@ operf: $(PROG) fuzztest: for i in `seq 1 10`; do \ - time cat /dev/urandom | head -c 100000 | iconv -f latin1 -t utf-8 | $(PROG) >/dev/null; done + time cat /dev/urandom | head -c 500000 | iconv -f latin1 -t utf-8 | tee fuzz-$$i.txt | $(PROG) > /dev/null && rm fuzz-$$i.txt ; done $(SITE)/index.html: spec.txt ./make_site_index.sh $(SPECVERSION) | \ @@ -17,14 +17,18 @@ developers, since `scanners.c` can be provided in a released source tarball.) The parser is very fast, on par with -[sundown](https://github.com/vmg/sundown). Some benchmarks: - -|Implementation | Time to parse a 500K book | -|---------------|---------------------------| -| Markdown.pl | 3.99s | -| discount | 0.089s | -| sundown | 0.015s | -| cmark | 0.019s | +[sundown](https://github.com/vmg/sundown). Some benchmarks (on +a 1.6 GHz Intel Core i5, measured using `time`, and parsing a +~500K book, the English version of [*Pro +Git*](https://github.com/progit/progit/tree/master/en) by +Scott Chacon and Ben Straub): + +|Implementation | Time | +|---------------|-------| +| Markdown.pl | 3.990s| +| discount | 0.089s| +| sundown | 0.015s| +| cmark | 0.019s| Usage: cmark [FILE*] Options: --help, -h Print usage information @@ -32,12 +36,13 @@ The parser is very fast, on par with --version Print version The JavaScript implementation is a single JavaScript file, with -no dependencies, that can be linked to in an HTML page. (To build, -it, do `make js/commonmark.js`---this requires `browserify`, which you -can get using `npm install -g browserify`.) A command-line -version (using `node.js`) is also provided (`js/bin/commonmark`), and -there is a "dingus" for playing with it interactively. (`make dingus` -will start this.) +no dependencies, that can be linked to in an HTML page. To build, +it, do `make js/commonmark.js` (this requires `browserify`, which you +can get using `npm install -g browserify`). You can also fetch +a pre-built copy from `http://spec.commonmark.org/js/commonmark.js`. +A command-line version (using `node.js`) is also provided +(`js/bin/commonmark`), and there is a "dingus" for playing with it +interactively. (`make dingus` will start this.) [Try it now!](http://jgm.github.io/CommonMark/js/) diff --git a/js/.npmignore b/js/.npmignore new file mode 100644 index 0000000..d73e230 --- /dev/null +++ b/js/.npmignore @@ -0,0 +1,3 @@ +commonmark.js +*.tgz +index.html diff --git a/js/README.md b/js/README.md new file mode 100644 index 0000000..9f1e043 --- /dev/null +++ b/js/README.md @@ -0,0 +1,24 @@ +CommonMark +========== + +CommonMark is a rationalized version of Markdown syntax, +with a [spec][the spec] and BSD3-licensed reference +implementations in C and JavaScript. + +For more information, see <http://commonmark.org>. + +To play with this library without installing it, see +the live dingus at <http://spec.commonmark.org/dingus.html>. + +This package includes the commonmark library and a +command-line executable, `commonmark`. + +Basic usage example: + + var reader = new commonmark.DocParser(); + var writer = new commonmark.HtmlRenderer(); + var parsed = reader.parse("Hello *world*"); + var result = writer.render(parsed); + + [the spec]: http://spec.commonmark.org + diff --git a/js/lib/html-renderer.js b/js/lib/html-renderer.js index e1a6063..a676e3a 100644 --- a/js/lib/html-renderer.js +++ b/js/lib/html-renderer.js @@ -43,7 +43,7 @@ var renderInline = function(inline) { return inTags('a', attrs, this.renderInlines(inline.label)); case 'Image': attrs = [['src', this.escape(inline.destination, true)], - ['alt', this.escape(this.renderInlines(inline.label))]]; + ['alt', this.renderInlines(inline.label).replace(/\<[^>]*\>/g,'')]]; if (inline.title) { attrs.push(['title', this.escape(inline.title, true)]); } diff --git a/js/lib/inlines.js b/js/lib/inlines.js index 5fde099..4f1f16a 100644 --- a/js/lib/inlines.js +++ b/js/lib/inlines.js @@ -235,8 +235,8 @@ var scanDelims = function(cc) { char_after = fromCodePoint(cc_after); } - var can_open = numdelims > 0 && numdelims <= 3 && !(/\s/.test(char_after)); - var can_close = numdelims > 0 && numdelims <= 3 && !(/\s/.test(char_before)); + var can_open = numdelims > 0 && !(/\s/.test(char_after)); + var can_close = numdelims > 0 && !(/\s/.test(char_before)); if (cc === C_UNDERSCORE) { can_open = can_open && !((/[a-z0-9]/i).test(char_before)); can_close = can_close && !((/[a-z0-9]/i).test(char_after)); @@ -265,6 +265,7 @@ var parseEmphasis = function(cc,inlines) { var res = this.scanDelims(cc); var numdelims = res.numdelims; + var usedelims; if (numdelims === 0) { this.pos = startpos; @@ -279,41 +280,36 @@ var parseEmphasis = function(cc,inlines) { if (opener.cc === cc) { // we have a match! - if (opener.numdelims <= numdelims) { // all openers used - - this.pos += opener.numdelims; - var X; - switch (opener.numdelims) { - case 3: - X = function(x) { return Strong([Emph(x)]); }; - break; - case 2: - X = Strong; - break; - case 1: - default: - X = Emph; - break; - } + if (numdelims < 3 || opener.numdelims < 3) { + usedelims = numdelims <= opener.numdelims ? numdelims : opener.numdelims; + } else { // numdelims >= 3 && opener.numdelims >= 3 + usedelims = numdelims % 2 === 0 ? 2 : 1; + } + var X = usedelims === 1 ? Emph : Strong; + + if (opener.numdelims == usedelims) { // all openers used + + this.pos += usedelims; inlines[opener.pos] = X(inlines.slice(opener.pos + 1)); inlines.splice(opener.pos + 1, inlines.length - (opener.pos + 1)); // Remove entries after this, to prevent overlapping nesting: this.emphasis_openers = opener.previous; return true; - } else if (opener.numdelims > numdelims) { // only some openers used + } else if (opener.numdelims > usedelims) { // only some openers used - this.pos += numdelims; - opener.numdelims -= numdelims; + this.pos += usedelims; + opener.numdelims -= usedelims; inlines[opener.pos].c = inlines[opener.pos].c.slice(0, opener.numdelims); - var X = numdelims === 2 ? Strong : Emph; inlines[opener.pos + 1] = X(inlines.slice(opener.pos + 1)); inlines.splice(opener.pos + 2, inlines.length - (opener.pos + 2)); // Remove entries after this, to prevent overlapping nesting: this.emphasis_openers = opener; return true; + } else { // usedelims > opener.numdelims, should never happen + throw new Error("Logic error: usedelims > opener.numdelims"); } } diff --git a/js/package.json b/js/package.json new file mode 100644 index 0000000..4ae8bca --- /dev/null +++ b/js/package.json @@ -0,0 +1,27 @@ +{ "name": "commonmark", + "description": "a strongly specified, highly compatible variant of Markdown", + "dist-tags": { "latest": "0.9.0" }, + "versions": "0.9.0", + "homepage": "http://commonmark.org", + "keywords": + [ "markdown", + "commonmark", + "md", + "stmd" ], + "repository": + { "type": "git", + "url": "https://github.com/jgm/CommonMark.git" }, + "author": "John MacFarlane", + "bugs": { "url": "https://github.com/jgm/CommonMark/issues" }, + "license": "BSD-3-Clause", + "version": "0.9.0", + "main": "./lib/index.js", + "bin": { "commonmark": "./bin/commonmark" }, + "scripts": { "test": "node ./test.js" }, + "directories": { + "lib": "./lib" + }, + "engines": { + "node": "*" + } +} diff --git a/make_site_index.sh b/make_site_index.sh index d11dbe0..8a00018 100755 --- a/make_site_index.sh +++ b/make_site_index.sh @@ -4,13 +4,17 @@ SPECVERSION=$1 SITE=_site VERSIONS=`cd $SITE; ls -d -1 0.* | sort -r -g` -echo "% CommonMark Spec\n" +echo "% CommonMark Spec" +echo "" date=`grep '<div class="version">' $SITE/$SPECVERSION/index.html | perl -pe 's/^.*(\d\d\d\d-\d\d-\d\d).*$/\1/'` -echo "[**Latest version ($SPECVERSION)**](/$SPECVERSION/) ($date)\n" +echo "[**Latest version ($SPECVERSION)**](/$SPECVERSION/) ($date)" +echo "" echo "[discussion forum](http://talk.commonmark.org/) | " echo "[interactive dingus](/dingus.html) | " -echo "[repository](https://github.com/jgm/CommonMark/)\n" -echo "Older versions:\n" +echo "[repository](https://github.com/jgm/CommonMark/)" +echo "" +echo "Older versions:" +echo "" for vers in $VERSIONS do date=`grep '<div class="version">' $SITE/$vers/index.html | perl -pe 's/^.*(\d\d\d\d-\d\d-\d\d).*$/\1/'` diff --git a/runtests.pl b/runtests.pl index 0bff360..ae1195e 100644 --- a/runtests.pl +++ b/runtests.pl @@ -81,7 +81,7 @@ sub dotest waitpid($pid, 0); $html = &tidy($html); $actual = &tidy($actual); - $actual =~ s/\'/'/; + $actual =~ s/\'/'/g; if ($actual eq $html) { print colored("✓", "green"); @@ -2,8 +2,8 @@ title: CommonMark Spec author: - John MacFarlane -version: 0.7 -date: 2014-10-28 +version: 0.10 +date: 2014-11-06 ... # Introduction @@ -4248,87 +4248,71 @@ The following rules capture all of these patterns, while allowing for efficient parsing strategies that do not backtrack: 1. A single `*` character [can open emphasis](#can-open-emphasis) - <a id="can-open-emphasis"></a> iff - - (a) it is not part of a sequence of four or more unescaped `*`s, - (b) it is not followed by whitespace, and - (c) either it is not followed by a `*` character or it is - followed immediately by emphasis or strong emphasis. + <a id="can-open-emphasis"></a> iff it is not followed by + whitespace. 2. A single `_` character [can open emphasis](#can-open-emphasis) iff - - (a) it is not part of a sequence of four or more unescaped `_`s, - (b) it is not followed by whitespace, - (c) it is not preceded by an ASCII alphanumeric character, and - (d) either it is not followed by a `_` character or it is - followed immediately by emphasis or strong emphasis. + it is not followed by whitespace and it is not preceded by an + ASCII alphanumeric character. 3. A single `*` character [can close emphasis](#can-close-emphasis) - <a id="can-close-emphasis"></a> iff - - (a) it is not part of a sequence of four or more unescaped `*`s, and - (b) it is not preceded by whitespace. + <a id="can-close-emphasis"></a> iff it is not preceded by whitespace. 4. A single `_` character [can close emphasis](#can-close-emphasis) iff - - (a) it is not part of a sequence of four or more unescaped `_`s, - (b) it is not preceded by whitespace, and - (c) it is not followed by an ASCII alphanumeric character. + it is not preceded by whitespace and it is not followed by an + ASCII alphanumeric character. 5. A double `**` [can open strong emphasis](#can-open-strong-emphasis) - <a id="can-open-strong-emphasis" ></a> iff - - (a) it is not part of a sequence of four or more unescaped `*`s, - (b) it is not followed by whitespace, and - (c) either it is not followed by a `*` character or it is - followed immediately by emphasis. + <a id="can-open-strong-emphasis" ></a> iff it is not followed by + whitespace. 6. A double `__` [can open strong emphasis](#can-open-strong-emphasis) - iff - - (a) it is not part of a sequence of four or more unescaped `_`s, - (b) it is not followed by whitespace, and - (c) it is not preceded by an ASCII alphanumeric character, and - (d) either it is not followed by a `_` character or it is - followed immediately by emphasis. + iff it is not followed by whitespace and it is not preceded by an + ASCII alphanumeric character. 7. A double `**` [can close strong emphasis](#can-close-strong-emphasis) - <a id="can-close-strong-emphasis" ></a> iff - - (a) it is not part of a sequence of four or more unescaped `*`s, and - (b) it is not preceded by whitespace. + <a id="can-close-strong-emphasis" ></a> iff it is not preceded by + whitespace. 8. A double `__` [can close strong emphasis](#can-close-strong-emphasis) - iff - - (a) it is not part of a sequence of four or more unescaped `_`s, - (b) it is not preceded by whitespace, and - (c) it is not followed by an ASCII alphanumeric character. + iff it is not preceded by whitespace and it is not followed by an + ASCII alphanumeric character. 9. Emphasis begins with a delimiter that [can open emphasis](#can-open-emphasis) and ends with a delimiter that [can close emphasis](#can-close-emphasis), and that uses the same - character (`_` or `*`) as the opening delimiter. The inlines - between the open delimiter and the closing delimiter are the - contents of the emphasis inline. + character (`_` or `*`) as the opening delimiter. There must + be a nonempty sequence of inlines between the open delimiter + and the closing delimiter; these form the contents of the emphasis + inline. 10. Strong emphasis begins with a delimiter that [can open strong emphasis](#can-open-strong-emphasis) and ends with a delimiter that - [can close strong emphasis](#can-close-strong-emphasis), and that uses the - same character (`_` or `*`) as the opening delimiter. The inlines - between the open delimiter and the closing delimiter are the - contents of the strong emphasis inline. + [can close strong emphasis](#can-close-strong-emphasis), and that + uses the same character (`_` or `*`) as the opening delimiter. + There must be a nonempty sequence of inlines between the open + delimiter and the closing delimiter; these form the contents of + the strong emphasis inline. + +11. A literal `*` character cannot occur at the beginning or end of + `*`-delimited emphasis or `**`-delimited strong emphasis, unless it + is backslash-escaped. + +12. A literal `_` character cannot occur at the beginning or end of + `_`-delimited emphasis or `__`-delimited strong emphasis, unless it + is backslash-escaped. -Where rules 1--10 above are compatible with multiple parsings, +Where rules 1--12 above are compatible with multiple parsings, the following principles resolve ambiguity: -11. An interpretation `<strong>...</strong>` is always preferred to +13. The number of nestings should be minimized. Thus, for example, + an interpretation `<strong>...</strong>` is always preferred to `<em><em>...</em></em>`. -12. An interpretation `<strong><em>...</em></strong>` is always +14. An interpretation `<strong><em>...</em></strong>` is always preferred to `<em><strong>..</strong></em>`. -13. When two potential emphasis or strong emphasis spans overlap, +15. When two potential emphasis or strong emphasis spans overlap, so that the second begins before the first ends and ends after the first ends, the first is preferred. Thus, for example, `*foo _bar* baz_` is parsed as `<em>foo _bar</em> baz_` rather @@ -4336,13 +4320,13 @@ the following principles resolve ambiguity: `**foo*bar**` is parsed as `<em><em>foo</em>bar</em>*` rather than `<strong>foo*bar</strong>`. -14. When there are two potential emphasis or strong emphasis spans +16. When there are two potential emphasis or strong emphasis spans with the same closing delimiter, the shorter one (the one that opens later) is preferred. Thus, for example, `**foo **bar baz**` is parsed as `**foo <strong>bar baz</strong>` rather than `<strong>foo **bar baz</strong>`. -15. Inline code spans, links, images, and HTML tags group more tightly +17. Inline code spans, links, images, and HTML tags group more tightly than emphasis. So, when there is a choice between an interpretation that contains one of these elements and one that does not, the former always wins. Thus, for example, `*[foo*](bar)` is @@ -4351,7 +4335,7 @@ the following principles resolve ambiguity: These rules can be illustrated through a series of examples. -Simple emphasis: +Rule 1: . *foo bar* @@ -4359,347 +4343,420 @@ Simple emphasis: <p><em>foo bar</em></p> . +This is not emphasis, because the opening `*` is followed by +whitespace: + . -_foo bar_ +a * foo bar* . -<p><em>foo bar</em></p> +<p>a * foo bar*</p> . -Simple strong emphasis: +Intraword emphasis with `*` is permitted: . -**foo bar** +foo*bar* . -<p><strong>foo bar</strong></p> +<p>foo<em>bar</em></p> . . -__foo bar__ +5*6*78 . -<p><strong>foo bar</strong></p> +<p>5<em>6</em>78</p> . -Emphasis can continue over line breaks: +Rule 2: . -*foo -bar* +_foo bar_ . -<p><em>foo -bar</em></p> +<p><em>foo bar</em></p> . +This is not emphasis, because the opening `*` is followed by +whitespace: + . -_foo -bar_ +_ foo bar_ . -<p><em>foo -bar</em></p> +<p>_ foo bar_</p> . +Emphasis with `_` is not allowed inside ASCII words: + . -**foo -bar** +foo_bar_ . -<p><strong>foo -bar</strong></p> +<p>foo_bar_</p> . . -__foo -bar__ +5_6_78 . -<p><strong>foo -bar</strong></p> +<p>5_6_78</p> . -Emphasis can contain other inline constructs: +But it is permitted inside non-ASCII words: . -*foo [bar](/url)* +пристаням_стремятся_ . -<p><em>foo <a href="/url">bar</a></em></p> +<p>пристаням<em>стремятся</em></p> . +Rule 3: + +This is not emphasis, because the closing `*` is preceded by +whitespace: + . -_foo [bar](/url)_ +*foo bar * . -<p><em>foo <a href="/url">bar</a></em></p> +<p>*foo bar *</p> . +Intraword emphasis with `*` is allowed: + . -**foo [bar](/url)** +*foo*bar . -<p><strong>foo <a href="/url">bar</a></strong></p> +<p><em>foo</em>bar</p> . + +Rule 4: + +This is not emphasis, because the closing `_` is preceded by +whitespace: + . -__foo [bar](/url)__ +_foo bar _ . -<p><strong>foo <a href="/url">bar</a></strong></p> +<p>_foo bar _</p> . -Symbols contained in other inline constructs will not -close emphasis: +Intraword emphasis: . -*foo [bar*](/url) +_foo_bar . -<p>*foo <a href="/url">bar*</a></p> +<p>_foo_bar</p> . . -_foo [bar_](/url) +_пристаням_стремятся . -<p>_foo <a href="/url">bar_</a></p> +<p><em>пристаням</em>стремятся</p> . . -**<a href="**"> +_foo_bar_baz_ . -<p>**<a href="**"></p> +<p><em>foo_bar_baz</em></p> . +Rule 5: + . -__<a href="__"> +**foo bar** . -<p>__<a href="__"></p> +<p><strong>foo bar</strong></p> . +This is not strong emphasis, because the opening delimiter is +followed by whitespace: + . -*a `*`* +** foo bar** . -<p><em>a <code>*</code></em></p> +<p>** foo bar**</p> . +Intraword strong emphasis with `**` is permitted: + . -_a `_`_ +foo**bar** . -<p><em>a <code>_</code></em></p> +<p>foo<strong>bar</strong></p> . +Rule 6: + . -**a<http://foo.bar?q=**> +__foo bar__ . -<p>**a<a href="http://foo.bar?q=**">http://foo.bar?q=**</a></p> +<p><strong>foo bar</strong></p> . +This is not strong emphasis, because the opening delimiter is +followed by whitespace: + . -__a<http://foo.bar?q=__> +__ foo bar__ . -<p>__a<a href="http://foo.bar?q=__">http://foo.bar?q=__</a></p> +<p>__ foo bar__</p> . -This is not emphasis, because the opening delimiter is -followed by white space: +Intraword emphasis examples: . -and * foo bar* +foo__bar__ . -<p>and * foo bar*</p> +<p>foo__bar__</p> . . -_ foo bar_ +5__6__78 . -<p>_ foo bar_</p> +<p>5__6__78</p> . . -and ** foo bar** +пристаням__стремятся__ . -<p>and ** foo bar**</p> +<p>пристаням<strong>стремятся</strong></p> . . -__ foo bar__ +__foo, __bar__, baz__ . -<p>__ foo bar__</p> +<p><strong>foo, <strong>bar</strong>, baz</strong></p> . -This is not emphasis, because the closing delimiter is -preceded by white space: +Rule 7: + +This is not strong emphasis, because the closing delimiter is preceded +by whitespace: . -and *foo bar * +**foo bar ** . -<p>and *foo bar *</p> +<p>**foo bar **</p> . +(Nor can it be interpreted as an emphasized `*foo bar *`, because of +Rule 11.) + +Intraword emphasis: + . -and _foo bar _ +**foo**bar . -<p>and _foo bar _</p> +<p><strong>foo</strong>bar</p> . +Rule 8: + +This is not strong emphasis, because the closing delimiter is +preceded by whitespace: + . -and **foo bar ** +__foo bar __ . -<p>and **foo bar **</p> +<p>__foo bar __</p> . +Intraword strong emphasis examples: + . -and __foo bar __ +__foo__bar . -<p>and __foo bar __</p> +<p>__foo__bar</p> . -The rules imply that a sequence of four or more unescaped `*` or -`_` characters will always be parsed as a literal string: +. +__пристаням__стремятся +. +<p><strong>пристаням</strong>стремятся</p> +. . -****hi**** +__foo__bar__baz__ . -<p>****hi****</p> +<p><strong>foo__bar__baz</strong></p> . +Rule 9: + +Any nonempty sequence of inline elements can be the contents of an +emphasized span. + . -_____hi_____ +*foo [bar](/url)* . -<p>_____hi_____</p> +<p><em>foo <a href="/url">bar</a></em></p> . . -Sign here: _________ +*foo +bar* . -<p>Sign here: _________</p> +<p><em>foo +bar</em></p> . -The rules also imply that there can be no empty emphasis or strong -emphasis: +In particular, emphasis and strong emphasis can be nested +inside emphasis: . -** is not an empty emphasis +_foo __bar__ baz_ . -<p>** is not an empty emphasis</p> +<p><em>foo <strong>bar</strong> baz</em></p> . . -**** is not an empty strong emphasis +_foo _bar_ baz_ . -<p>**** is not an empty strong emphasis</p> +<p><em>foo <em>bar</em> baz</em></p> . -To include `*` or `_` in emphasized sections, use backslash escapes -or code spans: +. +__foo_ bar_ +. +<p><em><em>foo</em> bar</em></p> +. . -*here is a \** +*foo *bar** . -<p><em>here is a *</em></p> +<p><em>foo <em>bar</em></em></p> . . -__this is a double underscore (`__`)__ +*foo **bar** baz* . -<p><strong>this is a double underscore (<code>__</code>)</strong></p> +<p><em>foo <strong>bar</strong> baz</em></p> . -Or use the other emphasis character: +But note: . -*_* +*foo**bar**baz* . -<p><em>_</em></p> +<p><em>foo</em><em>bar</em><em>baz</em></p> . +The difference is that in the preceding case, +the internal delimiters [can close emphasis](#can-close-emphasis), +while in the cases with spaces, they cannot. + . -_*_ +***foo** bar* . -<p><em>*</em></p> +<p><em><strong>foo</strong> bar</em></p> . . -*__* +*foo **bar*** . -<p><em>__</em></p> +<p><em>foo <strong>bar</strong></em></p> . +Note, however, that in the following case we get no strong +emphasis, because the opening delimiter is closed by the first +`*` before `bar`: + . -_**_ +*foo**bar*** . -<p><em>**</em></p> +<p><em>foo</em><em>bar</em>**</p> . -`*` delimiters allow intra-word emphasis; `_` delimiters do not: + +Indefinite levels of nesting are possible: . -foo*bar*baz +*foo **bar *baz* bim** bop* . -<p>foo<em>bar</em>baz</p> +<p><em>foo <strong>bar <em>baz</em> bim</strong> bop</em></p> . . -foo_bar_baz +*foo [*bar*](/url)* . -<p>foo_bar_baz</p> +<p><em>foo <a href="/url"><em>bar</em></a></em></p> . +There can be no empty emphasis or strong emphasis: + . -foo__bar__baz +** is not an empty emphasis . -<p>foo__bar__baz</p> +<p>** is not an empty emphasis</p> . . -_foo_bar_baz_ +**** is not an empty strong emphasis . -<p><em>foo_bar_baz</em></p> +<p>**** is not an empty strong emphasis</p> . + +Rule 10: + +Any nonempty sequence of inline elements can be the contents of an +strongly emphasized span. + . -11*15*32 +**foo [bar](/url)** . -<p>11<em>15</em>32</p> +<p><strong>foo <a href="/url">bar</a></strong></p> . . -11_15_32 +**foo +bar** . -<p>11_15_32</p> +<p><strong>foo +bar</strong></p> . -Internal underscores will be ignored in underscore-delimited -emphasis: +In particular, emphasis and strong emphasis can be nested +inside strong emphasis: . -_foo_bar_baz_ +__foo _bar_ baz__ . -<p><em>foo_bar_baz</em></p> +<p><strong>foo <em>bar</em> baz</strong></p> . . -__foo__bar__baz__ +__foo __bar__ baz__ . -<p><strong>foo__bar__baz</strong></p> +<p><strong>foo <strong>bar</strong> baz</strong></p> . -The rules are sufficient for the following nesting patterns: - . -***foo bar*** +____foo__ bar__ . -<p><strong><em>foo bar</em></strong></p> +<p><strong><strong>foo</strong> bar</strong></p> . . -___foo bar___ +**foo **bar**** . -<p><strong><em>foo bar</em></strong></p> +<p><strong>foo <strong>bar</strong></strong></p> . . -***foo** bar* +**foo *bar* baz** . -<p><em><strong>foo</strong> bar</em></p> +<p><strong>foo <em>bar</em> baz</strong></p> . +But note: + . -___foo__ bar_ +**foo*bar*baz** . -<p><em><strong>foo</strong> bar</em></p> +<p><em><em>foo</em>bar</em>baz**</p> . +The difference is that in the preceding case, +the internal delimiters [can close emphasis](#can-close-emphasis), +while in the cases with spaces, they cannot. + . ***foo* bar** . @@ -4707,259 +4764,266 @@ ___foo__ bar_ . . -___foo_ bar__ +**foo *bar*** . -<p><strong><em>foo</em> bar</strong></p> +<p><strong>foo <em>bar</em></strong></p> . +Indefinite levels of nesting are possible: + . -*foo **bar*** +**foo *bar **baz** +bim* bop** . -<p><em>foo <strong>bar</strong></em></p> +<p><strong>foo <em>bar <strong>baz</strong> +bim</em> bop</strong></p> . . -_foo __bar___ +**foo [*bar*](/url)** . -<p><em>foo <strong>bar</strong></em></p> +<p><strong>foo <a href="/url"><em>bar</em></a></strong></p> . +There can be no empty emphasis or strong emphasis: + . -**foo *bar*** +__ is not an empty emphasis . -<p><strong>foo <em>bar</em></strong></p> +<p>__ is not an empty emphasis</p> . . -__foo _bar___ +____ is not an empty strong emphasis . -<p><strong>foo <em>bar</em></strong></p> +<p>____ is not an empty strong emphasis</p> . + +Rule 11: + . -*foo **bar*** +foo *** . -<p><em>foo <strong>bar</strong></em></p> +<p>foo ***</p> . . -_foo __bar___ +foo *\** . -<p><em>foo <strong>bar</strong></em></p> +<p>foo <em>*</em></p> . . -*foo *bar* baz* +foo *_* . -<p><em>foo <em>bar</em> baz</em></p> +<p>foo <em>_</em></p> . . -_foo _bar_ baz_ +foo ***** . -<p><em>foo <em>bar</em> baz</em></p> +<p>foo *****</p> . . -**foo **bar** baz** +foo **\*** . -<p><strong>foo <strong>bar</strong> baz</strong></p> +<p>foo <strong>*</strong></p> . . -__foo __bar__ baz__ +foo **_** . -<p><strong>foo <strong>bar</strong> baz</strong></p> +<p>foo <strong>_</strong></p> . +Note that when delimiters do not match evenly, Rule 11 determines +that the excess literal `*` characters will appear outside of the +emphasis, rather than inside it: + . -*foo **bar** baz* +**foo* . -<p><em>foo <strong>bar</strong> baz</em></p> +<p>*<em>foo</em></p> . . -_foo __bar__ baz_ +*foo** . -<p><em>foo <strong>bar</strong> baz</em></p> +<p><em>foo</em>*</p> . . -**foo *bar* baz** +***foo** . -<p><strong>foo <em>bar</em> baz</strong></p> +<p>*<strong>foo</strong></p> . . -__foo _bar_ baz__ +****foo* . -<p><strong>foo <em>bar</em> baz</strong></p> +<p>***<em>foo</em></p> . . -**foo, *bar*, baz** +**foo*** . -<p><strong>foo, <em>bar</em>, baz</strong></p> +<p><strong>foo</strong>*</p> . . -__foo, _bar_, baz__ +*foo**** . -<p><strong>foo, <em>bar</em>, baz</strong></p> +<p><em>foo</em>***</p> . -But note: + +Rule 12: . -*foo**bar**baz* +foo ___ . -<p><em>foo</em><em>bar</em><em>baz</em></p> +<p>foo ___</p> . . -**foo*bar*baz** +foo _\__ . -<p><em><em>foo</em>bar</em>baz**</p> +<p>foo <em>_</em></p> . -The difference is that in the two preceding cases, -the internal delimiters [can close emphasis](#can-close-emphasis), -while in the cases with spaces, they cannot. - -Note that you cannot nest emphasis directly inside emphasis -using the same delimeter, or strong emphasis directly inside -strong emphasis: - . -**foo** +foo _*_ . -<p><strong>foo</strong></p> +<p>foo <em>*</em></p> . . -****foo**** +foo _____ . -<p>****foo****</p> +<p>foo _____</p> . -For these nestings, you need to switch delimiters: +. +foo __\___ +. +<p>foo <strong>_</strong></p> +. . -*_foo_* +foo __*__ . -<p><em><em>foo</em></em></p> +<p>foo <strong>*</strong></p> . . -**__foo__** +__foo_ . -<p><strong><strong>foo</strong></strong></p> +<p>_<em>foo</em></p> . -Note that a `*` followed by a `*` can close emphasis, and -a `**` followed by a `*` can close strong emphasis (and -similarly for `_` and `__`): +Note that when delimiters do not match evenly, Rule 12 determines +that the excess literal `_` characters will appear outside of the +emphasis, rather than inside it: . -*foo** +_foo__ . -<p><em>foo</em>*</p> +<p><em>foo</em>_</p> . . -*foo *bar** +___foo__ . -<p><em>foo <em>bar</em></em></p> +<p>_<strong>foo</strong></p> . . -**foo*** +____foo_ . -<p><strong>foo</strong>*</p> +<p>___<em>foo</em></p> . . -***foo* bar*** +__foo___ . -<p><strong><em>foo</em> bar</strong>*</p> +<p><strong>foo</strong>_</p> . . -***foo** bar*** +_foo____ . -<p><em><strong>foo</strong> bar</em>**</p> +<p><em>foo</em>___</p> . -The following contains no strong emphasis, because the opening -delimiter is closed by the first `*` before `bar`: +Rule 13 implies that if you want emphasis nested directly inside +emphasis, you must use different delimiters: . -*foo**bar*** +**foo** . -<p><em>foo</em><em>bar</em>**</p> +<p><strong>foo</strong></p> . -However, a string of four or more `****` can never close emphasis: - . -*foo**** +*_foo_* . -<p>*foo****</p> +<p><em><em>foo</em></em></p> . -We retain symmetry in these cases: - . -*foo** - -**foo* +__foo__ . -<p><em>foo</em>*</p> -<p>*<em>foo</em></p> +<p><strong>foo</strong></p> . . -*foo *bar** - -**foo* bar* +_*foo*_ . -<p><em>foo <em>bar</em></em></p> -<p><em><em>foo</em> bar</em></p> +<p><em><em>foo</em></em></p> . -More cases with mismatched delimiters: +However, strong emphasis within strong emphasisis possible without +switching delimiters: . -*bar*** +****foo**** . -<p><em>bar</em>**</p> +<p><strong><strong>foo</strong></strong></p> . . -***foo* +____foo____ . -<p>**<em>foo</em></p> +<p><strong><strong>foo</strong></strong></p> . + +Rule 13 can be applied to arbitrarily long sequences of +delimiters: + . -**bar*** +******foo****** . -<p><strong>bar</strong>*</p> +<p><strong><strong><strong>foo</strong></strong></strong></p> . +Rule 14: + . -***foo** +***foo*** . -<p>*<strong>foo</strong></p> +<p><strong><em>foo</em></strong></p> . . -***foo *bar* +_____foo_____ . -<p>***foo <em>bar</em></p> +<p><strong><strong><em>foo</em></strong></strong></p> . -The following cases illustrate rule 13: +Rule 15: . *foo _bar* baz_ @@ -4968,12 +5032,13 @@ The following cases illustrate rule 13: . . -**foo bar* baz** +**foo*bar** . -<p><em><em>foo bar</em> baz</em>*</p> +<p><em><em>foo</em>bar</em>*</p> . -The following cases illustrate rule 14: + +Rule 16: . **foo **bar baz** @@ -4987,18 +5052,18 @@ The following cases illustrate rule 14: <p>*foo <em>bar baz</em></p> . -The following cases illustrate rule 15: +Rule 17: . -*[foo*](bar) +*[bar*](/url) . -<p>*<a href="bar">foo*</a></p> +<p>*<a href="/url">bar*</a></p> . . -*![foo*](bar) +_foo [bar_](/url) . -<p>*<img src="bar" alt="foo*" /></p> +<p>_foo <a href="/url">bar_</a></p> . . @@ -5008,11 +5073,42 @@ The following cases illustrate rule 15: . . -*a`a*` +**<a href="**"> +. +<p>**<a href="**"></p> +. + +. +__<a href="__"> +. +<p>__<a href="__"></p> +. + +. +*a `*`* +. +<p><em>a <code>*</code></em></p> +. + +. +_a `_`_ +. +<p><em>a <code>_</code></em></p> +. + +. +**a<http://foo.bar?q=**> +. +<p>**a<a href="http://foo.bar?q=**">http://foo.bar?q=**</a></p> +. + +. +__a<http://foo.bar?q=__> . -<p>*a<code>a*</code></p> +<p>__a<a href="http://foo.bar?q=__">http://foo.bar?q=__</a></p> . + ## Links A link contains a [link label](#link-label) (the visible text), @@ -5590,9 +5686,9 @@ is followed by a link label (even though `[bar]` is not defined): ## Images An (unescaped) exclamation mark (`!`) followed by a reference or -inline link will be parsed as an image. The link label will be -used as the image's alt text, and the link title, if any, will -be used as the image's title. +inline link will be parsed as an image. The plain string content +of the link label will be used as the image's alt text, and the link +title, if any, will be used as the image's title. . ![foo](/url "title") @@ -5605,15 +5701,19 @@ be used as the image's title. [foo *bar*]: train.jpg "train & tracks" . -<p><img src="train.jpg" alt="foo <em>bar</em>" title="train & tracks" /></p> +<p><img src="train.jpg" alt="foo bar" title="train & tracks" /></p> . +Note that in the above example, the alt text is `foo bar`, not `foo +*bar*` or `foo <em>bar</em>` or `foo <em>bar</em>`. Only +the plain string content is rendered, without formatting. + . ![foo *bar*][] [foo *bar*]: train.jpg "train & tracks" . -<p><img src="train.jpg" alt="foo <em>bar</em>" title="train & tracks" /></p> +<p><img src="train.jpg" alt="foo bar" title="train & tracks" /></p> . . @@ -5621,7 +5721,7 @@ be used as the image's title. [FOOBAR]: train.jpg "train & tracks" . -<p><img src="train.jpg" alt="foo <em>bar</em>" title="train & tracks" /></p> +<p><img src="train.jpg" alt="foo bar" title="train & tracks" /></p> . . @@ -5681,7 +5781,7 @@ Collapsed: [*foo* bar]: /url "title" . -<p><img src="/url" alt="<em>foo</em> bar" title="title" /></p> +<p><img src="/url" alt="foo bar" title="title" /></p> . The labels are case-insensitive: @@ -5721,7 +5821,7 @@ Shortcut: [*foo* bar]: /url "title" . -<p><img src="/url" alt="<em>foo</em> bar" title="title" /></p> +<p><img src="/url" alt="foo bar" title="title" /></p> . . diff --git a/src/blocks.c b/src/blocks.c index 7613c82..ccb84a7 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -20,14 +20,14 @@ static node_block* make_block(int tag, int start_line, int start_column) node_block* e; e = calloc(1, sizeof(*e)); - if(e != NULL) { - e->tag = tag; - e->open = true; - e->start_line = start_line; - e->start_column = start_column; - e->end_line = start_line; - strbuf_init(&e->string_content, 32); - } + if(e != NULL) { + e->tag = tag; + e->open = true; + e->start_line = start_line; + e->start_column = start_column; + e->end_line = start_line; + strbuf_init(&e->string_content, 32); + } return e; } @@ -252,12 +252,12 @@ static node_block* add_child(node_block* parent, return child; } + // Free a node_block list and any children. void cmark_free_nodes(node_block *e) { node_block * next; while (e != NULL) { - next = e->next; free_inlines(e->inline_content); strbuf_free(&e->string_content); if (e->tag == BLOCK_FENCED_CODE) { @@ -265,31 +265,63 @@ void cmark_free_nodes(node_block *e) } else if (e->tag == BLOCK_DOCUMENT) { reference_map_free(e->as.document.refmap); } - cmark_free_nodes(e->children); + if (e->last_child) { + // Splice children into list + e->last_child->next = e->next; + e->next = e->children; + } + next = e->next; free(e); e = next; } } +typedef struct BlockStack { + struct BlockStack *previous; + node_block *next_sibling; +} block_stack; + // Walk through node_block and all children, recursively, parsing // string content into inline content where appropriate. void process_inlines(node_block* cur, reference_map *refmap) { - switch (cur->tag) { - case BLOCK_PARAGRAPH: - case BLOCK_ATX_HEADER: - case BLOCK_SETEXT_HEADER: - cur->inline_content = parse_inlines(&cur->string_content, refmap); - break; + block_stack* stack = NULL; + block_stack* newstack = NULL; + + while (cur != NULL) { + switch (cur->tag) { + case BLOCK_PARAGRAPH: + case BLOCK_ATX_HEADER: + case BLOCK_SETEXT_HEADER: + cur->inline_content = parse_inlines(&cur->string_content, refmap); + break; - default: - break; - } + default: + break; + } - node_block *child = cur->children; - while (child != NULL) { - process_inlines(child, refmap); - child = child->next; + if (cur->children) { + newstack = (block_stack*)malloc(sizeof(block_stack)); + if (newstack == NULL) return; + newstack->previous = stack; + stack = newstack; + stack->next_sibling = cur->next; + cur = cur->children; + } else { + cur = cur->next; + } + + while (cur == NULL && stack != NULL) { + cur = stack->next_sibling; + newstack = stack->previous; + free(stack); + stack = newstack; + } + } + while (stack != NULL) { + newstack = stack->previous; + free(stack); + stack = newstack; } } @@ -311,16 +343,16 @@ static int parse_list_marker(chunk *input, int pos, struct ListData ** dataptr) return 0; } data = calloc(1, sizeof(*data)); - if(data == NULL) { - return 0; - } else { - data->marker_offset = 0; // will be adjusted later - data->list_type = bullet; - data->bullet_char = c; - data->start = 1; - data->delimiter = period; - data->tight = false; - } + if(data == NULL) { + return 0; + } else { + data->marker_offset = 0; // will be adjusted later + data->list_type = bullet; + data->bullet_char = c; + data->start = 1; + data->delimiter = period; + data->tight = false; + } } else if (isdigit(c)) { int start = 0; @@ -336,16 +368,16 @@ static int parse_list_marker(chunk *input, int pos, struct ListData ** dataptr) return 0; } data = calloc(1, sizeof(*data)); - if(data == NULL) { - return 0; - } else { - data->marker_offset = 0; // will be adjusted later - data->list_type = ordered; - data->bullet_char = 0; - data->start = start; - data->delimiter = (c == '.' ? period : parens); - data->tight = false; - } + if(data == NULL) { + return 0; + } else { + data->marker_offset = 0; // will be adjusted later + data->list_type = ordered; + data->bullet_char = 0; + data->start = start; + data->delimiter = (c == '.' ? period : parens); + data->tight = false; + } } else { return 0; } @@ -438,8 +470,8 @@ static void chop_trailing_hashtags(chunk *ch) // Check for a be a space before the final #s: if (n != orig_n && n >= 0 && peek_at(ch, n) == ' ') { - ch->len = n; - chunk_rtrim(ch); + ch->len = n; + chunk_rtrim(ch); } } @@ -462,7 +494,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr) // Add a newline to the end if not present: if (line->ptr[line->size - 1] != '\n') { - strbuf_putc(line, '\n'); + strbuf_putc(line, '\n'); } input.data = line->ptr; input.len = line->size; diff --git a/src/buffer.c b/src/buffer.c index 1cdcae8..a5139fa 100644 --- a/src/buffer.c +++ b/src/buffer.c @@ -291,19 +291,11 @@ void strbuf_drop(strbuf *buf, int n) } } -void strbuf_trim(strbuf *buf) +void strbuf_rtrim(strbuf *buf) { - int i = 0; - if (!buf->size) return; - while (i < buf->size && isspace(buf->ptr[i])) - i++; - - strbuf_drop(buf, i); - - /* rtrim */ while (buf->size > 0) { if (!isspace(buf->ptr[buf->size - 1])) break; @@ -314,6 +306,21 @@ void strbuf_trim(strbuf *buf) buf->ptr[buf->size] = '\0'; } +void strbuf_trim(strbuf *buf) +{ + int i = 0; + + if (!buf->size) + return; + + while (i < buf->size && isspace(buf->ptr[i])) + i++; + + strbuf_drop(buf, i); + + strbuf_rtrim(buf); +} + // Destructively modify string, collapsing consecutive // space and newline characters into a single space. void strbuf_normalize_whitespace(strbuf *s) diff --git a/src/buffer.h b/src/buffer.h index 1bc1eee..63d6202 100644 --- a/src/buffer.h +++ b/src/buffer.h @@ -107,6 +107,7 @@ int strbuf_strchr(const strbuf *buf, int c, int pos); int strbuf_strrchr(const strbuf *buf, int c, int pos); void strbuf_drop(strbuf *buf, int n); void strbuf_truncate(strbuf *buf, int len); +void strbuf_rtrim(strbuf *buf); void strbuf_trim(strbuf *buf); void strbuf_normalize_whitespace(strbuf *s); void strbuf_unescape(strbuf *s); diff --git a/src/cmark.h b/src/cmark.h index e34df72..ff2f9a2 100644 --- a/src/cmark.h +++ b/src/cmark.h @@ -9,7 +9,6 @@ #define VERSION "0.1" #define CODE_INDENT 4 -#define STACK_LIMIT 1000 struct node_inl { enum { diff --git a/src/html/html.c b/src/html/html.c index fde1cb4..5f08506 100644 --- a/src/html/html.c +++ b/src/html/html.c @@ -8,6 +8,76 @@ #include "debug.h" #include "html/houdini.h" +typedef struct RenderStack { + struct RenderStack *previous; + char* literal; + union { + node_inl *inl; + node_block *block; + } next_sibling; + bool tight; + bool trim; +} render_stack; + +static void free_render_stack(render_stack * rstack) +{ + render_stack * tempstack; + while (rstack) { + tempstack = rstack; + rstack = rstack->previous; + free(tempstack); + } +} + +static render_stack* push_inline(render_stack* rstack, + node_inl* inl, + char* literal) +{ + render_stack* newstack; + newstack = (render_stack*)malloc(sizeof(render_stack)); + if (newstack == NULL) { + return NULL; + } + newstack->previous = rstack; + newstack->next_sibling.inl = inl; + newstack->literal = literal; + newstack->tight = false; + newstack->trim = false; + return newstack; +} + +static render_stack* push_block(render_stack* rstack, + node_block* block, + char* literal, + bool tight, + bool trim) +{ + render_stack* newstack; + newstack = (render_stack*)malloc(sizeof(render_stack)); + if (newstack == NULL) { + return NULL; + } + newstack->previous = rstack; + newstack->next_sibling.block = block; + newstack->literal = literal; + newstack->tight = tight; + newstack->trim = trim; + return newstack; +} + +static render_stack* pop_render_stack(render_stack* rstack) +{ + render_stack* top = rstack; + + if (rstack == NULL) { + return NULL; + } + rstack = rstack->previous; + top->previous = NULL; + free_render_stack(top); + return rstack; +} + // Functions to convert node_block and inline lists to HTML strings. static void escape_html(strbuf *dest, const unsigned char *source, int length) @@ -33,196 +103,276 @@ static inline void cr(strbuf *html) } // Convert an inline list to HTML. Returns 0 on success, and sets result. -static void inlines_to_html(strbuf *html, node_inl* ils) +static void inlines_to_plain_html(strbuf *html, node_inl* ils) { - strbuf scrap = GH_BUF_INIT; + node_inl* children; + bool visit_children; + render_stack* rstack = NULL; while(ils != NULL) { + visit_children = false; switch(ils->tag) { - case INL_STRING: - escape_html(html, ils->content.literal.data, ils->content.literal.len); - break; - - case INL_LINEBREAK: - strbuf_puts(html, "<br />\n"); - break; - - case INL_SOFTBREAK: - strbuf_putc(html, '\n'); - break; - - case INL_CODE: - strbuf_puts(html, "<code>"); - escape_html(html, ils->content.literal.data, ils->content.literal.len); - strbuf_puts(html, "</code>"); - break; - - case INL_RAW_HTML: - strbuf_put(html, - ils->content.literal.data, - ils->content.literal.len); - break; - - case INL_LINK: - strbuf_puts(html, "<a href=\""); - if (ils->content.linkable.url) - escape_href(html, ils->content.linkable.url, -1); - - if (ils->content.linkable.title) { - strbuf_puts(html, "\" title=\""); - escape_html(html, ils->content.linkable.title, -1); - } + case INL_STRING: + case INL_CODE: + case INL_RAW_HTML: + escape_html(html, ils->content.literal.data, ils->content.literal.len); + break; + + case INL_LINEBREAK: + case INL_SOFTBREAK: + strbuf_putc(html, '\n'); + break; + + case INL_LINK: + case INL_IMAGE: + children = ils->content.inlines; + visit_children = true; + rstack = push_inline(rstack, ils->next, ""); + break; + + case INL_STRONG: + case INL_EMPH: + children = ils->content.inlines; + visit_children = true; + rstack = push_inline(rstack, ils->next, ""); + break; + } + if (visit_children) { + ils = children; + } else { + ils = ils->next; + } + while (ils == NULL && rstack != NULL) { + strbuf_puts(html, rstack->literal); + ils = rstack->next_sibling.inl; + rstack = pop_render_stack(rstack); + } + } - strbuf_puts(html, "\">"); - inlines_to_html(html, ils->content.inlines); - strbuf_puts(html, "</a>"); - break; - - case INL_IMAGE: - strbuf_puts(html, "<img src=\""); - if (ils->content.linkable.url) - escape_href(html, ils->content.linkable.url, -1); - - inlines_to_html(&scrap, ils->content.inlines); - strbuf_puts(html, "\" alt=\""); - if (scrap.size) - escape_html(html, scrap.ptr, scrap.size); - strbuf_clear(&scrap); - - if (ils->content.linkable.title) { - strbuf_puts(html, "\" title=\""); - escape_html(html, ils->content.linkable.title, -1); - } + free_render_stack(rstack); +} - strbuf_puts(html, "\"/>"); - break; - case INL_STRONG: - strbuf_puts(html, "<strong>"); - inlines_to_html(html, ils->content.inlines); - strbuf_puts(html, "</strong>"); - break; +// Convert an inline list to HTML. Returns 0 on success, and sets result. +static void inlines_to_html(strbuf *html, node_inl* ils) +{ + node_inl* children; + render_stack* rstack = NULL; - case INL_EMPH: - strbuf_puts(html, "<em>"); - inlines_to_html(html, ils->content.inlines); - strbuf_puts(html, "</em>"); - break; + while(ils != NULL) { + children = NULL; + switch(ils->tag) { + case INL_STRING: + escape_html(html, ils->content.literal.data, ils->content.literal.len); + break; + + case INL_LINEBREAK: + strbuf_puts(html, "<br />\n"); + break; + + case INL_SOFTBREAK: + strbuf_putc(html, '\n'); + break; + + case INL_CODE: + strbuf_puts(html, "<code>"); + escape_html(html, ils->content.literal.data, ils->content.literal.len); + strbuf_puts(html, "</code>"); + break; + + case INL_RAW_HTML: + strbuf_put(html, + ils->content.literal.data, + ils->content.literal.len); + break; + + case INL_LINK: + strbuf_puts(html, "<a href=\""); + if (ils->content.linkable.url) + escape_href(html, ils->content.linkable.url, -1); + + if (ils->content.linkable.title) { + strbuf_puts(html, "\" title=\""); + escape_html(html, ils->content.linkable.title, -1); + } + + strbuf_puts(html, "\">"); + children = ils->content.inlines; + rstack = push_inline(rstack, ils->next, "</a>"); + break; + + case INL_IMAGE: + strbuf_puts(html, "<img src=\""); + if (ils->content.linkable.url) + escape_href(html, ils->content.linkable.url, -1); + + strbuf_puts(html, "\" alt=\""); + inlines_to_plain_html(html, ils->content.inlines); + + if (ils->content.linkable.title) { + strbuf_puts(html, "\" title=\""); + escape_html(html, ils->content.linkable.title, -1); + } + + strbuf_puts(html, "\"/>"); + break; + + case INL_STRONG: + strbuf_puts(html, "<strong>"); + children = ils->content.inlines; + rstack = push_inline(rstack, ils->next, "</strong>"); + break; + + case INL_EMPH: + strbuf_puts(html, "<em>"); + children = ils->content.inlines; + rstack = push_inline(rstack, ils->next, "</em>"); + break; + } + if (children) { + ils = children; + } else { + ils = ils->next; + } + while (ils == NULL && rstack != NULL) { + strbuf_puts(html, rstack->literal); + ils = rstack->next_sibling.inl; + rstack = pop_render_stack(rstack); } - ils = ils->next; } - strbuf_free(&scrap); + free_render_stack(rstack); } // Convert a node_block list to HTML. Returns 0 on success, and sets result. -static void blocks_to_html(strbuf *html, node_block *b, bool tight) +static void blocks_to_html(strbuf *html, node_block *b) { struct ListData *data; + render_stack* rstack = NULL; + bool visit_children = false; + bool tight = false; while(b != NULL) { + visit_children = false; switch(b->tag) { - case BLOCK_DOCUMENT: - blocks_to_html(html, b->children, false); - break; - - case BLOCK_PARAGRAPH: - if (tight) { - inlines_to_html(html, b->inline_content); - } else { - cr(html); - strbuf_puts(html, "<p>"); - inlines_to_html(html, b->inline_content); - strbuf_puts(html, "</p>\n"); - } - break; - - case BLOCK_BQUOTE: - cr(html); - strbuf_puts(html, "<blockquote>\n"); - blocks_to_html(html, b->children, false); - strbuf_puts(html, "</blockquote>\n"); - break; - - case BLOCK_LIST_ITEM: - cr(html); - strbuf_puts(html, "<li>"); - blocks_to_html(html, b->children, tight); - strbuf_trim(html); /* TODO: rtrim */ - strbuf_puts(html, "</li>\n"); - break; - - case BLOCK_LIST: - // make sure a list starts at the beginning of the line: - cr(html); - data = &(b->as.list); - - if (data->start > 1) { - strbuf_printf(html, "<%s start=\"%d\">\n", - data->list_type == bullet ? "ul" : "ol", - data->start); - } else { - strbuf_puts(html, data->list_type == bullet ? "<ul>\n" : "<ol>\n"); - } + case BLOCK_DOCUMENT: + rstack = push_block(rstack, b->next, "", false, false); + visit_children = true; + break; - blocks_to_html(html, b->children, data->tight); - strbuf_puts(html, data->list_type == bullet ? "</ul>" : "</ol>"); - strbuf_putc(html, '\n'); - break; - - case BLOCK_ATX_HEADER: - case BLOCK_SETEXT_HEADER: - cr(html); - strbuf_printf(html, "<h%d>", b->as.header.level); + case BLOCK_PARAGRAPH: + if (tight) { inlines_to_html(html, b->inline_content); - strbuf_printf(html, "</h%d>\n", b->as.header.level); - break; - - case BLOCK_INDENTED_CODE: - case BLOCK_FENCED_CODE: + } else { cr(html); - - strbuf_puts(html, "<pre><code"); - - if (b->tag == BLOCK_FENCED_CODE) { - strbuf *info = &b->as.code.info; - - if (strbuf_len(info) > 0) { - int first_tag = strbuf_strchr(info, ' ', 0); - if (first_tag < 0) - first_tag = strbuf_len(info); - - strbuf_puts(html, " class=\"language-"); - escape_html(html, info->ptr, first_tag); - strbuf_putc(html, '"'); - } + strbuf_puts(html, "<p>"); + inlines_to_html(html, b->inline_content); + strbuf_puts(html, "</p>\n"); + } + break; + + case BLOCK_BQUOTE: + cr(html); + strbuf_puts(html, "<blockquote>\n"); + rstack = push_block(rstack, b->next, "</blockquote>\n", tight, false); + tight = false; + visit_children = true; + break; + + case BLOCK_LIST_ITEM: + cr(html); + strbuf_puts(html, "<li>"); + rstack = push_block(rstack, b->next, "</li>\n", tight, true); + visit_children = true; + break; + + case BLOCK_LIST: + // make sure a list starts at the beginning of the line: + cr(html); + data = &(b->as.list); + + if (data->start > 1) { + strbuf_printf(html, "<%s start=\"%d\">\n", + data->list_type == bullet ? "ul" : "ol", + data->start); + } else { + strbuf_puts(html, data->list_type == bullet ? "<ul>\n" : "<ol>\n"); + } + + rstack = push_block(rstack, b->next, + data->list_type == bullet ? + "\n</ul>\n" : "\n</ol>\n", tight, false); + tight = data->tight; + visit_children = true; + break; + + case BLOCK_ATX_HEADER: + case BLOCK_SETEXT_HEADER: + cr(html); + strbuf_printf(html, "<h%d>", b->as.header.level); + inlines_to_html(html, b->inline_content); + strbuf_printf(html, "</h%d>\n", b->as.header.level); + break; + + case BLOCK_INDENTED_CODE: + case BLOCK_FENCED_CODE: + cr(html); + + strbuf_puts(html, "<pre><code"); + + if (b->tag == BLOCK_FENCED_CODE) { + strbuf *info = &b->as.code.info; + + if (strbuf_len(info) > 0) { + int first_tag = strbuf_strchr(info, ' ', 0); + if (first_tag < 0) + first_tag = strbuf_len(info); + + strbuf_puts(html, " class=\"language-"); + escape_html(html, info->ptr, first_tag); + strbuf_putc(html, '"'); } + } - strbuf_putc(html, '>'); - escape_html(html, b->string_content.ptr, b->string_content.size); - strbuf_puts(html, "</code></pre>\n"); - break; + strbuf_putc(html, '>'); + escape_html(html, b->string_content.ptr, b->string_content.size); + strbuf_puts(html, "</code></pre>\n"); + break; - case BLOCK_HTML: - strbuf_put(html, b->string_content.ptr, b->string_content.size); - break; + case BLOCK_HTML: + strbuf_put(html, b->string_content.ptr, b->string_content.size); + break; - case BLOCK_HRULE: - strbuf_puts(html, "<hr />\n"); - break; + case BLOCK_HRULE: + strbuf_puts(html, "<hr />\n"); + break; - case BLOCK_REFERENCE_DEF: - break; + case BLOCK_REFERENCE_DEF: + break; - default: - assert(false); + default: + assert(false); + } + if (visit_children) { + b = b->children; + } else { + b = b->next; + } + while (b == NULL && rstack != NULL) { + strbuf_puts(html, rstack->literal); + if (rstack->trim) { + strbuf_rtrim(html); + } + tight = rstack->tight; + b = rstack->next_sibling.block; + rstack = pop_render_stack(rstack); } - - b = b->next; } + + free_render_stack(rstack); } void cmark_render_html(strbuf *html, node_block *root) { - blocks_to_html(html, root, false); + blocks_to_html(html, root); } diff --git a/src/inlines.c b/src/inlines.c index 9216979..810230c 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -9,6 +9,7 @@ #include "utf8.h" #include "scanners.h" #include "inlines.h" +#include "debug.h" typedef struct InlineStack { struct InlineStack *previous; @@ -41,9 +42,9 @@ static unsigned char *bufdup(const unsigned char *buf) if (buf) { int len = strlen((char *)buf); new = calloc(len + 1, sizeof(*new)); - if(new != NULL) { - memcpy(new, buf, len + 1); - } + if(new != NULL) { + memcpy(new, buf, len + 1); + } } return new; @@ -52,13 +53,13 @@ static unsigned char *bufdup(const unsigned char *buf) static inline node_inl *make_link_(node_inl *label, unsigned char *url, unsigned char *title) { node_inl* e = calloc(1, sizeof(*e)); - if(e != NULL) { - e->tag = INL_LINK; - e->content.linkable.label = label; - e->content.linkable.url = url; - e->content.linkable.title = title; - e->next = NULL; - } + if(e != NULL) { + e->tag = INL_LINK; + e->content.linkable.label = label; + e->content.linkable.url = url; + e->content.linkable.title = title; + e->next = NULL; + } return e; } @@ -81,11 +82,11 @@ inline static node_inl* make_link(node_inl* label, chunk url, chunk title) inline static node_inl* make_inlines(int t, node_inl* contents) { node_inl * e = calloc(1, sizeof(*e)); - if(e != NULL) { - e->tag = t; - e->content.inlines = contents; - e->next = NULL; - } + if(e != NULL) { + e->tag = t; + e->content.inlines = contents; + e->next = NULL; + } return e; } @@ -93,11 +94,11 @@ inline static node_inl* make_inlines(int t, node_inl* contents) inline static node_inl* make_literal(int t, chunk s) { node_inl * e = calloc(1, sizeof(*e)); - if(e != NULL) { - e->tag = t; - e->content.literal = s; - e->next = NULL; - } + if(e != NULL) { + e->tag = t; + e->content.literal = s; + e->next = NULL; + } return e; } @@ -105,10 +106,10 @@ inline static node_inl* make_literal(int t, chunk s) inline static node_inl* make_simple(int t) { node_inl* e = calloc(1, sizeof(*e)); - if(e != NULL) { - e->tag = t; - e->next = NULL; - } + if(e != NULL) { + e->tag = t; + e->next = NULL; + } return e; } @@ -121,10 +122,28 @@ inline static node_inl* make_simple(int t) #define make_emph(contents) make_inlines(INL_EMPH, contents) #define make_strong(contents) make_inlines(INL_STRONG, contents) -// Free an inline list. +// Utility function used by free_inlines +void splice_into_list(node_inl* e, node_inl* children) { + node_inl * tmp; + if (children) { + tmp = children; + // Find last child + while (tmp->next) { + tmp = tmp->next; + } + // Splice children into list + tmp->next = e->next; + e->next = children; + } + return ; +} + +// Free an inline list. Avoid recursion to prevent stack overflows +// on deeply nested structures. extern void free_inlines(node_inl* e) { node_inl * next; + while (e != NULL) { switch (e->tag){ case INL_STRING: @@ -139,13 +158,14 @@ extern void free_inlines(node_inl* e) case INL_IMAGE: free(e->content.linkable.url); free(e->content.linkable.title); - free_inlines(e->content.linkable.label); + splice_into_list(e, e->content.linkable.label); break; case INL_EMPH: case INL_STRONG: - free_inlines(e->content.inlines); + splice_into_list(e, e->content.inlines); break; default: + log_warn("Unknown inline tag %d", e->tag); break; } next = e->next; @@ -297,8 +317,8 @@ static int scan_delims(subject* subj, unsigned char c, bool * can_open, bool * c advance(subj); } char_after = peek_char(subj); - *can_open = numdelims > 0 && numdelims <= 3 && !isspace(char_after); - *can_close = numdelims > 0 && numdelims <= 3 && !isspace(char_before); + *can_open = numdelims > 0 && !isspace(char_after); + *can_close = numdelims > 0 && !isspace(char_before); if (c == '_') { *can_open = *can_open && !isalnum(char_before); *can_close = *can_close && !isalnum(char_after); @@ -308,13 +328,13 @@ static int scan_delims(subject* subj, unsigned char c, bool * can_open, bool * c static void free_openers(subject* subj, inline_stack* istack) { - inline_stack * tempstack; - while (subj->emphasis_openers != istack) { - tempstack = subj->emphasis_openers; - subj->emphasis_openers = subj->emphasis_openers->previous; - subj->emphasis_nestlevel--; - free(tempstack); - } + inline_stack * tempstack; + while (subj->emphasis_openers != istack) { + tempstack = subj->emphasis_openers; + subj->emphasis_openers = subj->emphasis_openers->previous; + subj->emphasis_nestlevel--; + free(tempstack); + } } // Parse strong/emph or a fallback. @@ -324,6 +344,7 @@ static node_inl* handle_strong_emph(subject* subj, unsigned char c, node_inl **l bool can_open, can_close; int numdelims; int useDelims; + int openerDelims; inline_stack * istack; node_inl * inl; node_inl * emph; @@ -332,81 +353,84 @@ static node_inl* handle_strong_emph(subject* subj, unsigned char c, node_inl **l numdelims = scan_delims(subj, c, &can_open, &can_close); if (can_close) - { - // walk the stack and find a matching opener, if there is one - istack = subj->emphasis_openers; - while (true) { - if (istack == NULL) - goto cannotClose; - - if (istack->delim_char == c) - break; + // walk the stack and find a matching opener, if there is one + istack = subj->emphasis_openers; + while (true) + { + if (istack == NULL) + goto cannotClose; + + if (istack->delim_char == c) + break; + + istack = istack->previous; + } + + // calculate the actual number of delimeters used from this closer + openerDelims = istack->delim_count; + if (numdelims < 3 || openerDelims < 3) { + useDelims = numdelims <= openerDelims ? numdelims : openerDelims; + } else { // (numdelims >= 3 && openerDelims >= 3) + useDelims = numdelims % 2 == 0 ? 2 : 1; + } - istack = istack->previous; + if (istack->delim_count == useDelims) + { + // the opener is completely used up - remove the stack entry and reuse the inline element + inl = istack->first_inline; + inl->tag = useDelims == 1 ? INL_EMPH : INL_STRONG; + chunk_free(&inl->content.literal); + inl->content.inlines = inl->next; + inl->next = NULL; + + // remove this opener and all later ones from stack: + free_openers(subj, istack->previous); + *last = inl; + } + else + { + // the opener will only partially be used - stack entry remains (truncated) and a new inline is added. + inl = istack->first_inline; + istack->delim_count -= useDelims; + inl->content.literal.len = istack->delim_count; + + emph = useDelims == 1 ? make_emph(inl->next) : make_strong(inl->next); + inl->next = emph; + + // remove all later openers from stack: + free_openers(subj, istack); + + *last = emph; + } + + // if the closer was not fully used, move back a char or two and try again. + if (useDelims < numdelims) + { + subj->pos = subj->pos - numdelims + useDelims; + return NULL; + } + + return NULL; // make_str(chunk_literal("")); } - // calculate the actual number of delimeters used from this closer - useDelims = istack->delim_count; - if (useDelims == 3) useDelims = numdelims == 3 ? 1 : numdelims; - else if (useDelims > numdelims) useDelims = 1; - - if (istack->delim_count == useDelims) - { - // the opener is completely used up - remove the stack entry and reuse the inline element - inl = istack->first_inline; - inl->tag = useDelims == 1 ? INL_EMPH : INL_STRONG; - chunk_free(&inl->content.literal); - inl->content.inlines = inl->next; - inl->next = NULL; - - // remove this opener and all later ones from stack: - free_openers(subj, istack->previous); - *last = inl; - } - else - { - // the opener will only partially be used - stack entry remains (truncated) and a new inline is added. - inl = istack->first_inline; - istack->delim_count -= useDelims; - inl->content.literal.len = istack->delim_count; - - emph = useDelims == 1 ? make_emph(inl->next) : make_strong(inl->next); - inl->next = emph; - - // remove all later openers from stack: - free_openers(subj, istack); - - *last = emph; - } + cannotClose: + inl_text = make_str(chunk_dup(&subj->input, subj->pos - numdelims, numdelims)); - // if the closer was not fully used, move back a char or two and try again. - if (useDelims < numdelims) + if (can_open) { - subj->pos = subj->pos - numdelims + useDelims; - return handle_strong_emph(subj, c, last); + istack = (inline_stack*)malloc(sizeof(inline_stack)); + if (istack == NULL) { + return NULL; + } + istack->delim_count = numdelims; + istack->delim_char = c; + istack->first_inline = inl_text; + istack->previous = subj->emphasis_openers; + subj->emphasis_openers = istack; + subj->emphasis_nestlevel++; } - return NULL; // make_str(chunk_literal("")); - } - -cannotClose: - inl_text = make_str(chunk_dup(&subj->input, subj->pos - numdelims, numdelims)); - - if (can_open && subj->emphasis_nestlevel < STACK_LIMIT) - { - istack = (inline_stack*)malloc(sizeof(inline_stack)); - if (istack == NULL) { - return NULL; - } - istack->delim_count = numdelims; - istack->delim_char = c; - istack->first_inline = inl_text; - istack->previous = subj->emphasis_openers; - subj->emphasis_openers = istack; - subj->emphasis_nestlevel++; - } - return inl_text; } @@ -438,7 +462,7 @@ static node_inl* handle_entity(subject* subj) len = houdini_unescape_ent(&ent, subj->input.data + subj->pos, subj->input.len - subj->pos - ); + ); if (len == 0) return make_str(chunk_literal("&")); @@ -538,9 +562,9 @@ static node_inl* handle_pointy_brace(subject* subj) subj->pos += matchlen; return make_autolink( - make_str_with_entities(&contents), - contents, 0 - ); + make_str_with_entities(&contents), + contents, 0 + ); } // next try to match an email autolink @@ -550,9 +574,9 @@ static node_inl* handle_pointy_brace(subject* subj) subj->pos += matchlen; return make_autolink( - make_str_with_entities(&contents), - contents, 1 - ); + make_str_with_entities(&contents), + contents, 1 + ); } // finally, try to match an html tag @@ -594,8 +618,7 @@ static int link_label(subject* subj, chunk *raw_label) advance(subj); // advance past [ unsigned char c; - while ((c = peek_char(subj)) && - (c != ']' || (nestlevel > 0 && nestlevel < STACK_LIMIT))) { + while ((c = peek_char(subj)) && (c != ']' || nestlevel > 0)) { switch (c) { case '`': tmp = handle_backticks(subj); @@ -646,11 +669,12 @@ static node_inl* handle_left_bracket(subject* subj) int n; int sps; int found_label; - int endlabel, starturl, endurl, starttitle, endtitle, endall; + int endlabel, startpos, starturl, endurl, starttitle, endtitle, endall; chunk rawlabel; chunk url, title; + startpos = subj->pos; found_label = link_label(subj, &rawlabel); endlabel = subj->pos; @@ -679,13 +703,7 @@ static node_inl* handle_left_bracket(subject* subj) return make_link(lab, url, title); } else { - // if we get here, we matched a label but didn't get further: - subj->pos = endlabel; - lab = parse_chunk_inlines(&rawlabel, subj->refmap); - result = append_inlines(make_str(chunk_literal("[")), - append_inlines(lab, - make_str(chunk_literal("]")))); - return result; + goto noMatch; } } else { chunk rawlabel_tmp; @@ -710,16 +728,14 @@ static node_inl* handle_left_bracket(subject* subj) lab = parse_chunk_inlines(&rawlabel, NULL); result = make_ref_link(lab, ref); } else { - subj->pos = endlabel; - lab = parse_chunk_inlines(&rawlabel, subj->refmap); - result = append_inlines(make_str(chunk_literal("[")), - append_inlines(lab, make_str(chunk_literal("]")))); + goto noMatch; } return result; } } +noMatch: // If we fall through to here, it means we didn't match a link: - advance(subj); // advance past [ + subj->pos = startpos + 1; // advance past [ return make_str(chunk_literal("[")); } @@ -755,9 +771,9 @@ extern node_inl* parse_inlines_while(subject* subj, int (*f)(subject*)) node_inl** last = &result; node_inl* first = NULL; while ((*f)(subj) && parse_inline(subj, last)) { - if (!first) { - first = *last; - } + if (!first) { + first = *last; + } } inline_stack* istack = subj->emphasis_openers; diff --git a/src/references.c b/src/references.c index 04b9025..5ba4b24 100644 --- a/src/references.c +++ b/src/references.c @@ -16,12 +16,12 @@ refhash(const unsigned char *link_ref) static void reference_free(reference *ref) { - if(ref != NULL) { - free(ref->label); - free(ref->url); - free(ref->title); - free(ref); - } + if(ref != NULL) { + free(ref->label); + free(ref->url); + free(ref->title); + free(ref); + } } // normalize reference: collapse internal whitespace to single space, @@ -33,8 +33,8 @@ static unsigned char *normalize_reference(chunk *ref) strbuf normalized = GH_BUF_INIT; unsigned char *result; - if(ref == NULL) - return NULL; + if(ref == NULL) + return NULL; if (ref->len == 0) return NULL; @@ -50,7 +50,7 @@ static unsigned char *normalize_reference(chunk *ref) free(result); return NULL; } - + return result; } @@ -81,15 +81,15 @@ extern void reference_create(reference_map *map, chunk *label, chunk *url, chunk return; ref = calloc(1, sizeof(*ref)); - if(ref != NULL) { - ref->label = reflabel; - ref->hash = refhash(ref->label); - ref->url = clean_url(url); - ref->title = clean_title(title); - ref->next = NULL; - - add_reference(map, ref); - } + if(ref != NULL) { + ref->label = reflabel; + ref->hash = refhash(ref->label); + ref->url = clean_url(url); + ref->title = clean_title(title); + ref->next = NULL; + + add_reference(map, ref); + } } // Returns reference if refmap contains a reference with matching @@ -125,8 +125,8 @@ void reference_map_free(reference_map *map) { unsigned int i; - if(map == NULL) - return; + if(map == NULL) + return; for (i = 0; i < REFMAP_SIZE; ++i) { reference *ref = map->table[i]; @@ -144,5 +144,5 @@ void reference_map_free(reference_map *map) reference_map *reference_map_new(void) { - return calloc(1, sizeof(reference_map)); + return calloc(1, sizeof(reference_map)); } |