diff options
author | Jason A. Donenfeld <Jason@zx2c4.com> | 2015-10-09 15:13:35 +0200 |
---|---|---|
committer | Jason A. Donenfeld <Jason@zx2c4.com> | 2015-10-09 15:13:35 +0200 |
commit | 525c815cc400bc49881144bcd7e7b717bbc1af5d (patch) | |
tree | 1d4ed0d11a950c45cc1fceb26cd3aa20ed6c2300 /filters/html-converters/resources/markdown.pl | |
parent | 6edfc1672cdc5eb0dfb0ff5db0ec1de1ec53415e (diff) |
filters: Simplify converters
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Diffstat (limited to 'filters/html-converters/resources/markdown.pl')
-rwxr-xr-x | filters/html-converters/resources/markdown.pl | 1727 |
1 files changed, 0 insertions, 1727 deletions
diff --git a/filters/html-converters/resources/markdown.pl b/filters/html-converters/resources/markdown.pl deleted file mode 100755 index 4c39808..0000000 --- a/filters/html-converters/resources/markdown.pl +++ /dev/null @@ -1,1727 +0,0 @@ -#!/usr/bin/perl - -# -# Markdown -- A text-to-HTML conversion tool for web writers -# -# Copyright (c) 2004 John Gruber -# <http://daringfireball.net/projects/markdown/> -# - - -package Markdown; -require 5.006_000; -use strict; -use warnings; - -use Digest::MD5 qw(md5_hex); -use vars qw($VERSION); -$VERSION = '1.0.1'; -# Tue 14 Dec 2004 - - -# -# Global default settings: -# -my $g_empty_element_suffix = " />"; # Change to ">" for HTML output -my $g_tab_width = 4; - - -# -# Globals: -# - -# Regex to match balanced [brackets]. See Friedl's -# "Mastering Regular Expressions", 2nd Ed., pp. 328-331. -my $g_nested_brackets; -$g_nested_brackets = qr{ - (?> # Atomic matching - [^\[\]]+ # Anything other than brackets - | - \[ - (??{ $g_nested_brackets }) # Recursive set of nested brackets - \] - )* -}x; - - -# Table of hash values for escaped characters: -my %g_escape_table; -foreach my $char (split //, '\\`*_{}[]()>#+-.!') { - $g_escape_table{$char} = md5_hex($char); -} - - -# Global hashes, used by various utility routines -my %g_urls; -my %g_titles; -my %g_html_blocks; - -# Used to track when we're inside an ordered or unordered list -# (see _ProcessListItems() for details): -my $g_list_level = 0; - - -#### Blosxom plug-in interface ########################################## - -# Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine -# which posts Markdown should process, using a "meta-markup: markdown" -# header. If it's set to 0 (the default), Markdown will process all -# entries. -my $g_blosxom_use_meta = 0; - -sub start { 1; } -sub story { - my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_; - - if ( (! $g_blosxom_use_meta) or - (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i)) - ){ - $$body_ref = Markdown($$body_ref); - } - 1; -} - - -#### Movable Type plug-in interface ##################################### -eval {require MT}; # Test to see if we're running in MT. -unless ($@) { - require MT; - import MT; - require MT::Template::Context; - import MT::Template::Context; - - eval {require MT::Plugin}; # Test to see if we're running >= MT 3.0. - unless ($@) { - require MT::Plugin; - import MT::Plugin; - my $plugin = new MT::Plugin({ - name => "Markdown", - description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)", - doc_link => 'http://daringfireball.net/projects/markdown/' - }); - MT->add_plugin( $plugin ); - } - - MT::Template::Context->add_container_tag(MarkdownOptions => sub { - my $ctx = shift; - my $args = shift; - my $builder = $ctx->stash('builder'); - my $tokens = $ctx->stash('tokens'); - - if (defined ($args->{'output'}) ) { - $ctx->stash('markdown_output', lc $args->{'output'}); - } - - defined (my $str = $builder->build($ctx, $tokens) ) - or return $ctx->error($builder->errstr); - $str; # return value - }); - - MT->add_text_filter('markdown' => { - label => 'Markdown', - docs => 'http://daringfireball.net/projects/markdown/', - on_format => sub { - my $text = shift; - my $ctx = shift; - my $raw = 0; - if (defined $ctx) { - my $output = $ctx->stash('markdown_output'); - if (defined $output && $output =~ m/^html/i) { - $g_empty_element_suffix = ">"; - $ctx->stash('markdown_output', ''); - } - elsif (defined $output && $output eq 'raw') { - $raw = 1; - $ctx->stash('markdown_output', ''); - } - else { - $raw = 0; - $g_empty_element_suffix = " />"; - } - } - $text = $raw ? $text : Markdown($text); - $text; - }, - }); - - # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter: - my $smartypants; - - { - no warnings "once"; - $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'}; - } - - if ($smartypants) { - MT->add_text_filter('markdown_with_smartypants' => { - label => 'Markdown With SmartyPants', - docs => 'http://daringfireball.net/projects/markdown/', - on_format => sub { - my $text = shift; - my $ctx = shift; - if (defined $ctx) { - my $output = $ctx->stash('markdown_output'); - if (defined $output && $output eq 'html') { - $g_empty_element_suffix = ">"; - } - else { - $g_empty_element_suffix = " />"; - } - } - $text = Markdown($text); - $text = $smartypants->($text, '1'); - }, - }); - } -} -else { -#### BBEdit/command-line text filter interface ########################## -# Needs to be hidden from MT (and Blosxom when running in static mode). - - # We're only using $blosxom::version once; tell Perl not to warn us: - no warnings 'once'; - unless ( defined($blosxom::version) ) { - use warnings; - - #### Check for command-line switches: ################# - my %cli_opts; - use Getopt::Long; - Getopt::Long::Configure('pass_through'); - GetOptions(\%cli_opts, - 'version', - 'shortversion', - 'html4tags', - ); - if ($cli_opts{'version'}) { # Version info - print "\nThis is Markdown, version $VERSION.\n"; - print "Copyright 2004 John Gruber\n"; - print "http://daringfireball.net/projects/markdown/\n\n"; - exit 0; - } - if ($cli_opts{'shortversion'}) { # Just the version number string. - print $VERSION; - exit 0; - } - if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML - $g_empty_element_suffix = ">"; - } - - - #### Process incoming text: ########################### - my $text; - { - local $/; # Slurp the whole file - $text = <>; - } - print <<'EOT'; -<style> -.markdown-body { - font-size: 14px; - line-height: 1.6; - overflow: hidden; -} -.markdown-body>*:first-child { - margin-top: 0 !important; -} -.markdown-body>*:last-child { - margin-bottom: 0 !important; -} -.markdown-body a.absent { - color: #c00; -} -.markdown-body a.anchor { - display: block; - padding-left: 30px; - margin-left: -30px; - cursor: pointer; - position: absolute; - top: 0; - left: 0; - bottom: 0; -} -.markdown-body h1, .markdown-body h2, .markdown-body h3, .markdown-body h4, .markdown-body h5, .markdown-body h6 { - margin: 20px 0 10px; - padding: 0; - font-weight: bold; - -webkit-font-smoothing: antialiased; - cursor: text; - position: relative; -} -.markdown-body h1 .mini-icon-link, .markdown-body h2 .mini-icon-link, .markdown-body h3 .mini-icon-link, .markdown-body h4 .mini-icon-link, .markdown-body h5 .mini-icon-link, .markdown-body h6 .mini-icon-link { - display: none; - color: #000; -} -.markdown-body h1:hover a.anchor, .markdown-body h2:hover a.anchor, .markdown-body h3:hover a.anchor, .markdown-body h4:hover a.anchor, .markdown-body h5:hover a.anchor, .markdown-body h6:hover a.anchor { - text-decoration: none; - line-height: 1; - padding-left: 0; - margin-left: -22px; - top: 15%} -.markdown-body h1:hover a.anchor .mini-icon-link, .markdown-body h2:hover a.anchor .mini-icon-link, .markdown-body h3:hover a.anchor .mini-icon-link, .markdown-body h4:hover a.anchor .mini-icon-link, .markdown-body h5:hover a.anchor .mini-icon-link, .markdown-body h6:hover a.anchor .mini-icon-link { - display: inline-block; -} -.markdown-body h1 tt, .markdown-body h1 code, .markdown-body h2 tt, .markdown-body h2 code, .markdown-body h3 tt, .markdown-body h3 code, .markdown-body h4 tt, .markdown-body h4 code, .markdown-body h5 tt, .markdown-body h5 code, .markdown-body h6 tt, .markdown-body h6 code { - font-size: inherit; -} -.markdown-body h1 { - font-size: 28px; - color: #000; -} -.markdown-body h2 { - font-size: 24px; - border-bottom: 1px solid #ccc; - color: #000; -} -.markdown-body h3 { - font-size: 18px; -} -.markdown-body h4 { - font-size: 16px; -} -.markdown-body h5 { - font-size: 14px; -} -.markdown-body h6 { - color: #777; - font-size: 14px; -} -.markdown-body p, .markdown-body blockquote, .markdown-body ul, .markdown-body ol, .markdown-body dl, .markdown-body table, .markdown-body pre { - margin: 15px 0; -} -.markdown-body hr { - background: transparent url("/dirty-shade.png") repeat-x 0 0; - border: 0 none; - color: #ccc; - height: 4px; - padding: 0; -} -.markdown-body>h2:first-child, .markdown-body>h1:first-child, .markdown-body>h1:first-child+h2, .markdown-body>h3:first-child, .markdown-body>h4:first-child, .markdown-body>h5:first-child, .markdown-body>h6:first-child { - margin-top: 0; - padding-top: 0; -} -.markdown-body a:first-child h1, .markdown-body a:first-child h2, .markdown-body a:first-child h3, .markdown-body a:first-child h4, .markdown-body a:first-child h5, .markdown-body a:first-child h6 { - margin-top: 0; - padding-top: 0; -} -.markdown-body h1+p, .markdown-body h2+p, .markdown-body h3+p, .markdown-body h4+p, .markdown-body h5+p, .markdown-body h6+p { - margin-top: 0; -} -.markdown-body li p.first { - display: inline-block; -} -.markdown-body ul, .markdown-body ol { - padding-left: 30px; -} -.markdown-body ul.no-list, .markdown-body ol.no-list { - list-style-type: none; - padding: 0; -} -.markdown-body ul li>:first-child, .markdown-body ul li ul:first-of-type, .markdown-body ul li ol:first-of-type, .markdown-body ol li>:first-child, .markdown-body ol li ul:first-of-type, .markdown-body ol li ol:first-of-type { - margin-top: 0px; -} -.markdown-body ul li p:last-of-type, .markdown-body ol li p:last-of-type { - margin-bottom: 0; -} -.markdown-body ul ul, .markdown-body ul ol, .markdown-body ol ol, .markdown-body ol ul { - margin-bottom: 0; -} -.markdown-body dl { - padding: 0; -} -.markdown-body dl dt { - font-size: 14px; - font-weight: bold; - font-style: italic; - padding: 0; - margin: 15px 0 5px; -} -.markdown-body dl dt:first-child { - padding: 0; -} -.markdown-body dl dt>:first-child { - margin-top: 0px; -} -.markdown-body dl dt>:last-child { - margin-bottom: 0px; -} -.markdown-body dl dd { - margin: 0 0 15px; - padding: 0 15px; -} -.markdown-body dl dd>:first-child { - margin-top: 0px; -} -.markdown-body dl dd>:last-child { - margin-bottom: 0px; -} -.markdown-body blockquote { - border-left: 4px solid #DDD; - padding: 0 15px; - color: #777; -} -.markdown-body blockquote>:first-child { - margin-top: 0px; -} -.markdown-body blockquote>:last-child { - margin-bottom: 0px; -} -.markdown-body table th { - font-weight: bold; -} -.markdown-body table th, .markdown-body table td { - border: 1px solid #ccc; - padding: 6px 13px; -} -.markdown-body table tr { - border-top: 1px solid #ccc; - background-color: #fff; -} -.markdown-body table tr:nth-child(2n) { - background-color: #f8f8f8; -} -.markdown-body img { - max-width: 100%; - -moz-box-sizing: border-box; - box-sizing: border-box; -} -.markdown-body span.frame { - display: block; - overflow: hidden; -} -.markdown-body span.frame>span { - border: 1px solid #ddd; - display: block; - float: left; - overflow: hidden; - margin: 13px 0 0; - padding: 7px; - width: auto; -} -.markdown-body span.frame span img { - display: block; - float: left; -} -.markdown-body span.frame span span { - clear: both; - color: #333; - display: block; - padding: 5px 0 0; -} -.markdown-body span.align-center { - display: block; - overflow: hidden; - clear: both; -} -.markdown-body span.align-center>span { - display: block; - overflow: hidden; - margin: 13px auto 0; - text-align: center; -} -.markdown-body span.align-center span img { - margin: 0 auto; - text-align: center; -} -.markdown-body span.align-right { - display: block; - overflow: hidden; - clear: both; -} -.markdown-body span.align-right>span { - display: block; - overflow: hidden; - margin: 13px 0 0; - text-align: right; -} -.markdown-body span.align-right span img { - margin: 0; - text-align: right; -} -.markdown-body span.float-left { - display: block; - margin-right: 13px; - overflow: hidden; - float: left; -} -.markdown-body span.float-left span { - margin: 13px 0 0; -} -.markdown-body span.float-right { - display: block; - margin-left: 13px; - overflow: hidden; - float: right; -} -.markdown-body span.float-right>span { - display: block; - overflow: hidden; - margin: 13px auto 0; - text-align: right; -} -.markdown-body code, .markdown-body tt { - margin: 0 2px; - padding: 0px 5px; - border: 1px solid #eaeaea; - background-color: #f8f8f8; - border-radius: 3px; -} -.markdown-body code { - white-space: nowrap; -} -.markdown-body pre>code { - margin: 0; - padding: 0; - white-space: pre; - border: none; - background: transparent; -} -.markdown-body .highlight pre, .markdown-body pre { - background-color: #f8f8f8; - border: 1px solid #ccc; - font-size: 13px; - line-height: 19px; - overflow: auto; - padding: 6px 10px; - border-radius: 3px; -} -.markdown-body pre code, .markdown-body pre tt { - margin: 0; - padding: 0; - background-color: transparent; - border: none; -} -</style> -EOT - print "<div class='markdown-body'>"; - print Markdown($text); - print "</div>"; - } -} - - - -sub Markdown { -# -# Main function. The order in which other subs are called here is -# essential. Link and image substitutions need to happen before -# _EscapeSpecialChars(), so that any *'s or _'s in the <a> -# and <img> tags get encoded. -# - my $text = shift; - - # Clear the global hashes. If we don't clear these, you get conflicts - # from other articles when generating a page which contains more than - # one article (e.g. an index page that shows the N most recent - # articles): - %g_urls = (); - %g_titles = (); - %g_html_blocks = (); - - - # Standardize line endings: - $text =~ s{\r\n}{\n}g; # DOS to Unix - $text =~ s{\r}{\n}g; # Mac to Unix - - # Make sure $text ends with a couple of newlines: - $text .= "\n\n"; - - # Convert all tabs to spaces. - $text = _Detab($text); - - # Strip any lines consisting only of spaces and tabs. - # This makes subsequent regexen easier to write, because we can - # match consecutive blank lines with /\n+/ instead of something - # contorted like /[ \t]*\n+/ . - $text =~ s/^[ \t]+$//mg; - - # Turn block-level HTML blocks into hash entries - $text = _HashHTMLBlocks($text); - - # Strip link definitions, store in hashes. - $text = _StripLinkDefinitions($text); - - $text = _RunBlockGamut($text); - - $text = _UnescapeSpecialChars($text); - - return $text . "\n"; -} - - -sub _StripLinkDefinitions { -# -# Strips link definitions from text, stores the URLs and titles in -# hash references. -# - my $text = shift; - my $less_than_tab = $g_tab_width - 1; - - # Link defs are in the form: ^[id]: url "optional title" - while ($text =~ s{ - ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1 - [ \t]* - \n? # maybe *one* newline - [ \t]* - <?(\S+?)>? # url = $2 - [ \t]* - \n? # maybe one newline - [ \t]* - (?: - (?<=\s) # lookbehind for whitespace - ["(] - (.+?) # title = $3 - [")] - [ \t]* - )? # title is optional - (?:\n+|\Z) - } - {}mx) { - $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive - if ($3) { - $g_titles{lc $1} = $3; - $g_titles{lc $1} =~ s/"/"/g; - } - } - - return $text; -} - - -sub _HashHTMLBlocks { - my $text = shift; - my $less_than_tab = $g_tab_width - 1; - - # Hashify HTML blocks: - # We only want to do this for block-level HTML tags, such as headers, - # lists, and tables. That's because we still want to wrap <p>s around - # "paragraphs" that are wrapped in non-block-level tags, such as anchors, - # phrase emphasis, and spans. The list of tags we're looking for is - # hard-coded: - my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/; - my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/; - - # First, look for nested blocks, e.g.: - # <div> - # <div> - # tags for inner block must be indented. - # </div> - # </div> - # - # The outermost tags must start at the left margin for this to match, and - # the inner nested divs must be indented. - # We need to do this before the next, more liberal match, because the next - # match will start at the first `<div>` and stop at the first `</div>`. - $text =~ s{ - ( # save in $1 - ^ # start of line (with /m) - <($block_tags_a) # start tag = $2 - \b # word break - (.*\n)*? # any number of lines, minimally matching - </\2> # the matching end tag - [ \t]* # trailing spaces/tabs - (?=\n+|\Z) # followed by a newline or end of document - ) - }{ - my $key = md5_hex($1); - $g_html_blocks{$key} = $1; - "\n\n" . $key . "\n\n"; - }egmx; - - - # - # Now match more liberally, simply from `\n<tag>` to `</tag>\n` - # - $text =~ s{ - ( # save in $1 - ^ # start of line (with /m) - <($block_tags_b) # start tag = $2 - \b # word break - (.*\n)*? # any number of lines, minimally matching - .*</\2> # the matching end tag - [ \t]* # trailing spaces/tabs - (?=\n+|\Z) # followed by a newline or end of document - ) - }{ - my $key = md5_hex($1); - $g_html_blocks{$key} = $1; - "\n\n" . $key . "\n\n"; - }egmx; - # Special case just for <hr />. It was easier to make a special case than - # to make the other regex more complicated. - $text =~ s{ - (?: - (?<=\n\n) # Starting after a blank line - | # or - \A\n? # the beginning of the doc - ) - ( # save in $1 - [ ]{0,$less_than_tab} - <(hr) # start tag = $2 - \b # word break - ([^<>])*? # - /?> # the matching end tag - [ \t]* - (?=\n{2,}|\Z) # followed by a blank line or end of document - ) - }{ - my $key = md5_hex($1); - $g_html_blocks{$key} = $1; - "\n\n" . $key . "\n\n"; - }egx; - - # Special case for standalone HTML comments: - $text =~ s{ - (?: - (?<=\n\n) # Starting after a blank line - | # or - \A\n? # the beginning of the doc - ) - ( # save in $1 - [ ]{0,$less_than_tab} - (?s: - <! - (--.*?--\s*)+ - > - ) - [ \t]* - (?=\n{2,}|\Z) # followed by a blank line or end of document - ) - }{ - my $key = md5_hex($1); - $g_html_blocks{$key} = $1; - "\n\n" . $key . "\n\n"; - }egx; - - - return $text; -} - - -sub _RunBlockGamut { -# -# These are all the transformations that form block-level -# tags like paragraphs, headers, and list items. -# - my $text = shift; - - $text = _DoHeaders($text); - - # Do Horizontal Rules: - $text =~ s{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx; - $text =~ s{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx; - $text =~ s{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx; - - $text = _DoLists($text); - - $text = _DoCodeBlocks($text); - - $text = _DoBlockQuotes($text); - - # We already ran _HashHTMLBlocks() before, in Markdown(), but that - # was to escape raw HTML in the original Markdown source. This time, - # we're escaping the markup we've just created, so that we don't wrap - # <p> tags around block-level tags. - $text = _HashHTMLBlocks($text); - - $text = _FormParagraphs($text); - - return $text; -} - - -sub _RunSpanGamut { -# -# These are all the transformations that occur *within* block-level -# tags like paragraphs, headers, and list items. -# - my $text = shift; - - $text = _DoCodeSpans($text); - - $text = _EscapeSpecialChars($text); - - # Process anchor and image tags. Images must come first, - # because ![foo][f] looks like an anchor. - $text = _DoImages($text); - $text = _DoAnchors($text); - - # Make links out of things like `<http://example.com/>` - # Must come after _DoAnchors(), because you can use < and > - # delimiters in inline links like [this](<url>). - $text = _DoAutoLinks($text); - - $text = _EncodeAmpsAndAngles($text); - - $text = _DoItalicsAndBold($text); - - # Do hard breaks: - $text =~ s/ {2,}\n/ <br$g_empty_element_suffix\n/g; - - return $text; -} - - -sub _EscapeSpecialChars { - my $text = shift; - my $tokens ||= _TokenizeHTML($text); - - $text = ''; # rebuild $text from the tokens -# my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags. -# my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!; - - foreach my $cur_token (@$tokens) { - if ($cur_token->[0] eq "tag") { - # Within tags, encode * and _ so they don't conflict - # with their use in Markdown for italics and strong. - # We're replacing each such character with its - # corresponding MD5 checksum value; this is likely - # overkill, but it should prevent us from colliding - # with the escape values by accident. - $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gx; - $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gx; - $text .= $cur_token->[1]; - } else { - my $t = $cur_token->[1]; - $t = _EncodeBackslashEscapes($t); - $text .= $t; - } - } - return $text; -} - - -sub _DoAnchors { -# -# Turn Markdown link shortcuts into XHTML <a> tags. -# - my $text = shift; - - # - # First, handle reference-style links: [link text] [id] - # - $text =~ s{ - ( # wrap whole match in $1 - \[ - ($g_nested_brackets) # link text = $2 - \] - - [ ]? # one optional space - (?:\n[ ]*)? # one optional newline followed by spaces - - \[ - (.*?) # id = $3 - \] - ) - }{ - my $result; - my $whole_match = $1; - my $link_text = $2; - my $link_id = lc $3; - - if ($link_id eq "") { - $link_id = lc $link_text; # for shortcut links like [this][]. - } - - if (defined $g_urls{$link_id}) { - my $url = $g_urls{$link_id}; - $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid - $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold. - $result = "<a href=\"$url\""; - if ( defined $g_titles{$link_id} ) { - my $title = $g_titles{$link_id}; - $title =~ s! \* !$g_escape_table{'*'}!gx; - $title =~ s! _ !$g_escape_table{'_'}!gx; - $result .= " title=\"$title\""; - } - $result .= ">$link_text</a>"; - } - else { - $result = $whole_match; - } - $result; - }xsge; - - # - # Next, inline-style links: [link text](url "optional title") - # - $text =~ s{ - ( # wrap whole match in $1 - \[ - ($g_nested_brackets) # link text = $2 - \] - \( # literal paren - [ \t]* - <?(.*?)>? # href = $3 - [ \t]* - ( # $4 - (['"]) # quote char = $5 - (.*?) # Title = $6 - \5 # matching quote - )? # title is optional - \) - ) - }{ - my $result; - my $whole_match = $1; - my $link_text = $2; - my $url = $3; - my $title = $6; - - $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid - $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold. - $result = "<a href=\"$url\""; - - if (defined $title) { - $title =~ s/"/"/g; - $title =~ s! \* !$g_escape_table{'*'}!gx; - $title =~ s! _ !$g_escape_table{'_'}!gx; - $result .= " title=\"$title\""; - } - - $result .= ">$link_text</a>"; - - $result; - }xsge; - - return $text; -} - - -sub _DoImages { -# -# Turn Markdown image shortcuts into <img> tags. -# - my $text = shift; - - # - # First, handle reference-style labeled images: ![alt text][id] - # - $text =~ s{ - ( # wrap whole match in $1 - !\[ - (.*?) # alt text = $2 - \] - - [ ]? # one optional space - (?:\n[ ]*)? # one optional newline followed by spaces - - \[ - (.*?) # id = $3 - \] - - ) - }{ - my $result; - my $whole_match = $1; - my $alt_text = $2; - my $link_id = lc $3; - - if ($link_id eq "") { - $link_id = lc $alt_text; # for shortcut links like ![this][]. - } - - $alt_text =~ s/"/"/g; - if (defined $g_urls{$link_id}) { - my $url = $g_urls{$link_id}; - $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid - $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold. - $result = "<img src=\"$url\" alt=\"$alt_text\""; - if (defined $g_titles{$link_id}) { - my $title = $g_titles{$link_id}; - $title =~ s! \* !$g_escape_table{'*'}!gx; - $title =~ s! _ !$g_escape_table{'_'}!gx; - $result .= " title=\"$title\""; - } - $result .= $g_empty_element_suffix; - } - else { - # If there's no such link ID, leave intact: - $result = $whole_match; - } - - $result; - }xsge; - - # - # Next, handle inline images: ![alt text](url "optional title") - # Don't forget: encode * and _ - - $text =~ s{ - ( # wrap whole match in $1 - !\[ - (.*?) # alt text = $2 - \] - \( # literal paren - [ \t]* - <?(\S+?)>? # src url = $3 - [ \t]* - ( # $4 - (['"]) # quote char = $5 - (.*?) # title = $6 - \5 # matching quote - [ \t]* - )? # title is optional - \) - ) - }{ - my $result; - my $whole_match = $1; - my $alt_text = $2; - my $url = $3; - my $title = ''; - if (defined($6)) { - $title = $6; - } - - $alt_text =~ s/"/"/g; - $title =~ s/"/"/g; - $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid - $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold. - $result = "<img src=\"$url\" alt=\"$alt_text\""; - if (defined $title) { - $title =~ s! \* !$g_escape_table{'*'}!gx; - $title =~ s! _ !$g_escape_table{'_'}!gx; - $result .= " title=\"$title\""; - } - $result .= $g_empty_element_suffix; - - $result; - }xsge; - - return $text; -} - - -sub _DoHeaders { - my $text = shift; - - # Setext-style headers: - # Header 1 - # ======== - # - # Header 2 - # -------- - # - $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{ - "<h1>" . _RunSpanGamut($1) . "</h1>\n\n"; - }egmx; - - $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{ - "<h2>" . _RunSpanGamut($1) . "</h2>\n\n"; - }egmx; - - - # atx-style headers: - # # Header 1 - # ## Header 2 - # ## Header 2 with closing hashes ## - # ... - # ###### Header 6 - # - $text =~ s{ - ^(\#{1,6}) # $1 = string of #'s - [ \t]* - (.+?) # $2 = Header text - [ \t]* - \#* # optional closing #'s (not counted) - \n+ - }{ - my $h_level = length($1); - "<h$h_level>" . _RunSpanGamut($2) . "</h$h_level>\n\n"; - }egmx; - - return $text; -} - - -sub _DoLists { -# -# Form HTML ordered (numbered) and unordered (bulleted) lists. -# - my $text = shift; - my $less_than_tab = $g_tab_width - 1; - - # Re-usable patterns to match list item bullets and number markers: - my $marker_ul = qr/[*+-]/; - my $marker_ol = qr/\d+[.]/; - my $marker_any = qr/(?:$marker_ul|$marker_ol)/; - - # Re-usable pattern to match any entirel ul or ol list: - my $whole_list = qr{ - ( # $1 = whole list - ( # $2 - [ ]{0,$less_than_tab} - (${marker_any}) # $3 = first list item marker - [ \t]+ - ) - (?s:.+?) - ( # $4 - \z - | - \n{2,} - (?=\S) - (?! # Negative lookahead for another list item marker - [ \t]* - ${marker_any}[ \t]+ - ) - ) - ) - }mx; - - # We use a different prefix before nested lists than top-level lists. - # See extended comment in _ProcessListItems(). - # - # Note: There's a bit of duplication here. My original implementation - # created a scalar regex pattern as the conditional result of the test on - # $g_list_level, and then only ran the $text =~ s{...}{...}egmx - # substitution once, using the scalar as the pattern. This worked, - # everywhere except when running under MT on my hosting account at Pair - # Networks. There, this caused all rebuilds to be killed by the reaper (or - # perhaps they crashed, but that seems incredibly unlikely given that the - # same script on the same server ran fine *except* under MT. I've spent - # more time trying to figure out why this is happening than I'd like to - # admit. My only guess, backed up by the fact that this workaround works, - # is that Perl optimizes the substition when it can figure out that the - # pattern will never change, and when this optimization isn't on, we run - # afoul of the reaper. Thus, the slightly redundant code to that uses two - # static s/// patterns rather than one conditional pattern. - - if ($g_list_level) { - $text =~ s{ - ^ - $whole_list - }{ - my $list = $1; - my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol"; - # Turn double returns into triple returns, so that we can make a - # paragraph for the last item in a list, if necessary: - $list =~ s/\n{2,}/\n\n\n/g; - my $result = _ProcessListItems($list, $marker_any); - $result = "<$list_type>\n" . $result . "</$list_type>\n"; - $result; - }egmx; - } - else { - $text =~ s{ - (?:(?<=\n\n)|\A\n?) - $whole_list - }{ - my $list = $1; - my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol"; - # Turn double returns into triple returns, so that we can make a - # paragraph for the last item in a list, if necessary: - $list =~ s/\n{2,}/\n\n\n/g; - my $result = _ProcessListItems($list, $marker_any); - $result = "<$list_type>\n" . $result . "</$list_type>\n"; - $result; - }egmx; - } - - - return $text; -} - - -sub _ProcessListItems { -# -# Process the contents of a single ordered or unordered list, splitting it -# into individual list items. -# - - my $list_str = shift; - my $marker_any = shift; - - - # The $g_list_level global keeps track of when we're inside a list. - # Each time we enter a list, we increment it; when we leave a list, - # we decrement. If it's zero, we're not in a list anymore. - # - # We do this because when we're not inside a list, we want to treat - # something like this: - # - # I recommend upgrading to version - # 8. Oops, now this line is treated - # as a sub-list. - # - # As a single paragraph, despite the fact that the second line starts - # with a digit-period-space sequence. - # - # Whereas when we're inside a list (or sub-list), that line will be - # treated as the start of a sub-list. What a kludge, huh? This is - # an aspect of Markdown's syntax that's hard to parse perfectly - # without resorting to mind-reading. Perhaps the solution is to - # change the syntax rules such that sub-lists must start with a - # starting cardinal number; e.g. "1." or "a.". - - $g_list_level++; - - # trim trailing blank lines: - $list_str =~ s/\n{2,}\z/\n/; - - - $list_str =~ s{ - (\n)? # leading line = $1 - (^[ \t]*) # leading whitespace = $2 - ($marker_any) [ \t]+ # list marker = $3 - ((?s:.+?) # list item text = $4 - (\n{1,2})) - (?= \n* (\z | \2 ($marker_any) [ \t]+)) - }{ - my $item = $4; - my $leading_line = $1; - my $leading_space = $2; - - if ($leading_line or ($item =~ m/\n{2,}/)) { - $item = _RunBlockGamut(_Outdent($item)); - } - else { - # Recursion for sub-lists: - $item = _DoLists(_Outdent($item)); - chomp $item; - $item = _RunSpanGamut($item); - } - - "<li>" . $item . "</li>\n"; - }egmx; - - $g_list_level--; - return $list_str; -} - - - -sub _DoCodeBlocks { -# -# Process Markdown `<pre><code>` blocks. -# - - my $text = shift; - - $text =~ s{ - (?:\n\n|\A) - ( # $1 = the code block -- one or more lines, starting with a space/tab - (?: - (?:[ ]{$g_tab_width} | \t) # Lines must start with a tab or a tab-width of spaces - .*\n+ - )+ - ) - ((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc - }{ - my $codeblock = $1; - my $result; # return value - - $codeblock = _EncodeCode(_Outdent($codeblock)); - $codeblock = _Detab($codeblock); - $codeblock =~ s/\A\n+//; # trim leading newlines - $codeblock =~ s/\s+\z//; # trim trailing whitespace - - $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n"; - - $result; - }egmx; - - return $text; -} - - -sub _DoCodeSpans { -# -# * Backtick quotes are used for <code></code> spans. -# -# * You can use multiple backticks as the delimiters if you want to -# include literal backticks in the code span. So, this input: -# -# Just type ``foo `bar` baz`` at the prompt. -# -# Will translate to: -# -# <p>Just type <code>foo `bar` baz</code> at the prompt.</p> -# -# There's no arbitrary limit to the number of backticks you -# can use as delimters. If you need three consecutive backticks -# in your code, use four for delimiters, etc. -# -# * You can use spaces to get literal backticks at the edges: -# -# ... type `` `bar` `` ... -# -# Turns to: -# -# ... type <code>`bar`</code> ... -# - - my $text = shift; - - $text =~ s@ - (`+) # $1 = Opening run of ` - (.+?) # $2 = The code block - (?<!`) - \1 # Matching closer - (?!`) - @ - my $c = "$2"; - $c =~ s/^[ \t]*//g; # leading whitespace - $c =~ s/[ \t]*$//g; # trailing whitespace - $c = _EncodeCode($c); - "<code>$c</code>"; - @egsx; - - return $text; -} - - -sub _EncodeCode { -# -# Encode/escape certain characters inside Markdown code runs. -# The point is that in code, these characters are literals, -# and lose their special Markdown meanings. -# - local $_ = shift; - - # Encode all ampersands; HTML entities are not - # entities within a Markdown code span. - s/&/&/g; - - # Encode $'s, but only if we're running under Blosxom. - # (Blosxom interpolates Perl variables in article bodies.) - { - no warnings 'once'; - if (defined($blosxom::version)) { - s/\$/$/g; - } - } - - - # Do the angle bracket song and dance: - s! < !<!gx; - s! > !>!gx; - - # Now, escape characters that are magic in Markdown: - s! \* !$g_escape_table{'*'}!gx; - s! _ !$g_escape_table{'_'}!gx; - s! { !$g_escape_table{'{'}!gx; - s! } !$g_escape_table{'}'}!gx; - s! \[ !$g_escape_table{'['}!gx; - s! \] !$g_escape_table{']'}!gx; - s! \\ !$g_escape_table{'\\'}!gx; - - return $_; -} - - -sub _DoItalicsAndBold { - my $text = shift; - - # <strong> must go first: - $text =~ s{ (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 } - {<strong>$2</strong>}gsx; - - $text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 } - {<em>$2</em>}gsx; - - return $text; -} - - -sub _DoBlockQuotes { - my $text = shift; - - $text =~ s{ - ( # Wrap whole match in $1 - ( - ^[ \t]*>[ \t]? # '>' at the start of a line - .+\n # rest of the first line - (.+\n)* # subsequent consecutive lines - \n* # blanks - )+ - ) - }{ - my $bq = $1; - $bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting - $bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines - $bq = _RunBlockGamut($bq); # recurse - - $bq =~ s/^/ /g; - # These leading spaces screw with <pre> content, so we need to fix that: - $bq =~ s{ - (\s*<pre>.+?</pre>) - }{ - my $pre = $1; - $pre =~ s/^ //mg; - $pre; - }egsx; - - "<blockquote>\n$bq\n</blockquote>\n\n"; - }egmx; - - - return $text; -} - - -sub _FormParagraphs { -# -# Params: -# $text - string to process with html <p> tags -# - my $text = shift; - - # Strip leading and trailing lines: - $text =~ s/\A\n+//; - $text =~ s/\n+\z//; - - my @grafs = split(/\n{2,}/, $text); - - # - # Wrap <p> tags. - # - foreach (@grafs) { - unless (defined( $g_html_blocks{$_} )) { - $_ = _RunSpanGamut($_); - s/^([ \t]*)/<p>/; - $_ .= "</p>"; - } - } - - # - # Unhashify HTML blocks - # - foreach (@grafs) { - if (defined( $g_html_blocks{$_} )) { - $_ = $g_html_blocks{$_}; - } - } - - return join "\n\n", @grafs; -} - - -sub _EncodeAmpsAndAngles { -# Smart processing for ampersands and angle brackets that need to be encoded. - - my $text = shift; - - # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: - # http://bumppo.net/projects/amputator/ - $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&/g; - - # Encode naked <'s - $text =~ s{<(?![a-z/?\$!])}{<}gi; - - return $text; -} - - -sub _EncodeBackslashEscapes { -# -# Parameter: String. -# Returns: The string, with after processing the following backslash -# escape sequences. -# - local $_ = shift; - - s! \\\\ !$g_escape_table{'\\'}!gx; # Must process escaped backslashes first. - s! \\` !$g_escape_table{'`'}!gx; - s! \\\* !$g_escape_table{'*'}!gx; - s! \\_ !$g_escape_table{'_'}!gx; - s! \\\{ !$g_escape_table{'{'}!gx; - s! \\\} !$g_escape_table{'}'}!gx; - s! \\\[ !$g_escape_table{'['}!gx; - s! \\\] !$g_escape_table{']'}!gx; - s! \\\( !$g_escape_table{'('}!gx; - s! \\\) !$g_escape_table{')'}!gx; - s! \\> !$g_escape_table{'>'}!gx; - s! \\\# !$g_escape_table{'#'}!gx; - s! \\\+ !$g_escape_table{'+'}!gx; - s! \\\- !$g_escape_table{'-'}!gx; - s! \\\. !$g_escape_table{'.'}!gx; - s{ \\! }{$g_escape_table{'!'}}gx; - - return $_; -} - - -sub _DoAutoLinks { - my $text = shift; - - $text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi; - - # Email addresses: <address@domain.foo> - $text =~ s{ - < - (?:mailto:)? - ( - [-.\w]+ - \@ - [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+ - ) - > - }{ - _EncodeEmailAddress( _UnescapeSpecialChars($1) ); - }egix; - - return $text; -} - - -sub _EncodeEmailAddress { -# -# Input: an email address, e.g. "foo@example.com" -# -# Output: the email address as a mailto link, with each character -# of the address encoded as either a decimal or hex entity, in -# the hopes of foiling most address harvesting spam bots. E.g.: -# -# <a href="mailto:foo@e -# xample.com">foo -# @example.com</a> -# -# Based on a filter by Matthew Wickline, posted to the BBEdit-Talk -# mailing list: <http://tinyurl.com/yu7ue> -# - - my $addr = shift; - - srand; - my @encode = ( - sub { '&#' . ord(shift) . ';' }, - sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' }, - sub { shift }, - ); - - $addr = "mailto:" . $addr; - - $addr =~ s{(.)}{ - my $char = $1; - if ( $char eq '@' ) { - # this *must* be encoded. I insist. - $char = $encode[int rand 1]->($char); - } elsif ( $char ne ':' ) { - # leave ':' alone (to spot mailto: later) - my $r = rand; - # roughly 10% raw, 45% hex, 45% dec - $char = ( - $r > .9 ? $encode[2]->($char) : - $r < .45 ? $encode[1]->($char) : - $encode[0]->($char) - ); - } - $char; - }gex; - - $addr = qq{<a href="$addr">$addr</a>}; - $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part - - return $addr; -} - - -sub _UnescapeSpecialChars { -# -# Swap back in all the special characters we've hidden. -# - my $text = shift; - - while( my($char, $hash) = each(%g_escape_table) ) { - $text =~ s/$hash/$char/g; - } - return $text; -} - - -sub _TokenizeHTML { -# -# Parameter: String containing HTML markup. -# Returns: Reference to an array of the tokens comprising the input -# string. Each token is either a tag (possibly with nested, -# tags contained therein, such as <a href="<MTFoo>">, or a -# run of text between tags. Each element of the array is a -# two-element array; the first is either 'tag' or 'text'; -# the second is the actual value. -# -# -# Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin. -# <http://www.bradchoate.com/past/mtregex.php> -# - - my $str = shift; - my $pos = 0; - my $len = length $str; - my @tokens; - - my $depth = 6; - my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth); - my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment - (?s: <\? .*? \?> ) | # processing instruction - $nested_tags/ix; # nested tags - - while ($str =~ m/($match)/g) { - my $whole_tag = $1; - my $sec_start = pos $str; - my $tag_start = $sec_start - length $whole_tag; - if ($pos < $tag_start) { - push @tokens, ['text', substr($str, $pos, $tag_start - $pos)]; - } - push @tokens, ['tag', $whole_tag]; - $pos = pos $str; - } - push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len; - \@tokens; -} - - -sub _Outdent { -# -# Remove one level of line-leading tabs or spaces -# - my $text = shift; - - $text =~ s/^(\t|[ ]{1,$g_tab_width})//gm; - return $text; -} - - -sub _Detab { -# -# Cribbed from a post by Bart Lateur: -# <http://www.nntp.perl.org/group/perl.macperl.anyperl/154> -# - my $text = shift; - - $text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge; - return $text; -} - - -1; - -__END__ - - -=pod - -=head1 NAME - -B<Markdown> - - -=head1 SYNOPSIS - -B<Markdown.pl> [ B<--html4tags> ] [ B<--version> ] [ B<-shortversion> ] - [ I<file> ... ] - - -=head1 DESCRIPTION - -Markdown is a text-to-HTML filter; it translates an easy-to-read / -easy-to-write structured text format into HTML. Markdown's text format -is most similar to that of plain text email, and supports features such -as headers, *emphasis*, code blocks, blockquotes, and links. - -Markdown's syntax is designed not as a generic markup language, but -specifically to serve as a front-end to (X)HTML. You can use span-level -HTML tags anywhere in a Markdown document, and you can use block level -HTML tags (like <div> and <table> as well). - -For more information about Markdown's syntax, see: - - http://daringfireball.net/projects/markdown/ - - -=head1 OPTIONS - -Use "--" to end switch parsing. For example, to open a file named "-z", use: - - Markdown.pl -- -z - -=over 4 - - -=item B<--html4tags> - -Use HTML 4 style for empty element tags, e.g.: - - <br> - -instead of Markdown's default XHTML style tags, e.g.: - - <br /> - - -=item B<-v>, B<--version> - -Display Markdown's version number and copyright information. - - -=item B<-s>, B<--shortversion> - -Display the short-form version number. - - -=back - - - -=head1 BUGS - -To file bug reports or feature requests (other than topics listed in the -Caveats section above) please send email to: - - support@daringfireball.net - -Please include with your report: (1) the example input; (2) the output -you expected; (3) the output Markdown actually produced. - - -=head1 VERSION HISTORY - -See the readme file for detailed release notes for this version. - -1.0.1 - 14 Dec 2004 - -1.0 - 28 Aug 2004 - - -=head1 AUTHOR - - John Gruber - http://daringfireball.net - - PHP port and other contributions by Michel Fortin - http://michelf.com - - -=head1 COPYRIGHT AND LICENSE - -Copyright (c) 2003-2004 John Gruber -<http://daringfireball.net/> -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - -* Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - -* Neither the name "Markdown" nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. - -This software is provided by the copyright holders and contributors "as -is" and any express or implied warranties, including, but not limited -to, the implied warranties of merchantability and fitness for a -particular purpose are disclaimed. In no event shall the copyright owner -or contributors be liable for any direct, indirect, incidental, special, -exemplary, or consequential damages (including, but not limited to, -procurement of substitute goods or services; loss of use, data, or -profits; or business interruption) however caused and on any theory of -liability, whether in contract, strict liability, or tort (including -negligence or otherwise) arising in any way out of the use of this -software, even if advised of the possibility of such damage. - -=cut |