var Node = require('./node'); var fromCodePoint = require('./from-code-point.js'); var entityToChar = require('./html5-entities.js').entityToChar; // Constants for character codes: var C_NEWLINE = 10; var C_SPACE = 32; var C_ASTERISK = 42; var C_UNDERSCORE = 95; var C_BACKTICK = 96; var C_OPEN_BRACKET = 91; var C_CLOSE_BRACKET = 93; var C_LESSTHAN = 60; var C_BANG = 33; var C_BACKSLASH = 92; var C_AMPERSAND = 38; var C_OPEN_PAREN = 40; var C_COLON = 58; // Some regexps used in inline parser: var ESCAPABLE = '[!"#$%&\'()*+,./:;<=>?@[\\\\\\]^_`{|}~-]'; var ESCAPED_CHAR = '\\\\' + ESCAPABLE; var REG_CHAR = '[^\\\\()\\x00-\\x20]'; var IN_PARENS_NOSP = '\\((' + REG_CHAR + '|' + ESCAPED_CHAR + ')*\\)'; var TAGNAME = '[A-Za-z][A-Za-z0-9]*'; var ATTRIBUTENAME = '[a-zA-Z_:][a-zA-Z0-9:._-]*'; var UNQUOTEDVALUE = "[^\"'=<>`\\x00-\\x20]+"; var SINGLEQUOTEDVALUE = "'[^']*'"; var DOUBLEQUOTEDVALUE = '"[^"]*"'; var ATTRIBUTEVALUE = "(?:" + UNQUOTEDVALUE + "|" + SINGLEQUOTEDVALUE + "|" + DOUBLEQUOTEDVALUE + ")"; var ATTRIBUTEVALUESPEC = "(?:" + "\\s*=" + "\\s*" + ATTRIBUTEVALUE + ")"; var ATTRIBUTE = "(?:" + "\\s+" + ATTRIBUTENAME + ATTRIBUTEVALUESPEC + "?)"; var OPENTAG = "<" + TAGNAME + ATTRIBUTE + "*" + "\\s*/?>"; var CLOSETAG = "]"; var HTMLCOMMENT = "|"; var PROCESSINGINSTRUCTION = "[<][?].*?[?][>]"; var DECLARATION = "]*>"; var CDATA = ""; var HTMLTAG = "(?:" + OPENTAG + "|" + CLOSETAG + "|" + HTMLCOMMENT + "|" + PROCESSINGINSTRUCTION + "|" + DECLARATION + "|" + CDATA + ")"; var ENTITY = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"; var rePunctuation = new RegExp(/^[\u2000-\u206F\u2E00-\u2E7F\\'!"#\$%&\(\)\*\+,\-\.\/:;<=>\?@\[\]\^_`\{\|\}~]/); var reHtmlTag = new RegExp('^' + HTMLTAG, 'i'); var reLinkTitle = new RegExp( '^(?:"(' + ESCAPED_CHAR + '|[^"\\x00])*"' + '|' + '\'(' + ESCAPED_CHAR + '|[^\'\\x00])*\'' + '|' + '\\((' + ESCAPED_CHAR + '|[^)\\x00])*\\))'); var reLinkDestinationBraces = new RegExp( '^(?:[<](?:[^<>\\n\\\\\\x00]' + '|' + ESCAPED_CHAR + '|' + '\\\\)*[>])'); var reLinkDestination = new RegExp( '^(?:' + REG_CHAR + '+|' + ESCAPED_CHAR + '|' + IN_PARENS_NOSP + ')*'); var reEscapable = new RegExp(ESCAPABLE); var reAllEscapedChar = new RegExp('\\\\(' + ESCAPABLE + ')', 'g'); var reEntityHere = new RegExp('^' + ENTITY, 'i'); var reEntity = new RegExp(ENTITY, 'gi'); // Matches a string of non-special characters. var reMain = /^[^\n`\[\]\\!<&*_]+/m; // Replace entities and backslash escapes with literal characters. var unescapeString = function(s) { "use strict"; return s.replace(reAllEscapedChar, '$1') .replace(reEntity, entityToChar); }; // Normalize reference label: collapse internal whitespace // to single space, remove leading/trailing whitespace, case fold. var normalizeReference = function(s) { "use strict"; return s.trim() .replace(/\s+/, ' ') .toUpperCase(); }; var text = function(s) { "use strict"; var node = new Node('Text'); node.literal = s; return node; }; // INLINE PARSER // These are methods of an InlineParser object, defined below. // An InlineParser keeps track of a subject (a string to be // parsed) and a position in that subject. // If re matches at current position in the subject, advance // position in subject and return the match; otherwise return null. var match = function(re) { "use strict"; var m = re.exec(this.subject.slice(this.pos)); if (m) { this.pos += m.index + m[0].length; return m[0]; } else { return null; } }; // Returns the code for the character at the current subject position, or -1 // there are no more characters. var peek = function() { "use strict"; if (this.pos < this.subject.length) { return this.subject.charCodeAt(this.pos); } else { return -1; } }; // Parse zero or more space characters, including at most one newline var spnl = function() { "use strict"; this.match(/^ *(?:\n *)?/); return 1; }; // All of the parsers below try to match something at the current position // in the subject. If they succeed in matching anything, they // return the inline matched, advancing the subject. // Attempt to parse backticks, adding either a backtick code span or a // literal sequence of backticks. var parseBackticks = function(block) { "use strict"; var ticks = this.match(/^`+/); if (!ticks) { return 0; } var afterOpenTicks = this.pos; var foundCode = false; var matched; var node; while (!foundCode && (matched = this.match(/`+/m))) { if (matched === ticks) { node = new Node('Code'); node.literal = this.subject.slice(afterOpenTicks, this.pos - ticks.length) .replace(/[ \n]+/g, ' ') .trim(); block.appendChild(node); return true; } } // If we got here, we didn't match a closing backtick sequence. this.pos = afterOpenTicks; block.appendChild(text(ticks)); return true; }; // Parse a backslash-escaped special character, adding either the escaped // character, a hard line break (if the backslash is followed by a newline), // or a literal backslash to the block's children. var parseBackslash = function(block) { "use strict"; var subj = this.subject, pos = this.pos; var node; if (subj.charCodeAt(pos) === C_BACKSLASH) { if (subj.charAt(pos + 1) === '\n') { this.pos = this.pos + 2; node = new Node('Hardbreak'); block.appendChild(node); } else if (reEscapable.test(subj.charAt(pos + 1))) { this.pos = this.pos + 2; block.appendChild(text(subj.charAt(pos + 1))); } else { this.pos++; block.appendChild(text('\\')); } return true; } else { return false; } }; // Attempt to parse an autolink (URL or email in pointy brackets). var parseAutolink = function(block) { "use strict"; var m; var dest; var node; if ((m = this.match(/^<([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>/))) { // email autolink dest = m.slice(1, -1); node = new Node('Link'); node.destination = 'mailto:' + encodeURI(unescape(dest)); node.appendChild(text(dest)); block.appendChild(node); return true; } else if ((m = this.match(/^<(?:coap|doi|javascript|aaa|aaas|about|acap|cap|cid|crid|data|dav|dict|dns|file|ftp|geo|go|gopher|h323|http|https|iax|icap|im|imap|info|ipp|iris|iris.beep|iris.xpc|iris.xpcs|iris.lwz|ldap|mailto|mid|msrp|msrps|mtqp|mupdate|news|nfs|ni|nih|nntp|opaquelocktoken|pop|pres|rtsp|service|session|shttp|sieve|sip|sips|sms|snmp|soap.beep|soap.beeps|tag|tel|telnet|tftp|thismessage|tn3270|tip|tv|urn|vemmi|ws|wss|xcon|xcon-userid|xmlrpc.beep|xmlrpc.beeps|xmpp|z39.50r|z39.50s|adiumxtra|afp|afs|aim|apt|attachment|aw|beshare|bitcoin|bolo|callto|chrome|chrome-extension|com-eventbrite-attendee|content|cvs|dlna-playsingle|dlna-playcontainer|dtn|dvb|ed2k|facetime|feed|finger|fish|gg|git|gizmoproject|gtalk|hcp|icon|ipn|irc|irc6|ircs|itms|jar|jms|keyparc|lastfm|ldaps|magnet|maps|market|message|mms|ms-help|msnim|mumble|mvn|notes|oid|palm|paparazzi|platform|proxy|psyc|query|res|resource|rmi|rsync|rtmp|secondlife|sftp|sgn|skype|smb|soldat|spotify|ssh|steam|svn|teamspeak|things|udp|unreal|ut2004|ventrilo|view-source|webcal|wtai|wyciwyg|xfire|xri|ymsgr):[^<>\x00-\x20]*>/i))) { dest = m.slice(1, -1); node = new Node('Link'); node.destination = encodeURI(unescape(dest)); node.appendChild(text(dest)); block.appendChild(node); return true; } else { return false; } }; // Attempt to parse a raw HTML tag. var parseHtmlTag = function(block) { "use strict"; var m = this.match(reHtmlTag); var node; if (m) { node = new Node('Html'); node.literal = m; block.appendChild(node); return true; } else { return false; } }; // Scan a sequence of characters with code cc, and return information about // the number of delimiters and whether they are positioned such that // they can open and/or close emphasis or strong emphasis. A utility // function for strong/emph parsing. var scanDelims = function(cc) { "use strict"; var numdelims = 0; var char_before, char_after, cc_after; var startpos = this.pos; char_before = this.pos === 0 ? '\n' : this.subject.charAt(this.pos - 1); while (this.peek() === cc) { numdelims++; this.pos++; } cc_after = this.peek(); if (cc_after === -1) { char_after = '\n'; } else { char_after = fromCodePoint(cc_after); } var can_open = numdelims > 0 && !(/\s/.test(char_after)) && !(rePunctuation.test(char_after) && !(/\s/.test(char_before)) && !(rePunctuation.test(char_before))); var can_close = numdelims > 0 && !(/\s/.test(char_before)) && !(rePunctuation.test(char_before) && !(/\s/.test(char_after)) && !(rePunctuation.test(char_after))); if (cc === C_UNDERSCORE) { can_open = can_open && !((/[a-z0-9]/i).test(char_before)); can_close = can_close && !((/[a-z0-9]/i).test(char_after)); } this.pos = startpos; return { numdelims: numdelims, can_open: can_open, can_close: can_close }; }; // Attempt to parse emphasis or strong emphasis. var parseEmphasis = function(cc, block) { "use strict"; var res = this.scanDelims(cc); var numdelims = res.numdelims; var startpos = this.pos; if (numdelims === 0) { return false; } this.pos += numdelims; var node = text(this.subject.slice(startpos, this.pos)); block.appendChild(node); // Add entry to stack for this opener this.delimiters = { cc: cc, numdelims: numdelims, node: node, previous: this.delimiters, next: null, can_open: res.can_open, can_close: res.can_close, active: true }; if (this.delimiters.previous !== null) { this.delimiters.previous.next = this.delimiters; } return true; }; var removeDelimiter = function(delim) { "use strict"; if (delim.previous !== null) { delim.previous.next = delim.next; } if (delim.next === null) { // top of stack this.delimiters = delim.previous; } else { delim.next.previous = delim.previous; } }; var processEmphasis = function(block, stack_bottom) { "use strict"; var opener, closer; var opener_inl, closer_inl; var nextstack, tempstack; var use_delims; var tmp, next; // find first closer above stack_bottom: closer = this.delimiters; while (closer !== null && closer.previous !== stack_bottom) { closer = closer.previous; } // move forward, looking for closers, and handling each while (closer !== null) { if (closer.can_close && (closer.cc === C_UNDERSCORE || closer.cc === C_ASTERISK)) { // found emphasis closer. now look back for first matching opener: opener = closer.previous; while (opener !== null && opener !== stack_bottom) { if (opener.cc === closer.cc && opener.can_open) { break; } opener = opener.previous; } if (opener !== null && opener !== stack_bottom) { // calculate actual number of delimiters used from this closer if (closer.numdelims < 3 || opener.numdelims < 3) { use_delims = closer.numdelims <= opener.numdelims ? closer.numdelims : opener.numdelims; } else { use_delims = closer.numdelims % 2 === 0 ? 2 : 1; } opener_inl = opener.node; closer_inl = closer.node; // remove used delimiters from stack elts and inlines opener.numdelims -= use_delims; closer.numdelims -= use_delims; opener_inl.literal = opener_inl.literal.slice(0, opener_inl.literal.length - use_delims); closer_inl.literal = closer_inl.literal.slice(0, closer_inl.literal.length - use_delims); // build contents for new emph element var emph = new Node(use_delims === 1 ? 'Emph' : 'Strong'); tmp = opener_inl.next; while (tmp && tmp !== closer_inl) { next = tmp.next; tmp.unlink(); emph.appendChild(tmp); tmp = next; } opener_inl.insertAfter(emph); // remove elts btw opener and closer in delimiters stack tempstack = closer.previous; while (tempstack !== null && tempstack !== opener) { nextstack = tempstack.previous; this.removeDelimiter(tempstack); tempstack = nextstack; } // if opener has 0 delims, remove it and the inline if (opener.numdelims === 0) { opener_inl.unlink(); this.removeDelimiter(opener); } if (closer.numdelims === 0) { closer_inl.unlink(); tempstack = closer.next; this.removeDelimiter(closer); closer = tempstack; } } else { closer = closer.next; } } else { closer = closer.next; } } // remove all delimiters while (this.delimiters !== stack_bottom) { this.removeDelimiter(this.delimiters); } }; // Attempt to parse link title (sans quotes), returning the string // or null if no match. var parseLinkTitle = function() { "use strict"; var title = this.match(reLinkTitle); if (title) { // chop off quotes from title and unescape: return unescapeString(title.substr(1, title.length - 2)); } else { return null; } }; // Attempt to parse link destination, returning the string or // null if no match. var parseLinkDestination = function() { "use strict"; var res = this.match(reLinkDestinationBraces); if (res) { // chop off surrounding <..>: return encodeURI(unescape(unescapeString(res.substr(1, res.length - 2)))); } else { res = this.match(reLinkDestination); if (res !== null) { return encodeURI(unescape(unescapeString(res))); } else { return null; } } }; // Attempt to parse a link label, returning number of characters parsed. var parseLinkLabel = function() { "use strict"; var m = this.match(/^\[(?:[^\\\[\]]|\\[\[\]]){0,1000}\]/); return m === null ? 0 : m.length; }; // Add open bracket to delimiter stack and add a text node to block's children. var parseOpenBracket = function(block) { "use strict"; var startpos = this.pos; this.pos += 1; var node = text('['); block.appendChild(node); // Add entry to stack for this opener this.delimiters = { cc: C_OPEN_BRACKET, numdelims: 1, node: node, previous: this.delimiters, next: null, can_open: true, can_close: false, index: startpos, active: true }; if (this.delimiters.previous !== null) { this.delimiters.previous.next = this.delimiters; } return true; }; // IF next character is [, and ! delimiter to delimiter stack and // add a text node to block's children. Otherwise just add a text node. var parseBang = function(block) { "use strict"; var startpos = this.pos; this.pos += 1; if (this.peek() === C_OPEN_BRACKET) { this.pos += 1; var node = text('!['); block.appendChild(node); // Add entry to stack for this opener this.delimiters = { cc: C_BANG, numdelims: 1, node: node, previous: this.delimiters, next: null, can_open: true, can_close: false, index: startpos + 1, active: true }; if (this.delimiters.previous !== null) { this.delimiters.previous.next = this.delimiters; } } else { block.appendChild(text('!')); } return true; }; // Try to match close bracket against an opening in the delimiter // stack. Add either a link or image, or a plain [ character, // to block's children. If there is a matching delimiter, // remove it from the delimiter stack. var parseCloseBracket = function(block) { "use strict"; var startpos; var is_image; var dest; var title; var matched = false; var reflabel; var opener; this.pos += 1; startpos = this.pos; // look through stack of delimiters for a [ or ![ opener = this.delimiters; while (opener !== null) { if (opener.cc === C_OPEN_BRACKET || opener.cc === C_BANG) { break; } opener = opener.previous; } if (opener === null) { // no matched opener, just return a literal block.appendChild(text(']')); return true; } if (!opener.active) { // no matched opener, just return a literal block.appendChild(text(']')); // take opener off emphasis stack this.removeDelimiter(opener); return true; } // If we got here, open is a potential opener is_image = opener.cc === C_BANG; // Check to see if we have a link/image // Inline link? if (this.peek() === C_OPEN_PAREN) { this.pos++; if (this.spnl() && ((dest = this.parseLinkDestination()) !== null) && this.spnl() && // make sure there's a space before the title: (/^\s/.test(this.subject.charAt(this.pos - 1)) && (title = this.parseLinkTitle() || '') || true) && this.spnl() && this.match(/^\)/)) { matched = true; } } else { // Next, see if there's a link label var savepos = this.pos; this.spnl(); var beforelabel = this.pos; var n = this.parseLinkLabel(); if (n === 0 || n === 2) { // empty or missing second label reflabel = this.subject.slice(opener.index, startpos); } else { reflabel = this.subject.slice(beforelabel, beforelabel + n); } if (n === 0) { // If shortcut reference link, rewind before spaces we skipped. this.pos = savepos; } // lookup rawlabel in refmap var link = this.refmap[normalizeReference(reflabel)]; if (link) { dest = link.destination; title = link.title; matched = true; } } if (matched) { var node = new Node(is_image ? 'Image' : 'Link'); node.destination = dest; node.title = title; var tmp, next; tmp = opener.node.next; while (tmp) { next = tmp.next; tmp.unlink(); node.appendChild(tmp); tmp = next; } block.appendChild(node); this.processEmphasis(node, opener.previous); opener.node.unlink(); // processEmphasis will remove this and later delimiters. // Now, for a link, we also deactivate earlier link openers. // (no links in links) if (!is_image) { opener = this.delimiters; while (opener !== null) { if (opener.cc === C_OPEN_BRACKET) { opener.active = false; // deactivate this opener } opener = opener.previous; } } return true; } else { // no match this.removeDelimiter(opener); // remove this opener from stack this.pos = startpos; block.appendChild(text(']')); return true; } }; // Attempt to parse an entity, return Entity object if successful. var parseEntity = function(block) { "use strict"; var m; if ((m = this.match(reEntityHere))) { block.appendChild(text(entityToChar(m))); return true; } else { return false; } }; // Parse a run of ordinary characters, or a single character with // a special meaning in markdown, as a plain string. var parseString = function(block) { "use strict"; var m; if ((m = this.match(reMain))) { block.appendChild(text(m)); return true; } else { return false; } }; // Parse a newline. If it was preceded by two spaces, return a hard // line break; otherwise a soft line break. var parseNewline = function(block) { "use strict"; this.pos += 1; // assume we're at a \n // check previous node for trailing spaces var lastc = block.lastChild; if (lastc && lastc.t === 'Text') { var sps = / *$/.exec(lastc.literal)[0].length; if (sps > 0) { lastc.literal = lastc.literal.replace(/ *$/,''); } block.appendChild(new Node(sps >= 2 ? 'Hardbreak' : 'Softbreak')); } else { block.appendChild(new Node('Softbreak')); } this.match(/^ */); // gobble leading spaces in next line return true; }; // Attempt to parse a link reference, modifying refmap. var parseReference = function(s, refmap) { "use strict"; this.subject = s; this.pos = 0; var rawlabel; var dest; var title; var matchChars; var startpos = this.pos; // label: matchChars = this.parseLinkLabel(); if (matchChars === 0) { return 0; } else { rawlabel = this.subject.substr(0, matchChars); } // colon: if (this.peek() === C_COLON) { this.pos++; } else { this.pos = startpos; return 0; } // link url this.spnl(); dest = this.parseLinkDestination(); if (dest === null || dest.length === 0) { this.pos = startpos; return 0; } var beforetitle = this.pos; this.spnl(); title = this.parseLinkTitle(); if (title === null) { title = ''; // rewind before spaces this.pos = beforetitle; } // make sure we're at line end: if (this.match(/^ *(?:\n|$)/) === null) { this.pos = startpos; return 0; } var normlabel = normalizeReference(rawlabel); if (!refmap[normlabel]) { refmap[normlabel] = { destination: dest, title: title }; } return this.pos - startpos; }; // Parse the next inline element in subject, advancing subject position. // On success, add the result to block's children and return true. // On failure, return false. var parseInline = function(block) { "use strict"; var c = this.peek(); if (c === -1) { return false; } var res; switch(c) { case C_NEWLINE: res = this.parseNewline(block); break; case C_BACKSLASH: res = this.parseBackslash(block); break; case C_BACKTICK: res = this.parseBackticks(block); break; case C_ASTERISK: case C_UNDERSCORE: res = this.parseEmphasis(c, block); break; case C_OPEN_BRACKET: res = this.parseOpenBracket(block); break; case C_BANG: res = this.parseBang(block); break; case C_CLOSE_BRACKET: res = this.parseCloseBracket(block); break; case C_LESSTHAN: res = this.parseAutolink(block) || this.parseHtmlTag(block); break; case C_AMPERSAND: res = this.parseEntity(block); break; default: res = this.parseString(block); break; } if (!res) { this.pos += 1; var textnode = new Node('Text'); textnode.literal = fromCodePoint(c); block.appendChild(textnode); } return true; }; // Parse string_content in block into inline children, // using refmap to resolve references. var parseInlines = function(block, refmap) { "use strict"; this.subject = block.string_content.trim(); this.pos = 0; this.refmap = refmap || {}; this.delimiters = null; while (this.parseInline(block)) { } this.processEmphasis(block, null); }; // The InlineParser object. function InlineParser(){ "use strict"; return { subject: '', delimiters: null, // used by parseEmphasis method pos: 0, refmap: {}, match: match, peek: peek, spnl: spnl, unescapeString: unescapeString, parseBackticks: parseBackticks, parseBackslash: parseBackslash, parseAutolink: parseAutolink, parseHtmlTag: parseHtmlTag, scanDelims: scanDelims, parseEmphasis: parseEmphasis, parseLinkTitle: parseLinkTitle, parseLinkDestination: parseLinkDestination, parseLinkLabel: parseLinkLabel, parseOpenBracket: parseOpenBracket, parseCloseBracket: parseCloseBracket, parseBang: parseBang, parseEntity: parseEntity, parseString: parseString, parseNewline: parseNewline, parseReference: parseReference, parseInline: parseInline, processEmphasis: processEmphasis, removeDelimiter: removeDelimiter, parse: parseInlines }; } module.exports = InlineParser;