diff options
| author | John MacFarlane <jgm@berkeley.edu> | 2014-07-21 22:29:16 -0700 | 
|---|---|---|
| committer | John MacFarlane <jgm@berkeley.edu> | 2014-08-13 22:56:32 -0700 | 
| commit | 870e63be7360b5a0097a27656048e853bc720464 (patch) | |
| tree | e8f19ee2d62e529115cb71dcda5f3298cca7d389 /src/blocks.c | |
| parent | 650ad87f35f4405a2ca8270d2b2835daa442e5f1 (diff) | |
Initial commit
Diffstat (limited to 'src/blocks.c')
| -rw-r--r-- | src/blocks.c | 747 | 
1 files changed, 747 insertions, 0 deletions
| diff --git a/src/blocks.c b/src/blocks.c new file mode 100644 index 0000000..2776231 --- /dev/null +++ b/src/blocks.c @@ -0,0 +1,747 @@ +#include <stdlib.h> +#include <stdio.h> +#include <stdbool.h> +#include <ctype.h> +#include "bstrlib.h" +#include "stmd.h" +#include "uthash.h" +#include "debug.h" +#include "scanners.h" + +static block* make_block(int tag, int start_line, int start_column) +{ +  block* e; +  e = (block*) malloc(sizeof(block)); +  e->tag = tag; +  e->open = true; +  e->last_line_blank = false; +  e->start_line = start_line; +  e->start_column = start_column; +  e->end_line = start_line; +  e->children = NULL; +  e->last_child = NULL; +  e->parent = NULL; +  e->top = NULL; +  e->attributes.refmap = NULL; +  e->string_content = bfromcstr(""); +  e->inline_content = NULL; +  e->next = NULL; +  e->prev = NULL; +  return e; +} + +// Create a root document block. +extern block* make_document() +{ +  block * e = make_block(document, 1, 1); +  reference * map = NULL; +  reference ** refmap; +  refmap = (reference**) malloc(sizeof(reference*)); +  *refmap = map; +  e->attributes.refmap = refmap; +  e->top = e; +  return e; +} + +// Returns true if line has only space characters, else false. +bool is_blank(bstring s, int offset) +{ +  char c; +  while ((c = bchar(s, offset))) { +    if (c == '\n') { +      return true; +    } else if (c == ' ') { +      offset++; +    } else { +      return false; +    } +  } +  return true; +} + +static inline bool can_contain(int parent_type, int child_type) +{ +  return ( parent_type == document || +           parent_type == block_quote || +           parent_type == list_item || +           (parent_type == list && child_type == list_item) ); +} + +static inline bool accepts_lines(int block_type) +{ +  return (block_type == paragraph || +          block_type == atx_header || +          block_type == indented_code || +          block_type == fenced_code); +} + +static int add_line(block* block, bstring ln, int offset) +{ +  bstring s = bmidstr(ln, offset, blength(ln) - offset); +  check(block->open, "attempted to add line (%s) to closed container (%d)", +        ln->data, block->tag); +  check(bformata(block->string_content, "%s", s->data) == 0, +        "could not append line to string_content"); +  bdestroy(s); +  return 0; + error: +  return -1; +} + +static int remove_trailing_blank_lines(bstring ln) +{ +  bstring tofind = bfromcstr(" \t\r\n"); +  int pos; +  // find last nonspace: +  pos = bninchrr(ln, blength(ln) - 1, tofind); +  if (pos == BSTR_ERR) { // all spaces +    bassigncstr(ln, ""); +  } else { +    // find next newline after it +    pos = bstrchrp(ln, '\n', pos); +    if (pos != BSTR_ERR) { +      check(bdelete(ln, pos, blength(ln) - pos) != BSTR_ERR, +        "failed to delete trailing blank lines"); +    } +  } +  bdestroy(tofind); +  return 0; + error: +  return -1; +} + +// Check to see if a block ends with a blank line, descending +// if needed into lists and sublists. +static bool ends_with_blank_line(block* block) +{ +  if (block->last_line_blank) { +    return true; +  } +  if ((block->tag == list || block->tag == list_item) && block->last_child) { +    return ends_with_blank_line(block->last_child); +  } else { +    return false; +  } +} + +// Break out of all containing lists +static int break_out_of_lists(block ** bptr, int line_number) +{ +  block * container = *bptr; +  block * b = container->top; +  // find first containing list: +  while (b && b->tag != list) { +    b = b->last_child; +  } +  if (b) { +    while (container && container != b) { +      finalize(container, line_number); +      container = container->parent; +    } +    finalize(b, line_number); +    *bptr = b->parent; +  } +  return 0; +} + + +extern int finalize(block* b, int line_number) +{ +  int firstlinelen; +  int pos; +  block* item; +  block* subitem; + +  check(b != NULL, "finalize called on null block"); +  if (!b->open) { +    return 0; // don't do anything if the block is already closed +  } +  b->open = false; +  if (line_number > b->start_line) { +    b->end_line = line_number - 1; +  } else { +    b->end_line = line_number; +  } + +  switch (b->tag) { + +  case paragraph: +    pos = 0; +    while (bchar(b->string_content, 0) == '[' && +           (pos = parse_reference(b->string_content, +                                  b->top->attributes.refmap))) { +      bdelete(b->string_content, 0, pos); +    } +    if (is_blank(b->string_content, 0)) { +      b->tag = reference_def; +    } +    break; + +  case indented_code: +    remove_trailing_blank_lines(b->string_content); +    bformata(b->string_content, "\n"); +    break; + +  case fenced_code: +    // first line of contents becomes info +    firstlinelen = bstrchr(b->string_content, '\n'); +    b->attributes.fenced_code_data.info = +      bmidstr(b->string_content, 0, firstlinelen); +    bdelete(b->string_content, 0, firstlinelen + 1); // +1 for \n +    btrimws(b->attributes.fenced_code_data.info); +    unescape(b->attributes.fenced_code_data.info); +    break; + +  case list: // determine tight/loose status +    b->attributes.list_data.tight = true; // tight by default +    item = b->children; + +    while (item) { +      // check for non-final non-empty list item ending with blank line: +      if (item->last_line_blank && item->next) { +        b->attributes.list_data.tight = false; +        break; +      } +      // recurse into children of list item, to see if there are +      // spaces between them: +      subitem = item->children; +      while (subitem) { +        if (ends_with_blank_line(subitem) && +            (item->next || subitem->next)) { +          b->attributes.list_data.tight = false; +          break; +        } +        subitem = subitem->next; +      } +      if (!(b->attributes.list_data.tight)) { +        break; +      } +      item = item->next; +    } + +    break; + +  default: +    break; +  } + +  return 0; + error: +  return -1; +} + +// Add a block as child of another.  Return pointer to child. +extern block* add_child(block* parent, +                        int block_type, int start_line, int start_column) +{ +  // if 'parent' isn't the kind of block that can accept this child, +  // then back up til we hit a block that can. +  while (!can_contain(parent->tag, block_type)) { +    finalize(parent, start_line); +    parent = parent->parent; +  } + +  check(parent != NULL, "parent container cannot accept children"); + +  block* child = make_block(block_type, start_line, start_column); +  child->parent = parent; +  child->top = parent->top; + +  if (parent->last_child) { +    parent->last_child->next = child; +    child->prev = parent->last_child; +  } else { +    parent->children = child; +    child->prev = NULL; +  } +  parent->last_child = child; +  return child; + error: +  return NULL; +} + +// Free a block list and any children. +extern void free_blocks(block* e) +{ +  block * next; +  while (e != NULL) { +    next = e->next; +    free_inlines(e->inline_content); +    bdestroy(e->string_content); +    if (e->tag == fenced_code) { +      bdestroy(e->attributes.fenced_code_data.info); +    } else if (e->tag == document) { +      free_reference_map(e->attributes.refmap); +    } +    free_blocks(e->children); +    free(e); +    e = next; +  } +} + +// Walk through block and all children, recursively, parsing +// string content into inline content where appropriate. +int process_inlines(block* cur, reference** refmap) +{ +  switch (cur->tag) { + +  case paragraph: +  case atx_header: +  case setext_header: +    check(cur->string_content != NULL, "string_content is NULL"); +    cur->inline_content = parse_inlines(cur->string_content, refmap); +    bdestroy(cur->string_content); +    cur->string_content = NULL; +    break; + +  default: +    break; +  } + +  block * child = cur->children; +  while (child != NULL) { +    process_inlines(child, refmap); +    child = child->next; +  } + +  return 0; + error: +  return -1; +} + +// Attempts to parse a list item marker (bullet or enumerated). +// On success, returns length of the marker, and populates +// data with the details.  On failure, returns 0. +static int parse_list_marker(bstring ln, int pos, +                             struct ListData ** dataptr) +{ +  char c; +  int startpos; +  int start = 1; +  struct ListData * data; + +  startpos = pos; +  c = bchar(ln, pos); + +  if ((c == '*' || c == '-' || c == '+') && !scan_hrule(ln, pos)) { +    pos++; +    if (!isspace(bchar(ln, pos))) { +      return 0; +    } +    data = malloc(sizeof(struct ListData)); +    data->marker_offset = 0; // will be adjusted later +    data->list_type = bullet; +    data->bullet_char = c; +    data->start = 1; +    data->delimiter = period; +    data->tight = false; + +  } else if (isdigit(c)) { + +    pos++; +    while (isdigit(bchar(ln, pos))) { +      pos++; +    } + +    if (!sscanf((char *) ln->data + startpos, "%d", &start)) { +      log_err("sscanf failed"); +      return 0; +    } + +    c = bchar(ln, pos); +    if (c == '.' || c == ')') { +      pos++; +      if (!isspace(bchar(ln, pos))) { +        return 0; +      } +      data = malloc(sizeof(struct ListData)); +      data->marker_offset = 0; // will be adjusted later +      data->list_type = ordered; +      data->bullet_char = 0; +      data->start = start; +      data->delimiter = (c == '.' ? period : parens); +      data->tight = false; +    } else { +      return 0; +    } + +  } else { +    return 0; +  } + +  *dataptr = data; +  return (pos - startpos); +} + +// Return 1 if list item belongs in list, else 0. +static int lists_match(struct ListData list_data, +                       struct ListData item_data) +{ +  return (list_data.list_type == item_data.list_type && +          list_data.delimiter == item_data.delimiter && +          // list_data.marker_offset == item_data.marker_offset && +          list_data.bullet_char == item_data.bullet_char); +} + +// Process one line at a time, modifying a block. +// Returns 0 if successful.  curptr is changed to point to +// the currently open block. +extern int incorporate_line(bstring ln, int line_number, block** curptr) +{ +  block* last_matched_container; +  int offset = 0; +  int matched = 0; +  int lev = 0; +  int i; +  struct ListData * data = NULL; +  bool all_matched = true; +  block* container; +  block* cur = *curptr; +  bool blank = false; +  int first_nonspace; +  int indent; + +  // detab input line +  check(bdetab(ln, 1) != BSTR_ERR, +        "invalid UTF-8 sequence in line %d\n", line_number); + +  // container starts at the document root. +  container = cur->top; + +  // for each containing block, try to parse the associated line start. +  // bail out on failure:  container will point to the last matching block. + +  while (container->last_child && container->last_child->open) { +    container = container->last_child; + +    first_nonspace = offset; +    while (bchar(ln, first_nonspace) == ' ') { +      first_nonspace++; +    } + +    indent = first_nonspace - offset; +    blank = bchar(ln, first_nonspace) == '\n'; + +    if (container->tag == block_quote) { + +      matched = indent <= 3 && bchar(ln, first_nonspace) == '>'; +      if (matched) { +        offset = first_nonspace + 1; +        if (bchar(ln, offset) == ' ') { +          offset++; +        } +      } else { +        all_matched = false; +      } + +    } else if (container->tag == list_item) { + +      if (indent >= container->attributes.list_data.marker_offset + +          container->attributes.list_data.padding) { +        offset += container->attributes.list_data.marker_offset + +          container->attributes.list_data.padding; +      } else if (blank) { +        offset = first_nonspace; +      } else { +        all_matched = false; +      } + +    } else if (container->tag == indented_code) { + +      if (indent >= CODE_INDENT) { +        offset += CODE_INDENT; +      } else if (blank) { +        offset = first_nonspace; +      } else { +        all_matched = false; +      } + +    } else if (container->tag == atx_header || +               container->tag == setext_header) { + +      // a header can never contain more than one line +      all_matched = false; + +    } else if (container->tag == fenced_code) { + +      // skip optional spaces of fence offset +      i = container->attributes.fenced_code_data.fence_offset; +      while (i > 0 && bchar(ln, offset) == ' ') { +        offset++; +        i--; +      } + +    } else if (container->tag == html_block) { + +      if (blank) { +        all_matched = false; +      } + +    } else if (container->tag == paragraph) { + +      if (blank) { +        container->last_line_blank =true; +        all_matched = false; +      } + +    } + +    if (!all_matched) { +      container = container->parent;  // back up to last matching block +      break; +    } +  } + +  last_matched_container = container; + +  // check to see if we've hit 2nd blank line, break out of list: +  if (blank && container->last_line_blank) { +    break_out_of_lists(&container, line_number); +  } + +  // unless last matched container is code block, try new container starts: +  while (container->tag != fenced_code && container->tag != indented_code && +         container->tag != html_block) { + +    first_nonspace = offset; +    while (bchar(ln, first_nonspace) == ' ') { +      first_nonspace++; +    } + +    indent = first_nonspace - offset; +    blank = bchar(ln, first_nonspace) == '\n'; + +    if (indent >= CODE_INDENT) { + +      if (cur->tag != paragraph && !blank) { +        offset += CODE_INDENT; +        container = add_child(container, indented_code, line_number, offset + 1); +      } else { // indent > 4 in lazy line +        break; +      } + +    } else if (bchar(ln, first_nonspace) == '>') { + +      offset = first_nonspace + 1; +      // optional following character +      if (bchar(ln, offset) == ' ') { +        offset++; +      } +      container = add_child(container, block_quote, line_number, offset + 1); + +    } else if ((matched = scan_atx_header_start(ln, first_nonspace))) { + +      offset = first_nonspace + matched; +      container = add_child(container, atx_header, line_number, offset + 1); +      int hashpos = bstrchrp(ln, '#', first_nonspace); +      check(hashpos != BSTR_ERR, "no # found in atx header start"); +      int level = 0; +      while (bchar(ln, hashpos) == '#') { +        level++; +        hashpos++; +      } +      container->attributes.header_level = level; + +    } else if ((matched = scan_open_code_fence(ln, first_nonspace))) { + +      container = add_child(container, fenced_code, line_number, +          first_nonspace + 1); +      container->attributes.fenced_code_data.fence_char = bchar(ln, +          first_nonspace); +      container->attributes.fenced_code_data.fence_length = matched; +      container->attributes.fenced_code_data.fence_offset = +        first_nonspace - offset; +      offset = first_nonspace + matched; + +    } else if ((matched = scan_html_block_tag(ln, first_nonspace))) { + +      container = add_child(container, html_block, line_number, +                            first_nonspace + 1); +      // note, we don't adjust offset because the tag is part of the text + +    } else if (container->tag == paragraph && +              (lev = scan_setext_header_line(ln, first_nonspace)) && +               // check that there is only one line in the paragraph: +               bstrrchrp(container->string_content, '\n', +                         blength(container->string_content) - 2) == BSTR_ERR) { + +        container->tag = setext_header; +        container->attributes.header_level = lev; +        offset = blength(ln) - 1; + +    } else if (!(container->tag == paragraph && !all_matched) && +               (matched = scan_hrule(ln, first_nonspace))) { + +      // it's only now that we know the line is not part of a setext header: +      container = add_child(container, hrule, line_number, first_nonspace + 1); +      finalize(container, line_number); +      container = container->parent; +      offset = blength(ln) - 1; + +    } else if ((matched = parse_list_marker(ln, first_nonspace, &data))) { + +        // compute padding: +        offset = first_nonspace + matched; +        i = 0; +        while (i <= 5 && bchar(ln, offset + i) == ' ') { +          i++; +        } +        // i = number of spaces after marker, up to 5 +        if (i >= 5 || i < 1 || bchar(ln, offset) == '\n') { +          data->padding = matched + 1; +          if (i > 0) { +            offset += 1; +          } +        } else { +          data->padding = matched + i; +          offset += i; +        } + +        // check container; if it's a list, see if this list item +        // can continue the list; otherwise, create a list container. + +        data->marker_offset = indent; + +        if (container->tag != list || +            !lists_match(container->attributes.list_data, *data)) { +          container = add_child(container, list, line_number, +              first_nonspace + 1); +          container->attributes.list_data = *data; +        } + +        // add the list item +        container = add_child(container, list_item, line_number, +            first_nonspace + 1); +        container->attributes.list_data = *data; +        free(data); + +    } else { +      break; +    } + +    if (accepts_lines(container->tag)) { +      // if it's a line container, it can't contain other containers +      break; +    } +  } + +  // what remains at offset is a text line.  add the text to the +  // appropriate container. + +  first_nonspace = offset; +  while (bchar(ln, first_nonspace) == ' ') { +    first_nonspace++; +  } + +  indent = first_nonspace - offset; +  blank = bchar(ln, first_nonspace) == '\n'; + +  // block quote lines are never blank as they start with > +  // and we don't count blanks in fenced code for purposes of tight/loose +  // lists or breaking out of lists.  we also don't set last_line_blank +  // on an empty list item. +  container->last_line_blank = (blank && +                                container->tag != block_quote && +                                container->tag != fenced_code && +                                !(container->tag == list_item && +                                  container->children == NULL && +                                  container->start_line == line_number)); + +    block *cont = container; +    while (cont->parent) { +      cont->parent->last_line_blank = false; +      cont = cont->parent; +    } + +  if (cur != last_matched_container && +      container == last_matched_container && +      !blank && +      cur->tag == paragraph && +      blength(cur->string_content) > 0) { + +    check(add_line(cur, ln, offset) == 0, "could not add line"); + +  } else { // not a lazy continuation + +    // finalize any blocks that were not matched and set cur to container: +    while (cur != last_matched_container) { + +      finalize(cur, line_number); +      cur = cur->parent; +      check(cur != NULL, "cur is NULL, last_matched_container->tag = %d", +            last_matched_container->tag); + +    } + +    if (container->tag == indented_code) { + +      check(add_line(container, ln, offset) == 0, "could not add line"); + +    } else if (container->tag == fenced_code) { + +      matched = (indent <= 3 +        && bchar(ln, first_nonspace) == container->attributes.fenced_code_data.fence_char) +        && scan_close_code_fence(ln, first_nonspace, +                                 container->attributes.fenced_code_data.fence_length); +      if (matched) { +        // if closing fence, don't add line to container; instead, close it: +        finalize(container, line_number); +        container = container->parent; // back up to parent +      } else { +        check(add_line(container, ln, offset) == 0, "could not add line"); +      } + +    } else if (container->tag == html_block) { + +      check(add_line(container, ln, offset) == 0, "could not add line"); + +    } else if (blank) { + +      // ??? do nothing + +    } else if (container->tag == atx_header) { + +      // chop off trailing ###s...use a scanner? +      brtrimws(ln); +      int p = blength(ln) - 1; +      int numhashes = 0; +      // if string ends in #s, remove these: +      while (bchar(ln, p) == '#') { +        p--; +        numhashes++; +      } +      if (bchar(ln, p) == '\\') { +        // the last # was escaped, so we include it. +        p++; +        numhashes--; +      } +      check(bdelete(ln, p + 1, numhashes) != BSTR_ERR, +            "could not delete final hashes"); +      check(add_line(container, ln, first_nonspace) == 0, "could not add line"); +      finalize(container, line_number); +      container = container->parent; + +    } else if (accepts_lines(container->tag)) { + +      check(add_line(container, ln, first_nonspace) == 0, "could not add line"); + +    } else if (container->tag != hrule && container->tag != setext_header) { + +      // create paragraph container for line +      container = add_child(container, paragraph, line_number, first_nonspace + 1); +      check(add_line(container, ln, first_nonspace) == 0, "could not add line"); + +    } else { + +      log_warn("Line %d with container type %d did not match any condition:\n\"%s\"", +               line_number, container->tag, ln->data); + +    } +    *curptr = container; +  } + +  return 0; + error: +  return -1; +} + | 
