Refactor: Isolate dictionary of reference defintions.

This commit is contained in:
Martin Mitas 2017-07-17 08:50:04 +02:00
parent 07c5dba9ba
commit ff524e9104

View File

@ -75,7 +75,7 @@
typedef struct MD_MARK_tag MD_MARK;
typedef struct MD_BLOCK_tag MD_BLOCK;
typedef struct MD_CONTAINER_tag MD_CONTAINER;
typedef struct MD_LINK_REF_DEF_tag MD_LINK_REF_DEF;
typedef struct MD_REF_DEF_tag MD_REF_DEF;
/* During analyzes of inline marks, we need to manage some "mark chains",
@ -101,10 +101,10 @@ struct MD_CTX_tag {
CHAR* buffer;
unsigned alloc_buffer;
/* Link reference definitions. */
MD_LINK_REF_DEF* link_ref_defs;
int n_link_ref_defs;
int alloc_link_ref_defs;
/* Reference definitions. */
MD_REF_DEF* ref_defs;
int n_ref_defs;
int alloc_ref_defs;
/* Stack of inline/span markers.
* This is only used for parsing a single block contents but by storing it
@ -910,6 +910,22 @@ md_merge_lines_alloc(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_
return 0;
}
static OFF
md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size)
{
SZ char_size;
int codepoint;
while(off < size) {
codepoint = md_decode_unicode(label, off, size, &char_size);
if(!ISUNICODEWHITESPACE_(codepoint) && !ISNEWLINE_(label[off]))
break;
off += char_size;
}
return off;
}
/******************************
*** Recognizing raw HTML ***
@ -1459,15 +1475,11 @@ md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE* attr)
}
/***************************
*** Recognizing Links ***
***************************/
/*********************************************
*** Dictionary of Reference Definitions ***
*********************************************/
/* Note this code is partially shared between processing inlines and blocks
* as link reference definitions and links share some helper parser functions.
*/
struct MD_LINK_REF_DEF_tag {
struct MD_REF_DEF_tag {
CHAR* label;
CHAR* title;
SZ label_size : 24;
@ -1478,6 +1490,113 @@ struct MD_LINK_REF_DEF_tag {
OFF dest_end;
};
static int
md_link_label_cmp(const void* a, const void* b)
{
const CHAR* a_label = ((const MD_REF_DEF*)a)->label;
const CHAR* b_label = ((const MD_REF_DEF*)b)->label;
SZ a_size = ((const MD_REF_DEF*)a)->label_size;
SZ b_size = ((const MD_REF_DEF*)b)->label_size;
OFF a_off;
OFF b_off;
/* The slow path, with Unicode case folding and Unicode whitespace collapsing. */
a_off = md_skip_unicode_whitespace(a_label, 0, a_size);
b_off = md_skip_unicode_whitespace(b_label, 0, b_size);
while(a_off < a_size || b_off < b_size) {
int a_codepoint, b_codepoint;
SZ a_char_size, b_char_size;
int a_is_whitespace, b_is_whitespace;
if(a_off < a_size) {
a_codepoint = md_decode_unicode(a_label, a_off, a_size, &a_char_size);
a_is_whitespace = ISUNICODEWHITESPACE_(a_codepoint) || ISNEWLINE_(a_label[a_off]);
} else {
/* Treat end of label as a whitespace. */
a_codepoint = -1;
a_is_whitespace = TRUE;
}
if(b_off < b_size) {
b_codepoint = md_decode_unicode(b_label, b_off, b_size, &b_char_size);
b_is_whitespace = ISUNICODEWHITESPACE_(b_codepoint) || ISNEWLINE_(b_label[b_off]);
} else {
/* Treat end of label as a whitespace. */
b_codepoint = -1;
b_is_whitespace = TRUE;
}
if(a_is_whitespace || b_is_whitespace) {
if(!a_is_whitespace || !b_is_whitespace)
return (a_is_whitespace ? -1 : +1);
a_off = md_skip_unicode_whitespace(a_label, a_off, a_size);
b_off = md_skip_unicode_whitespace(b_label, b_off, b_size);
} else {
MD_UNICODE_FOLD_INFO a_fold_info, b_fold_info;
int cmp;
md_get_unicode_fold_info(a_codepoint, &a_fold_info);
md_get_unicode_fold_info(b_codepoint, &b_fold_info);
if(a_fold_info.n_codepoints != b_fold_info.n_codepoints)
return (a_fold_info.n_codepoints - b_fold_info.n_codepoints);
cmp = memcmp(a_fold_info.codepoints, b_fold_info.codepoints, a_fold_info.n_codepoints * sizeof(int));
if(cmp != 0)
return cmp;
a_off += a_char_size;
b_off += b_char_size;
}
}
return 0;
}
static int
md_link_label_cmp_for_qsort(const void* a, const void* b)
{
int cmp;
cmp = md_link_label_cmp(a, b);
if(cmp != 0)
return cmp;
/* The specification requests that only first reference definition
* with the same label is valid. So make sure we make a stable sort
* from the qsort(). */
return ((MD_REF_DEF*) a - (MD_REF_DEF*) b);
}
static inline const MD_REF_DEF*
md_lookup_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size)
{
MD_REF_DEF key;
key.label = (CHAR*) label;
key.label_size = label_size;
return bsearch(&key, ctx->ref_defs, ctx->n_ref_defs,
sizeof(MD_REF_DEF), md_link_label_cmp);
}
static void
md_build_ref_def_dict(MD_CTX* ctx)
{
/* Sort reference definitions so we can use bsearch() for their lookup. */
qsort(ctx->ref_defs, ctx->n_ref_defs, sizeof(MD_REF_DEF),
md_link_label_cmp_for_qsort);
}
/***************************
*** Recognizing Links ***
***************************/
/* Note this code is partially shared between processing inlines and blocks
* as reference definitions and links share some helper parser functions.
*/
typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR;
struct MD_LINK_ATTR_tag {
OFF dest_beg;
@ -1695,9 +1814,9 @@ md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
return FALSE;
}
/* Returns 0 if it is not a link reference definition.
/* Returns 0 if it is not a reference definition.
*
* Returns N > 0 if it is not a link reference definition (then N corresponds
* Returns N > 0 if it is not a reference definition (then N corresponds
* to the number of lines forming it). In this case the definition is stored
* for resolving any links referring to it.
*
@ -1722,7 +1841,7 @@ md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
OFF off;
int line_index = 0;
int tmp_line_index;
MD_LINK_REF_DEF* def;
MD_REF_DEF* def;
int ret;
/* Link label. */
@ -1786,23 +1905,23 @@ md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
label_needs_free = TRUE;
}
/* Store the link reference definition. */
if(ctx->n_link_ref_defs >= ctx->alloc_link_ref_defs) {
MD_LINK_REF_DEF* new_defs;
/* Store the reference definition. */
if(ctx->n_ref_defs >= ctx->alloc_ref_defs) {
MD_REF_DEF* new_defs;
ctx->alloc_link_ref_defs = (ctx->alloc_link_ref_defs > 0 ? ctx->alloc_link_ref_defs * 2 : 16);
new_defs = (MD_LINK_REF_DEF*) realloc(ctx->link_ref_defs, ctx->alloc_link_ref_defs * sizeof(MD_LINK_REF_DEF));
ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0 ? ctx->alloc_ref_defs * 2 : 16);
new_defs = (MD_REF_DEF*) realloc(ctx->ref_defs, ctx->alloc_ref_defs * sizeof(MD_REF_DEF));
if(new_defs == NULL) {
MD_LOG("realloc() failed.");
ret = -1;
goto abort;
}
ctx->link_ref_defs = new_defs;
ctx->ref_defs = new_defs;
}
def = &ctx->link_ref_defs[ctx->n_link_ref_defs];
memset(def, 0, sizeof(MD_LINK_REF_DEF));
def = &ctx->ref_defs[ctx->n_ref_defs];
memset(def, 0, sizeof(MD_REF_DEF));
def->label = label;
def->label_size = label_size;
@ -1825,7 +1944,7 @@ md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
}
/* Success. */
ctx->n_link_ref_defs++;
ctx->n_ref_defs++;
return line_index + 1;
abort:
@ -1835,118 +1954,11 @@ abort:
return -1;
}
static OFF
md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size)
{
SZ char_size;
int codepoint;
while(off < size) {
codepoint = md_decode_unicode(label, off, size, &char_size);
if(!ISUNICODEWHITESPACE_(codepoint) && !ISNEWLINE_(label[off]))
break;
off += char_size;
}
return off;
}
static int
md_link_label_cmp(const void* a, const void* b)
{
const CHAR* a_label = ((const MD_LINK_REF_DEF*)a)->label;
const CHAR* b_label = ((const MD_LINK_REF_DEF*)b)->label;
SZ a_size = ((const MD_LINK_REF_DEF*)a)->label_size;
SZ b_size = ((const MD_LINK_REF_DEF*)b)->label_size;
OFF a_off;
OFF b_off;
/* The slow path, with Unicode case folding and Unicode whitespace collapsing. */
a_off = md_skip_unicode_whitespace(a_label, 0, a_size);
b_off = md_skip_unicode_whitespace(b_label, 0, b_size);
while(a_off < a_size || b_off < b_size) {
int a_codepoint, b_codepoint;
SZ a_char_size, b_char_size;
int a_is_whitespace, b_is_whitespace;
if(a_off < a_size) {
a_codepoint = md_decode_unicode(a_label, a_off, a_size, &a_char_size);
a_is_whitespace = ISUNICODEWHITESPACE_(a_codepoint) || ISNEWLINE_(a_label[a_off]);
} else {
/* Treat end of label as a whitespace. */
a_codepoint = -1;
a_is_whitespace = TRUE;
}
if(b_off < b_size) {
b_codepoint = md_decode_unicode(b_label, b_off, b_size, &b_char_size);
b_is_whitespace = ISUNICODEWHITESPACE_(b_codepoint) || ISNEWLINE_(b_label[b_off]);
} else {
/* Treat end of label as a whitespace. */
b_codepoint = -1;
b_is_whitespace = TRUE;
}
if(a_is_whitespace || b_is_whitespace) {
if(!a_is_whitespace || !b_is_whitespace)
return (a_is_whitespace ? -1 : +1);
a_off = md_skip_unicode_whitespace(a_label, a_off, a_size);
b_off = md_skip_unicode_whitespace(b_label, b_off, b_size);
} else {
MD_UNICODE_FOLD_INFO a_fold_info, b_fold_info;
int cmp;
md_get_unicode_fold_info(a_codepoint, &a_fold_info);
md_get_unicode_fold_info(b_codepoint, &b_fold_info);
if(a_fold_info.n_codepoints != b_fold_info.n_codepoints)
return (a_fold_info.n_codepoints - b_fold_info.n_codepoints);
cmp = memcmp(a_fold_info.codepoints, b_fold_info.codepoints, a_fold_info.n_codepoints * sizeof(int));
if(cmp != 0)
return cmp;
a_off += a_char_size;
b_off += b_char_size;
}
}
return 0;
}
static int
md_link_label_cmp_for_qsort(const void* a, const void* b)
{
int cmp;
cmp = md_link_label_cmp(a, b);
if(cmp != 0)
return cmp;
/* The specification requests that only first link reference definition
* with the same label is valid. So make sure we make a stable sort
* from the qsort(). */
return ((MD_LINK_REF_DEF*) a - (MD_LINK_REF_DEF*) b);
}
static inline const MD_LINK_REF_DEF*
md_lookup_link_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size)
{
MD_LINK_REF_DEF key;
key.label = (CHAR*) label;
key.label_size = label_size;
return bsearch(&key, ctx->link_ref_defs, ctx->n_link_ref_defs,
sizeof(MD_LINK_REF_DEF), md_link_label_cmp);
}
static int
md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
OFF beg, OFF end, MD_LINK_ATTR* attr)
{
const MD_LINK_REF_DEF* def;
const MD_REF_DEF* def;
const MD_LINE* beg_line;
const MD_LINE* end_line;
CHAR* label;
@ -1978,7 +1990,7 @@ md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
label_size = end - beg;
}
def = md_lookup_link_ref_def(ctx, label, label_size);
def = md_lookup_ref_def(ctx, label, label_size);
if(def != NULL) {
attr->dest_beg = def->dest_beg;
attr->dest_end = def->dest_end;
@ -2084,12 +2096,12 @@ abort:
}
static void
md_free_link_ref_defs(MD_CTX* ctx)
md_free_ref_defs(MD_CTX* ctx)
{
int i;
for(i = 0; i < ctx->n_link_ref_defs; i++) {
MD_LINK_REF_DEF* def = &ctx->link_ref_defs[i];
for(i = 0; i < ctx->n_ref_defs; i++) {
MD_REF_DEF* def = &ctx->ref_defs[i];
if(def->label_needs_free)
free(def->label);
@ -2097,7 +2109,7 @@ md_free_link_ref_defs(MD_CTX* ctx)
free(def->title);
}
free(ctx->link_ref_defs);
free(ctx->ref_defs);
}
@ -4335,10 +4347,10 @@ md_start_new_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* line)
return 0;
}
/* Eat from start of current (textual) block any link reference definitions
* and remember them so we can resolve any links referring to them.
/* Eat from start of current (textual) block any reference definitions and
* remember them so we can resolve any links referring to them.
*
* (Link reference definitions can only be at start of it as they cannot break
* (Reference definitions can only be at start of it as they cannot break
* a paragraph.)
*/
static int
@ -4349,17 +4361,17 @@ md_consume_link_reference_definitions(MD_CTX* ctx)
int n = 0;
/* Compute how many lines at the start of the block form one or more
* link reference definitions. */
* reference definitions. */
while(n < n_lines) {
int n_link_ref_lines;
n_link_ref_lines = md_is_link_reference_definition(ctx,
lines + n, n_lines - n);
/* Not a link reference definition? */
/* Not a reference definition? */
if(n_link_ref_lines == 0)
break;
/* We fail if it is the link ref. def. but it could not be stored due
/* We fail if it is the ref. def. but it could not be stored due
* a memory allocation error. */
if(n_link_ref_lines < 0)
return -1;
@ -4367,7 +4379,7 @@ md_consume_link_reference_definitions(MD_CTX* ctx)
n += n_link_ref_lines;
}
/* If there was at least one link reference definition, we need to remove
/* If there was at least one reference definition, we need to remove
* its lines from the block, or perhaps even the whole block. */
if(n > 0) {
if(n == n_lines) {
@ -4393,9 +4405,9 @@ md_end_current_block(MD_CTX* ctx)
if(ctx->current_block == NULL)
return ret;
/* Check whether there is a link reference definition. (We do this here
* instead of in md_analyze_line() because link reference definition can
* take multiple lines.) */
/* Check whether there is a reference definition. (We do this here instead
* of in md_analyze_line() because reference definition can take multiple
* lines.) */
if(ctx->current_block->type == MD_BLOCK_P) {
MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
if(CH(lines[0].beg) == _T('['))
@ -5556,9 +5568,7 @@ md_process_doc(MD_CTX *ctx)
md_end_current_block(ctx);
/* Sort link reference definitions so we can use bsearch() for their lookup. */
qsort(ctx->link_ref_defs, ctx->n_link_ref_defs, sizeof(MD_LINK_REF_DEF),
md_link_label_cmp_for_qsort);
md_build_ref_def_dict(ctx);
/* Process all blocks. */
MD_CHECK(md_leave_child_containers(ctx, 0));
@ -5626,7 +5636,7 @@ md_parse(const MD_CHAR* text, MD_SIZE size, const MD_RENDERER* renderer, void* u
ret = md_process_doc(&ctx);
/* Clean-up. */
md_free_link_ref_defs(&ctx);
md_free_ref_defs(&ctx);
free(ctx.buffer);
free(ctx.marks);
free(ctx.block_bytes);