Fix O(n^2) in handling the "rule of three".
We had to break the list of potential '*' openers into multiple ones so we do not have to walk it when looking for matching length due to the "rule of three" for intraword delimiter runs. Fixes #63.
This commit is contained in:
parent
b21086522e
commit
2dd96ab4ac
15
CHANGELOG.md
15
CHANGELOG.md
@ -6,16 +6,17 @@
|
||||
|
||||
Fixes:
|
||||
|
||||
* Fixed some quadratic behaviors:
|
||||
[#58](https://github.com/mity/md4c/issues/58),
|
||||
* [#58](https://github.com/mity/md4c/issues/58),
|
||||
[#59](https://github.com/mity/md4c/issues/59),
|
||||
[#60](https://github.com/mity/md4c/issues/60),
|
||||
[#66](https://github.com/mity/md4c/issues/66)
|
||||
[#63](https://github.com/mity/md4c/issues/63),
|
||||
[#66](https://github.com/mity/md4c/issues/66):
|
||||
Some inputs could lead to quadratic parsing times. Thanks to Anders Kaseorg
|
||||
for finding all those issues.
|
||||
|
||||
Thanks to Anders Kaseorg for finding all those issues.
|
||||
|
||||
* [#61](https://github.com/mity/md4c/issues/59): Flag `MD_FLAG_NOHTMLSPANS`
|
||||
erroneously affected also recognition of CommonMark autolinks.
|
||||
* [#61](https://github.com/mity/md4c/issues/59):
|
||||
Flag `MD_FLAG_NOHTMLSPANS` erroneously affected also recognition of
|
||||
CommonMark autolinks.
|
||||
|
||||
|
||||
## Version 0.3.0
|
||||
|
201
md4c/md4c.c
201
md4c/md4c.c
@ -123,15 +123,20 @@ struct MD_CTX_tag {
|
||||
#endif
|
||||
|
||||
/* For resolving of inline spans. */
|
||||
MD_MARKCHAIN mark_chains[6];
|
||||
#define PTR_CHAIN ctx->mark_chains[0]
|
||||
#define TABLECELLBOUNDARIES ctx->mark_chains[1]
|
||||
#define ASTERISK_OPENERS ctx->mark_chains[2]
|
||||
#define UNDERSCORE_OPENERS ctx->mark_chains[3]
|
||||
#define TILDE_OPENERS ctx->mark_chains[4]
|
||||
#define BRACKET_OPENERS ctx->mark_chains[5]
|
||||
#define OPENERS_CHAIN_FIRST 2
|
||||
#define OPENERS_CHAIN_LAST 5
|
||||
MD_MARKCHAIN mark_chains[11];
|
||||
#define PTR_CHAIN ctx->mark_chains[0]
|
||||
#define TABLECELLBOUNDARIES ctx->mark_chains[1]
|
||||
#define ASTERISK_OPENERS_extraword_mod3_0 ctx->mark_chains[2]
|
||||
#define ASTERISK_OPENERS_extraword_mod3_1 ctx->mark_chains[3]
|
||||
#define ASTERISK_OPENERS_extraword_mod3_2 ctx->mark_chains[4]
|
||||
#define ASTERISK_OPENERS_intraword_mod3_0 ctx->mark_chains[5]
|
||||
#define ASTERISK_OPENERS_intraword_mod3_1 ctx->mark_chains[6]
|
||||
#define ASTERISK_OPENERS_intraword_mod3_2 ctx->mark_chains[7]
|
||||
#define UNDERSCORE_OPENERS ctx->mark_chains[8]
|
||||
#define TILDE_OPENERS ctx->mark_chains[9]
|
||||
#define BRACKET_OPENERS ctx->mark_chains[10]
|
||||
#define OPENERS_CHAIN_FIRST 2
|
||||
#define OPENERS_CHAIN_LAST 10
|
||||
|
||||
int n_table_cell_boundaries;
|
||||
|
||||
@ -2493,13 +2498,41 @@ struct MD_MARK_tag {
|
||||
|
||||
/* Mark flags specific for various mark types (so they can share bits). */
|
||||
#define MD_MARK_EMPH_INTRAWORD 0x20 /* Helper for the "rule of 3". */
|
||||
#define MD_MARK_EMPH_MODULO3_0 0x40
|
||||
#define MD_MARK_EMPH_MODULO3_1 0x80
|
||||
#define MD_MARK_EMPH_MODULO3_2 (0x40 | 0x80)
|
||||
#define MD_MARK_EMPH_MODULO3_MASK (0x40 | 0x80)
|
||||
#define MD_MARK_EMPH_MOD3_0 0x40
|
||||
#define MD_MARK_EMPH_MOD3_1 0x80
|
||||
#define MD_MARK_EMPH_MOD3_2 (0x40 | 0x80)
|
||||
#define MD_MARK_EMPH_MOD3_MASK (0x40 | 0x80)
|
||||
#define MD_MARK_AUTOLINK 0x20 /* Distinguisher for '<', '>'. */
|
||||
#define MD_MARK_VALIDPERMISSIVEAUTOLINK 0x20 /* For permissive autolinks. */
|
||||
|
||||
static MD_MARKCHAIN*
|
||||
md_asterisk_chain(MD_CTX* ctx, unsigned flags)
|
||||
{
|
||||
switch(flags & (MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_MASK)) {
|
||||
case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_intraword_mod3_0;
|
||||
case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_intraword_mod3_1;
|
||||
case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_intraword_mod3_2;
|
||||
case MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_extraword_mod3_0;
|
||||
case MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_extraword_mod3_1;
|
||||
case MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_extraword_mod3_2;
|
||||
default: MD_UNREACHABLE();
|
||||
}
|
||||
}
|
||||
|
||||
static MD_MARKCHAIN*
|
||||
md_mark_chain(MD_CTX* ctx, int mark_index)
|
||||
{
|
||||
MD_MARK* mark = &ctx->marks[mark_index];
|
||||
|
||||
switch(mark->ch) {
|
||||
case _T('*'): return md_asterisk_chain(ctx, mark->flags);
|
||||
case _T('_'): return &UNDERSCORE_OPENERS;
|
||||
case _T('~'): return &TILDE_OPENERS;
|
||||
case _T('['): return &BRACKET_OPENERS;
|
||||
case _T('|'): return &TABLECELLBOUNDARIES;
|
||||
default: return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static MD_MARK*
|
||||
md_push_mark(MD_CTX* ctx)
|
||||
@ -2658,16 +2691,11 @@ md_rollback(MD_CTX* ctx, int opener_index, int closer_index, int how)
|
||||
MD_MARKCHAIN* chain;
|
||||
|
||||
mark_opener->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
|
||||
|
||||
switch(mark_opener->ch) {
|
||||
case '*': chain = &ASTERISK_OPENERS; break;
|
||||
case '_': chain = &UNDERSCORE_OPENERS; break;
|
||||
case '~': chain = &TILDE_OPENERS; break;
|
||||
default: MD_UNREACHABLE(); break;
|
||||
chain = md_mark_chain(ctx, opener_index);
|
||||
if(chain != NULL) {
|
||||
md_mark_chain_append(ctx, chain, mark_opener_index);
|
||||
discard_flag = 1;
|
||||
}
|
||||
md_mark_chain_append(ctx, chain, mark_opener_index);
|
||||
|
||||
discard_flag = 1;
|
||||
}
|
||||
}
|
||||
|
||||
@ -3009,9 +3037,9 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
|
||||
* split the mark when being later resolved partially by some
|
||||
* shorter closer. */
|
||||
switch((tmp - off) % 3) {
|
||||
case 0: flags |= MD_MARK_EMPH_MODULO3_0; break;
|
||||
case 1: flags |= MD_MARK_EMPH_MODULO3_1; break;
|
||||
case 2: flags |= MD_MARK_EMPH_MODULO3_2; break;
|
||||
case 0: flags |= MD_MARK_EMPH_MOD3_0; break;
|
||||
case 1: flags |= MD_MARK_EMPH_MOD3_1; break;
|
||||
case 2: flags |= MD_MARK_EMPH_MOD3_2; break;
|
||||
}
|
||||
|
||||
PUSH_MARK(ch, off, tmp, flags);
|
||||
@ -3504,7 +3532,7 @@ md_analyze_table_cell_boundary(MD_CTX* ctx, int mark_index)
|
||||
* follows.
|
||||
*/
|
||||
static int
|
||||
md_split_simple_pairing_mark(MD_CTX* ctx, int mark_index, SZ n)
|
||||
md_split_emph_mark(MD_CTX* ctx, int mark_index, SZ n)
|
||||
{
|
||||
MD_MARK* mark = &ctx->marks[mark_index];
|
||||
int new_mark_index = mark_index + (mark->end - mark->beg - n);
|
||||
@ -3521,8 +3549,82 @@ md_split_simple_pairing_mark(MD_CTX* ctx, int mark_index, SZ n)
|
||||
}
|
||||
|
||||
static void
|
||||
md_analyze_simple_pairing_mark(MD_CTX* ctx, MD_MARKCHAIN* chain, int mark_index,
|
||||
int apply_rule_of_three)
|
||||
md_analyze_emph(MD_CTX* ctx, int mark_index)
|
||||
{
|
||||
MD_MARK* mark = &ctx->marks[mark_index];
|
||||
MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
|
||||
|
||||
/* If we can be a closer, try to resolve with the preceding opener. */
|
||||
if(mark->flags & MD_MARK_POTENTIAL_CLOSER) {
|
||||
MD_MARK* opener = NULL;
|
||||
int opener_index;
|
||||
|
||||
if(mark->ch == _T('*')) {
|
||||
MD_MARKCHAIN* opener_chains[6];
|
||||
int i, n_opener_chains;
|
||||
unsigned flags = mark->flags;
|
||||
|
||||
/* Apply "rule of three". (This is why we break asterisk opener
|
||||
* marks into multiple chains.) */
|
||||
n_opener_chains = 0;
|
||||
if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_0)
|
||||
opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_0;
|
||||
if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
|
||||
opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_1;
|
||||
if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
|
||||
opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_2;
|
||||
if(!(flags & MD_MARK_EMPH_INTRAWORD) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_0)
|
||||
opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_0;
|
||||
if(!(flags & MD_MARK_EMPH_INTRAWORD) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
|
||||
opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_1;
|
||||
if(!(flags & MD_MARK_EMPH_INTRAWORD) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
|
||||
opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_2;
|
||||
|
||||
/* Opener is the most recent mark from the allowed chains. */
|
||||
for(i = 0; i < n_opener_chains; i++) {
|
||||
if(opener_chains[i]->tail >= 0) {
|
||||
int tmp_index = opener_chains[i]->tail;
|
||||
MD_MARK* tmp_mark = &ctx->marks[tmp_index];
|
||||
if(opener == NULL || tmp_mark->end > opener->end) {
|
||||
opener_index = tmp_index;
|
||||
opener = tmp_mark;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Simple emph. mark */
|
||||
if(chain->tail >= 0) {
|
||||
opener_index = chain->tail;
|
||||
opener = &ctx->marks[opener_index];
|
||||
}
|
||||
}
|
||||
|
||||
/* Resolve, if we have found matching opener. */
|
||||
if(opener != NULL) {
|
||||
SZ opener_size = opener->end - opener->beg;
|
||||
SZ closer_size = mark->end - mark->beg;
|
||||
|
||||
if(opener_size > closer_size) {
|
||||
opener_index = md_split_emph_mark(ctx, opener_index, closer_size);
|
||||
md_mark_chain_append(ctx, md_mark_chain(ctx, opener_index), opener_index);
|
||||
} else if(opener_size < closer_size) {
|
||||
md_split_emph_mark(ctx, mark_index, closer_size - opener_size);
|
||||
}
|
||||
|
||||
md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING);
|
||||
md_resolve_range(ctx, chain, opener_index, mark_index);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* If we could not resolve as closer, we may be yet be an opener. */
|
||||
if(mark->flags & MD_MARK_POTENTIAL_OPENER)
|
||||
md_mark_chain_append(ctx, chain, mark_index);
|
||||
}
|
||||
|
||||
#if 0
|
||||
static void
|
||||
md_analyze_emph(MD_CTX* ctx, MD_MARKCHAIN* chains, int n_chains, int mark_index)
|
||||
{
|
||||
MD_MARK* mark = &ctx->marks[mark_index];
|
||||
|
||||
@ -3538,10 +3640,10 @@ md_analyze_simple_pairing_mark(MD_CTX* ctx, MD_MARKCHAIN* chain, int mark_index,
|
||||
while((mark->flags & MD_MARK_EMPH_INTRAWORD) || (opener->flags & MD_MARK_EMPH_INTRAWORD)) {
|
||||
SZ opener_orig_size_modulo3;
|
||||
|
||||
switch(opener->flags & MD_MARK_EMPH_MODULO3_MASK) {
|
||||
case MD_MARK_EMPH_MODULO3_0: opener_orig_size_modulo3 = 0; break;
|
||||
case MD_MARK_EMPH_MODULO3_1: opener_orig_size_modulo3 = 1; break;
|
||||
case MD_MARK_EMPH_MODULO3_2: opener_orig_size_modulo3 = 2; break;
|
||||
switch(opener->flags & MD_MARK_EMPH_MOD3_MASK) {
|
||||
case MD_MARK_EMPH_MOD3_0: opener_orig_size_modulo3 = 0; break;
|
||||
case MD_MARK_EMPH_MOD3_1: opener_orig_size_modulo3 = 1; break;
|
||||
case MD_MARK_EMPH_MOD3_2: opener_orig_size_modulo3 = 2; break;
|
||||
default: MD_UNREACHABLE(); break;
|
||||
}
|
||||
|
||||
@ -3564,10 +3666,10 @@ md_analyze_simple_pairing_mark(MD_CTX* ctx, MD_MARKCHAIN* chain, int mark_index,
|
||||
}
|
||||
|
||||
if(opener_size > closer_size) {
|
||||
opener_index = md_split_simple_pairing_mark(ctx, opener_index, closer_size);
|
||||
opener_index = md_split_emph_mark(ctx, opener_index, closer_size);
|
||||
md_mark_chain_append(ctx, chain, opener_index);
|
||||
} else if(opener_size < closer_size) {
|
||||
md_split_simple_pairing_mark(ctx, mark_index, closer_size - opener_size);
|
||||
md_split_emph_mark(ctx, mark_index, closer_size - opener_size);
|
||||
}
|
||||
|
||||
md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING);
|
||||
@ -3581,18 +3683,7 @@ cannot_resolve:
|
||||
if(mark->flags & MD_MARK_POTENTIAL_OPENER)
|
||||
md_mark_chain_append(ctx, chain, mark_index);
|
||||
}
|
||||
|
||||
static inline void
|
||||
md_analyze_asterisk(MD_CTX* ctx, int mark_index)
|
||||
{
|
||||
md_analyze_simple_pairing_mark(ctx, &ASTERISK_OPENERS, mark_index, 1);
|
||||
}
|
||||
|
||||
static inline void
|
||||
md_analyze_underscore(MD_CTX* ctx, int mark_index)
|
||||
{
|
||||
md_analyze_simple_pairing_mark(ctx, &UNDERSCORE_OPENERS, mark_index, 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void
|
||||
md_analyze_tilde(MD_CTX* ctx, int mark_index)
|
||||
@ -3763,8 +3854,8 @@ md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
|
||||
case ']': md_analyze_bracket(ctx, i); break;
|
||||
case '&': md_analyze_entity(ctx, i); break;
|
||||
case '|': md_analyze_table_cell_boundary(ctx, i); break;
|
||||
case '*': md_analyze_asterisk(ctx, i); break;
|
||||
case '_': md_analyze_underscore(ctx, i); break;
|
||||
case '_': /* Pass through. */
|
||||
case '*': md_analyze_emph(ctx, i); break;
|
||||
case '~': md_analyze_tilde(ctx, i); break;
|
||||
case '.': /* Pass through. */
|
||||
case ':': md_analyze_permissive_url_autolink(ctx, i); break;
|
||||
@ -3823,8 +3914,18 @@ md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
|
||||
int mark_beg, int mark_end)
|
||||
{
|
||||
md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("*_~@:."));
|
||||
ASTERISK_OPENERS.head = -1;
|
||||
ASTERISK_OPENERS.tail = -1;
|
||||
ASTERISK_OPENERS_extraword_mod3_0.head = -1;
|
||||
ASTERISK_OPENERS_extraword_mod3_0.tail = -1;
|
||||
ASTERISK_OPENERS_extraword_mod3_1.head = -1;
|
||||
ASTERISK_OPENERS_extraword_mod3_1.tail = -1;
|
||||
ASTERISK_OPENERS_extraword_mod3_2.head = -1;
|
||||
ASTERISK_OPENERS_extraword_mod3_2.tail = -1;
|
||||
ASTERISK_OPENERS_intraword_mod3_0.head = -1;
|
||||
ASTERISK_OPENERS_intraword_mod3_0.tail = -1;
|
||||
ASTERISK_OPENERS_intraword_mod3_1.head = -1;
|
||||
ASTERISK_OPENERS_intraword_mod3_1.tail = -1;
|
||||
ASTERISK_OPENERS_intraword_mod3_2.head = -1;
|
||||
ASTERISK_OPENERS_intraword_mod3_2.tail = -1;
|
||||
UNDERSCORE_OPENERS.head = -1;
|
||||
UNDERSCORE_OPENERS.tail = -1;
|
||||
TILDE_OPENERS.head = -1;
|
||||
|
@ -30,6 +30,9 @@ pathological = {
|
||||
"many emph openers with no closers":
|
||||
(("_a " * 65000),
|
||||
re.compile("(_a ){64999}_a")),
|
||||
"many 3-emph openers with no closers":
|
||||
(("a***" * 65000),
|
||||
re.compile("(a[*][*][*]){65000}")),
|
||||
"many link closers with no openers":
|
||||
(("a]" * 65000),
|
||||
re.compile("(a\]){65000}")),
|
||||
|
Loading…
Reference in New Issue
Block a user