diff --git a/NOTES b/NOTES index edee9d0594059f74dc64e92b52c6f680971c26f1..f1ea86b242fe76435192f0cdf24f922364970746 100644 --- a/NOTES +++ b/NOTES @@ -15,6 +15,20 @@ consistency checks. TODO: Add consistency check to the bitreader -We should support the use of parse-table-based parse methods; add a -parse_compile method that must be called before the newly-created -parser is used. +TODO: We should support the use of parse-table-based parse methods; add a + parse_compile method that must be called before the newly-created + parser is used. + + +Regarding butnot and difference: + +There's a "do what I say, not what I do" variation in how we +implemented these (versus how jsparse did it). His `butnot` succeeds +if p1 and p2 both match and p1's result is longer than p2's, though +the comments say it should succeed if p2's result is longer than +p1's. Also, his `difference` succeeds if p1 and p2 both match, full +stop, returning the result of p2 if p2's result is shorter than p1's +or the result of p1 otherwise, though the comments say it should +succeed if p2's result is shorter than p1's. Whatever; we're doing +what the comments say. + diff --git a/src/hammer.c b/src/hammer.c index 95d8a426a5fb4ef409b9f9104f327a2a8cbed190..7d1dabbcffd7ca77e1c8017e64cd502e05093100 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -294,7 +294,7 @@ static parse_result_t* parse_butnot(void *env, parse_state_t *state) { // cache the initial state of the input stream input_stream_t start_state = state->input_stream; parse_result_t *r1 = do_parse(parsers->p1, state); - // if r1 is null, bail out early + // if p1 failed, bail out early if (NULL == r1) { return NULL; } @@ -304,7 +304,7 @@ static parse_result_t* parse_butnot(void *env, parse_state_t *state) { parse_result_t *r2 = do_parse(parsers->p2, state); // TODO(mlp): I'm pretty sure the input stream state should be the post-p1 state in all cases state->input_stream = after_p1_state; - // if r2 is null, restore post-p1 state and bail out early + // if p2 failed, restore post-p1 state and bail out early if (NULL == r2) { return r1; } @@ -326,8 +326,76 @@ const parser_t* butnot(const parser_t* p1, const parser_t* p2) { return ret; } -const parser_t* difference(const parser_t* p1, const parser_t* p2) { return NULL; } -const parser_t* xor(const parser_t* p1, const parser_t* p2) { return NULL; } +static parse_result_t* parse_difference(void *env, parse_state_t *state) { + two_parsers_t *parsers = (two_parsers_t*)env; + // cache the initial state of the input stream + input_stream_t start_state = state->input_stream; + parse_result_t *r1 = do_parse(parsers->p1, state); + // if p1 failed, bail out early + if (NULL == r1) { + return NULL; + } + // cache the state after parse #1, since we might have to back up to it + input_stream_t after_p1_state = state->input_stream; + state->input_stream = start_state; + parse_result_t *r2 = do_parse(parsers->p2, state); + // TODO(mlp): I'm pretty sure the input stream state should be the post-p1 state in all cases + state->input_stream = after_p1_state; + // if p2 failed, restore post-p1 state and bail out early + if (NULL == r2) { + return r1; + } + size_t r1len = token_length(r1); + size_t r2len = token_length(r2); + // if both match but p1's text is shorter than p2's, fail + if (r1len < r2len) { + return NULL; + } else { + return r1; + } +} + +const parser_t* difference(const parser_t* p1, const parser_t* p2) { + two_parsers_t *env = g_new(two_parsers_t, 1); + env->p1 = p1; env->p2 = p2; + parser_t *ret = g_new(parser_t, 1); + ret->fn = parse_difference; ret->env = (void*)env; + return ret; +} + +static parse_result_t* parse_xor(void *env, parse_state_t *state) { + two_parsers_t *parsers = (two_parsers_t*)env; + // cache the initial state of the input stream + input_stream_t start_state = state->input_stream; + parse_result_t *r1 = do_parse(parsers->p1, state); + input_stream_t after_p1_state = state->input_stream; + // reset input stream, parse again + state->input_stream = start_state; + parse_result_t *r2 = do_parse(parsers->p2, state); + if (NULL == r1) { + if (NULL != r2) { + return r2; + } else { + return NULL; + } + } else { + if (NULL == r2) { + state->input_stream = after_p1_state; + return r1; + } else { + return NULL; + } + } +} + +const parser_t* xor(const parser_t* p1, const parser_t* p2) { + two_parsers_t *env = g_new(two_parsers_t, 1); + env->p1 = p1; env->p2 = p2; + parser_t *ret = g_new(parser_t, 1); + ret->fn = parse_xor; ret->env = (void*)env; + return ret; +} + const parser_t* repeat0(const parser_t* p) { return NULL; } const parser_t* repeat1(const parser_t* p) { return NULL; } const parser_t* repeat_n(const parser_t* p, const size_t n) { return NULL; } diff --git a/src/hammer.h b/src/hammer.h index 87252889d53212d357efcd07b10c5812258b7087..b0a9c1ab0b02cbbf5d75ccddf7af724afa0730fd 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -26,15 +26,7 @@ * input - the entire string being parsed * index - current position in input * length - size of input - * THE FOLLOWING DESCRIBES HOW JSPARSE DOES IT. OUR MILEAGE MAY VARY. - * cache - a hash table describing the state of the parse, including partial parse_results. - * It's actually a hash table of [parser_id, hash_table[index, parse_result]], - * where the parser id is incremented as the parse goes along (parsers that have - * already been applied once don't get a new parser_id ... but the global variable - * still increments? not sure why that is, need to debug some), and the locations - * at which it's been applied are memoized. - * - * In our case, it's a hash table from parser_cache_key_t to parse_state_t. + * cache - a hash table describing the state of the parse, including partial parse_results. It's a hash table from parser_cache_key_t to parse_state_t. * */ #define BYTE_BIG_ENDIAN 0x1 @@ -124,9 +116,22 @@ const parser_t* sequence(const parser_t* p_array[]); /* Given an array of parsers, p_array, apply each parser in order. The first parser to succeed is the result; if no parsers succeed, the parse fails. */ const parser_t* choice(const parser_t* p_array[]); +/* Given two parsers, p1 and p2, this parser succeeds in the following cases: + * - if p1 succeeds and p2 fails + * - if both succeed but p1's result is shorter than p2's + */ const parser_t* butnot(const parser_t* p1, const parser_t* p2); + +/* Given two parsers, p1 and p2, this parser succeeds in the following cases: + * - if p1 succeeds and p2 fails + * - if both succeed but p2's result is shorter than p1's + */ const parser_t* difference(const parser_t* p1, const parser_t* p2); + +/* Given two parsers, p1 and p2, this parser succeeds if *either* p1 or p2 succeed, but not if they both do. + */ const parser_t* xor(const parser_t* p1, const parser_t* p2); + const parser_t* repeat0(const parser_t* p); const parser_t* repeat1(const parser_t* p); const parser_t* repeat_n(const parser_t* p, const size_t n);