From 318b594bede8c3a785f035a4bbb1dd989962a82d Mon Sep 17 00:00:00 2001 From: "Meredith L. Patterson" <clonearmy@gmail.com> Date: Tue, 1 May 2012 00:33:47 +0100 Subject: [PATCH] Parser combinators are well underway. Ones that are now finished are: * token: matches a sequence of bytes (with length) * ch: matches a single byte * range: matches any byte within the range [lower, upper] (inclusive) * join_action: joins the results of another parser with a separator * negate: matches the opposite of any single-character parser * end_p: succeeds if there's no input left to parse * nothing_p: always fails One other big change: the AST is now a GSequence of parsed_token_t's. WARNING: This is not actually enforced, because C. Also tweaked the makefile a little (which will get clobbered in TQ's next commit) and added some documentary comments to hammer.h. --- common.mk | 2 +- src/Makefile | 1 + src/hammer.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++--- src/hammer.h | 35 +++++++-- 4 files changed, 220 insertions(+), 16 deletions(-) diff --git a/common.mk b/common.mk index ca794304..70f453b9 100644 --- a/common.mk +++ b/common.mk @@ -1,4 +1,4 @@ -CFLAGS := $(shell pkg-config --cflags glib-2.0) -std=c99 +CFLAGS := $(shell pkg-config --cflags glib-2.0) -std=gnu99 LDFLAGS := $(shell pkg-config --libs glib-2.0) CC := gcc diff --git a/src/Makefile b/src/Makefile index 69a3e2ef..e9f76f2b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,6 +1,7 @@ -include ../common.mk OUTPUTS := bitreader.o \ + hammer.o \ libhammer.a \ test_suite diff --git a/src/hammer.c b/src/hammer.c index 260bd397..b0671867 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -16,6 +16,8 @@ */ #include "hammer.h" +#include "internal.h" +#include <assert.h> #include <string.h> /* TODO(thequux): rewrite to follow new parse_state_t layout parse_state_t* from(parse_state_t *ps, const size_t index) { @@ -84,16 +86,196 @@ int put_cached(parse_state_t *ps, const parser_t *p, parse_result_t *cached) { } } -const parser_t* token(const uint8_t *s) { return NULL; } -const parser_t* ch(const uint8_t c) { return NULL; } -const parser_t* range(const uint8_t lower, const uint8_t upper) { return NULL; } +parse_result_t* do_parse(const parser_t* parser, parse_state_t *state); + +/* Helper function, since these lines appear in every parser */ +inline parse_result_t* make_result(GSequence *ast) { + parse_result_t *ret = g_new(parse_result_t, 1); + ret->ast = ast; + return ret; +} + +typedef struct { + uint8_t *str; + uint8_t len; +} token_t; + +static parse_result_t* parse_token(void *env, parse_state_t *state) { + token_t *t = (token_t*)env; + for (int i=0; i<t->len; ++i) { + uint8_t chr = (uint8_t)read_bits(&state->input_stream, 8, false); + if (t->str[i] != chr) { + return NULL; + } + } + parsed_token_t *tok = g_new(parsed_token_t, 1); + tok->token = t->str; tok->len = t->len; + GSequence *ast = g_sequence_new(NULL); + g_sequence_append(ast, tok); + return make_result(ast); +} + +const parser_t* token(const uint8_t *str, const size_t len) { + token_t *t = g_new(token_t, 1); + t->str = (uint8_t*)str, t->len = len; + parser_t *ret = g_new(parser_t, 1); + ret->fn = parse_token; ret->env = t; + return (const parser_t*)ret; +} + +static parse_result_t* parse_ch(void* env, parse_state_t *state) { + uint8_t c = (uint8_t)env; + uint8_t r = (uint8_t)read_bits(&state->input_stream, 8, false); + if (c == r) { + parsed_token_t *tok = g_new(parsed_token_t, 1); + tok->token = GUINT_TO_POINTER(c); tok->len = 1; + GSequence *ast = g_sequence_new(NULL); + g_sequence_append(ast, tok); + return make_result(ast); + } else { + return NULL; + } +} + +const parser_t* ch(const uint8_t c) { + parser_t *ret = g_new(parser_t, 1); + ret->fn = parse_ch; ret->env = (void*)c; + return (const parser_t*)ret; +} + +typedef struct { + uint8_t lower; + uint8_t upper; +} range_t; + +static parse_result_t* parse_range(void* env, parse_state_t *state) { + range_t *range = (range_t*)env; + uint8_t r = (uint8_t)read_bits(&state->input_stream, 8, false); + if (range->lower <= r && range->upper >= r) { + parsed_token_t *tok = g_new(parsed_token_t, 1); + tok->token = GUINT_TO_POINTER(r); tok->len = 1; + GSequence *ast = g_sequence_new(NULL); + g_sequence_append(ast, tok); + return make_result(ast); + } else { + return NULL; + } +} + +const parser_t* range(const uint8_t lower, const uint8_t upper) { + range_t *r = g_new(range_t, 1); + r->lower = lower; r->upper = upper; + parser_t *ret = g_new(parser_t, 1); + ret->fn = parse_range; ret->env = (void*)r; + return (const parser_t*)ret; +} const parser_t* whitespace(const parser_t* p) { return NULL; } //const parser_t* action(const parser_t* p, /* fptr to action on AST */) { return NULL; } -const parser_t* join_action(const parser_t* p, const uint8_t *sep) { return NULL; } -const parser_t* left_faction_action(const parser_t* p) { return NULL; } -const parser_t* negate(const parser_t* p) { return NULL; } -const parser_t* end_p() { return NULL; } -const parser_t* nothing_p() { return NULL; } + +typedef struct { + parser_t *parser; + uint8_t *sep; + size_t len; +} join_t; + +void join_collect(gpointer tok, gpointer ret) { + size_t sz = GPOINTER_TO_SIZE(ret); + sz += ((parsed_token_t*)tok)->len; + ret = GSIZE_TO_POINTER(sz); +} + +static parse_result_t* parse_join(void *env, parse_state_t *state) { + join_t *j = (join_t*)env; + parse_result_t *result = do_parse(j->parser, state); + size_t num_tokens = g_sequence_get_length((GSequence*)result->ast); + if (0 < num_tokens) { + gpointer sz = GSIZE_TO_POINTER(0); + // aggregate length of tokens in AST + g_sequence_foreach((GSequence*)result->ast, join_collect, sz); + // plus aggregate length of all separators + size_t ret_len = GPOINTER_TO_SIZE(sz) + (num_tokens - 1) * j->len; + gpointer ret_str = g_malloc(ret_len); + // first the first token ... + GSequenceIter *it = g_sequence_get_begin_iter((GSequence*)result->ast); + parsed_token_t *tok = g_sequence_get(it); + memcpy(ret_str, tok->token, tok->len); + ret_str += tok->len; + // if there was only one token, don't enter the while loop + it = g_sequence_iter_next(it); + while (!g_sequence_iter_is_end(it)) { + // add a separator + memcpy(ret_str, j->sep, j->len); + ret_str += j->len; + // then the next token + tok = g_sequence_get(it); + memcpy(ret_str, tok->token, tok->len); + // finally, advance the pointer and the iterator + ret_str += tok->len; + it = g_sequence_iter_next(it); + } + // reset the return pointer and construct the return parse_result_t + ret_str -= ret_len; + parsed_token_t *ret_tok = g_new(parsed_token_t, 1); + ret_tok->token = ret_str; ret_tok->len = ret_len; + GSequence *ast = g_sequence_new(NULL); + g_sequence_append(ast, tok); + return make_result(ast); + } else { + return NULL; + } +} + +const parser_t* join_action(const parser_t* p, const uint8_t *sep, const size_t len) { + join_t *j = g_new(join_t, 1); + j->parser = (parser_t*)p; j->sep = (uint8_t*)sep; j->len = len; + parser_t *ret = g_new(parser_t, 1); + ret->fn = parse_join; ret->env = (void*)j; + return (const parser_t*)ret; +} + +const parser_t* left_factor_action(const parser_t* p) { return NULL; } + +static parse_result_t* parse_negate(void *env, parse_state_t *state) { + parser_t *p = (parser_t*)env; + parse_result_t *result = do_parse(p, state); + if (NULL == result) { + uint8_t r = (uint8_t)read_bits(&state->input_stream, 8, false); + parsed_token_t *tok = g_new(parsed_token_t, 1); + tok->token = GUINT_TO_POINTER(r); tok->len = 1; + GSequence *ast = g_sequence_new(NULL); + g_sequence_append(ast, tok); + return make_result(ast); + } else { + return NULL; + } +} + +const parser_t* negate(const parser_t* p) { + assert(parse_ch == p->fn || parse_range == p->fn); + parser_t *ret = g_new(parser_t, 1); + ret->fn = parse_negate; ret->env = (void*)p; + return (const parser_t*)ret; +} + +static parse_result_t* parse_end(void *env, parse_state_t *state) { + if (state->input_stream.index == state->input_stream.length) { + parse_result_t *ret = g_new(parse_result_t, 1); + ret->ast = NULL; + return ret; + } else { + return NULL; + } +} + +const parser_t* end_p() { + parser_t *ret = g_new(parser_t, 1); + ret->fn = parse_end; ret->env = NULL; + return (const parser_t*)ret; +} +const parser_t* nothing_p() { + // not a mistake, this parser always fails + return NULL; +} const parser_t* sequence(const parser_t* p_array[]) { return NULL; } const parser_t* choice(const parser_t* p_array[]) { return NULL; } const parser_t* butnot(const parser_t* p1, const parser_t* p2) { return NULL; } diff --git a/src/hammer.h b/src/hammer.h index 1a06ffd9..4f652a43 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -54,28 +54,49 @@ typedef struct parse_state { input_stream_t input_stream; } parse_state_t; +typedef struct parsed_token { + const uint8_t *token; + size_t len; +} parsed_token_t; + typedef struct parse_result { - const uint8_t *remaining; - const uint8_t *matched; const GSequence *ast; } parse_result_t; typedef struct parser { - parse_result_t* (*fn)(void* env, parse_state_t *state); - void* env; + parse_result_t* (*fn)(void *env, parse_state_t *state); + void *env; } parser_t; parse_result_t* parse(const parser_t* parser, const uint8_t* input); -const parser_t* token(const uint8_t *s); +/* Given a string, returns a parser that parses that string value. */ +const parser_t* token(const uint8_t *str, const size_t len); + +/* Given a single character, returns a parser that parses that character. */ const parser_t* ch(const uint8_t c); + +/* Given two single-character bounds, lower and upper, returns a parser that parses a single character within the range [lower, upper] (inclusive). */ const parser_t* range(const uint8_t lower, const uint8_t upper); + +/* Given another parser, p, returns a parser that skips any whitespace and then applies p. */ const parser_t* whitespace(const parser_t* p); + +/* Given another parser, p, and a function f, returns a parser that applies p, then applies f to everything in the AST of p's result. */ //const parser_t* action(const parser_t* p, /* fptr to action on AST */); -const parser_t* join_action(const parser_t* p, const uint8_t *sep); -const parser_t* left_faction_action(const parser_t* p); + +/* Given another parser, p, and a separator, sep, returns a parser that applies p, then joins everything in the AST of p's result with sep. For example, if the AST of p's result is {"dog", "cat", "hedgehog"} and sep is "|", the AST of this parser's result will be {"dog|cat|hedgehog"}. */ +const parser_t* join_action(const parser_t* p, const uint8_t *sep, const size_t len); + +const parser_t* left_factor_action(const parser_t* p); + +/* Given a single-character parser, p, returns a single-character parser that will parse any character *other* than the character p would parse. */ const parser_t* negate(const parser_t* p); + +/* A no-argument parser that succeeds if there is no more input to parse. */ const parser_t* end_p(); + +/* This parser always fails. */ const parser_t* nothing_p(); const parser_t* sequence(const parser_t* p_array[]); const parser_t* choice(const parser_t* p_array[]); -- GitLab