Skip to content
Snippets Groups Projects
Commit 318b594b authored by Meredith L. Patterson's avatar Meredith L. Patterson
Browse files

Parser combinators are well underway. Ones that are now finished are:

 * token: matches a sequence of bytes (with length)
 * ch: matches a single byte
 * range: matches any byte within the range [lower, upper] (inclusive)
 * join_action: joins the results of another parser with a separator
 * negate: matches the opposite of any single-character parser
 * end_p: succeeds if there's no input left to parse
 * nothing_p: always fails

One other big change: the AST is now a GSequence of parsed_token_t's. WARNING: This is not actually enforced, because C.

Also tweaked the makefile a little (which will get clobbered in TQ's next commit) and added some documentary comments to hammer.h.
parent dfd8cf2a
No related branches found
No related tags found
No related merge requests found
CFLAGS := $(shell pkg-config --cflags glib-2.0) -std=c99
CFLAGS := $(shell pkg-config --cflags glib-2.0) -std=gnu99
LDFLAGS := $(shell pkg-config --libs glib-2.0)
CC := gcc
......
-include ../common.mk
OUTPUTS := bitreader.o \
hammer.o \
libhammer.a \
test_suite
......
......@@ -16,6 +16,8 @@
*/
#include "hammer.h"
#include "internal.h"
#include <assert.h>
#include <string.h>
/* TODO(thequux): rewrite to follow new parse_state_t layout
parse_state_t* from(parse_state_t *ps, const size_t index) {
......@@ -84,16 +86,196 @@ int put_cached(parse_state_t *ps, const parser_t *p, parse_result_t *cached) {
}
}
const parser_t* token(const uint8_t *s) { return NULL; }
const parser_t* ch(const uint8_t c) { return NULL; }
const parser_t* range(const uint8_t lower, const uint8_t upper) { return NULL; }
parse_result_t* do_parse(const parser_t* parser, parse_state_t *state);
/* Helper function, since these lines appear in every parser */
inline parse_result_t* make_result(GSequence *ast) {
parse_result_t *ret = g_new(parse_result_t, 1);
ret->ast = ast;
return ret;
}
typedef struct {
uint8_t *str;
uint8_t len;
} token_t;
static parse_result_t* parse_token(void *env, parse_state_t *state) {
token_t *t = (token_t*)env;
for (int i=0; i<t->len; ++i) {
uint8_t chr = (uint8_t)read_bits(&state->input_stream, 8, false);
if (t->str[i] != chr) {
return NULL;
}
}
parsed_token_t *tok = g_new(parsed_token_t, 1);
tok->token = t->str; tok->len = t->len;
GSequence *ast = g_sequence_new(NULL);
g_sequence_append(ast, tok);
return make_result(ast);
}
const parser_t* token(const uint8_t *str, const size_t len) {
token_t *t = g_new(token_t, 1);
t->str = (uint8_t*)str, t->len = len;
parser_t *ret = g_new(parser_t, 1);
ret->fn = parse_token; ret->env = t;
return (const parser_t*)ret;
}
static parse_result_t* parse_ch(void* env, parse_state_t *state) {
uint8_t c = (uint8_t)env;
uint8_t r = (uint8_t)read_bits(&state->input_stream, 8, false);
if (c == r) {
parsed_token_t *tok = g_new(parsed_token_t, 1);
tok->token = GUINT_TO_POINTER(c); tok->len = 1;
GSequence *ast = g_sequence_new(NULL);
g_sequence_append(ast, tok);
return make_result(ast);
} else {
return NULL;
}
}
const parser_t* ch(const uint8_t c) {
parser_t *ret = g_new(parser_t, 1);
ret->fn = parse_ch; ret->env = (void*)c;
return (const parser_t*)ret;
}
typedef struct {
uint8_t lower;
uint8_t upper;
} range_t;
static parse_result_t* parse_range(void* env, parse_state_t *state) {
range_t *range = (range_t*)env;
uint8_t r = (uint8_t)read_bits(&state->input_stream, 8, false);
if (range->lower <= r && range->upper >= r) {
parsed_token_t *tok = g_new(parsed_token_t, 1);
tok->token = GUINT_TO_POINTER(r); tok->len = 1;
GSequence *ast = g_sequence_new(NULL);
g_sequence_append(ast, tok);
return make_result(ast);
} else {
return NULL;
}
}
const parser_t* range(const uint8_t lower, const uint8_t upper) {
range_t *r = g_new(range_t, 1);
r->lower = lower; r->upper = upper;
parser_t *ret = g_new(parser_t, 1);
ret->fn = parse_range; ret->env = (void*)r;
return (const parser_t*)ret;
}
const parser_t* whitespace(const parser_t* p) { return NULL; }
//const parser_t* action(const parser_t* p, /* fptr to action on AST */) { return NULL; }
const parser_t* join_action(const parser_t* p, const uint8_t *sep) { return NULL; }
const parser_t* left_faction_action(const parser_t* p) { return NULL; }
const parser_t* negate(const parser_t* p) { return NULL; }
const parser_t* end_p() { return NULL; }
const parser_t* nothing_p() { return NULL; }
typedef struct {
parser_t *parser;
uint8_t *sep;
size_t len;
} join_t;
void join_collect(gpointer tok, gpointer ret) {
size_t sz = GPOINTER_TO_SIZE(ret);
sz += ((parsed_token_t*)tok)->len;
ret = GSIZE_TO_POINTER(sz);
}
static parse_result_t* parse_join(void *env, parse_state_t *state) {
join_t *j = (join_t*)env;
parse_result_t *result = do_parse(j->parser, state);
size_t num_tokens = g_sequence_get_length((GSequence*)result->ast);
if (0 < num_tokens) {
gpointer sz = GSIZE_TO_POINTER(0);
// aggregate length of tokens in AST
g_sequence_foreach((GSequence*)result->ast, join_collect, sz);
// plus aggregate length of all separators
size_t ret_len = GPOINTER_TO_SIZE(sz) + (num_tokens - 1) * j->len;
gpointer ret_str = g_malloc(ret_len);
// first the first token ...
GSequenceIter *it = g_sequence_get_begin_iter((GSequence*)result->ast);
parsed_token_t *tok = g_sequence_get(it);
memcpy(ret_str, tok->token, tok->len);
ret_str += tok->len;
// if there was only one token, don't enter the while loop
it = g_sequence_iter_next(it);
while (!g_sequence_iter_is_end(it)) {
// add a separator
memcpy(ret_str, j->sep, j->len);
ret_str += j->len;
// then the next token
tok = g_sequence_get(it);
memcpy(ret_str, tok->token, tok->len);
// finally, advance the pointer and the iterator
ret_str += tok->len;
it = g_sequence_iter_next(it);
}
// reset the return pointer and construct the return parse_result_t
ret_str -= ret_len;
parsed_token_t *ret_tok = g_new(parsed_token_t, 1);
ret_tok->token = ret_str; ret_tok->len = ret_len;
GSequence *ast = g_sequence_new(NULL);
g_sequence_append(ast, tok);
return make_result(ast);
} else {
return NULL;
}
}
const parser_t* join_action(const parser_t* p, const uint8_t *sep, const size_t len) {
join_t *j = g_new(join_t, 1);
j->parser = (parser_t*)p; j->sep = (uint8_t*)sep; j->len = len;
parser_t *ret = g_new(parser_t, 1);
ret->fn = parse_join; ret->env = (void*)j;
return (const parser_t*)ret;
}
const parser_t* left_factor_action(const parser_t* p) { return NULL; }
static parse_result_t* parse_negate(void *env, parse_state_t *state) {
parser_t *p = (parser_t*)env;
parse_result_t *result = do_parse(p, state);
if (NULL == result) {
uint8_t r = (uint8_t)read_bits(&state->input_stream, 8, false);
parsed_token_t *tok = g_new(parsed_token_t, 1);
tok->token = GUINT_TO_POINTER(r); tok->len = 1;
GSequence *ast = g_sequence_new(NULL);
g_sequence_append(ast, tok);
return make_result(ast);
} else {
return NULL;
}
}
const parser_t* negate(const parser_t* p) {
assert(parse_ch == p->fn || parse_range == p->fn);
parser_t *ret = g_new(parser_t, 1);
ret->fn = parse_negate; ret->env = (void*)p;
return (const parser_t*)ret;
}
static parse_result_t* parse_end(void *env, parse_state_t *state) {
if (state->input_stream.index == state->input_stream.length) {
parse_result_t *ret = g_new(parse_result_t, 1);
ret->ast = NULL;
return ret;
} else {
return NULL;
}
}
const parser_t* end_p() {
parser_t *ret = g_new(parser_t, 1);
ret->fn = parse_end; ret->env = NULL;
return (const parser_t*)ret;
}
const parser_t* nothing_p() {
// not a mistake, this parser always fails
return NULL;
}
const parser_t* sequence(const parser_t* p_array[]) { return NULL; }
const parser_t* choice(const parser_t* p_array[]) { return NULL; }
const parser_t* butnot(const parser_t* p1, const parser_t* p2) { return NULL; }
......
......@@ -54,28 +54,49 @@ typedef struct parse_state {
input_stream_t input_stream;
} parse_state_t;
typedef struct parsed_token {
const uint8_t *token;
size_t len;
} parsed_token_t;
typedef struct parse_result {
const uint8_t *remaining;
const uint8_t *matched;
const GSequence *ast;
} parse_result_t;
typedef struct parser {
parse_result_t* (*fn)(void* env, parse_state_t *state);
void* env;
parse_result_t* (*fn)(void *env, parse_state_t *state);
void *env;
} parser_t;
parse_result_t* parse(const parser_t* parser, const uint8_t* input);
const parser_t* token(const uint8_t *s);
/* Given a string, returns a parser that parses that string value. */
const parser_t* token(const uint8_t *str, const size_t len);
/* Given a single character, returns a parser that parses that character. */
const parser_t* ch(const uint8_t c);
/* Given two single-character bounds, lower and upper, returns a parser that parses a single character within the range [lower, upper] (inclusive). */
const parser_t* range(const uint8_t lower, const uint8_t upper);
/* Given another parser, p, returns a parser that skips any whitespace and then applies p. */
const parser_t* whitespace(const parser_t* p);
/* Given another parser, p, and a function f, returns a parser that applies p, then applies f to everything in the AST of p's result. */
//const parser_t* action(const parser_t* p, /* fptr to action on AST */);
const parser_t* join_action(const parser_t* p, const uint8_t *sep);
const parser_t* left_faction_action(const parser_t* p);
/* Given another parser, p, and a separator, sep, returns a parser that applies p, then joins everything in the AST of p's result with sep. For example, if the AST of p's result is {"dog", "cat", "hedgehog"} and sep is "|", the AST of this parser's result will be {"dog|cat|hedgehog"}. */
const parser_t* join_action(const parser_t* p, const uint8_t *sep, const size_t len);
const parser_t* left_factor_action(const parser_t* p);
/* Given a single-character parser, p, returns a single-character parser that will parse any character *other* than the character p would parse. */
const parser_t* negate(const parser_t* p);
/* A no-argument parser that succeeds if there is no more input to parse. */
const parser_t* end_p();
/* This parser always fails. */
const parser_t* nothing_p();
const parser_t* sequence(const parser_t* p_array[]);
const parser_t* choice(const parser_t* p_array[]);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment