diff --git a/examples/dns.c b/examples/dns.c new file mode 100644 index 0000000000000000000000000000000000000000..027d675c5c5e80f2437882fa2b9adefbce70648d --- /dev/null +++ b/examples/dns.c @@ -0,0 +1,85 @@ +#include "../hammer.h" + +bool is_zero(parse_result_t *p) { + return (0 == p->ast->uint); +} + +bool validate_dns(parse_result_t *p) { + +} + +int main(int argc, char **argv) { + + const parser_t dns_header = sequence(bits(16, false), // ID + bits(1, false), // QR + bits(4, false), // opcode + bits(1, false), // AA + bits(1, false), // TC + bits(1, false), // RD + bits(1, false), // RA + ignore(attr_bool(bits(3, false), is_zero)), // Z + bits(4, false), // RCODE + uint16(), // QDCOUNT + uint16(), // ANCOUNT + uint16(), // NSCOUNT + uint16(), // ARCOUNT + NULL); + + const parser_t *dns_question = sequence(length_value(uint8(), uint8()), // QNAME + uint16(), // QTYPE + uint16(), // QCLASS + NULL); + + const parser_t *letter = choice(range('a', 'z'), + range('A', 'Z'), + NULL); + + const parser_t *let_dig = choice(letter, + range('0', '9'), + NULL); + + const parser_t *ldh_str = many1(choice(let_dig, + ch('-'), + NULL)); + + const parser_t *label = sequence(letter, + optional(sequence(optional(ldh_str), + let_dig, + NULL)), + NULL); + + /** + * You could write it like this ... + * parser_t *indirect_subdomain = indirect(); + * const parser_t *subdomain = choice(label, + * sequence(indirect_subdomain, + * ch('.'), + * label, + * NULL), + * NULL); + * bind_indirect(indirect_subdomain, subdomain); + * + * ... but this is easier and equivalent + */ + + parser_t *subdomain = sepBy1(label, ch('.')); + + const parser_t *domain = choice(subdomain, + ch(' '), + NULL); + + const parser_t *dns_rr = sequence(domain, // NAME + uint16(), // TYPE + uint16(), // CLASS + uint32(), // TTL + length_value(uint16(), uint8()) // RDLENGTH+RDATA + NULL); + + + const parser_t *dns_message = attr_bool(sequence(dns_header, + dns_question, + many(dns_rr), + end_p(), + NULL), + validate_dns); +} diff --git a/src/hammer.c b/src/hammer.c index eb418fe0fc408ac6ec77d672d0dba7295c7e0fa0..5bd1ae1008a7dfd75db81ff81e219a37c461b1f3 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -336,7 +336,31 @@ const parser_t* whitespace(const parser_t* p) { return ret; } -const parser_t* action(const parser_t* p, const action_t a) { return &unimplemented; } +typedef struct { + const parser_t *p; + action_t action; +} parse_action_t; + +static parse_result_t* parse_action(void *env, parse_state_t *state) { + parse_action_t *a = (parse_action_t*)env; + if (a->p && a->action) { + parse_result_t *tmp = do_parse(a->p, state); + //parsed_token_t *tok = a->action(do_parse(a->p, state)); + const parsed_token_t *tok = a->action(tmp); + return make_result(state, (parsed_token_t*)tok); + } else // either the parser's missing or the action's missing + return NULL; +} + +const parser_t* action(const parser_t* p, const action_t a) { + parser_t *res = g_new(parser_t, 1); + res->fn = parse_action; + parse_action_t *env = g_new(parse_action_t, 1); + env->p = p; + env->action = a; + res->env = (void*)env; + return res; +} static parse_result_t* parse_charset(void *env, parse_state_t *state) { uint8_t in = read_bits(&state->input_stream, 8, false); @@ -783,7 +807,6 @@ const parser_t* epsilon_p() { return res; } - static parse_result_t* parse_indirect(void* env, parse_state_t* state) { return do_parse(env, state); } @@ -798,7 +821,68 @@ parser_t* indirect() { return res; } -const parser_t* attr_bool(const parser_t* p, attr_bool_t a) { return &unimplemented; } +typedef struct { + const parser_t *p; + predicate_t pred; +} attr_bool_t; + +static parse_result_t* parse_attr_bool(void *env, parse_state_t *state) { + attr_bool_t *a = (attr_bool_t*)env; + parse_result_t *res = do_parse(a->p, state); + if (res) { + if (a->pred(res)) + return res; + else + return NULL; + } else + return NULL; +} + +const parser_t* attr_bool(const parser_t* p, predicate_t pred) { + parser_t *res = g_new(parser_t, 1); + res->fn = parse_attr_bool; + attr_bool_t *env = g_new(attr_bool_t, 1); + env->p = p; + env->pred = pred; + res->env = (void*)env; + return res; +} + +typedef struct { + const parser_t *length; + const parser_t *value; +} lv_t; + +static parse_result_t* parse_length_value(void *env, parse_state_t *state) { + lv_t *lv = (lv_t*)env; + parse_result_t *len = do_parse(lv->length, state); + if (!len) + return NULL; + if (len->ast->token_type != TT_UINT) + errx(1, "Length parser must return an unsigned integer"); + parser_t epsilon_local = { + .fn = parse_epsilon, + .env = NULL + }; + repeat_t repeat = { + .p = lv->value, + .sep = &epsilon_local, + .count = len->ast->uint, + .min_p = false + }; + return parse_many(&repeat, state); +} + +const parser_t* length_value(const parser_t* length, const parser_t* value) { + parser_t *res = g_new(parser_t, 1); + res->fn = parse_length_value; + lv_t *env = g_new(lv_t, 1); + env->length = length; + env->value = value; + res->env = (void*)env; + return res; +} + const parser_t* and(const parser_t* p) { return &unimplemented; } static parse_result_t* parse_not(void* env, parse_state_t* state) { @@ -881,7 +965,7 @@ static void test_range(void) { static void test_int64(void) { const parser_t *int64_ = int64(); - g_check_parse_ok(int64_, "\xff\xff\xff\xfe\x00\x00\x00\x00", 8, "s-0x200000000"); + g_check_parse_ok(int64_, "\xff\xff\xff\xfe\x00\x00\x00\x00", 8, "s0x200000000"); g_check_parse_failed(int64_, "\xff\xff\xff\xfe\x00\x00\x00", 7); } @@ -962,15 +1046,52 @@ static void test_whitespace(void) { g_check_parse_failed(whitespace_, "_a", 2); } -parse_result_t* upcase(parse_result_t *p) { - return NULL; // shut compiler up +#include <ctype.h> + +const parsed_token_t* upcase(parse_result_t *p) { + switch(p->ast->token_type) { + case TT_SEQUENCE: + { + parsed_token_t *ret = a_new_(p->arena, parsed_token_t, 1); + counted_array_t *seq = carray_new_sized(p->arena, p->ast->seq->used); + ret->token_type = TT_SEQUENCE; + for (size_t i=0; i<p->ast->seq->used; ++i) { + if (TT_UINT == ((parsed_token_t*)p->ast->seq->elements[i])->token_type) { + parsed_token_t *tmp = a_new_(p->arena, parsed_token_t, 1); + tmp->token_type = TT_UINT; + tmp->uint = toupper(((parsed_token_t*)p->ast->seq->elements[i])->uint); + carray_append(seq, tmp); + } else { + carray_append(seq, p->ast->seq->elements[i]); + } + } + ret->seq = seq; + return (const parsed_token_t*)ret; + } + case TT_UINT: + { + parsed_token_t *ret = a_new_(p->arena, parsed_token_t, 1); + ret->token_type = TT_UINT; + ret->uint = toupper(p->ast->uint); + return (const parsed_token_t*)ret; + } + default: + return p->ast; + } } static void test_action(void) { - const parser_t *action_ = action(sequence(choice(ch('a'), ch('A'), NULL), choice(ch('b'), ch('B'), NULL), NULL), upcase); - - g_check_parse_ok(action_, "ab", 2, "(u0x41, u0x42)"); - g_check_parse_ok(action_, "AB", 2, "(u0x41, u0x42)"); + const parser_t *action_ = action(sequence(choice(ch('a'), + ch('A'), + NULL), + choice(ch('b'), + ch('B'), + NULL), + NULL), + upcase); + + g_check_parse_ok(action_, "ab", 2, "(u0x41 u0x42)"); + g_check_parse_ok(action_, "AB", 2, "(u0x41 u0x42)"); } static void test_not_in(void) { diff --git a/src/hammer.h b/src/hammer.h index 18388b61ab7942df18211d9cda3199c129c14771..a0b93e569bb85a394e248fba9bcf3a939afe9607 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -36,6 +36,7 @@ typedef enum token_type { TT_SINT, TT_UINT, TT_SEQUENCE, + TT_USER = 64, TT_ERR, TT_MAX } token_type_t; @@ -59,173 +60,362 @@ typedef struct parsed_token { double dbl; float flt; counted_array_t *seq; // a sequence of parsed_token_t's + void *user; }; size_t index; char bit_offset; } parsed_token_t; - - -/* If a parse fails, the parse result will be NULL. - * If a parse is successful but there's nothing there (i.e., if end_p succeeds) then there's a parse result but its ast is NULL. +/** + * The result of a successful parse. + * If a parse fails, the parse result will be NULL. + * If a parse is successful but there's nothing there (i.e., if end_p + * succeeds) then there's a parse result but its ast is NULL. */ typedef struct parse_result { const parsed_token_t *ast; arena_t arena; } parse_result_t; -/* Type of an action to apply to an AST, used in the action() parser. */ -typedef parse_result_t* (*action_t)(parse_result_t *p); +/** + * Type of an action to apply to an AST, used in the action() parser. + * It can be any (user-defined) function that takes a parse_result_t* + * and returns a parsed_token_t*. (This is so that the user doesn't + * have to worry about memory allocation; action() does that for you.) + * Note that the tagged union in parsed_token_t* supports user-defined + * types, so you can create your own token types (corresponding to, + * say, structs) and stuff values for them into the void* in the + * tagged union in parsed_token_t. + */ +typedef const parsed_token_t* (*action_t)(parse_result_t *p); -/* Type of a boolean attribute-checking function, used in the attr_bool() parser. */ -typedef int (*attr_bool_t)(void *env); +/** + * Type of a boolean attribute-checking function, used in the + * attr_bool() parser. It can be any (user-defined) function that takes + * a parse_result_t* and returns true or false. + */ +typedef bool (*predicate_t)(parse_result_t *p); typedef struct parser { parse_result_t* (*fn)(void *env, parse_state_t *state); void *env; } parser_t; +/** + * Top-level function to call a parser that has been built over some + * piece of input (of known size). + */ parse_result_t* parse(const parser_t* parser, const uint8_t* input, size_t length); -/* Given a string, returns a parser that parses that string value. */ +/** + * Given a string, returns a parser that parses that string value. + * + * Result token type: TT_BYTES + */ const parser_t* token(const uint8_t *str, const size_t len); -/* Given a single character, returns a parser that parses that character. */ +/** + * Given a single character, returns a parser that parses that + * character. + * + * Result token type: TT_UINT + */ const parser_t* ch(const uint8_t c); -/* Given two single-character bounds, lower and upper, returns a parser that parses a single character within the range [lower, upper] (inclusive). */ +/** + * Given two single-character bounds, lower and upper, returns a parser + * that parses a single character within the range [lower, upper] + * (inclusive). + * + * Result token type: TT_UINT + */ const parser_t* range(const uint8_t lower, const uint8_t upper); -/* Returns a parser that parses the specified number of bits. sign == true if signed, false if unsigned. */ +/** + * Returns a parser that parses the specified number of bits. sign == + * true if signed, false if unsigned. + * + * Result token type: TT_SINT if sign == true, TT_UINT if sign == false + */ const parser_t* bits(size_t len, bool sign); -/* Returns a parser that parses a signed 8-byte integer value. */ +/** + * Returns a parser that parses a signed 8-byte integer value. + * + * Result token type: TT_SINT + */ const parser_t* int64(); -/* Returns a parser that parses a signed 4-byte integer value. */ +/** + * Returns a parser that parses a signed 4-byte integer value. + * + * Result token type: TT_SINT + */ const parser_t* int32(); -/* Returns a parser that parses a signed 2-byte integer value. */ +/** + * Returns a parser that parses a signed 2-byte integer value. + * + * Result token type: TT_SINT + */ const parser_t* int16(); -/* Returns a parser that parses a signed 1-byte integer value. */ +/** + * Returns a parser that parses a signed 1-byte integer value. + * + * Result token type: TT_SINT + */ const parser_t* int8(); -/* Returns a parser that parses an unsigned 8-byte integer value. */ +/** + * Returns a parser that parses an unsigned 8-byte integer value. + * + * Result token type: TT_UINT + */ const parser_t* uint64(); -/* Returns a parser that parses an unsigned 4-byte integer value. */ +/** + * Returns a parser that parses an unsigned 4-byte integer value. + * + * Result token type: TT_UINT + */ const parser_t* uint32(); -/* Returns a parser that parses an unsigned 2-byte integer value. */ +/** + * Returns a parser that parses an unsigned 2-byte integer value. + * + * Result token type: TT_UINT + */ const parser_t* uint16(); -/* Returns a parser that parses an unsigned 1-byte integer value. */ +/** + * Returns a parser that parses an unsigned 1-byte integer value. + * + * Result token type: TT_UINT + */ const parser_t* uint8(); -/* Given another parser, p, returns a parser that skips any whitespace and then applies p. */ +/** + * Given another parser, p, returns a parser that skips any whitespace + * and then applies p. + * + * Result token type: p's result type + */ const parser_t* whitespace(const parser_t* p); -/* Given another parser, p, and a function f, returns a parser that applies p, then applies f to everything in the AST of p's result. */ +/** + * Given another parser, p, and a function f, returns a parser that + * applies p, then applies f to everything in the AST of p's result. + * + * Result token type: any + */ const parser_t* action(const parser_t* p, const action_t a); -/* Parse a single character *NOT* in charset */ +/** + * Parse a single character *NOT* in the given charset. + * + * Result token type: TT_UINT + */ const parser_t* not_in(const uint8_t *charset, int length); -/* A no-argument parser that succeeds if there is no more input to parse. */ +/** + * A no-argument parser that succeeds if there is no more input to + * parse. + * + * Result token type: None. The parse_result_t exists but its AST is NULL. + */ const parser_t* end_p(); -/* This parser always fails. */ +/** + * This parser always fails. + * + * Result token type: NULL. Always. + */ const parser_t* nothing_p(); -/* Given an null-terminated list of parsers, apply each parser in order. The parse succeeds only if all parsers succeed. */ +/** + * Given a null-terminated list of parsers, apply each parser in order. + * The parse succeeds only if all parsers succeed. + * + * Result token type: TT_SEQUENCE + */ const parser_t* sequence(const parser_t* p, ...) __attribute__((sentinel)); -/* Given an array of parsers, p_array, apply each parser in order. The first parser to succeed is the result; if no parsers succeed, the parse fails. */ +/** + * Given an array of parsers, p_array, apply each parser in order. The + * first parser to succeed is the result; if no parsers succeed, the + * parse fails. + * + * Result token type: The type of the first successful parser's result. + */ const parser_t* choice(const parser_t* p, ...) __attribute__((sentinel)); -/* Given two parsers, p1 and p2, this parser succeeds in the following cases: +/** + * Given two parsers, p1 and p2, this parser succeeds in the following + * cases: * - if p1 succeeds and p2 fails * - if both succeed but p1's result is as long as or shorter than p2's + * + * Result token type: p1's result type. */ const parser_t* butnot(const parser_t* p1, const parser_t* p2); -/* Given two parsers, p1 and p2, this parser succeeds in the following cases: +/** + * Given two parsers, p1 and p2, this parser succeeds in the following + * cases: * - if p1 succeeds and p2 fails * - if both succeed but p2's result is shorter than p1's + * + * Result token type: p1's result type. */ const parser_t* difference(const parser_t* p1, const parser_t* p2); -/* Given two parsers, p1 and p2, this parser succeeds if *either* p1 or p2 succeed, but not if they both do. +/** + * Given two parsers, p1 and p2, this parser succeeds if *either* p1 or + * p2 succeed, but not if they both do. + * + * Result token type: The type of the result of whichever parser succeeded. */ const parser_t* xor(const parser_t* p1, const parser_t* p2); -/* Given a parser, p, this parser succeeds for zero or more repetitions of p. */ +/** + * Given a parser, p, this parser succeeds for zero or more repetitions + * of p. + * + * Result token type: TT_SEQUENCE + */ const parser_t* many(const parser_t* p); -/* Given a parser, p, this parser succeeds for one or more repetitions of p. */ +/** + * Given a parser, p, this parser succeeds for one or more repetitions + * of p. + * + * Result token type: TT_SEQUENCE + */ const parser_t* many1(const parser_t* p); -/* Given a parser, p, this parser succeeds for exactly N repetitions of p. */ +/** + * Given a parser, p, this parser succeeds for exactly N repetitions + * of p. + * + * Result token type: TT_SEQUENCE + */ const parser_t* repeat_n(const parser_t* p, const size_t n); -/* Given a parser, p, this parser succeeds with the value p parsed or with an empty result. */ +/** + * Given a parser, p, this parser succeeds with the value p parsed or + * with an empty result. + * + * Result token type: If p succeeded, the type of its result; if not, TT_NONE. + */ const parser_t* optional(const parser_t* p); -/* Given a parser, p, this parser succeeds if p succeeds, but doesn't include p's result in the result. */ +/** + * Given a parser, p, this parser succeeds if p succeeds, but doesn't + * include p's result in the result. + * + * Result token type: None. The parse_result_t exists but its AST is NULL. + */ const parser_t* ignore(const parser_t* p); -/* Given a parser, p, and a parser for a separator, sep, this parser matches a (possibly empty) list of things that p can parse, separated by sep. - * For example, if p is repeat1(range('0','9')) and sep is ch(','), sepBy(p, sep) will match a comma-separated list of integers. +/** + * Given a parser, p, and a parser for a separator, sep, this parser + * matches a (possibly empty) list of things that p can parse, + * separated by sep. + * For example, if p is repeat1(range('0','9')) and sep is ch(','), + * sepBy(p, sep) will match a comma-separated list of integers. + * + * Result token type: TT_SEQUENCE */ const parser_t* sepBy(const parser_t* p, const parser_t* sep); -/* Given a parser, p, and a parser for a separator, sep, this parser matches a list of things that p can parse, separated by sep. Unlike sepBy, this ensures that the result has at least one element. +/** + * Given a parser, p, and a parser for a separator, sep, this parser matches a list of things that p can parse, separated by sep. Unlike sepBy, this ensures that the result has at least one element. * For example, if p is repeat1(range('0','9')) and sep is ch(','), sepBy1(p, sep) will match a comma-separated list of integers. + * + * Result token type: TT_SEQUENCE */ const parser_t* sepBy1(const parser_t* p, const parser_t* sep); -/* This parser always returns a zero length match, i.e., empty string. */ +/** + * This parser always returns a zero length match, i.e., empty string. + * + * Result token type: None. The parse_result_t exists but its AST is NULL. + */ const parser_t* epsilon_p(); -/* This parser attaches an attribute function, which returns true or false, to a parser. The function is evaluated over the parser's result AST. +/** + * This parser applies its first argument to read an unsigned integer + * value, then applies its second argument that many times. length + * should parse an unsigned integer value; this is checked at runtime. + * Specifically, the token_type of the returned token must be TT_UINT. + * In future we might relax this to include TT_USER but don't count on it. + * + * Result token type: TT_SEQUENCE + */ +const parser_t* length_value(const parser_t* length, const parser_t* value); + +/** + * This parser attaches a predicate function, which returns true or + * false, to a parser. The function is evaluated over the parser's + * result. * The parse only succeeds if the attribute function returns true. + * + * Result token type: p's result type if pred succeeded, NULL otherwise. */ -const parser_t* attr_bool(const parser_t* p, const attr_bool_t a); +const parser_t* attr_bool(const parser_t* p, predicate_t pred); -/* The 'and' parser is a predicate. It asserts that a conditional syntax is satisfied, but consumes no input. +/** + * The 'and' parser asserts that a conditional syntax is satisfied, + * but doesn't consume that conditional syntax. * This is useful for lookahead. As an example: * - * Suppose you already have a parser, hex_p, that parses numbers in hexadecimal format (including the leading '0x'). Then + * Suppose you already have a parser, hex_p, that parses numbers in + * hexadecimal format (including the leading '0x'). Then * sequence(and(token((const uint8_t*)"0x", 2)), hex_p) - * checks to see whether there is a leading "0x", *does not* consume the "0x", and then applies hex_p to parse the hex-formatted number. + * checks to see whether there is a leading "0x", *does not* consume + * the "0x", and then applies hex_p to parse the hex-formatted number. + * + * 'and' succeeds if p succeeds, and fails if p fails. * - * 'and' succeeds if p succeeds, and fails if p fails. Like 'ignore', 'and' does not attach a result to the AST. + * Result token type: None. The parse_result_t exists but its AST is NULL. */ const parser_t* and(const parser_t* p); -/* The 'not' parser is a predicate. It asserts that a conditional syntax is *not* satisfied, and consumes no input. +/** + * The 'not' parser asserts that a conditional syntax is *not* + * satisfied, but doesn't consume that conditional syntax. * As a somewhat contrived example: * * Since 'choice' applies its arguments in order, the following parser: * sequence(ch('a'), choice(ch('+'), token((const uint8_t*)"++"), NULL), ch('b'), NULL) - * will not parse "a++b", because once choice() has succeeded, it will not backtrack and try other alternatives if a later parser in the sequence - * fails. - * Instead, you can force the use of the second alternative by turning the ch('+') alternative into a sequence with not: + * will not parse "a++b", because once choice() has succeeded, it will + * not backtrack and try other alternatives if a later parser in the + * sequence fails. + * Instead, you can force the use of the second alternative by turning + * the ch('+') alternative into a sequence with not: * sequence(ch('a'), choice(sequence(ch('+'), not(ch('+')), NULL), token((const uint8_t*)"++")), ch('b'), NULL) - * If the input string is "a+b", the first alternative is applied; if the input string is "a++b", the second alternative is applied. + * If the input string is "a+b", the first alternative is applied; if + * the input string is "a++b", the second alternative is applied. + * + * Result token type: None. The parse_result_t exists but its AST is NULL. */ const parser_t* not(const parser_t* p); /** - * Create a parser that just calls out to another, as yet unknown, parser. + * Create a parser that just calls out to another, as yet unknown, + * parser. * Note that the inner parser gets bound later, with bind_indirect. * This can be used to create recursive parsers. + * + * Result token type: the type of whatever parser is bound to it with + * bind_indirect(). */ parser_t *indirect(); /** - * Set the inner parser of an indirect. See comments on indirect for details. + * Set the inner parser of an indirect. See comments on indirect for + * details. */ void bind_indirect(parser_t* indirect, parser_t* inner); diff --git a/src/internal.h b/src/internal.h index 4cd5bf6dcf9c323daddefd6d2778805243962096..09970e01a01f45665924b412e3ae8aab379f4009 100644 --- a/src/internal.h +++ b/src/internal.h @@ -26,7 +26,7 @@ #else #define assert_message(check, message) do { \ if (!(check)) \ - errx(1, "Assertation failed (programmer error): %s", message); \ + errx(1, "Assertion failed (programmer error): %s", message); \ } while(0) #endif #define false 0 diff --git a/src/pprint.c b/src/pprint.c index 250ecfbef104842374aaf98308984183d50240ef..ac0d02db611f7fbe1c91d5ca4d3397fc3d738558 100644 --- a/src/pprint.c +++ b/src/pprint.c @@ -94,6 +94,10 @@ static inline void append_buf_c(struct result_buf *buf, char v) { static void unamb_sub(const parsed_token_t* tok, struct result_buf *buf) { char* tmpbuf; int len; + if (!tok) { + append_buf(buf, "NULL", 4); + return; + } switch (tok->token_type) { case TT_NONE: append_buf(buf, "null", 4);