diff --git a/examples/SConscript b/examples/SConscript index b34b85a1cd469386b752bc3721a8b54954315e2a..8504b4bb718f735f51ed22ed4c5f5500633e299c 100644 --- a/examples/SConscript +++ b/examples/SConscript @@ -6,8 +6,9 @@ example = env.Clone() example.Append(LIBS="hammer", LIBPATH="../src") dns = example.Program('dns', ['dns.c', 'rr.c', 'dns_common.c']) +ttuser = example.Program('ttuser', 'ttuser.c') base64 = example.Program('base64', 'base64.c') base64_sem1 = example.Program('base64_sem1', 'base64_sem1.c') base64_sem2 = example.Program('base64_sem2', 'base64_sem2.c') ties = example.Program('ties', ['ties.c', 'grammar.c']) -env.Alias("examples", [dns, base64, base64_sem1, base64_sem2, ties]) \ No newline at end of file +env.Alias("examples", [dns, ttuser, base64, base64_sem1, base64_sem2, ties]) diff --git a/examples/ttuser.c b/examples/ttuser.c new file mode 100644 index 0000000000000000000000000000000000000000..4e83356cd8d785c391efeea5baea65fd68e86ea0 --- /dev/null +++ b/examples/ttuser.c @@ -0,0 +1,140 @@ +/* + * Example parser that demonstrates the use of user-defined token types. + * + * Note the custom printer function that hooks into h_pprint(). + */ + +#include "../src/hammer.h" +#include "../src/glue.h" + + +/* + * custom tokens + */ + +HTokenType TT_SUBJ, TT_PRED, TT_OBJ, TT_ADJ, TT_ADVC; + +void +pprint(FILE *stream, const HParsedToken *tok, int indent, int delta) +{ + /* + * Pretty-printer rules: + * + * - Output 'indent' spaces after every newline you produce. + * - Do not add indent on the first line of output. + * - Do not add a trailing newline. + * - Indent sub-objects by adding 'delta' to 'indent'. + */ + + if (((HParsedToken *)tok->user)->token_type == TT_SEQUENCE) + fprintf(stream, "\n%*s", indent, ""); + h_pprint(stream, tok->user, indent, delta); +} + +/* XXX define umamb_sub as well */ + +void +init(void) +{ + TT_SUBJ = h_allocate_token_new("subject", NULL, pprint); + TT_PRED = h_allocate_token_new("predicate", NULL, pprint); + TT_OBJ = h_allocate_token_new("object", NULL, pprint); + TT_ADJ = h_allocate_token_new("adjective", NULL, pprint); + TT_ADVC = h_allocate_token_new("adverbial clause", NULL, pprint); +} + + +/* + * semantic actions + * + * Normally these would be more interesting, but for this example, we just wrap + * our tokens in their intended types. + */ +HParsedToken *act_subj(const HParseResult *p, void *u) { + return H_MAKE(SUBJ, (void *)p->ast); +} +HParsedToken *act_pred(const HParseResult *p, void *u) { + return H_MAKE(PRED, (void *)p->ast); +} +HParsedToken *act_obj(const HParseResult *p, void *u) { + return H_MAKE(OBJ, (void *)p->ast); +} +HParsedToken *act_adj(const HParseResult *p, void *u) { + return H_MAKE(ADJ, (void *)p->ast); +} +HParsedToken *act_advc(const HParseResult *p, void *u) { + return H_MAKE(ADVC, (void *)p->ast); +} + + +/* + * grammar + */ + +HParser * +build_parser(void) +{ + /* words */ + #define W(X) h_whitespace(h_literal(#X)) + H_RULE(art, h_choice(W(a), W(the), NULL)); + H_RULE(noun, h_choice(W(cat), W(dog), W(fox), W(tiger), W(lion), + W(bear), W(fence), W(tree), W(car), W(cow), NULL)); + H_RULE(verb, h_choice(W(eats), W(jumps), W(falls), NULL)); + H_ARULE(adj, h_choice(W(quick), W(slow), W(happy), W(lazy), W(cyan), + W(magenta), W(yellow), W(black), W(brown), NULL)); + H_RULE(adverb, h_choice(W(with), W(over), W(after), NULL)); + #undef W + + /* phrases */ + H_RULE(nphrase, h_sequence(art, h_many(adj), noun, NULL)); + + /* sentence structure */ + H_ARULE(subj, nphrase); + H_ARULE(pred, verb); + H_ARULE(obj, nphrase); + H_ARULE(advc, h_sequence(adverb, nphrase, NULL)); + H_RULE(sentnc, h_sequence(subj, pred, + h_optional(obj), h_optional(advc), NULL)); + + return sentnc; +} + + +/* + * main routine: read, parse, print + * + * input e.g.: + * "the quick brown fox jumps the fence with a cyan lion" + */ + +#include <stdio.h> +#include <inttypes.h> + +int +main(int argc, char **argv) +{ + uint8_t input[1024]; + size_t sz; + const HParser *parser; + const HParseResult *result; + + init(); + parser = build_parser(); + + sz = fread(input, 1, sizeof(input), stdin); + if (!feof(stdin)) { + fprintf(stderr, "too much input\n"); + return 1; + } + + result = h_parse(parser, input, sz); + if (!result) { + fprintf(stderr, "no parse\n"); + return 1; + } + + h_pprintln(stdout, result->ast); + fprintf(stderr, "consumed %" PRId64 "/%zu bytes.\n", + result->bit_length / 8, sz); + return 0; +} diff --git a/src/hammer.h b/src/hammer.h index 32ec2e05c18dbb322b3769107b9d96999341b064..ae2103ef7efcd8b321b39f2aa9778f39455f9ef9 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -754,10 +754,22 @@ HAMMER_FN_DECL(void, h_parse_result_free, HParseResult *result); */ char* h_write_result_unamb(const HParsedToken* tok); /** - * Format token to the given output stream. Indent starting at - * [indent] spaces, with [delta] spaces between levels. + * Format token to the given output stream. Indent starting at [indent] spaces, + * with [delta] spaces between levels. + * + * Note: This function does not print a trailing newline. It also does not + * print any spaces to indent the initial line of output. This makes it + * suitable for recursive use in the condensed output of larger structures. */ void h_pprint(FILE* stream, const HParsedToken* tok, int indent, int delta); +/** + * Format token to the given output. Print a trailing newline. + * + * This function assumes an initial indentation of 0 and uses 2 spaces between + * indentation levels. It is equivalent to 'h_pprint(stream, tok, 0, 2)' + * followed by 'fputc('\n', stream)' and is provided for convenience. + */ +void h_pprintln(FILE* stream, const HParsedToken* tok); /** * Build parse tables for the given parser backend. See the @@ -821,7 +833,8 @@ HTokenType h_allocate_token_type(const char* name); /// Allocate a new token type with an unambiguous print function. HTokenType h_allocate_token_new( const char* name, - void (*unamb_sub)(const HParsedToken *tok, struct result_buf *buf)); + void (*unamb_sub)(const HParsedToken *tok, struct result_buf *buf), + void (*pprint)(FILE* stream, const HParsedToken* tok, int indent, int delta)); /// Get the token type associated with name. Returns -1 if name is unkown HTokenType h_get_token_type_number(const char* name); diff --git a/src/internal.h b/src/internal.h index 347646a1835065b24004f4d2064495e348a4658c..324fcbafc5ef7601fac70ceaea04894b8d46010d 100644 --- a/src/internal.h +++ b/src/internal.h @@ -435,6 +435,7 @@ typedef struct HTTEntry_ { const char* name; HTokenType value; void (*unamb_sub)(const HParsedToken *tok, struct result_buf *buf); + void (*pprint)(FILE* stream, const HParsedToken* tok, int indent, int delta); } HTTEntry; const HTTEntry* h_get_token_type_entry(HTokenType token_type); diff --git a/src/pprint.c b/src/pprint.c index 52f42eb6060230a8bb608b8e5ab1eafb6ef1467c..7d944b857af011b02ce9a698c81981217c57dcf8 100644 --- a/src/pprint.c +++ b/src/pprint.c @@ -23,6 +23,7 @@ #include "internal.h" #include <stdlib.h> #include <inttypes.h> +#include <ctype.h> typedef struct pp_state { int delta; @@ -30,55 +31,72 @@ typedef struct pp_state { int at_bol; } pp_state_t; +static void pprint_bytes(FILE *stream, const uint8_t *bs, size_t len) +{ + fprintf(stream, "\""); + for (size_t i = 0; i < len; i++) { + uint8_t c = bs[i]; + if (c >= 0x20 && c <= 0x7e) + fputc(c, stream); + else + fprintf(stream, "\\u00%02hhx", c); + } + fprintf(stream, "\""); +} + void h_pprint(FILE* stream, const HParsedToken* tok, int indent, int delta) { + if (tok == NULL) { + fprintf(stream, "(null)"); + return; + } switch (tok->token_type) { case TT_NONE: - fprintf(stream, "%*snull\n", indent, ""); + fprintf(stream, "null"); break; case TT_BYTES: - if (tok->bytes.len == 0) - fprintf(stream, "%*s<>\n", indent, ""); - else { - fprintf(stream, "%*s", indent, ""); - for (size_t i = 0; i < tok->bytes.len; i++) { - fprintf(stream, - "%c%02hhx", - (i == 0) ? '<' : '.', - tok->bytes.token[i]); - } - fprintf(stream, ">\n"); - } + pprint_bytes(stream, tok->bytes.token, tok->bytes.len); break; case TT_SINT: - if (tok->sint < 0) - fprintf(stream, "%*ss -%#" PRIx64 "\n", indent, "", -tok->sint); - else - fprintf(stream, "%*ss %#" PRIx64 "\n", indent, "", tok->sint); - + fprintf(stream, "%" PRId64, tok->sint); break; case TT_UINT: - fprintf(stream, "%*su %#" PRIx64 "\n", indent, "", tok->uint); + fprintf(stream, "%" PRIu64, tok->uint); break; - case TT_SEQUENCE: { - fprintf(stream, "%*s[\n", indent, ""); - for (size_t i = 0; i < tok->seq->used; i++) { - h_pprint(stream, tok->seq->elements[i], indent + delta, delta); + case TT_SEQUENCE: + if (tok->seq->used == 0) + fprintf(stream, "[ ]"); + else { + fprintf(stream, "[%*s", delta - 1, ""); + for (size_t i = 0; i < tok->seq->used; i++) { + if (i > 0) fprintf(stream, "\n%*s,%*s", indent, "", delta - 1, ""); + h_pprint(stream, tok->seq->elements[i], indent + delta, delta); + } + if (tok->seq->used > 2) + fprintf(stream, "\n%*s]", indent, ""); + else + fprintf(stream, " ]"); } - fprintf(stream, "%*s]\n", indent, ""); - } - break; - case TT_USER: - fprintf(stream, "%*sUSER:%s\n", indent, "", h_get_token_type_name(tok->token_type)); break; default: - if(tok->token_type > TT_USER) { - fprintf(stream, "%*sUSER:%s %d\n", indent, "", h_get_token_type_name(tok->token_type), tok->token_type-TT_USER); - } else { - assert_message(0, "Should not reach here."); + assert_message(tok->token_type >= TT_USER, "h_pprint: unhandled token type"); + { + const HTTEntry *e = h_get_token_type_entry(tok->token_type); + fprintf(stream, "{ \"TT\":%d, \"N\":", (int)e->value); + pprint_bytes(stream, (uint8_t *)e->name, strlen(e->name)); + if (e->pprint != NULL) { + fprintf(stream, ", \"V\":"); + e->pprint(stream, tok, indent + delta, delta); + } + fprintf(stream, " }"); } } } +void h_pprintln(FILE* stream, const HParsedToken* tok) { + h_pprint(stream, tok, 0, 2); + fputc('\n', stream); +} + struct result_buf { char* output; @@ -202,6 +220,3 @@ char* h_write_result_unamb(const HParsedToken* tok) { h_append_buf_c(&buf, 0); return buf.output; } - - - diff --git a/src/registry.c b/src/registry.c index 00486db46ca6c1fdece03a051242f4f05ad23514..5486fd7bdb8022c65a296205b0dfd562a20a0572 100644 --- a/src/registry.c +++ b/src/registry.c @@ -54,12 +54,14 @@ static void default_unamb_sub(const HParsedToken* tok, HTokenType h_allocate_token_new( const char* name, - void (*unamb_sub)(const HParsedToken *tok, struct result_buf *buf)) { + void (*unamb_sub)(const HParsedToken *tok, struct result_buf *buf), + void (*pprint)(FILE* stream, const HParsedToken* tok, int indent, int delta)) { HTTEntry* new_entry = h_alloc(&system_allocator, sizeof(*new_entry)); assert(new_entry != NULL); new_entry->name = name; new_entry->value = 0; - new_entry->unamb_sub = unamb_sub; + new_entry->unamb_sub = unamb_sub ? unamb_sub : default_unamb_sub; + new_entry->pprint = pprint; HTTEntry* probe = *(HTTEntry**)tsearch(new_entry, &tt_registry, compare_entries); if (probe->value != 0) { // Token type already exists... @@ -86,7 +88,7 @@ HTokenType h_allocate_token_new( } } HTokenType h_allocate_token_type(const char* name) { - return h_allocate_token_new(name, default_unamb_sub); + return h_allocate_token_new(name, NULL, NULL); } HTokenType h_get_token_type_number(const char* name) { HTTEntry e;