diff --git a/HACKING b/HACKING index 44f59912c50edd2bf76f01dbe19652926abc41e6..56a818ad43040b0f5b26eea535b3f6955f96166d 100644 --- a/HACKING +++ b/HACKING @@ -6,12 +6,13 @@ internal anaphoric macros use. Chances are that if you use these names for other things, you're gonna have a bad time. In particular, these names, and the macros that use them, are: -- state: - Used by a_new and company. Should be an HParseState* -- mm__: - Used by h_new and h_free. Should be an HAllocator* -- stk__: - Used in desugaring. Should be an HCFStack* + +- `state`: + Used by `a_new` and company. Should be an `HParseState*`. +- `mm__`: + Used by `h_new` and `h_free`. Should be an `HAllocator*`. +- `stk__`: + Used in desugaring. Should be an `HCFStack*`. Function suffixes ================= @@ -21,9 +22,9 @@ parameters or parameters in multiple different forms. For example, often, you have a global memory manager that is used for an entire program. In this case, you can leave off the memory manager arguments off, letting them be implicit instead. Further, it is often convenient -to pass an array or va_list to a function instead of listing the -arguments inline (eg, for wrapping a function, generating the -arguments programattically, or writing bindings for another language. +to pass an array or `va_list` to a function instead of listing the +arguments inline (e.g., for wrapping a function, generating the +arguments programatically, or writing bindings for another language.) Because we have found that most variants fall into a fairly small set of forms, and to minimize the amount of API calls that users need to @@ -32,21 +33,22 @@ variants: the function name is followed by two underscores and a set of single-character "flags" indicating what optional features that particular variant has (in alphabetical order, of course): - __a: takes variadic arguments as a void*[] (not implemented yet, but will be soon. - __m: takes a memory manager as the first argument, to override the system memory manager. - __v: Takes the variadic argument list as a va_list - +- `__a`: takes variadic arguments as a `void*[]` (not implemented yet, + but will be soon.) +- `__m`: takes a memory manager as the first argument, to override the + system memory manager. +- `__v`: Takes the variadic argument list as a `va_list`. Memory managers =============== -If the __m function variants are used or system_allocator is +If the `__m` function variants are used or `system_allocator` is overridden, there come some difficult questions to answer, particularly regarding the behavior when multiple memory managers are combined. As a general rule of thumb (exceptions will be explicitly documented), assume that - If you have a function f, which is passed a memory manager m and +> If you have a function f, which is passed a memory manager m and returns a value r, any function that uses r as a parameter must also be told to use m as a memory manager. @@ -57,7 +59,7 @@ Language-independent test suite There is a language-independent representation of the Hammer test suite in `lib/test-suite`. This is intended to be used with the -tsparser.pl prolog library, along with a language-specific frontend. +tsparser.pl Prolog library, along with a language-specific frontend. Only the C# frontend exists so far; to regenerate the test suites using it, run diff --git a/SConstruct b/SConstruct index ecc86df312fb74c34348a4a06c83f21509eef53b..e03d2728fcdedc93cc11828d1d8ba1832a723dc1 100644 --- a/SConstruct +++ b/SConstruct @@ -73,6 +73,12 @@ AddOption('--coverage', action='store_true', help='Build with coverage instrumentation') +AddOption('--gprof', + dest='gprof', + default=False, + action="store_true", + help='Build with profiling instrumentation for gprof') + AddOption('--in-place', dest='in_place', default=False, @@ -128,6 +134,19 @@ if GetOption('coverage'): else: env.ParseConfig('llvm-config --ldflags') +if GetOption('gprof'): + if env['CC'] == 'gcc' and env['CXX'] == 'g++': + env.Append(CFLAGS=['-pg', '-fprofile-arcs'], + CXXFLAGS=['-pg', '-fprofile-arcs'], + LDFLAGS=['-pg', '-fprofile-arcs'], + LINKFLAGS=['-pg', '-fprofile-arcs']) + env.Append(LIBS=['gcov']) + env['GPROF'] = 1 + else: + print("Can only use gprof with gcc") + Exit(1) + + dbg = env.Clone(VARIANT='debug') if env['CC'] == 'cl': dbg.Append(CCFLAGS=['/Z7']) diff --git a/docs/hammerman.3 b/docs/hammerman.3 index cf1654b9943dd6e5160b3e5b519faa7c76210554..f3cf7e12b12d439a5026d3b23d5ff3bfea499e35 100644 --- a/docs/hammerman.3 +++ b/docs/hammerman.3 @@ -77,11 +77,13 @@ Benchmarking for parsing backends -- determine empirically which backend will be 11 12 HParseResult *result = h_parse(hello_parser, input, inputsize); 13 if(result) { -14 printf("yay!\n"); +14 printf("yay!\\n"); 15 } else { -16 printf("boo!\n"); +16 printf("boo!\\n"); 17 } -18 } +18 h_parse_result_free(result); +19 return 0 == result; +20 } .fi .SH "AUTHOR" .sp diff --git a/examples/SConscript b/examples/SConscript index b34b85a1cd469386b752bc3721a8b54954315e2a..28c5734d830cad028ee10c3df8ee7a344bb01088 100644 --- a/examples/SConscript +++ b/examples/SConscript @@ -3,11 +3,18 @@ from __future__ import absolute_import, division, print_function Import('env') example = env.Clone() -example.Append(LIBS="hammer", LIBPATH="../src") + +if 'GPROF' in env and env['GPROF'] == 1: + hammer_lib_name="hammer_pg" +else: + hammer_lib_name="hammer" + +example.Append(LIBS=hammer_lib_name, LIBPATH="../src") dns = example.Program('dns', ['dns.c', 'rr.c', 'dns_common.c']) +ttuser = example.Program('ttuser', 'ttuser.c') base64 = example.Program('base64', 'base64.c') base64_sem1 = example.Program('base64_sem1', 'base64_sem1.c') base64_sem2 = example.Program('base64_sem2', 'base64_sem2.c') ties = example.Program('ties', ['ties.c', 'grammar.c']) -env.Alias("examples", [dns, base64, base64_sem1, base64_sem2, ties]) \ No newline at end of file +env.Alias("examples", [dns, ttuser, base64, base64_sem1, base64_sem2, ties]) diff --git a/examples/base64.c b/examples/base64.c index 17264da9441d8fb7008496cc901c6a5f471a2db5..ddc162c0e4164e23ebef79ea4e3411f5ecf84cab 100644 --- a/examples/base64.c +++ b/examples/base64.c @@ -11,8 +11,6 @@ #include <inttypes.h> #include "../src/hammer.h" -#define DEBUG - const HParser* document = NULL; void init_parser(void) @@ -27,60 +25,17 @@ void init_parser(void) HParser *equals = h_ch('='); HParser *bsfdig = h_choice(alpha, digit, plus, slash, NULL); - HParser *bsfdig_4bit = h_choice( - h_ch('A'), h_ch('E'), h_ch('I'), h_ch('M'), h_ch('Q'), h_ch('U'), - h_ch('Y'), h_ch('c'), h_ch('g'), h_ch('k'), h_ch('o'), h_ch('s'), - h_ch('w'), h_ch('0'), h_ch('4'), h_ch('8'), NULL); - HParser *bsfdig_2bit = h_choice(h_ch('A'), h_ch('Q'), h_ch('g'), h_ch('w'), NULL); - - HParser *base64_quad = h_sequence(bsfdig, bsfdig, bsfdig, bsfdig, NULL); - HParser *base64_quads = h_many(base64_quad); - - HParser *base64_2 = h_sequence(bsfdig, bsfdig, bsfdig_4bit, equals, h_end_p(), NULL); - HParser *base64_1 = h_sequence(bsfdig, bsfdig_2bit, equals, equals, h_end_p(), NULL); - HParser *base64_ending = h_choice(h_end_p(), base64_2, base64_1, NULL); - HParser *base64 = h_sequence(base64_quads, base64_ending, NULL); - // why does this parse "A=="?! - // why does this parse "aaA=" but not "aA=="?! - - document = base64; -} - - -#include <string.h> -#include <assert.h> -#define TRUE (1) -#define FALSE (0) - -void assert_parse(int expected, char *data) { - HParseResult *result; - - size_t datasize = strlen(data); - result = h_parse(document, (void*)data, datasize); - if((result != NULL) != expected) { - fprintf(stderr, "Test failed: %s\n", data); - } -#ifdef DEBUG - else { - fprintf(stderr, "Test succeeded: %s\n", data); - fprintf(stderr, "parsed=%" PRId64 " bytes\n", result->bit_length/8); - h_pprint(stdout, result->ast, 0, 0); - } -#endif -} - -void test() { - assert_parse(TRUE, ""); - assert_parse(TRUE, "YQ=="); - assert_parse(TRUE, "YXU="); - assert_parse(TRUE, "YXVy"); - assert_parse(TRUE, "QVVSIFNBUkFG"); - assert_parse(TRUE, "QVVSIEhFUlUgU0FSQUY="); - assert_parse(FALSE, "A"); - assert_parse(FALSE, "A="); - assert_parse(FALSE, "A=="); - assert_parse(FALSE, "AAA=="); - assert_parse(FALSE, "aa=="); + HParser *bsfdig_4bit = h_in((uint8_t *)"AEIMQUYcgkosw048", 16); + HParser *bsfdig_2bit = h_in((uint8_t *)"AQgw", 4); + HParser *base64_3 = h_repeat_n(bsfdig, 4); + HParser *base64_2 = h_sequence(bsfdig, bsfdig, bsfdig_4bit, equals, NULL); + HParser *base64_1 = h_sequence(bsfdig, bsfdig_2bit, equals, equals, NULL); + HParser *base64 = h_sequence(h_many(base64_3), + h_optional(h_choice(base64_2, + base64_1, NULL)), + NULL); + + document = h_sequence(h_whitespace(base64), h_whitespace(h_end_p()), NULL); } @@ -94,8 +49,6 @@ int main(int argc, char **argv) init_parser(); - test(); - inputsize = fread(input, 1, sizeof(input), stdin); fprintf(stderr, "inputsize=%zu\ninput=", inputsize); fwrite(input, 1, inputsize, stderr); @@ -104,6 +57,7 @@ int main(int argc, char **argv) if(result) { fprintf(stderr, "parsed=%" PRId64 " bytes\n", result->bit_length/8); h_pprint(stdout, result->ast, 0, 0); + h_parse_result_free(result); return 0; } else { return 1; diff --git a/examples/base64_sem1.c b/examples/base64_sem1.c index afbbef841cc0ef0593e68a1ca7101eacc976f474..7127d1eb4738c450fba5d3a9b8ab1fa3ac32496a 100644 --- a/examples/base64_sem1.c +++ b/examples/base64_sem1.c @@ -149,12 +149,13 @@ HParser *init_parser(void) #include <stdio.h> +const HParser *parser; // Allocated statically to suppress leak warnings + int main(int argc, char **argv) { uint8_t input[102400]; size_t inputsize; - const HParser *parser; - const HParseResult *result; + HParseResult *result; parser = init_parser(); @@ -166,6 +167,7 @@ int main(int argc, char **argv) if(result) { fprintf(stderr, "parsed=%" PRId64 " bytes\n", result->bit_length/8); h_pprint(stdout, result->ast, 0, 0); + h_parse_result_free(result); return 0; } else { return 1; diff --git a/examples/base64_sem2.c b/examples/base64_sem2.c index b8f7b4a20312dcf39695ba52cdcf9573376d6c69..dac7e7ab0021198b76849da2bfe86af8864a9e9d 100644 --- a/examples/base64_sem2.c +++ b/examples/base64_sem2.c @@ -153,12 +153,13 @@ const HParser *init_parser(void) #include <stdio.h> +const HParser *parser; // Allocated statically to suppress leak warnings + int main(int argc, char **argv) { uint8_t input[102400]; size_t inputsize; - const HParser *parser; - const HParseResult *result; + HParseResult *result; parser = init_parser(); @@ -170,6 +171,7 @@ int main(int argc, char **argv) if(result) { fprintf(stderr, "parsed=%" PRId64 " bytes\n", result->bit_length/8); h_pprint(stdout, result->ast, 0, 0); + h_parse_result_free(result); return 0; } else { return 1; diff --git a/examples/ttuser.c b/examples/ttuser.c new file mode 100644 index 0000000000000000000000000000000000000000..c16e4625bb72d64f7803eec5f360b2cb98d10892 --- /dev/null +++ b/examples/ttuser.c @@ -0,0 +1,140 @@ +/* + * Example parser that demonstrates the use of user-defined token types. + * + * Note the custom printer function that hooks into h_pprint(). + */ + +#include "../src/hammer.h" +#include "../src/glue.h" + + +/* + * custom tokens + */ + +HTokenType TT_SUBJ, TT_PRED, TT_OBJ, TT_ADJ, TT_ADVC; + +void +pprint(FILE *stream, const HParsedToken *tok, int indent, int delta) +{ + /* + * Pretty-printer rules: + * + * - Output 'indent' spaces after every newline you produce. + * - Do not add indent on the first line of output. + * - Do not add a trailing newline. + * - Indent sub-objects by adding 'delta' to 'indent'. + */ + + if (((HParsedToken *)tok->user)->token_type == TT_SEQUENCE) + fprintf(stream, "\n%*s", indent, ""); + h_pprint(stream, tok->user, indent, delta); +} + +/* XXX define umamb_sub as well */ + +void +init(void) +{ + TT_SUBJ = h_allocate_token_new("subject", NULL, pprint); + TT_PRED = h_allocate_token_new("predicate", NULL, pprint); + TT_OBJ = h_allocate_token_new("object", NULL, pprint); + TT_ADJ = h_allocate_token_new("adjective", NULL, pprint); + TT_ADVC = h_allocate_token_new("adverbial clause", NULL, pprint); +} + + +/* + * semantic actions + * + * Normally these would be more interesting, but for this example, we just wrap + * our tokens in their intended types. + */ +HParsedToken *act_subj(const HParseResult *p, void *u) { + return H_MAKE(SUBJ, (void *)p->ast); +} +HParsedToken *act_pred(const HParseResult *p, void *u) { + return H_MAKE(PRED, (void *)p->ast); +} +HParsedToken *act_obj(const HParseResult *p, void *u) { + return H_MAKE(OBJ, (void *)p->ast); +} +HParsedToken *act_adj(const HParseResult *p, void *u) { + return H_MAKE(ADJ, (void *)p->ast); +} +HParsedToken *act_advc(const HParseResult *p, void *u) { + return H_MAKE(ADVC, (void *)p->ast); +} + + +/* + * grammar + */ + +HParser * +build_parser(void) +{ + /* words */ + #define W(X) h_whitespace(h_literal((const uint8_t *)(#X))) + H_RULE(art, h_choice(W(a), W(the), NULL)); + H_RULE(noun, h_choice(W(cat), W(dog), W(fox), W(tiger), W(lion), + W(bear), W(fence), W(tree), W(car), W(cow), NULL)); + H_RULE(verb, h_choice(W(eats), W(jumps), W(falls), NULL)); + H_ARULE(adj, h_choice(W(quick), W(slow), W(happy), W(lazy), W(cyan), + W(magenta), W(yellow), W(black), W(brown), NULL)); + H_RULE(adverb, h_choice(W(with), W(over), W(after), NULL)); + #undef W + + /* phrases */ + H_RULE(nphrase, h_sequence(art, h_many(adj), noun, NULL)); + + /* sentence structure */ + H_ARULE(subj, nphrase); + H_ARULE(pred, verb); + H_ARULE(obj, nphrase); + H_ARULE(advc, h_sequence(adverb, nphrase, NULL)); + H_RULE(sentnc, h_sequence(subj, pred, + h_optional(obj), h_optional(advc), NULL)); + + return sentnc; +} + + +/* + * main routine: read, parse, print + * + * input e.g.: + * "the quick brown fox jumps the fence with a cyan lion" + */ + +#include <stdio.h> +#include <inttypes.h> + +int +main(int argc, char **argv) +{ + uint8_t input[1024]; + size_t sz; + const HParser *parser; + const HParseResult *result; + + init(); + parser = build_parser(); + + sz = fread(input, 1, sizeof(input), stdin); + if (!feof(stdin)) { + fprintf(stderr, "too much input\n"); + return 1; + } + + result = h_parse(parser, input, sz); + if (!result) { + fprintf(stderr, "no parse\n"); + return 1; + } + + h_pprintln(stdout, result->ast); + fprintf(stderr, "consumed %" PRId64 "/%zu bytes.\n", + result->bit_length / 8, sz); + return 0; +} diff --git a/src/SConscript b/src/SConscript index 0c4f81ed3bcce5408e681ec92a7f0a9e677141f3..ccb6f5d8cb758219aa1ad83bc44124c0368e32ba 100644 --- a/src/SConscript +++ b/src/SConscript @@ -52,7 +52,8 @@ parsers = ['parsers/%s.c'%s for s in 'unimplemented', 'whitespace', 'xor', - 'value']] + 'value', + 'seek']] backends = ['backends/%s.c' % s for s in ['packrat', 'llk', 'regex', 'glr', 'lalr', 'lr', 'lr0']] @@ -98,9 +99,20 @@ if env['PLATFORM'] == 'win32': # prevent collision between .lib from dll and .lib for static lib static_library_name = 'hammer_s' -libhammer_shared = env.SharedLibrary('hammer', parsers + backends + misc_hammer_parts) -libhammer_static = env.StaticLibrary(static_library_name, parsers + backends + misc_hammer_parts) +if 'GPROF' in env and env['GPROF'] == 1: + # Disable the shared library (it won't work with gprof) and rename the static one + build_shared_library=False + static_library_name = 'hammer_pg' + +# Markers for later +libhammer_static = None +libhammer_shared = None + if build_shared_library: + libhammer_shared = env.SharedLibrary('hammer', parsers + backends + misc_hammer_parts) +libhammer_static = env.StaticLibrary(static_library_name, parsers + backends + misc_hammer_parts) + +if libhammer_shared is not None: Default(libhammer_shared, libhammer_static) env.Install('$libpath', [libhammer_static, libhammer_shared]) else: @@ -115,14 +127,20 @@ env.Install('$pkgconfigpath', '../../../libhammer.pc') if GetOption('with_tests'): testenv = env.Clone() testenv.ParseConfig('pkg-config --cflags --libs glib-2.0') - testenv.Append(LIBS=['hammer']) + if libhammer_shared is not None: + testenv.Append(LIBS=['hammer']) + else: + testenv.Append(LIBS=[static_library_name]) testenv.Prepend(LIBPATH=['.']) ctestexec = testenv.Program('test_suite', ctests + ['test_suite.c'], LINKFLAGS='--coverage' if testenv.GetOption('coverage') else None) ctest = Alias('testc', [ctestexec], ''.join(['env LD_LIBRARY_PATH=', os.path.dirname(ctestexec[0].path), ' ', ctestexec[0].path])) AlwaysBuild(ctest) testruns.append(ctest) -Export('libhammer_static libhammer_shared') +if libhammer_shared is not None: + Export('libhammer_static libhammer_shared') +else: + Export('libhammer_static') for b in env['bindings']: env.SConscript(['bindings/%s/SConscript' % b]) diff --git a/src/backends/lr.c b/src/backends/lr.c index f2ac4956d80358e51d35c0e70484013bbfde212a..6919bf6d0a8d284c95167adb07023914a49f89b4 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -420,6 +420,12 @@ void h_lr_parse_start(HSuspendedParser *s) s->backend_state = engine; } +// cf. comment before run_trace in regex.c +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunknown-pragmas" +#pragma GCC diagnostic ignored "-Wclobbered" +#endif bool h_lr_parse_chunk(HSuspendedParser* s, HInputStream *stream) { HLREngine *engine = s->backend_state; @@ -457,6 +463,10 @@ bool h_lr_parse_chunk(HSuspendedParser* s, HInputStream *stream) *stream = engine->input; return !run; // done if engine no longer running } +// Reenable -Wclobber +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic pop +#endif HParseResult *h_lr_parse_finish(HSuspendedParser *s) { diff --git a/src/backends/regex.c b/src/backends/regex.c index 9646ddd59343cacbd1cc53645161c88d70c15f78..c10c25890fd5bfdf5e3e9b37a64e988fd3010749 100644 --- a/src/backends/regex.c +++ b/src/backends/regex.c @@ -223,7 +223,7 @@ bool svm_stack_ensure_cap(HAllocator *mm__, HSVMContext *ctx, size_t addl) { * the second return; here, the only variables that could matter for * are arena and ctx (because they're referenced in "goto fail"). */ -#ifdef __GNUC__ +#if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunknown-pragmas" #pragma GCC diagnostic ignored "-Wclobbered" @@ -311,7 +311,7 @@ HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace, return NULL; } // Reenable -Wclobber -#ifdef __GNUC__ +#if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/bindings/lua/hammer.lua b/src/bindings/lua/hammer.lua new file mode 100644 index 0000000000000000000000000000000000000000..2ee1656a098633801610a4ee181366d13dd69d10 --- /dev/null +++ b/src/bindings/lua/hammer.lua @@ -0,0 +1,371 @@ +local ffi = require("ffi") +ffi.cdef[[ +typedef enum HParserBackend_ { + PB_MIN = 0, + PB_PACKRAT = PB_MIN, // PB_MIN is always the default. + PB_REGULAR, + PB_LLk, + PB_LALR, + PB_GLR, + PB_MAX = PB_GLR +} HParserBackend; + +typedef enum HTokenType_ { + TT_NONE = 1, + TT_BYTES = 2, + TT_SINT = 4, + TT_UINT = 8, + TT_SEQUENCE = 16, + TT_RESERVED_1, // reserved for backend-specific internal use + TT_ERR = 32, + TT_USER = 64, + TT_MAX +} HTokenType; + +typedef struct HBytes_ { + const uint8_t *token; + size_t len; +} HBytes; + +typedef struct HArena_ HArena ; // hidden implementation + +typedef struct HCountedArray_ { + size_t capacity; + size_t used; + HArena * arena; + struct HParsedToken_ **elements; +} HCountedArray; + +typedef struct HParsedToken_ { + HTokenType token_type; + union { + HBytes bytes; + int64_t sint; + uint64_t uint; + double dbl; + float flt; + HCountedArray *seq; // a sequence of HParsedToken's + void *user; + }; + size_t index; + size_t bit_length; + char bit_offset; +} HParsedToken; + +typedef struct HParseResult_ { + const HParsedToken *ast; + int64_t bit_length; + HArena * arena; +} HParseResult; + +typedef struct HParserVtable_ HParserVtable; +typedef struct HCFChoice_ HCFChoice; + +typedef struct HParser_ { + const HParserVtable *vtable; + HParserBackend backend; + void* backend_data; + void *env; + HCFChoice *desugared; +} HParser; + +typedef struct HAllocator_ HAllocator; + +typedef HParsedToken* (*HAction)(const HParseResult *p, void* user_data); +typedef bool (*HPredicate)(HParseResult *p, void* user_data); +typedef HParser* (*HContinuation)(HAllocator *mm__, const HParsedToken *x, void *env); + +HParseResult* h_parse(const HParser* parser, const uint8_t* input, size_t length); +HParser* h_token(const uint8_t *str, const size_t len); +HParser* h_ch(const uint8_t c); +HParser* h_ch_range(const uint8_t lower, const uint8_t upper); +HParser* h_int_range(const HParser *p, const int64_t lower, const int64_t upper); +HParser* h_bits(size_t len, bool sign); +HParser* h_int64(); +HParser* h_int32(); +HParser* h_int16(); +HParser* h_int8(); +HParser* h_uint64(); +HParser* h_uint32(); +HParser* h_uint16(); +HParser* h_uint8(); +HParser* h_whitespace(const HParser* p); +HParser* h_left(const HParser* p, const HParser* q); +HParser* h_right(const HParser* p, const HParser* q); +HParser* h_middle(const HParser* p, const HParser* x, const HParser* q); +HParser* h_action(const HParser* p, const HAction a, void* user_data); +HParser* h_in(const uint8_t *charset, size_t length); +HParser* h_not_in(const uint8_t *charset, size_t length); +HParser* h_end_p(); +HParser* h_nothing_p(); +HParser* h_sequence(HParser* p, ...); +HParser* h_choice(HParser* p, ...); +HParser* h_permutation(HParser* p, ...); +HParser* h_butnot(const HParser* p1, const HParser* p2); +HParser* h_difference(const HParser* p1, const HParser* p2); +HParser* h_xor(const HParser* p1, const HParser* p2); +HParser* h_many(const HParser* p); +HParser* h_many1(const HParser* p); +HParser* h_repeat_n(const HParser* p, const size_t n); +HParser* h_optional(const HParser* p); +HParser* h_ignore(const HParser* p); +HParser* h_sepBy(const HParser* p); +HParser* h_sepBy1(const HParser* p); +HParser* h_epsilon_p(); +HParser* h_length_value(const HParser* length, const HParser* value); +HParser* h_attr_bool(const HParser* p, HPredicate pred, void* user_data); +HParser* h_and(const HParser* p); +HParser* h_not(const HParser* p); +HParser* h_indirect(const HParser* p); +void h_bind_indirect(HParser* indirect, const HParser* inner); +HParser* h_with_endianness(char endianness, const HParser* p); +HParser* h_put_value(const HParser* p, const char* name); +HParser* h_get_value(const char* name); +HParser* h_bind(const HParser *p, HContinuation k, void *env); + +int h_compile(HParser* parser, HParserBackend backend, const void* params); + +static const uint8_t BYTE_BIG_ENDIAN = 0x1; +static const uint8_t BIT_BIG_ENDIAN = 0x2; +static const uint8_t BYTE_LITTLE_ENDIAN = 0x0; +static const uint8_t BIT_LITTLE_ENDIAN = 0x0; +]] +local h = ffi.load("hammer") + +local function helper(a, n, b, ...) + if n == 0 then return a + else return b, helper(a, n-1, ...) end +end +local function append(a, ...) + return helper(a, select('#', ...), ...) +end + +local mt = { + __index = { + parse = function(p, str) return h.h_parse(p, str, #str) end, + }, +} +local hammer = {} +hammer.parser = ffi.metatype("HParser", mt) + +local counted_array +local arr_mt = { + __index = function(table, key) + return table.elements[key] + end, + __len = function(table) return table.used end, + __ipairs = function(table) + local i, n = 0, #table + return function() + i = i + 1 + if i <= n then + return i, table.elements[i] + end + end + end, + __call = function(self) + ret = {} + for i, v in ipairs(self) + do ret[#ret+1] = v + end + return ret + end +} +counted_array = ffi.metatype("HCountedArray", arr_mt) + +local bytes_mt = { + __call = function(self) + local ret = "" + for i = 0, tonumber(ffi.cast("uintptr_t", ffi.cast("void *", self.len)))-1 + do ret = ret .. string.char(self.token[i]) + end + return ret + end +} +local byte_string = ffi.metatype("HBytes", bytes_mt) + +local token_types = ffi.new("HTokenType") + +local parsed_token +local tok_mt = { + __call = function(self) + if self.token_type == ffi.C.TT_BYTES then + return self.bytes() + elseif self.token_type == ffi.C.TT_SINT then + return tonumber(ffi.cast("intptr_t", ffi.cast("void *", self.sint))) + elseif self.token_type == ffi.C.TT_UINT then + return tonumber(ffi.cast("uintptr_t", ffi.cast("void *", self.uint))) + elseif self.token_type == ffi.C.TT_SEQUENCE then + return self.seq() + end + end +} +parsed_token = ffi.metatype("HParsedToken", tok_mt) + +function hammer.token(str) + return h.h_token(str, #str) +end +function hammer.ch(c) + if type(c) == "number" then + return h.h_ch(c) + else + return h.h_ch(c:byte()) + end +end +function hammer.ch_range(lower, upper) + if type(lower) == "number" and type(upper) == "number" then + return h.h_ch_range(lower, upper) + -- FIXME this is really not thorough type checking + else + return h.h_ch_range(lower:byte(), upper:byte()) + end +end +function hammer.int_range(parser, lower, upper) + return h.h_int_range(parser, lower, upper) +end +function hammer.bits(len, sign) + return h.h_bits(len, sign) +end +function hammer.int64() + return h.h_int64() +end +function hammer.int32() + return h.h_int32() +end +function hammer.int16() + return h.h_int16() +end +function hammer.int8() + return h.h_int8() +end +function hammer.uint64() + return h.h_uint64() +end +function hammer.uint32() + return h.h_uint32() +end +function hammer.uint16() + return h.h_uint16() +end +function hammer.uint8() + return h.h_uint8() +end +function hammer.whitespace(parser) + return h.h_whitespace(parser) +end +function hammer.left(parser1, parser2) + return h.h_left(parser1, parser2) +end +function hammer.right(parser1, parser2) + return h.h_right(parser1, parser2) +end +function hammer.middle(parser1, parser2, parser3) + return h.h_middle(parser1, parser2, parser3) +end +-- There could also be an overload of this that doesn't +-- bother with the env pointer, and passes it as NIL by +-- default, but I'm not going to deal with overloads now. +function hammer.action(parser, action, user_data) + local cb = ffi.cast("HAction", action) + return h.h_action(parser, cb, user_data) +end +function hammer.in_(charset) + local cs = ffi.new("const unsigned char[" .. #charset .. "]", charset) + return h.h_in(cs, #charset) +end +function hammer.not_in(charset) + return h.h_not_in(charset, #charset) +end +function hammer.end_p() + return h.h_end_p() +end +function hammer.nothing_p() + return h.h_nothing_p() +end +function hammer.sequence(parser, ...) + local parsers = append(nil, ...) + return h.h_sequence(parser, parsers) +end +function hammer.choice(parser, ...) + local parsers = append(nil, ...) + return h.h_choice(parser, parsers) +end +function hammer.permutation(parser, ...) + local parsers = append(nil, ...) + return h.h_permutation(parser, parsers) +end +function hammer.butnot(parser1, parser2) + return h.h_butnot(parser1, parser2) +end +function hammer.difference(parser1, parser2) + return h.h_difference(parser1, parser2) +end +function hammer.xor(parser1, parser2) + return h.h_xor(parser1, parser2) +end +function hammer.many(parser) + return h.h_many(parser) +end +function hammer.many1(parser) + return h.h_many1(parser) +end +function hammer.repeat_n(parser, n) + return h.h_repeat_n(parser, n) +end +function hammer.optional(parser) + return h.h_optional(parser) +end +function hammer.ignore(parser) + return h.h_ignore(parser) +end +function hammer.sepBy(parser) + return h.h_sepBy(parser) +end +function hammer.sepBy1(parser) + return h.h_sepBy1(parser) +end +function hammer.epsilon_p() + return h.h_epsilon_p() +end +function hammer.length_value(length, value) + return h.h_length_value(length, value) +end +function hammer.attr_bool(parser, predicate, user_data) + local cb = ffi.cast("HPredicate", predicate) + return h.h_attr_bool(parser, cb, user_data) +end +function hammer.and_(parser) + return h.h_and(parser) +end +function hammer.not_(parser) + return h.h_not(parser) +end +function hammer.indirect(parser) + return h.h_indirect(parser) +end +function hammer.bind_indirect(indirect, inner) + return h.h_bind_indirect(indirect, inner) +end +function hammer.with_endianness(endianness, parser) + return h.h_with_endianness(endianness, parser) +end +function hammer.put_value(parser, name) + return h.h_put_value(parser, name) +end +function hammer.get_value(name) + return h.h_get_value(name) +end +function hammer.bind(parser, continuation, env) + local cb = ffi.cast("HContinuation", continuation) + return h.h_bind(parser, cb, env) +end + +function hammer.compile(parser, backend, params) + return h.h_compile(parser, backend, params) +end + +hammer.BYTE_BIG_ENDIAN = 0x1; +hammer.BIT_BIG_ENDIAN = 0x2; +hammer.BYTE_LITTLE_ENDIAN = 0x0; +hammer.BIT_LITTLE_ENDIAN = 0x0; +return hammer \ No newline at end of file diff --git a/src/bindings/lua/test.lua b/src/bindings/lua/test.lua new file mode 100644 index 0000000000000000000000000000000000000000..cc32ce28ddfa003ea9c6c1eebbe7036e087c3685 --- /dev/null +++ b/src/bindings/lua/test.lua @@ -0,0 +1,844 @@ +describe("Combinator tests", function() + local hammer + + setup(function() + hammer = require("hammer") + ffi = require("ffi") + end) + + teardown(function() + hammer = nil + end) + + describe("Token tests", function() + local parser = hammer.token("95" .. string.char(0xa2)) + it("parses a token", function() + local ret = parser:parse("95" .. string.char(0xa2)) + assert.are.same("95" .. string.char(0xa2), ret.ast.bytes()) + end) + it("does not parse an incomplete token", function() + local ret = parser:parse("95") + assert.is_falsy(ret) + end) + end) + + describe("Char tests", function() + local parser = hammer.ch(0xa2) + it("parses a matching char", function() + local ret = parser:parse(string.char(0xa2)) + assert.are.same(string.char(0xa2), string.char(ret.ast())) + end) + it("rejects a non-matching char", function() + local ret = parser:parse(string.char(0xa3)) + assert.is_falsy(ret) + end) + end) + + describe("Char range tests", function() + local parser = hammer.ch_range("a", "c") + it("parses a char in the range", function() + local ret = parser:parse("b") + assert.are.same("b", string.char(ret.ast())) + end) + it("rejects a char outside the range", function() + local ret = parser:parse("d") + assert.is_falsy(ret) + end) + end) + + describe("Signed 64-bit int tests", function() + local parser = hammer.int64() + it("parses a valid 64-bit int", function() + local ret = parser:parse(string.char(0xff, 0xff, 0xff, 0xfe, 0x00, 0x00, 0x00, 0x00)) + assert.are.same(-0x200000000, ret.ast.sint) + end) + it("does not parse an invalid 64-bit int", function() + local ret = parser:parse(string.char(0xff, 0xff, 0xff, 0xfe, 0x00, 0x00, 0x00)) + assert.is_falsy(ret) + end) + end) + + describe("Signed 32-bit int tests", function() + local parser = hammer.int32() + it("parses a valid 32-bit int", function() + local ret = parser:parse(string.char(0xff, 0xfe, 0x00, 0x00)) + assert.are.same(-0x20000, ret.ast.sint) + end) + it("does not parse an invalid 32-bit int", function() + local ret = parser:parse(string.char(0xff, 0xfe, 0x00)) + assert.is_falsy(ret) + end) + end) + + describe("Signed 16-bit int tests", function() + local parser = hammer.int16() + it("parses a valid 16-bit int", function() + local ret = parser:parse(string.char(0xfe, 0x00)) + assert.are.same(-0x200, ret.ast.sint) + end) + it("does not parse an invalid 16-bit int", function() + local ret = parser:parse(string.char(0xfe)) + assert.is_falsy(ret) + end) + end) + + describe("Signed 8-bit int tests", function() + local parser = hammer.int8() + it("parses a valid 8-bit int", function() + local ret = parser:parse(string.char(0x88)) + assert.are.same(-0x78, ret.ast.sint) + end) + it("does not parse an invalid 8-bit int", function() + local ret = parser:parse("") + assert.is_falsy(ret) + end) + end) + + describe("Unsigned 64-bit int tests", function() + local parser = hammer.uint64() + it("parses a valid 64-bit unsigned int", function() + local ret = parser:parse(string.char(0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00)) + assert.are.same(0x200000000, ret.ast()) + end) + it("does not parse an invalid 64-bit unsigned int", function() + local ret = parser:parse(string.char(0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00)) + assert.is_falsy(ret) + end) + end) + + describe("Unsigned 32-bit int tests", function() + local parser = hammer.uint32() + it("parses a valid 32-bit unsigned int", function() + local ret = parser:parse(string.char(0x00, 0x02, 0x00, 0x00)) + assert.are.same(0x20000, ret.ast()) + end) + it("does not parse an invalid 32-bit unsigned int", function() + local ret = parser:parse(string.char(0x00, 0x02, 0x00)) + assert.is_falsy(ret) + end) + end) + + describe("Unsigned 16-bit int tests", function() + local parser = hammer.uint16() + it("parses a valid 16-bit unsigned int", function() + local ret = parser:parse(string.char(0x02, 0x00)) + assert.are.same(0x200, ret.ast()) + end) + it("does not parse an invalid 16-bit unsigned int", function() + local ret = parser:parse(string.char(0x02)) + assert.is_falsy(ret) + end) + end) + + describe("Unsigned 8-bit int tests", function() + local parser = hammer.uint8() + it("parses a valid 8-bit unsigned int", function() + local ret = parser:parse(string.char(0x78)) + assert.are.same(0x78, ret.ast()) + end) + it("does not parse an invalid 8=bit unsigned int", function() + local ret = parser:parse("") + assert.is_falsy(ret) + end) + end) + + describe("Integer range tests", function() + local parser = hammer.int_range(hammer.uint8(), 3, 10) + it("parses a value in the range", function() + local ret = parser:parse(string.char(0x05)) + assert.are.same(5, ret.ast()) + end) + it("does not parse a value outside the range", function() + local ret = parser:parse(string.char(0xb)) + assert.is_falsy(ret) + end) + end) + + describe("Whitespace tests", function() + local parser = hammer.whitespace(hammer.ch("a")) + local parser2 = hammer.whitespace(hammer.end_p()) + it("parses a string with no whitespace", function() + local ret = parser:parse("a") + assert.are.same("a", string.char(ret.ast())) + end) + it("parses a string with a leading space", function() + local ret = parser:parse(" a") + assert.are.same("a", string.char(ret.ast())) + end) + it("parses a string with leading spaces", function() + local ret = parser:parse(" a") + assert.are.same("a", string.char(ret.ast())) + end) + it("parses a string with a leading tab", function() + local ret = parser:parse("\ta") + assert.are.same("a", string.char(ret.ast())) + end) + it("does not parse a string with a leading underscore", function() + local ret = parser:parse("_a") + assert.is_falsy(ret) + end) + it("parses an empty string", function() + local ret = parser2:parse("") + assert.are.same(nil, ret.ast) + end) + it("parses a whitespace-only string", function() + local ret = parser2:parse(" ") + assert.are.same(nil, ret.ast) + end) + it("does not parse a string with leading whitespace and a trailing character", function() + local ret = parser2:parse(" x") + assert.is_falsy(ret) + end) + end) + + describe("Leftmost-parser tests", function() + local parser = hammer.left(hammer.ch("a"), hammer.ch(" ")) + it("parses the leftmost character", function() + local ret = parser:parse("a ") + assert.are.same("a", string.char(ret.ast())) + end) + it("does not parse a string that is too short", function() + local ret = parser:parse("a") + assert.is_falsy(ret) + end) + it("does not parse a string that starts with the wrong character", function() + local ret = parser:parse(" ") + assert.is_falsy(ret) + end) + it("does not parse a string with the wrong character in the second place", function() + local ret = parser:parse("ab") + assert.is_falsy(ret) + end) + end) + + describe("Rightmost-parser tests", function() + local parser = hammer.right(hammer.ch(" "), hammer.ch("a")) + it("parses the rightmost character", function() + local ret = parser:parse(" a") + assert.are.same("a", string.char(ret.ast())) + end) + it("does not parse a string that starts with the wrong character", function() + local ret = parser:parse("a") + assert.is_falsy(ret) + end) + it("does not parse a string that is too short", function() + local ret = parser:parse(" ") + assert.is_falsy(ret) + end) + it("does not parse a string with the characters in the wrong order", function() + local ret = parser:parse("ba") + assert.is_falsy(ret) + end) + end) + + describe("Middle-parser tests", function() + local parser = hammer.middle(hammer.ch(" "), hammer.ch("a"), hammer.ch(" ")) + it("parses the middle character", function() + local ret = parser:parse(" a ") + assert.are.same("a", string.char(ret.ast())) + end) + it("does not parse a string that is too short", function() + local ret = parser:parse("a") + assert.is_falsy(ret) + ret = parser:parse(" ") + assert.is_falsy(ret) + ret = parser:parse(" a") + assert.is_falsy(ret) + ret = parser:parse("a ") + assert.is_falsy(ret) + end) + it("does not parse a string with the wrong character in the middle", function() + ret = parser:parse(" b ") + assert.is_falsy(ret) + end) + it("does not parse a string that starts with the wrong character", function() + ret = parser:parse("ba ") + assert.is_falsy(ret) + end) + it("does not parse a string that ends with the wrong character", function() + ret = parser:parse(" ab") + assert.is_falsy(ret) + end) + end) + + describe("Semantic action tests", function() + local function upcase(result, user_data) + local chars = result.ast() + local ret = "" + for i, v in ipairs(chars) + do ret = ret .. string.char(v()):upper() + end + return ffi.new("HParsedToken", {hammer.TT_BYTES, ret}) + end + local parser = hammer.action(hammer.sequence(hammer.choice(hammer.ch("a"), hammer.ch("A")), hammer.choice(hammer.ch("b"), hammer.ch("B"))), upcase, nil) + it("converts a lowercase 'ab' to uppercase", function() + local ret = parser:parse("ab") + assert.are.same("AB", ret.ast()) + end) + it("accepts an uppercase 'AB' unchanged", function() + local ret = parser:parse("AB") + assert.are.same("AB", ret.ast()) + end) + it("rejects strings that don't match the underlying parser", function() + local ret = parser:parse("XX") + assert.is_falsy(ret) + end) + end) + + describe("Character set membership tests", function() + local parser = hammer.in_({"a", "b", "c"}) + it("parses a character that is in the included set", function() + local ret = parser:parse("b") + assert.are.same("b", string.char(ret.ast())) + end) + it("does not parse a character that is not in the included set", function() + local ret = parser:parse("d") + assert.is_falsy(ret) + end) + end) + + describe("Character set non-membership tests", function() + local parser = hammer.not_in({"a", "b", "c"}) + it("parses a character that is not in the excluded set", function() + local ret = parser:parse("d") + assert.are.same("d", string.char(ret.ast())) + end) + it("does not parse a character that is in the excluded set", function() + local ret = parser:parse("a") + assert.is_falsy(ret) + end) + end) + + describe("End-of-input tests", function() + local parser = hammer.sequence(hammer.ch("a"), hammer.end_p()) + it("parses a string that ends where it is expected to", function() + local ret = parser:parse("a") + assert.are.same({"a"}, ret.ast()) + end) + it("does not parse a string that is too long", function() + local ret = parser:parse("aa") + assert.is_falsy(ret) + end) + end) + + describe("Bottom parser tests", function() + local parser = hammer.nothing_p() + it("always fails", function() + local ret = parser:parse("a") + assert.is_falsy(ret) + end) + end) + + describe("Parser sequence tests", function() + local parser = hammer.sequence(hammer.ch("a"), hammer.ch("b")) + local parser2 = hammer.sequence(hammer.ch("a"), hammer.whitespace(hammer.ch("b"))) + it("parses a string matching the sequence", function() + local ret = parser:parse("ab") + assert.are.same({"a", "b"}, ret.ast()) + end) + it("does not parse a string that is too short", function() + local ret = parser:parse("a") + assert.is_falsy(ret) + end) + it("does not parse a string with the sequence out of order", function() + local ret = parser:parse("ba") + assert.is_falsy(ret) + end) + it("parses a whitespace-optional string with no whitespace", function() + local ret = parser2:parse("ab") + assert.are.same({"a", "b"}, ret.ast()) + end) + -- it("parses a whitespace-optional string containing whitespace", function() + -- local ret = parser:parse("a b") + -- assert.are.same({"a", "b"}, ret.ast()) -- this is the line that segfaults + -- print("in sequence") + -- ret = parser:parse("a b") + -- assert.are.same({"a", "b"}, ret.ast()) + -- end) + end) + + describe("Choice-of-parsers tests", function() + local parser = hammer.choice(hammer.ch("a"), hammer.ch("b")) + it("parses a character in the choice set", function() + local ret = parser:parse("a") + assert.are.same("a", string.char(ret.ast())) + ret = parser:parse("b") + assert.are.same("b", string.char(ret.ast())) + end) + it("does not parse a character not in the choice set", function() + local ret = parser:parse("c") + assert.is_falsy(ret) + end) + end) + + describe("X-but-not-Y tests", function() + local parser = hammer.butnot(hammer.ch("a"), hammer.token("ab")) + local parser2 = hammer.butnot(hammer.ch_range("0", "9"), hammer.ch("6")) + it("succeeds when 'a' matches but 'ab' doesn't", function() + local ret = parser:parse("a") + assert.are.same("a", string.char(ret.ast())) + ret = parser:parse("aa") + assert.are.same("a", string.char(ret.ast())) + end) + it("fails when p2's result is longer than p1's", function() + local ret = parser:parse("ab") + assert.is_falsy(ret) + end) + it("fails when p2's result is the same length as p1's", function() + local ret = parser2:parse("6") + assert.is_falsy(ret) + end) + end) + + describe("Difference-of-parsers tests", function() + local parser = hammer.difference(hammer.token("ab"), hammer.ch("a")) + it("succeeds when 'ab' matches and its result is longer than the result for 'a'", function() + local ret = parser:parse("ab") + assert.are.same("ab", ret.ast()) + end) + it("fails if 'ab' doesn't match", function() + local ret = parser:parse("a") + assert.is_falsy(ret) + end) + end) + + describe("XOR-of-parsers tests", function() + local parser = hammer.xor(hammer.ch_range("0", "6"), hammer.ch_range("5", "9")) + it("parses a value only in the first range", function() + local ret = parser:parse("0") + assert.are.same("0", string.char(ret.ast())) + end) + it("parses a value only in the second range", function() + local ret = parser:parse("9") + assert.are.same("9", string.char(ret.ast())) + end) + it("does not parse a value inside both ranges", function() + local ret = parser:parse("5") + assert.is_falsy(ret) + end) + it("does not parse a value outside the range", function() + local ret = parser:parse("a") + assert.is_falsy(ret) + end) + end) + + describe("Kleene * tests", function() + local parser = hammer.many(hammer.choice(hammer.ch("a"), hammer.ch("b"))) + it("parses an empty string", function() + local ret = parser:parse("") + assert.are.same({}, ret.ast()) + end) + it("parses a single repetition of the pattern", function() + local ret = parser:parse("a") + assert.are.same({"a"}, ret.ast()) + ret = parser:parse("b") + assert.are.same({"b"}, ret.ast()) + end) + it("parses multiple repetitions of the pattern", function() + local ret = parser:parse("aabbaba") + assert.are.same({"a", "a", "b", "b", "a", "b", "a"}, ret.ast()) + end) + end) + + describe("Kleene + tests", function() + local parser = hammer.many1(hammer.choice(hammer.ch("a"), hammer.ch("b"))) + it("does not parse an empty string", function() + local ret = parser:parse("") + assert.is_falsy(ret) + end) + it("parses a single repetition of the pattern", function() + local ret = parser:parse("a") + assert.are.same({"a"}, ret.ast()) + ret = parser:parse("b") + assert.are.same({"b"}, ret.ast()) + end) + it("parses multiple repetitions of the pattern", function() + local ret = parser:parse("aabbaba") + assert.are.same({"a", "a", "b", "b", "a", "b", "a"}, ret.ast()) + end) + it("does not parse a string that does not start with one of the patterns to repeat", function() + local ret = parser:parse("daabbabadef") + assert.is_falsy(ret) + end) + end) + + describe("Fixed-number-of-repetitions tests", function() + local parser = hammer.repeat_n(hammer.choice(hammer.ch("a"), hammer.ch("b")), 2) + it("does not parse a string without enough repetitions", function() + local ret = parser:parse("adef") + assert.is_falsy(ret) + end) + it("parses a string containing the correct number of repetitions", function() + local ret = parser:parse("abdef") + assert.are.same({"a", "b"}, ret.ast()) + end) + it("does not parse a string that does not start with a character in the repetition set", function() + local ret = parser:parse("dabdef") + assert.is_falsy(ret) + end) + end) + + describe("Kleene ? tests", function() + local parser = hammer.sequence(hammer.ch("a"), hammer.optional(hammer.choice(hammer.ch("b"), hammer.ch("c"))), hammer.ch("d")) + it("parses a string containing either optional character", function() + local ret = parser:parse("abd") + assert.are.same({"a", "b", "d"}, ret.ast()) + ret = parser:parse("acd") + assert.are.same({"a", "c", "d"}, ret.ast()) + end) + it("parses a string missing one of the optional characters", function() + local ret = parser:parse("ad") + assert.are.same({"a", {}, "d"}, ret.ast()) + end) + it("does not parse a string containing a character not among the optional ones", function() + local ret = parser:parse("aed") + assert.is_falsy(ret.ast) + end) + end) + + describe("'ignore' decorator tests", function() + local parser = hammer.sequence(hammer.ch("a"), hammer.ignore(hammer.ch("b")), hammer.ch("c")) + it("parses a string containing the pattern to ignore, and leaves that pattern out of the result", function() + local ret = parser:parse("abc") + assert.are.same({"a", "c"}, ret.ast()) + end) + it("does not parse a string not containing the pattern to ignore", function() + local ret = parser:parse("ac") + assert.is_falsy(ret) + end) + end) + + describe("Possibly-empty separated lists", function() + local parser = hammer.sepBy(hammer.choice(hammer.ch("1"), hammer.ch("2"), hammer.ch("3")), hammer.ch(",")) + it("parses an ordered list", function() + local ret = parser:parse("1,2,3") + assert.are.same({"1", "2", "3"}, ret.ast()) + end) + it("parses an unordered list", function() + local ret = parser:parse("1,3,2") + assert.are.same({"1", "3", "2"}, ret.ast()) + end) + it("parses a list not containing all options", function() + local ret = parser:parse("1,3") + assert.are.same({"1", "3"}, ret.ast()) + end) + it("parses a unary list", function() + local ret = parser:parse("3") + assert.are.same({"3"}, ret.ast()) + end) + it("parses an empty list", function() + local ret = parser:parse("") + assert.are.same({}, ret.ast()) + end) + end) + + describe("Non-empty separated lists", function() + local parser = hammer.sepBy1(hammer.choice(hammer.ch("1"), hammer.ch("2"), hammer.ch("3")), hammer.ch(",")) + it("parses an ordered list", function() + local ret = parser:parse("1,2,3") + assert.are.same({"1", "2", "3"}, ret.ast()) + end) + it("parses an unordered list", function() + local ret = parser:parse("1,3,2") + assert.are.same({"1", "3", "2"}, ret.ast()) + end) + it("parses a list not containing all options", function() + local ret = parser:parse("1,3") + assert.are.same({"1", "3"}, ret.ast()) + end) + -- it("parses a unary list", function() + -- local ret = parser:parse("3") + -- print("in sepBy1") + -- assert.are.same({"3"}, ret.ast()) -- this line also segfaults + -- end) + it("does not parse an empty list", function() + local ret = parser:parse("") + assert.is_falsy(ret) + end) + end) + + describe("Empty string tests", function() + local parser = hammer.sequence(hammer.ch("a"), hammer.epsilon_p(), hammer.ch("b")) + local parser2 = hammer.sequence(hammer.epsilon_p(), hammer.ch("a")) + local parser3 = hammer.sequence(hammer.ch("a"), hammer.epsilon_p()) + it("parses an empty string between two characters", function() + local ret = parser:parse("ab") + assert.are.same({"a", "b"}, ret.ast()) + end) + it("parses an empty string before a character", function() + local ret = parser2:parse("a") + assert.are.same({"a"}, ret.ast()) + end) + it("parses an empty string after a character", function() + local ret = parser3:parse("a") + assert.are.same({"a"}, ret.ast()) + end) + end) + + describe("Attribute validation tests", function() + local function equals(result, user_data) + return result.ast.seq.elements[0].uint == result.ast.seq.elements[1].uint + end + local parser = hammer.attr_bool(hammer.many1(hammer.choice(hammer.ch("a"), hammer.ch("b"))), equals) + it("parses successfully when both characters are the same (i.e., the validation function succeeds)", function() + local ret = parser:parse("aa") + assert.are.same({"a", "a"}, ret.ast()) + ret = parser:parse("bb") + assert.are.same({"b", "b"}, ret.ast()) + end) + it("does not parse successfully when the characters are different (i.e., the validation function fails)", function() + local ret = parser:parse("ab") + assert.is_falsy(ret) + end) + end) + + describe("Matching lookahead tests", function() + local parser = hammer.sequence(hammer.and_(hammer.ch("0")), hammer.ch("0")) + local parser2 = hammer.sequence(hammer.and_(hammer.ch("0")), hammer.ch("1")) + local parser3 = hammer.sequence(hammer.ch("1"), hammer.and_(hammer.ch("2"))) + it("parses successfully when the lookahead matches the next character to parse", function() + local ret = parser:parse("0") + assert.are.same({"0"}, ret.ast()) + end) + it("does not parse successfully when the lookahead does not match the next character to parse", function() + local ret = parser2:parse("0") + assert.is_falsy(ret) + end) + it("parses successfully when the lookahead is there", function() + local ret = parser3:parse("12") + assert.are.same({"1"}, ret.ast()) + end) + end) + + describe("Non-matching lookahead tests", function() + local parser = hammer.sequence(hammer.ch("a"), hammer.choice(hammer.ch("+"), hammer.token("++")), hammer.ch("b")) + local parser2 = hammer.sequence(hammer.ch("a"), hammer.choice(hammer.sequence(hammer.ch("+"), hammer.not_(hammer.ch("+"))), hammer.token("++")), hammer.ch("b")) + it("parses a single plus correctly in the 'choice' example", function() + local ret = parser:parse("a+b") + assert.are.same({"a", "+", "b"}, ret.ast()) + end) + it("does not parse a double plus correctly in the 'choice' example", function() + local ret = parser:parse("a++b") + assert.is_falsy(ret) + end) + it("parses a single plus correctly in the 'not' example", function() + local ret = parser2:parse("a+b") + assert.are.same({"a", {"+"}, "b"}, ret.ast()) + end) + it("parses a double plus correctly in the 'not' example", function() + local ret = parser2:parse("a++b") + assert.are.same({"a", "++", "b"}, ret.ast()) + end) + end) + + describe("Left recursion tests", function() + local parser = hammer.indirect() + hammer.bind_indirect(parser, hammer.choice(hammer.sequence(parser, hammer.ch("a")), hammer.ch("a"))) + -- it("parses the base case", function() + -- print("in leftrec") + -- local ret = parser:parse("a") -- this line segfaults + -- assert.are.same({"a"}, ret.ast()) + -- end) + it("parses one level of recursion", function() + local ret = parser:parse("aa") + assert.are.same({"a", "a"}, ret.ast()) + end) + it("parses two levels of recursion", function() + local ret = parser:parse("aaa") + assert.are.same({{"a", "a"}, "a"}, ret.ast()) + end) + end) + + describe("Right recursion tests", function() + local parser = hammer.indirect() + hammer.bind_indirect(parser, hammer.choice(hammer.sequence(hammer.ch("a"), parser), hammer.epsilon_p())) + it("parses the base case", function() + local ret = parser:parse("a") + assert.are.same({"a"}, ret.ast()) + end) + it("parses one level of recursion", function() + local ret = parser:parse("aa") + assert.are.same({"a", {"a"}}, ret.ast()) + end) + it("parses two levels of recursion", function() + local ret = parser:parse("aaa") + assert.are.same({"a", {"a", {"a"}}}, ret.ast()) + end) + end) + + describe("Endianness tests", function() + local bit = require("bit") + local u32 = hammer.uint32() + local u5 = hammer.bits(5, false) + local bb = bit.bor(hammer.BYTE_BIG_ENDIAN, hammer.BIT_BIG_ENDIAN) + local bl = bit.bor(hammer.BYTE_BIG_ENDIAN, hammer.BIT_LITTLE_ENDIAN) + local lb = bit.bor(hammer.BYTE_LITTLE_ENDIAN, hammer.BIT_BIG_ENDIAN) + local ll = bit.bor(hammer.BYTE_LITTLE_ENDIAN, hammer.BIT_LITTLE_ENDIAN) + local parser1 = hammer.with_endianness(bb, u32) + local parser2 = hammer.with_endianness(bb, u5) + local parser3 = hammer.with_endianness(ll, u32) + local parser4 = hammer.with_endianness(ll, u5) + local parser5 = hammer.with_endianness(bl, u32) + local parser6 = hammer.with_endianness(bl, u5) + local parser7 = hammer.with_endianness(lb, u32) + local parser8 = hammer.with_endianness(lb, u5) + it("parses big-endian cases", function() + local ret = parser1:parse("abcd") + assert.are.same(0x61626364, ret.ast()) + ret = parser2:parse("abcd") + assert.are.same(0xc, ret.ast()) + end) + it("parses little-endian cases", function() + local ret = parser3:parse("abcd") + assert.are.same(0x61626364, ret.ast()) + ret = parser4:parse("abcd") + assert.are.same(0xc, ret.ast()) + end) + it("parses mixed-endian cases", function() + local ret = parser5:parse("abcd") + assert.are.same(0x61626364, ret.ast()) + ret = parser6:parse("abcd") + assert.are.same(0x1, ret.ast()) + ret = parser7:parse("abcd") + assert.are.same(0x64636261, ret.ast()) + ret = parser8:parse("abcd") + assert.are.same(0xc, ret.ast()) + end) + end) + + describe("Symbol table tests", function() + local parser = hammer.sequence(hammer.put_value(hammer.uint8(), "size"), hammer.token("foo"), hammer.length_value(hammer.get_value("size"), hammer.uint8())) + it("parses a string that has enough bytes for the specified length", function() + local ret = parser:parse(string.char(0x06) .. "fooabcdef") + assert.are.same("foo", ret.ast()[2]) + assert.are.same({0x61, 0x62, 0x63, 0x64, 0x65, 0x66}, ret.ast()[3]) + end) + it("does not parse a string that does not have enough bytes for the specified length", function() + local ret = parser:parse(string.char(0x06) .. "fooabcde") + assert.is_falsy(ret) + end) + end) + + describe("Permutation tests", function() + local parser = hammer.permutation(hammer.ch("a"), hammer.ch("b"), hammer.ch("c")) + it("parses a permutation of 'abc'", function() + local ret = parser:parse("abc") + assert.are.same({"a", "b", "c"}, ret.ast()) + ret = parser:parse("acb") + assert.are.same({"a", "c", "b"}, ret.ast()) + ret = parser:parse("bac") + assert.are.same({"b", "a", "c"}, ret.ast()) + ret = parser:parse("bca") + assert.are.same({"b", "c", "a"}, ret.ast()) + ret = parser:parse("cab") + assert.are.same({"c", "a", "b"}, ret.ast()) + ret = parser:parse("cba") + assert.are.same({"c", "b", "a"}, ret.ast()) + end) + it("does not parse a string that is not a permutation of 'abc'", function() + local ret = parser:parse("a") + assert.is_falsy(ret) + ret = parser:parse("ab") + assert.is_falsy(ret) + ret = parser:parse("abb") + assert.is_falsy(ret) + end) + parser = hammer.permutation(hammer.ch("a"), hammer.ch("b"), hammer.optional(hammer.ch("c"))) + it("parses a string that is a permutation of 'ab[c]'", function() + local ret = parser:parse("abc") + assert.are.same({"a", "b", "c"}, ret.ast()) + ret = parser:parse("acb") + assert.are.same({"a", "c", "b"}, ret.ast()) + ret = parser:parse("bac") + assert.are.same({"b", "a", "c"}, ret.ast()) + ret = parser:parse("bca") + assert.are.same({"b", "c", "a"}, ret.ast()) + ret = parser:parse("cab") + assert.are.same({"c", "a", "b"}, ret.ast()) + ret = parser:parse("cba") + assert.are.same({"c", "b", "a"}, ret.ast()) + ret = parser:parse("ab") + assert.are.same({"a", "b"}, ret.ast()) + ret = parser:parse("ba") + assert.are.same({"b", "a"}, ret.ast()) + end) + it("does not parse a string that is not a permutation of 'ab[c]'", function() + local ret = parser:parse("a") + assert.is_falsy(ret) + ret = parser:parse("b") + assert.is_falsy(ret) + ret = parser:parse("c") + assert.is_falsy(ret) + ret = parser:parse("ca") + assert.is_falsy(ret) + ret = parser:parse("cb") + assert.is_falsy(ret) + ret = parser:parse("cc") + assert.is_falsy(ret) + ret = parser:parse("ccab") + assert.is_falsy(ret) + ret = parser:parse("ccc") + assert.is_falsy(ret) + end) + parser = hammer.permutation(hammer.optional(hammer.ch("c")), hammer.ch("a"), hammer.ch("b")) + it("parses a string that is a permutation of '[c]ab'", function() + local ret = parser:parse("abc") + assert.are.same({"a", "b", "c"}, ret.ast()) + ret = parser:parse("acb") + assert.are.same({"a", "c", "b"}, ret.ast()) + ret = parser:parse("bac") + assert.are.same({"b", "a", "c"}, ret.ast()) + ret = parser:parse("bca") + assert.are.same({"b", "c", "a"}, ret.ast()) + ret = parser:parse("cab") + assert.are.same({"c", "a", "b"}, ret.ast()) + ret = parser:parse("cba") + assert.are.same({"c", "b", "a"}, ret.ast()) + ret = parser:parse("ab") + assert.are.same({"a", "b"}, ret.ast()) + ret = parser:parse("ba") + assert.are.same({"b", "a"}, ret.ast()) + end) + it("does not parse a string that is not a permutation of '[c]ab'", function() + local ret = parser:parse("a") + assert.is_falsy(ret) + ret = parser:parse("b") + assert.is_falsy(ret) + ret = parser:parse("c") + assert.is_falsy(ret) + ret = parser:parse("ca") + assert.is_falsy(ret) + ret = parser:parse("cb") + assert.is_falsy(ret) + ret = parser:parse("cc") + assert.is_falsy(ret) + ret = parser:parse("ccab") + assert.is_falsy(ret) + ret = parser:parse("ccc") + assert.is_falsy(ret) + end) + end) + + -- describe("Monadic binding tests", function() + -- local function continuation(allocator, result, env) + -- local val = 0 + -- for k, v in result.seq + -- do val = val*10 + v->uint - 48 + -- end + -- if val > 26 then + -- return nil + -- else + -- return hammer.ch + -- end + -- end + -- local parser = hammer.bind(hammer.many1(hammer.ch_range("0", "9")), continuation, "a") + -- it("parses a ", function() + -- local ret = parser:parse() + -- assert.are.same(ret.ast., ) + -- end) + -- it("does not parse a ", function() + -- local ret = parser:parse() + -- assert.is_falsy(ret) + -- end) + -- end) +end) diff --git a/src/bitreader.c b/src/bitreader.c index fe21e439ec778aa39b3cbeb18c0b3ba4fbe337fd..0f0825b87c60697f4bd8aff727a3ffe4ecc19532 100644 --- a/src/bitreader.c +++ b/src/bitreader.c @@ -108,3 +108,77 @@ int64_t h_read_bits(HInputStream* state, int count, char signed_p) { out <<= final_shift; return (out ^ msb) - msb; // perform sign extension } + +void h_skip_bits(HInputStream* stream, size_t count) { + size_t left; + + if (count == 0) + return; + + if (stream->overrun) + return; + + if (stream->index == stream->length) { + stream->overrun = true; + return; + } + + // consume from a partial byte? + left = 8 - stream->bit_offset - stream->margin; + if (count < left) { + stream->bit_offset += count; + return; + } + if (left < 8) { + stream->index += 1; + stream->bit_offset = 0; + stream->margin = 0; + count -= left; + } + assert(stream->bit_offset == 0); + assert(stream->margin == 0); + + // consume full bytes + left = stream->length - stream->index; + if (count / 8 <= left) { + stream->index += count / 8; + count = count % 8; + } else { + stream->index = stream->length; + stream->overrun = true; + return; + } + assert(count < 8); + + // final partial byte + if (count > 0 && stream->index == stream->length) + stream->overrun = true; + else + stream->bit_offset = count; +} + +void h_seek_bits(HInputStream* stream, size_t pos) { + size_t pos_index = pos / 8; + size_t pos_offset = pos % 8; + + /* seek within the current byte? */ + if (pos_index == stream->index) { + stream->bit_offset = pos_offset; + return; + } + + stream->margin = 0; + + /* seek past the end? */ + if ((pos_index > stream->length) || + (pos_index == stream->length && pos_offset > 0)) { + stream->index = stream->length; + stream->bit_offset = 0; + stream->overrun = true; + return; + } + + stream->index = pos_index; + stream->bit_offset = pos_offset; + stream->margin = 0; +} diff --git a/src/glue.c b/src/glue.c index 58fe4175d4fd326b62c76449449a74768605ca9e..37962e849283951972ed60094345bec62b57434f 100644 --- a/src/glue.c +++ b/src/glue.c @@ -106,7 +106,7 @@ HParsedToken *h_make_seqn(HArena *arena, size_t n) return ret; } -HParsedToken *h_make_bytes(HArena *arena, uint8_t *array, size_t len) +HParsedToken *h_make_bytes(HArena *arena, const uint8_t *array, size_t len) { HParsedToken *ret = h_make_(arena, TT_BYTES); ret->bytes.len = len; diff --git a/src/glue.h b/src/glue.h index 0bbfe9cfa26ec1bb6376ff23aa3b2d6cc3b4e873..31597cd21c829d362e0a66c52a39dfc95b2a3a96 100644 --- a/src/glue.h +++ b/src/glue.h @@ -195,7 +195,7 @@ HParsedToken *h_act_ignore(const HParseResult *p, void* user_data); HParsedToken *h_make(HArena *arena, HTokenType type, void *value); HParsedToken *h_make_seq(HArena *arena); // Makes empty sequence. HParsedToken *h_make_seqn(HArena *arena, size_t n); // Makes empty sequence of expected size n. -HParsedToken *h_make_bytes(HArena *arena, uint8_t *array, size_t len); +HParsedToken *h_make_bytes(HArena *arena, const uint8_t *array, size_t len); HParsedToken *h_make_sint(HArena *arena, int64_t val); HParsedToken *h_make_uint(HArena *arena, uint64_t val); diff --git a/src/hammer.h b/src/hammer.h index ad44fee910fcf42445e57e47ec8c1fe2d18d3724..25141e081f8502ead70c6ff797157cf3cf1cafa6 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -462,6 +462,15 @@ HAMMER_FN_DECL_NOARG(HParser*, h_nothing_p); */ HAMMER_FN_DECL_VARARGS_ATTR(H_GCC_ATTRIBUTE((sentinel)), HParser*, h_sequence, HParser* p); +/** + * Given an `h_sequence` and a list of indices, returns a parser that parses the sequence + * but returns it without the results at the dropped indices. If a negative integer appears + * in the middle of the list, this combinator will silently ignore the rest of the list. + * + * Result token type: TT_SEQUENCE + */ +#define h_drop_from(p, ...) h_drop_from_(p, __VA_ARGS__, -1) +HAMMER_FN_DECL_VARARGS(HParser*, h_drop_from_, HParser* p); /** * Given an array of parsers, p_array, apply each parser in order. The * first parser to succeed is the result; if no parsers succeed, the @@ -716,6 +725,32 @@ HAMMER_FN_DECL(HParser*, h_get_value, const char* name); */ HAMMER_FN_DECL(HParser*, h_bind, const HParser *p, HContinuation k, void *env); +/** + * This parser skips 'n' bits of input. + * + * Result: None. The HParseResult exists but its AST is NULL. + */ +HAMMER_FN_DECL(HParser*, h_skip, size_t n); + +/** + * The HParser equivalent of fseek(), 'h_seek' modifies the parser's input + * position. Note that contrary to 'fseek', offsets are in bits, not bytes. + * The 'whence' argument uses the same values and semantics: SEEK_SET, + * SEEK_CUR, SEEK_END. + * + * Fails if the new input position would be negative or past the end of input. + * + * Result: TT_UINT. The new input position. + */ +HAMMER_FN_DECL(HParser*, h_seek, ssize_t offset, int whence); + +/** + * Report the current position in bits. Consumes no input. + * + * Result: TT_UINT. The current input position. + */ +HAMMER_FN_DECL_NOARG(HParser*, h_tell); + /** * Free the memory allocated to an HParseResult when it is no longer needed. */ @@ -728,10 +763,22 @@ HAMMER_FN_DECL(void, h_parse_result_free, HParseResult *result); */ char* h_write_result_unamb(const HParsedToken* tok); /** - * Format token to the given output stream. Indent starting at - * [indent] spaces, with [delta] spaces between levels. + * Format token to the given output stream. Indent starting at [indent] spaces, + * with [delta] spaces between levels. + * + * Note: This function does not print a trailing newline. It also does not + * print any spaces to indent the initial line of output. This makes it + * suitable for recursive use in the condensed output of larger structures. */ void h_pprint(FILE* stream, const HParsedToken* tok, int indent, int delta); +/** + * Format token to the given output. Print a trailing newline. + * + * This function assumes an initial indentation of 0 and uses 2 spaces between + * indentation levels. It is equivalent to 'h_pprint(stream, tok, 0, 2)' + * followed by 'fputc('\n', stream)' and is provided for convenience. + */ +void h_pprintln(FILE* stream, const HParsedToken* tok); /** * Build parse tables for the given parser backend. See the @@ -795,7 +842,8 @@ HTokenType h_allocate_token_type(const char* name); /// Allocate a new token type with an unambiguous print function. HTokenType h_allocate_token_new( const char* name, - void (*unamb_sub)(const HParsedToken *tok, struct result_buf *buf)); + void (*unamb_sub)(const HParsedToken *tok, struct result_buf *buf), + void (*pprint)(FILE* stream, const HParsedToken* tok, int indent, int delta)); /// Get the token type associated with name. Returns -1 if name is unkown HTokenType h_get_token_type_number(const char* name); diff --git a/src/internal.h b/src/internal.h index 0e92e99e6facf5d04c6b13ca8de51272ba630a1d..324fcbafc5ef7601fac70ceaea04894b8d46010d 100644 --- a/src/internal.h +++ b/src/internal.h @@ -327,9 +327,16 @@ extern HParserBackendVTable h__glr_backend_vtable; // TODO(thequux): Set symbol visibility for these functions so that they aren't exported. int64_t h_read_bits(HInputStream* state, int count, char signed_p); +void h_skip_bits(HInputStream* state, size_t count); +void h_seek_bits(HInputStream* state, size_t pos); static inline size_t h_input_stream_pos(HInputStream* state) { + assert(state->index < SIZE_MAX / 8); return state->index * 8 + state->bit_offset + state->margin; } +static inline size_t h_input_stream_length(HInputStream *state) { + assert(state->length <= SIZE_MAX / 8); + return state->length * 8; +} // need to decide if we want to make this public. HParseResult* h_do_parse(const HParser* parser, HParseState *state); void put_cached(HParseState *ps, const HParser *p, HParseResult *cached); @@ -428,6 +435,7 @@ typedef struct HTTEntry_ { const char* name; HTokenType value; void (*unamb_sub)(const HParsedToken *tok, struct result_buf *buf); + void (*pprint)(FILE* stream, const HParsedToken* tok, int indent, int delta); } HTTEntry; const HTTEntry* h_get_token_type_entry(HTokenType token_type); diff --git a/src/parsers/bits.c b/src/parsers/bits.c index be8f13f10a65f67e50d134c5f3557a1a7a209d62..288e3e95a45a7709e942d9987cda37c7e55819b5 100644 --- a/src/parsers/bits.c +++ b/src/parsers/bits.c @@ -29,7 +29,7 @@ static HParsedToken *reshape_bits(const HParseResult *p, void* signedp_p) { HParsedToken *ret = h_arena_malloc(p->arena, sizeof(HParsedToken)); ret->token_type = TT_UINT; - if(signedp && (seq->elements[0]->uint & 128)) + if(signedp && seq->used > 0 && (seq->elements[0]->uint & 128)) ret->uint = -1; // all ones for(size_t i=0; i<seq->used; i++) { diff --git a/src/parsers/choice.c b/src/parsers/choice.c index 90c3662b515babe4a69b0e24dc146ebe1d0a647d..69e4aee778977243594f0ffc124cb3931f4a8d03 100644 --- a/src/parsers/choice.c +++ b/src/parsers/choice.c @@ -164,5 +164,6 @@ HParser* h_choice__ma(HAllocator* mm__, void *args[]) { ret->vtable = &choice_vt; ret->env = (void*)s; ret->backend = PB_MIN; + ret->desugared = NULL; return ret; } diff --git a/src/parsers/many.c b/src/parsers/many.c index 071e3fcd2d30ed35f4622962751ebc63bea3d37c..77b9dd8be220d92eac36b18ddbcd2fe263945448 100644 --- a/src/parsers/many.c +++ b/src/parsers/many.c @@ -92,22 +92,17 @@ static void desugar_many(HAllocator *mm__, HCFStack *stk__, void *env) { // TODO: refactor this. HRepeat *repeat = (HRepeat*)env; if (!repeat->min_p) { - assert(!"Unreachable"); + // count is an exact count. + assert(repeat->sep == NULL); HCFS_BEGIN_CHOICE() { HCFS_BEGIN_SEQ() { - for (size_t i = 0; i < repeat->count; i++) { - if (i != 0 && repeat->sep != NULL) - HCFS_DESUGAR(repeat->sep); // Should be ignored. + for (size_t i = 0; i < repeat->count; i++) HCFS_DESUGAR(repeat->p); - } } HCFS_END_SEQ(); } HCFS_END_CHOICE(); return; } - if(repeat->count > 1) { - assert_message(0, "'h_repeat_n' is not context-free, can't be desugared"); - return; - } + assert(repeat->count <= 1); /* many(A) => Ma -> A Mar diff --git a/src/parsers/permutation.c b/src/parsers/permutation.c index b16758413eeafe2ce2ae91db2ebbe7593681d3cd..ec256c4af1f76292847102d0a07eca5cb19e5bae 100644 --- a/src/parsers/permutation.c +++ b/src/parsers/permutation.c @@ -176,5 +176,6 @@ HParser* h_permutation__ma(HAllocator* mm__, void *args[]) { ret->vtable = &permutation_vt; ret->env = (void*)s; ret->backend = PB_MIN; + ret->desugared = NULL; return ret; } diff --git a/src/parsers/seek.c b/src/parsers/seek.c new file mode 100644 index 0000000000000000000000000000000000000000..027098b59424a2f78c9b54a0683e66111c02863f --- /dev/null +++ b/src/parsers/seek.c @@ -0,0 +1,118 @@ +#include "parser_internal.h" + +typedef struct { + ssize_t offset; + int whence; +} HSeek; + +static HParseResult *parse_skip(void *env, HParseState *state) +{ + size_t n = (uintptr_t)env; + + h_skip_bits(&state->input_stream, n); + return make_result(state->arena, NULL); +} + +static HParseResult *parse_seek(void *env, HParseState *state) +{ + HSeek *s = (HSeek *)env; + HInputStream *stream = &state->input_stream; + size_t pos; + + /* determine base position */ + switch (s->whence) { + case SEEK_SET: + pos = 0; + break; + case SEEK_END: + pos = h_input_stream_length(stream); + break; + case SEEK_CUR: + pos = h_input_stream_pos(stream); + break; + default: + return NULL; /* invalid argument */ + } + + /* calculate target position and do basic overflow checks */ + if (s->offset < 0 && (size_t)(- s->offset) > pos) + return NULL; /* underflow */ + if (s->offset > 0 && SIZE_MAX - s->offset < pos) + return NULL; /* overflow */ + pos += s->offset; + + /* perform the seek and check for overrun */ + h_seek_bits(stream, pos); + if (stream->overrun) + return NULL; + + HParsedToken *tok = a_new(HParsedToken, 1); + tok->token_type = TT_UINT; + tok->uint = pos; + return make_result(state->arena, tok); +} + +static HParseResult *parse_tell(void *env, HParseState *state) +{ + HParsedToken *tok = a_new(HParsedToken, 1); + tok->token_type = TT_UINT; + tok->uint = h_input_stream_pos(&state->input_stream); + return make_result(state->arena, tok); +} + +static const HParserVtable skip_vt = { + .parse = parse_skip, + .isValidRegular = h_false, + .isValidCF = h_false, + .compile_to_rvm = h_not_regular, + .higher = false, +}; + +static const HParserVtable seek_vt = { + .parse = parse_seek, + .isValidRegular = h_false, + .isValidCF = h_false, + .compile_to_rvm = h_not_regular, + .higher = false, +}; + +static const HParserVtable tell_vt = { + .parse = parse_tell, + .isValidRegular = h_false, + .isValidCF = h_false, + .compile_to_rvm = h_not_regular, + .higher = false, +}; + +HParser* h_skip(size_t n) +{ + return h_skip__m(&system_allocator, n); +} + +HParser *h_skip__m(HAllocator* mm__, size_t n) +{ + return h_new_parser(mm__, &skip_vt, (void *)n); +} + +HParser* h_seek(ssize_t offset, int whence) +{ + return h_seek__m(&system_allocator, offset, whence); +} + +HParser *h_seek__m(HAllocator* mm__, ssize_t offset, int whence) +{ + HSeek *env = h_new(HSeek, 1); + env->offset = offset; + env->whence = whence; + return h_new_parser(mm__, &seek_vt, env); +} + +HParser *h_tell() +{ + return h_tell__m(&system_allocator); +} + +HParser *h_tell__m(HAllocator* mm__) +{ + return h_new_parser(mm__, &tell_vt, NULL); +} diff --git a/src/parsers/sequence.c b/src/parsers/sequence.c index 55c0c8885573ef7779714efd49eaf64cc59ac878..786ba62e43683f32ca0cc244bc0695cdb04a76fd 100644 --- a/src/parsers/sequence.c +++ b/src/parsers/sequence.c @@ -171,5 +171,88 @@ HParser* h_sequence__ma(HAllocator* mm__, void *args[]) { ret->vtable = &sequence_vt; ret->env = (void*)s; ret->backend = PB_MIN; + ret->desugared = NULL; return ret; } + +HParser* h_drop_from_(HParser* p, ...) { + assert_message(p->vtable == &sequence_vt, "drop_from requires a sequence parser"); + va_list ap; + va_start(ap, p); + HParser* ret = h_drop_from___mv(&system_allocator, p, ap); + va_end(ap); + return ret; +} + +HParser* h_drop_from___m(HAllocator* mm__, HParser* p, ...) { + assert_message(p->vtable == &sequence_vt, "drop_from requires a sequence parser"); + va_list ap; + va_start(ap, p); + HParser* ret = h_drop_from___mv(mm__, p, ap); + va_end(ap); + return ret; +} + +HParser* h_drop_from___v(HParser* p, va_list ap) { + assert_message(p->vtable == &sequence_vt, "drop_from requires a sequence parser"); + return h_drop_from___mv(&system_allocator, p, ap); +} + +HParser* h_drop_from___mv(HAllocator* mm__, HParser *p, va_list ap) { + /* Ok, here's where things get funny. + * + * Saying `h_drop_from(h_sequence(a, b, c, d, e, NULL), 0, 4, -1)` is functionally + * equivalent to `h_sequence(h_ignore(a), b, c, d, h_ignore(e), NULL)`. Thus, this + * term rewrites itself, becoming an h_sequence where some parsers are ignored. + */ + HSequence *s = (HSequence*)(p->env); + size_t indices[s->len]; + size_t count = 0; + int arg = 0; + + for (arg = va_arg(ap, int); arg >= 0; arg = va_arg(ap, int)) { + indices[count] = arg; + count++; + } + va_end(ap); + + HSequence *rewrite = h_new(HSequence, 1); + rewrite->p_array = h_new(HParser *, s->len); + rewrite->len = s->len; + for (size_t i=0, j=0; i<s->len; ++i) { + if (indices[j]==i) { + rewrite->p_array[i] = h_ignore(s->p_array[i]); + ++j; + } else { + rewrite->p_array[i] = s->p_array[i]; + } + } + + return h_new_parser(mm__, &sequence_vt, rewrite); +} + +HParser* h_drop_from___a(void *args[]) { + return h_drop_from___ma(&system_allocator, args); +} + +HParser* h_drop_from___ma(HAllocator* mm__, void *args[]) { + HParser *p = (HParser*)(args[0]); + assert_message(p->vtable == &sequence_vt, "drop_from requires a sequence parser"); + HSequence *s = (HSequence*)(p->env); + HSequence *rewrite = h_new(HSequence, 1); + rewrite->p_array = h_new(HParser *, s->len); + rewrite->len = s->len; + + int i=0, *argp = (int*)(args[1]); + while (*argp >= 0) { + if (i == *argp) { + rewrite->p_array[i] = h_ignore(s->p_array[i]); + ++argp; + } else { + rewrite->p_array[i] = s->p_array[i]; + } + ++i; + } + + return h_new_parser(mm__, &sequence_vt, rewrite); +} diff --git a/src/platform_bsdlike.c b/src/platform_bsdlike.c index 2ccf874264a740e0784e8fba14e2ae78a337fa08..ffe1e64db4d1c0e2589160a40468c408f12a3fa6 100644 --- a/src/platform_bsdlike.c +++ b/src/platform_bsdlike.c @@ -1,4 +1,8 @@ +#ifdef __OpenBSD__ +#define _BSD_SOURCE // to obtain asprintf/vasprintf +#else #define _GNU_SOURCE // to obtain asprintf/vasprintf +#endif #include "platform.h" #include <stdio.h> diff --git a/src/pprint.c b/src/pprint.c index 52f42eb6060230a8bb608b8e5ab1eafb6ef1467c..145bf5237ae98e7db240aa1540bf8b242801edd1 100644 --- a/src/pprint.c +++ b/src/pprint.c @@ -30,55 +30,74 @@ typedef struct pp_state { int at_bol; } pp_state_t; +static void pprint_bytes(FILE *stream, const uint8_t *bs, size_t len) +{ + fprintf(stream, "\""); + for (size_t i = 0; i < len; i++) { + uint8_t c = bs[i]; + if (c == '"' || c == '\\') + fprintf(stream, "\\%c", c); + else if (c >= 0x20 && c <= 0x7e) + fputc(c, stream); + else + fprintf(stream, "\\u00%02hhx", c); + } + fprintf(stream, "\""); +} + void h_pprint(FILE* stream, const HParsedToken* tok, int indent, int delta) { + if (tok == NULL) { + fprintf(stream, "(null)"); + return; + } switch (tok->token_type) { case TT_NONE: - fprintf(stream, "%*snull\n", indent, ""); + fprintf(stream, "null"); break; case TT_BYTES: - if (tok->bytes.len == 0) - fprintf(stream, "%*s<>\n", indent, ""); - else { - fprintf(stream, "%*s", indent, ""); - for (size_t i = 0; i < tok->bytes.len; i++) { - fprintf(stream, - "%c%02hhx", - (i == 0) ? '<' : '.', - tok->bytes.token[i]); - } - fprintf(stream, ">\n"); - } + pprint_bytes(stream, tok->bytes.token, tok->bytes.len); break; case TT_SINT: - if (tok->sint < 0) - fprintf(stream, "%*ss -%#" PRIx64 "\n", indent, "", -tok->sint); - else - fprintf(stream, "%*ss %#" PRIx64 "\n", indent, "", tok->sint); - + fprintf(stream, "%" PRId64, tok->sint); break; case TT_UINT: - fprintf(stream, "%*su %#" PRIx64 "\n", indent, "", tok->uint); + fprintf(stream, "%" PRIu64, tok->uint); break; - case TT_SEQUENCE: { - fprintf(stream, "%*s[\n", indent, ""); - for (size_t i = 0; i < tok->seq->used; i++) { - h_pprint(stream, tok->seq->elements[i], indent + delta, delta); + case TT_SEQUENCE: + if (tok->seq->used == 0) + fprintf(stream, "[ ]"); + else { + fprintf(stream, "[%*s", delta - 1, ""); + for (size_t i = 0; i < tok->seq->used; i++) { + if (i > 0) fprintf(stream, "\n%*s,%*s", indent, "", delta - 1, ""); + h_pprint(stream, tok->seq->elements[i], indent + delta, delta); + } + if (tok->seq->used > 2) + fprintf(stream, "\n%*s]", indent, ""); + else + fprintf(stream, " ]"); } - fprintf(stream, "%*s]\n", indent, ""); - } - break; - case TT_USER: - fprintf(stream, "%*sUSER:%s\n", indent, "", h_get_token_type_name(tok->token_type)); break; default: - if(tok->token_type > TT_USER) { - fprintf(stream, "%*sUSER:%s %d\n", indent, "", h_get_token_type_name(tok->token_type), tok->token_type-TT_USER); - } else { - assert_message(0, "Should not reach here."); + assert_message(tok->token_type >= TT_USER, "h_pprint: unhandled token type"); + { + const HTTEntry *e = h_get_token_type_entry(tok->token_type); + fprintf(stream, "{ \"TT\":%d, \"N\":", (int)e->value); + pprint_bytes(stream, (uint8_t *)e->name, strlen(e->name)); + if (e->pprint != NULL) { + fprintf(stream, ", \"V\":"); + e->pprint(stream, tok, indent + delta, delta); + } + fprintf(stream, " }"); } } } +void h_pprintln(FILE* stream, const HParsedToken* tok) { + h_pprint(stream, tok, 0, 2); + fputc('\n', stream); +} + struct result_buf { char* output; @@ -202,6 +221,3 @@ char* h_write_result_unamb(const HParsedToken* tok) { h_append_buf_c(&buf, 0); return buf.output; } - - - diff --git a/src/registry.c b/src/registry.c index 00486db46ca6c1fdece03a051242f4f05ad23514..5486fd7bdb8022c65a296205b0dfd562a20a0572 100644 --- a/src/registry.c +++ b/src/registry.c @@ -54,12 +54,14 @@ static void default_unamb_sub(const HParsedToken* tok, HTokenType h_allocate_token_new( const char* name, - void (*unamb_sub)(const HParsedToken *tok, struct result_buf *buf)) { + void (*unamb_sub)(const HParsedToken *tok, struct result_buf *buf), + void (*pprint)(FILE* stream, const HParsedToken* tok, int indent, int delta)) { HTTEntry* new_entry = h_alloc(&system_allocator, sizeof(*new_entry)); assert(new_entry != NULL); new_entry->name = name; new_entry->value = 0; - new_entry->unamb_sub = unamb_sub; + new_entry->unamb_sub = unamb_sub ? unamb_sub : default_unamb_sub; + new_entry->pprint = pprint; HTTEntry* probe = *(HTTEntry**)tsearch(new_entry, &tt_registry, compare_entries); if (probe->value != 0) { // Token type already exists... @@ -86,7 +88,7 @@ HTokenType h_allocate_token_new( } } HTokenType h_allocate_token_type(const char* name) { - return h_allocate_token_new(name, default_unamb_sub); + return h_allocate_token_new(name, NULL, NULL); } HTokenType h_get_token_type_number(const char* name) { HTTEntry e; diff --git a/src/system_allocator.c b/src/system_allocator.c index 39a1a7e77040c865f2d4f99977eb264391286bb4..f6e9cdcbbe74fedea568a73ada868e93d83c0660 100644 --- a/src/system_allocator.c +++ b/src/system_allocator.c @@ -59,6 +59,8 @@ static void* system_realloc(HAllocator *allocator, void* uptr, size_t size) { if (!uptr) { return system_alloc(allocator, size); } + // XXX this is incorrect if size == 0 and BLOCK_HEADER_SIZE != 0; it fails + // to behave like free(3) void* block = realloc(block_for_user_ptr(uptr), block_size(size)); if (!block) { return NULL; @@ -66,6 +68,7 @@ static void* system_realloc(HAllocator *allocator, void* uptr, size_t size) { uptr = user_ptr(block); #ifdef DEBUG__MEMFILL + // XXX this is the wrong block; this is reading uninitialized memory size_t old_size = ((HDebugBlockHeader*)block)->size; if (size > old_size) memset((char*)uptr+old_size, DEBUG__MEMFILL, size - old_size); diff --git a/src/t_bitwriter.c b/src/t_bitwriter.c index 6b9b7051fa480b47e9cf173e29d865bdbc4a8943..0d2a8c0fde798d90ab60a9663c34c6744be11697 100644 --- a/src/t_bitwriter.c +++ b/src/t_bitwriter.c @@ -32,6 +32,8 @@ void run_bitwriter_test(bitwriter_test_elem data[], char flags) { for (i = 0; data[i].nbits; i++) { g_check_cmp_uint64((uint64_t)h_read_bits(&input, data[i].nbits, FALSE), ==, data[i].data); } + + h_bit_writer_free(w); } static void test_bitwriter_ints(void) { diff --git a/src/t_grammar.c b/src/t_grammar.c index 0287b2fe6eda00a1d6575e619161d18ca9f20639..65812ea7dbd346f0833183a6d51b13c923126ad5 100644 --- a/src/t_grammar.c +++ b/src/t_grammar.c @@ -12,6 +12,8 @@ static void test_end(void) { g_check_hashtable_size(g->geneps, 0); g_check_derives_epsilon_not(g, p); + + h_cfgrammar_free(g); } static void test_example_1(void) { @@ -35,6 +37,8 @@ static void test_example_1(void) { g_check_followset_absent(1, g, c, "$"); g_check_followset_absent(1, g, c, "x"); g_check_followset_present(1, g, c, "y"); + + h_cfgrammar_free(g); } void register_grammar_tests(void) { diff --git a/src/t_parser.c b/src/t_parser.c index 331d2629018b40717bf49309ba0b561ce7a618a3..cb67901ed9227787d5580079112c410df000dd94 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -27,6 +27,44 @@ static void test_ch_range(gconstpointer backend) { g_check_parse_failed(range_, (HParserBackend)GPOINTER_TO_INT(backend), "d", 1); } +static void test_bits0(gconstpointer backend) { + const HParser *bits0_; + + bits0_ = h_bits(0, false); + g_check_parse_match(bits0_, (HParserBackend)GPOINTER_TO_INT(backend), "", 0, "u0"); + bits0_ = h_bits(0, true); + g_check_parse_match(bits0_, (HParserBackend)GPOINTER_TO_INT(backend), "", 0, "s0"); + + bits0_ = h_sequence(h_bits(0, false), h_ch('a'), NULL); + g_check_parse_match(bits0_, (HParserBackend)GPOINTER_TO_INT(backend), "a", 1, "(u0 u0x61)"); + bits0_ = h_sequence(h_bits(0, true), h_ch('a'), NULL); + g_check_parse_match(bits0_, (HParserBackend)GPOINTER_TO_INT(backend), "a", 1, "(s0 u0x61)"); +} + +static void test_bits(gconstpointer backend) { + const HParser *bits_; + + bits_ = h_bits(3, false); + g_check_parse_match(bits_, (HParserBackend)GPOINTER_TO_INT(backend), "\0", 1, "u0"); + g_check_parse_match(bits_, (HParserBackend)GPOINTER_TO_INT(backend), "\xff", 1, "u0x7"); + g_check_parse_failed(bits_, (HParserBackend)GPOINTER_TO_INT(backend), "", 0); + + bits_ = h_bits(3, true); + g_check_parse_match(bits_, (HParserBackend)GPOINTER_TO_INT(backend), "\0", 1, "s0"); + g_check_parse_match(bits_, (HParserBackend)GPOINTER_TO_INT(backend), "\xff", 1, "s-0x1"); + g_check_parse_failed(bits_, (HParserBackend)GPOINTER_TO_INT(backend), "", 0); + + bits_ = h_bits(9, false); + g_check_parse_match(bits_, (HParserBackend)GPOINTER_TO_INT(backend), "\0\0", 2, "u0"); + g_check_parse_match(bits_, (HParserBackend)GPOINTER_TO_INT(backend), "\xff\xff", 2, "u0x1ff"); + g_check_parse_failed(bits_, (HParserBackend)GPOINTER_TO_INT(backend), "a", 1); + + bits_ = h_bits(9, true); + g_check_parse_match(bits_, (HParserBackend)GPOINTER_TO_INT(backend), "\0\0", 2, "s0"); + g_check_parse_match(bits_, (HParserBackend)GPOINTER_TO_INT(backend), "\xff\xff", 2, "s-0x1"); + g_check_parse_failed(bits_, (HParserBackend)GPOINTER_TO_INT(backend), "a", 1); +} + //@MARK_START static void test_int64(gconstpointer backend) { const HParser *int64_ = h_int64(); @@ -743,10 +781,89 @@ static void test_bind(gconstpointer backend) { g_check_parse_failed(p, be, "272{", 4); } +static void test_skip(gconstpointer backend) { + HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend); + const HParser *p, *p_le, *p_be; + + p = h_sequence(h_ch('a'), h_skip(32), h_ch('f'), NULL); + g_check_parse_match(p, be, "abcdef", 6, "(u0x61 u0x66)"); + g_check_parse_failed(p, be, "abcdex", 6); + g_check_parse_failed(p, be, "abc", 3); + + p = h_sequence(h_ch('a'), h_skip(32), h_end_p(), NULL); + g_check_parse_match(p, be, "abcde", 5, "(u0x61)"); + + p = h_sequence(h_ch('a'), h_skip(3), h_ch('\0'), h_skip(5), h_ch('b'), NULL); + g_check_parse_match(p, be, "a\xe0\x1f\x62", 4, "(u0x61 u0 u0x62)"); // big-endian + p_le = h_with_endianness(BYTE_LITTLE_ENDIAN|BIT_LITTLE_ENDIAN, p); + p_be = h_with_endianness(BYTE_LITTLE_ENDIAN|BIT_BIG_ENDIAN, p); + g_check_parse_match(p_be, be, "a\xe0\x1f\x62", 4, "(u0x61 u0 u0x62)"); + g_check_parse_match(p_le, be, "a\x07\xf8\x62", 4, "(u0x61 u0 u0x62)"); + + p = h_sequence(h_ch('a'), h_skip(3), h_ch('\0'), h_skip(5), h_end_p(), NULL); + g_check_parse_match(p, be, "a\xe0\x1f", 3, "(u0x61 u0)"); // big-endian +} + +static void test_tell(gconstpointer backend) { + HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend); + const HParser *p; + + p = h_sequence(h_ch('a'), h_ch('b'), h_tell(), h_end_p(), NULL); + g_check_parse_match(p, be, "ab", 2, "(u0x61 u0x62 u0x10)"); + g_check_parse_failed(p, be, "abc", 1); + g_check_parse_failed(p, be, "a", 1); +} + +static void test_seek(gconstpointer backend) { + HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend); + const HParser *p; + + p = h_sequence(h_ch('a'), h_seek(40, SEEK_SET), h_ch('f'), NULL); + g_check_parse_match(p, be, "abcdef", 6, "(u0x61 u0x28 u0x66)"); + g_check_parse_failed(p, be, "abcdex", 6); + g_check_parse_failed(p, be, "abc", 3); + + p = h_sequence(h_ch('a'), h_seek(40, SEEK_SET), h_end_p(), NULL); + g_check_parse_match(p, be, "abcde", 5, "(u0x61 u0x28)"); + g_check_parse_failed(p, be, "abcdex", 6); + g_check_parse_failed(p, be, "abc", 3); + + p = h_sequence(h_ch('a'), h_seek(0, SEEK_END), h_end_p(), NULL); + g_check_parse_match(p, be, "abcde", 5, "(u0x61 u0x28)"); + g_check_parse_match(p, be, "abc", 3, "(u0x61 u0x18)"); + + p = h_sequence(h_ch('a'), h_seek(-16, SEEK_END), h_ch('x'), NULL); + g_check_parse_match(p, be, "abcdxy", 6, "(u0x61 u0x20 u0x78)"); + g_check_parse_match(p, be, "abxy", 4, "(u0x61 u0x10 u0x78)"); + g_check_parse_failed(p, be, "abc", 3); + g_check_parse_failed(p, be, "x", 1); + + p = h_sequence(h_ch('a'), h_seek(32, SEEK_CUR), h_ch('f'), NULL); + g_check_parse_match(p, be, "abcdef", 6, "(u0x61 u0x28 u0x66)"); + g_check_parse_failed(p, be, "xbcdef", 6); + g_check_parse_failed(p, be, "abcdex", 6); + g_check_parse_failed(p, be, "abc", 3); +} + +static void test_drop_from(gconstpointer backend) { + HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend); + HParser *p, *q, *r, *seq; + + seq = h_sequence(h_ch('a'), h_ch('b'), h_ch('c'), h_ch('d'), h_ch('e'), NULL); + p = h_drop_from(seq, 0, 4); + g_check_parse_match(p, be, "abcde", 5, "(u0x62 u0x63 u0x64)"); + //q = h_drop_from(seq, 1, 2, -1); + //g_check_parse_match(q, be, "abcde", 5, "(u0x61 u0x64 u0x65)"); + //r = h_drop_from(seq, 0, 1, 3, 4, -1); + //g_check_parse_match(r, be, "abcde", 5, "(u0x63)"); +} + void register_parser_tests(void) { g_test_add_data_func("/core/parser/packrat/token", GINT_TO_POINTER(PB_PACKRAT), test_token); g_test_add_data_func("/core/parser/packrat/ch", GINT_TO_POINTER(PB_PACKRAT), test_ch); g_test_add_data_func("/core/parser/packrat/ch_range", GINT_TO_POINTER(PB_PACKRAT), test_ch_range); + g_test_add_data_func("/core/parser/packrat/bits0", GINT_TO_POINTER(PB_PACKRAT), test_bits0); + g_test_add_data_func("/core/parser/packrat/bits", GINT_TO_POINTER(PB_PACKRAT), test_bits); g_test_add_data_func("/core/parser/packrat/int64", GINT_TO_POINTER(PB_PACKRAT), test_int64); g_test_add_data_func("/core/parser/packrat/int32", GINT_TO_POINTER(PB_PACKRAT), test_int32); g_test_add_data_func("/core/parser/packrat/int16", GINT_TO_POINTER(PB_PACKRAT), test_int16); @@ -795,10 +912,16 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/packrat/bind", GINT_TO_POINTER(PB_PACKRAT), test_bind); g_test_add_data_func("/core/parser/packrat/result_length", GINT_TO_POINTER(PB_PACKRAT), test_result_length); //g_test_add_data_func("/core/parser/packrat/token_position", GINT_TO_POINTER(PB_PACKRAT), test_token_position); + g_test_add_data_func("/core/parser/packrat/skip", GINT_TO_POINTER(PB_PACKRAT), test_skip); + g_test_add_data_func("/core/parser/packrat/seek", GINT_TO_POINTER(PB_PACKRAT), test_seek); + g_test_add_data_func("/core/parser/packrat/tell", GINT_TO_POINTER(PB_PACKRAT), test_tell); + g_test_add_data_func("/core/parser/packrat/drop_from", GINT_TO_POINTER(PB_PACKRAT), test_drop_from); g_test_add_data_func("/core/parser/llk/token", GINT_TO_POINTER(PB_LLk), test_token); g_test_add_data_func("/core/parser/llk/ch", GINT_TO_POINTER(PB_LLk), test_ch); g_test_add_data_func("/core/parser/llk/ch_range", GINT_TO_POINTER(PB_LLk), test_ch_range); + g_test_add_data_func("/core/parser/llk/bits0", GINT_TO_POINTER(PB_LLk), test_bits0); + //g_test_add_data_func("/core/parser/llk/bits", GINT_TO_POINTER(PB_LLk), test_bits); g_test_add_data_func("/core/parser/llk/int64", GINT_TO_POINTER(PB_LLk), test_int64); g_test_add_data_func("/core/parser/llk/int32", GINT_TO_POINTER(PB_LLk), test_int32); g_test_add_data_func("/core/parser/llk/int16", GINT_TO_POINTER(PB_LLk), test_int16); @@ -825,6 +948,7 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/llk/choice", GINT_TO_POINTER(PB_LLk), test_choice); g_test_add_data_func("/core/parser/llk/many", GINT_TO_POINTER(PB_LLk), test_many); g_test_add_data_func("/core/parser/llk/many1", GINT_TO_POINTER(PB_LLk), test_many1); + g_test_add_data_func("/core/parser/llk/repeat_n", GINT_TO_POINTER(PB_LLk), test_repeat_n); g_test_add_data_func("/core/parser/llk/optional", GINT_TO_POINTER(PB_LLk), test_optional); g_test_add_data_func("/core/parser/llk/sepBy", GINT_TO_POINTER(PB_LLk), test_sepBy); g_test_add_data_func("/core/parser/llk/sepBy1", GINT_TO_POINTER(PB_LLk), test_sepBy1); @@ -838,10 +962,13 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/llk/iterative", GINT_TO_POINTER(PB_LLk), test_iterative); g_test_add_data_func("/core/parser/llk/iterative/lookahead", GINT_TO_POINTER(PB_LLk), test_iterative_lookahead); g_test_add_data_func("/core/parser/llk/iterative/result_length", GINT_TO_POINTER(PB_LLk), test_iterative_result_length); + g_test_add_data_func("/core/parser/llk/drop_from", GINT_TO_POINTER(PB_LLk), test_drop_from); g_test_add_data_func("/core/parser/regex/token", GINT_TO_POINTER(PB_REGULAR), test_token); g_test_add_data_func("/core/parser/regex/ch", GINT_TO_POINTER(PB_REGULAR), test_ch); g_test_add_data_func("/core/parser/regex/ch_range", GINT_TO_POINTER(PB_REGULAR), test_ch_range); + g_test_add_data_func("/core/parser/regex/bits0", GINT_TO_POINTER(PB_REGULAR), test_bits0); + //g_test_add_data_func("/core/parser/regex/bits", GINT_TO_POINTER(PB_REGULAR), test_bits); g_test_add_data_func("/core/parser/regex/int64", GINT_TO_POINTER(PB_REGULAR), test_int64); g_test_add_data_func("/core/parser/regex/int32", GINT_TO_POINTER(PB_REGULAR), test_int32); g_test_add_data_func("/core/parser/regex/int16", GINT_TO_POINTER(PB_REGULAR), test_int16); @@ -877,10 +1004,13 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/regex/ignore", GINT_TO_POINTER(PB_REGULAR), test_ignore); g_test_add_data_func("/core/parser/regex/result_length", GINT_TO_POINTER(PB_REGULAR), test_result_length); g_test_add_data_func("/core/parser/regex/token_position", GINT_TO_POINTER(PB_REGULAR), test_token_position); + g_test_add_data_func("/core/parser/regex/drop_from", GINT_TO_POINTER(PB_REGULAR), test_drop_from); g_test_add_data_func("/core/parser/lalr/token", GINT_TO_POINTER(PB_LALR), test_token); g_test_add_data_func("/core/parser/lalr/ch", GINT_TO_POINTER(PB_LALR), test_ch); g_test_add_data_func("/core/parser/lalr/ch_range", GINT_TO_POINTER(PB_LALR), test_ch_range); + g_test_add_data_func("/core/parser/lalr/bits0", GINT_TO_POINTER(PB_LALR), test_bits0); + //g_test_add_data_func("/core/parser/lalr/bits", GINT_TO_POINTER(PB_LALR), test_bits); g_test_add_data_func("/core/parser/lalr/int64", GINT_TO_POINTER(PB_LALR), test_int64); g_test_add_data_func("/core/parser/lalr/int32", GINT_TO_POINTER(PB_LALR), test_int32); g_test_add_data_func("/core/parser/lalr/int16", GINT_TO_POINTER(PB_LALR), test_int16); @@ -907,6 +1037,7 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/lalr/choice", GINT_TO_POINTER(PB_LALR), test_choice); g_test_add_data_func("/core/parser/lalr/many", GINT_TO_POINTER(PB_LALR), test_many); g_test_add_data_func("/core/parser/lalr/many1", GINT_TO_POINTER(PB_LALR), test_many1); + g_test_add_data_func("/core/parser/lalr/repeat_n", GINT_TO_POINTER(PB_LALR), test_repeat_n); g_test_add_data_func("/core/parser/lalr/optional", GINT_TO_POINTER(PB_LALR), test_optional); g_test_add_data_func("/core/parser/lalr/sepBy", GINT_TO_POINTER(PB_LALR), test_sepBy); g_test_add_data_func("/core/parser/lalr/sepBy1", GINT_TO_POINTER(PB_LALR), test_sepBy1); @@ -921,10 +1052,13 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/lalr/iterative", GINT_TO_POINTER(PB_LALR), test_iterative); g_test_add_data_func("/core/parser/lalr/iterative/lookahead", GINT_TO_POINTER(PB_LALR), test_iterative_lookahead); g_test_add_data_func("/core/parser/lalr/iterative/result_length", GINT_TO_POINTER(PB_LALR), test_iterative_result_length); + g_test_add_data_func("/core/parser/lalr/drop_from", GINT_TO_POINTER(PB_LALR), test_drop_from); g_test_add_data_func("/core/parser/glr/token", GINT_TO_POINTER(PB_GLR), test_token); g_test_add_data_func("/core/parser/glr/ch", GINT_TO_POINTER(PB_GLR), test_ch); g_test_add_data_func("/core/parser/glr/ch_range", GINT_TO_POINTER(PB_GLR), test_ch_range); + g_test_add_data_func("/core/parser/glr/bits0", GINT_TO_POINTER(PB_GLR), test_bits0); + //g_test_add_data_func("/core/parser/glr/bits", GINT_TO_POINTER(PB_GLR), test_bits); g_test_add_data_func("/core/parser/glr/int64", GINT_TO_POINTER(PB_GLR), test_int64); g_test_add_data_func("/core/parser/glr/int32", GINT_TO_POINTER(PB_GLR), test_int32); g_test_add_data_func("/core/parser/glr/int16", GINT_TO_POINTER(PB_GLR), test_int16); @@ -951,6 +1085,7 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/glr/choice", GINT_TO_POINTER(PB_GLR), test_choice); g_test_add_data_func("/core/parser/glr/many", GINT_TO_POINTER(PB_GLR), test_many); g_test_add_data_func("/core/parser/glr/many1", GINT_TO_POINTER(PB_GLR), test_many1); + g_test_add_data_func("/core/parser/glr/repeat_n", GINT_TO_POINTER(PB_GLR), test_repeat_n); g_test_add_data_func("/core/parser/glr/optional", GINT_TO_POINTER(PB_GLR), test_optional); g_test_add_data_func("/core/parser/glr/sepBy", GINT_TO_POINTER(PB_GLR), test_sepBy); g_test_add_data_func("/core/parser/glr/sepBy1", GINT_TO_POINTER(PB_GLR), test_sepBy1); @@ -963,4 +1098,5 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/glr/ambiguous", GINT_TO_POINTER(PB_GLR), test_ambiguous); g_test_add_data_func("/core/parser/glr/result_length", GINT_TO_POINTER(PB_GLR), test_result_length); g_test_add_data_func("/core/parser/glr/token_position", GINT_TO_POINTER(PB_GLR), test_token_position); + g_test_add_data_func("/core/parser/glr/drop_from", GINT_TO_POINTER(PB_GLR), test_drop_from); } diff --git a/src/t_regression.c b/src/t_regression.c index 3dfe5dce12d0d9c9306ede01d5daf3a45ebab488..4e8ad0cdfc9df5c8ba2df32e4faaead50218ee7a 100644 --- a/src/t_regression.c +++ b/src/t_regression.c @@ -193,6 +193,83 @@ static void test_charset_bits(void) { g_check_cmp_uint32(test_charset_bits__buf[32], ==, 0xAB); } + +// Allocator for reproducing error 19. + +// The bug is a result of uninitialized data being used, initially +// assumed to be zero. Unfortunately, this assumption is often true, +// so reproducing the bug reliably and in a minimal fashion requires +// making it false. Fortunately, glibc malloc has an M_PERTURB option +// for making that assumption false. Unfortunately, we want the test +// to reproduce the bug on systems that don't use glibc. Fortunately, +// the standard Hammer system allocator has a DEBUG__MEMFILL option to +// fill uninitialized memory with a fill byte. Unfortunately, you +// have to recompile Hammer with that symbol #defined in order to +// enable it. Fortunately, hammer allows you to supply your own +// allocator. So this is a simple non-#define-dependent allocator +// that writes 0xbabababa†over all the memory it allocates. (But not +// the memory it reallocs, because, as it happens, the uninitialized +// memory in this case didn't come from a realloc.) +// +// Honestly I think we ought to remove the #ifdefs from +// system_allocator and always compile both the DEBUG__MEMFILL version +// and the non-DEBUG__MEMFILL version, merely changing which one is +// system_allocator, which is after all a struct of three pointers +// that can even be modified at run-time. +// +// †Can you hear it, Mr. Toot? + +static void* deadbeefing_malloc(HAllocator *allocator, size_t size) { + char *block = malloc(size); + if (block) memset(block, 0xba, size); + return block; +} + +// Don't deadbeef on realloc because it isn't necessary to reproduce this bug. +static void* deadbeefing_realloc(HAllocator *allocator, void *uptr, size_t size) { + return realloc(uptr, size); +} + +static void deadbeefing_free(HAllocator *allocator, void *uptr) { + free(uptr); +} + +static HAllocator deadbeefing_allocator = { + .alloc = deadbeefing_malloc, + .realloc = deadbeefing_realloc, + .free = deadbeefing_free, +}; + +static void test_bug_19() { + void *args[] = { + h_ch_range__m(&deadbeefing_allocator, '0', '9'), + h_ch_range__m(&deadbeefing_allocator, 'A', 'Z'), + h_ch_range__m(&deadbeefing_allocator, 'a', 'z'), + NULL, + }; + + HParser *parser = h_choice__ma(&deadbeefing_allocator, args); + + // In bug 19 ("GLR backend reaches unreachable code"), this call + // would fail because h_choice__ma allocated an HParser with h_new + // and didn't initialize its ->desugared field; consequently in + // the call chain h_compile ... h_lalr_compile ... h_desugar, + // h_desugar would find that ->desugared was already non-NULL (set + // to 0xbabababa in the above deadbeefing_malloc), and just return + // it, leading to a crash immediately afterwards in collect_nts. + // We don't actually care if the compile succeeds or fails, just + // that it doesn't crash. + h_compile(parser, PB_GLR, NULL); + + // The same bug happened in h_sequence__ma. + h_compile(h_sequence__ma(&deadbeefing_allocator, args), PB_GLR, NULL); + + // It also exists in h_permutation__ma, but it doesn't happen to + // manifest in the same way. I don't know how to write a test for + // the h_permutation__ma case. + g_assert_true(1); +} + void register_regression_tests(void) { g_test_add_func("/core/regression/bug118", test_bug118); g_test_add_func("/core/regression/seq_index_path", test_seq_index_path); @@ -202,4 +279,5 @@ void register_regression_tests(void) { g_test_add_func("/core/regression/lalr_charset_lhs", test_lalr_charset_lhs); g_test_add_func("/core/regression/cfg_many_seq", test_cfg_many_seq); g_test_add_func("/core/regression/charset_bits", test_charset_bits); + g_test_add_func("/core/regression/bug19", test_bug_19); } diff --git a/testing/leak-check.sh b/testing/leak-check.sh new file mode 100755 index 0000000000000000000000000000000000000000..b3f2d250be8618afb6e84b029f785c716ef79653 --- /dev/null +++ b/testing/leak-check.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# +# Script to run valgrind against the test suite for hunting memory leaks +# +# This assumes you run it in the Hammer base directory and have a debug build + +HAMMER_ROOT=. +VARIANT=debug +BUILD_PATH=$HAMMER_ROOT/build/$VARIANT +LD_LIBRARY_PATH=$BUILD_PATH/src:$LD_LIBRARY_PATH +VALGRIND=valgrind +VALGRIND_OPTS="-v --leak-check=full --leak-resolution=high --num-callers=40 --partial-loads-ok=no --show-leak-kinds=all --track-origins=yes --undef-value-errors=yes" +VALGRIND_SUPPRESSIONS="valgrind-glib.supp" + +for s in $VALGRIND_SUPPRESSIONS +do + VALGRIND_OPTS="$VALGRIND_OPTS --suppressions=$HAMMER_ROOT/testing/valgrind/$s" +done + +export LD_LIBRARY_PATH + +$VALGRIND $VALGRIND_OPTS $BUILD_PATH/src/test_suite $@ diff --git a/testing/valgrind/valgrind-glib.supp b/testing/valgrind/valgrind-glib.supp new file mode 100644 index 0000000000000000000000000000000000000000..9b35108f8f606aa7c3c9d8f85ffc6c51bff8f4d0 --- /dev/null +++ b/testing/valgrind/valgrind-glib.supp @@ -0,0 +1,40 @@ +{ + <g_test_add_vtable_supp> + Memcheck:Leak + match-leak-kinds: reachable + ... + fun:g_malloc + ... + fun:g_test_add_vtable + ... +} +{ + <g_test_init_malloc_supp> + Memcheck:Leak + match-leak-kinds: reachable + fun:malloc + ... + fun:g_test_init + ... +} +{ + <g_test_init_calloc_supp> + Memcheck:Leak + match-leak-kinds: reachable + fun:calloc + ... + fun:g_test_init + ... +} +{ + <g_rand_new_with_seed_array_supp> + Memcheck:Leak + match-leak-kinds: reachable + fun:calloc + fun:g_malloc0 + fun:g_rand_new_with_seed_array + ... + fun:g_test_run_suite + fun:g_test_run + ... +}