diff --git a/examples/base64.c b/examples/base64.c index 7fe3cffb7cc125542a949f01b0d63b5616497de0..ddc162c0e4164e23ebef79ea4e3411f5ecf84cab 100644 --- a/examples/base64.c +++ b/examples/base64.c @@ -11,8 +11,6 @@ #include <inttypes.h> #include "../src/hammer.h" -#define DEBUG - const HParser* document = NULL; void init_parser(void) @@ -27,65 +25,17 @@ void init_parser(void) HParser *equals = h_ch('='); HParser *bsfdig = h_choice(alpha, digit, plus, slash, NULL); - HParser *bsfdig_4bit = h_choice( - h_ch('A'), h_ch('E'), h_ch('I'), h_ch('M'), h_ch('Q'), h_ch('U'), - h_ch('Y'), h_ch('c'), h_ch('g'), h_ch('k'), h_ch('o'), h_ch('s'), - h_ch('w'), h_ch('0'), h_ch('4'), h_ch('8'), NULL); - HParser *bsfdig_2bit = h_choice(h_ch('A'), h_ch('Q'), h_ch('g'), h_ch('w'), NULL); - - HParser *base64_quad = h_sequence(bsfdig, bsfdig, bsfdig, bsfdig, NULL); - HParser *base64_quads = h_many(base64_quad); - - HParser *base64_2 = h_sequence(bsfdig, bsfdig, bsfdig_4bit, equals, h_end_p(), NULL); - HParser *base64_1 = h_sequence(bsfdig, bsfdig_2bit, equals, equals, h_end_p(), NULL); - HParser *base64_ending = h_choice(h_end_p(), base64_2, base64_1, NULL); - HParser *base64 = h_sequence(base64_quads, base64_ending, NULL); - // why does this parse "A=="?! - // why does this parse "aaA=" but not "aA=="?! - - document = base64; -} - - -#include <string.h> -#include <assert.h> -#define TRUE (1) -#define FALSE (0) - -void assert_parse(int expected, char *data) { - HParseResult *result; - - size_t datasize = strlen(data); - result = h_parse(document, (void*)data, datasize); - if((result != NULL) != expected) { - fprintf(stderr, "Test failed: %s\n", data); - } -#ifdef DEBUG - else { - fprintf(stderr, "Test succeeded: %s\n", data); - } - - if (result != NULL) { - fprintf(stderr, "parsed=%" PRId64 " bytes\n", result->bit_length / 8); - h_pprint(stdout, result->ast, 0, 0); - } -#endif - - h_parse_result_free(result); -} - -void test() { - assert_parse(TRUE, ""); - assert_parse(TRUE, "YQ=="); - assert_parse(TRUE, "YXU="); - assert_parse(TRUE, "YXVy"); - assert_parse(TRUE, "QVVSIFNBUkFG"); - assert_parse(TRUE, "QVVSIEhFUlUgU0FSQUY="); - assert_parse(FALSE, "A"); - assert_parse(FALSE, "A="); - assert_parse(FALSE, "A=="); - assert_parse(FALSE, "AAA=="); - assert_parse(FALSE, "aa=="); + HParser *bsfdig_4bit = h_in((uint8_t *)"AEIMQUYcgkosw048", 16); + HParser *bsfdig_2bit = h_in((uint8_t *)"AQgw", 4); + HParser *base64_3 = h_repeat_n(bsfdig, 4); + HParser *base64_2 = h_sequence(bsfdig, bsfdig, bsfdig_4bit, equals, NULL); + HParser *base64_1 = h_sequence(bsfdig, bsfdig_2bit, equals, equals, NULL); + HParser *base64 = h_sequence(h_many(base64_3), + h_optional(h_choice(base64_2, + base64_1, NULL)), + NULL); + + document = h_sequence(h_whitespace(base64), h_whitespace(h_end_p()), NULL); } @@ -99,8 +49,6 @@ int main(int argc, char **argv) init_parser(); - test(); - inputsize = fread(input, 1, sizeof(input), stdin); fprintf(stderr, "inputsize=%zu\ninput=", inputsize); fwrite(input, 1, inputsize, stderr); diff --git a/src/SConscript b/src/SConscript index 0c4f81ed3bcce5408e681ec92a7f0a9e677141f3..1a920a72500081b4dc7b3f04c4274169e1826c64 100644 --- a/src/SConscript +++ b/src/SConscript @@ -52,7 +52,8 @@ parsers = ['parsers/%s.c'%s for s in 'unimplemented', 'whitespace', 'xor', - 'value']] + 'value', + 'seek']] backends = ['backends/%s.c' % s for s in ['packrat', 'llk', 'regex', 'glr', 'lalr', 'lr', 'lr0']] diff --git a/src/backends/lr.c b/src/backends/lr.c index f2ac4956d80358e51d35c0e70484013bbfde212a..6919bf6d0a8d284c95167adb07023914a49f89b4 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -420,6 +420,12 @@ void h_lr_parse_start(HSuspendedParser *s) s->backend_state = engine; } +// cf. comment before run_trace in regex.c +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunknown-pragmas" +#pragma GCC diagnostic ignored "-Wclobbered" +#endif bool h_lr_parse_chunk(HSuspendedParser* s, HInputStream *stream) { HLREngine *engine = s->backend_state; @@ -457,6 +463,10 @@ bool h_lr_parse_chunk(HSuspendedParser* s, HInputStream *stream) *stream = engine->input; return !run; // done if engine no longer running } +// Reenable -Wclobber +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic pop +#endif HParseResult *h_lr_parse_finish(HSuspendedParser *s) { diff --git a/src/backends/regex.c b/src/backends/regex.c index 9646ddd59343cacbd1cc53645161c88d70c15f78..c10c25890fd5bfdf5e3e9b37a64e988fd3010749 100644 --- a/src/backends/regex.c +++ b/src/backends/regex.c @@ -223,7 +223,7 @@ bool svm_stack_ensure_cap(HAllocator *mm__, HSVMContext *ctx, size_t addl) { * the second return; here, the only variables that could matter for * are arena and ctx (because they're referenced in "goto fail"). */ -#ifdef __GNUC__ +#if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunknown-pragmas" #pragma GCC diagnostic ignored "-Wclobbered" @@ -311,7 +311,7 @@ HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace, return NULL; } // Reenable -Wclobber -#ifdef __GNUC__ +#if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/bitreader.c b/src/bitreader.c index fe21e439ec778aa39b3cbeb18c0b3ba4fbe337fd..0f0825b87c60697f4bd8aff727a3ffe4ecc19532 100644 --- a/src/bitreader.c +++ b/src/bitreader.c @@ -108,3 +108,77 @@ int64_t h_read_bits(HInputStream* state, int count, char signed_p) { out <<= final_shift; return (out ^ msb) - msb; // perform sign extension } + +void h_skip_bits(HInputStream* stream, size_t count) { + size_t left; + + if (count == 0) + return; + + if (stream->overrun) + return; + + if (stream->index == stream->length) { + stream->overrun = true; + return; + } + + // consume from a partial byte? + left = 8 - stream->bit_offset - stream->margin; + if (count < left) { + stream->bit_offset += count; + return; + } + if (left < 8) { + stream->index += 1; + stream->bit_offset = 0; + stream->margin = 0; + count -= left; + } + assert(stream->bit_offset == 0); + assert(stream->margin == 0); + + // consume full bytes + left = stream->length - stream->index; + if (count / 8 <= left) { + stream->index += count / 8; + count = count % 8; + } else { + stream->index = stream->length; + stream->overrun = true; + return; + } + assert(count < 8); + + // final partial byte + if (count > 0 && stream->index == stream->length) + stream->overrun = true; + else + stream->bit_offset = count; +} + +void h_seek_bits(HInputStream* stream, size_t pos) { + size_t pos_index = pos / 8; + size_t pos_offset = pos % 8; + + /* seek within the current byte? */ + if (pos_index == stream->index) { + stream->bit_offset = pos_offset; + return; + } + + stream->margin = 0; + + /* seek past the end? */ + if ((pos_index > stream->length) || + (pos_index == stream->length && pos_offset > 0)) { + stream->index = stream->length; + stream->bit_offset = 0; + stream->overrun = true; + return; + } + + stream->index = pos_index; + stream->bit_offset = pos_offset; + stream->margin = 0; +} diff --git a/src/glue.c b/src/glue.c index 58fe4175d4fd326b62c76449449a74768605ca9e..37962e849283951972ed60094345bec62b57434f 100644 --- a/src/glue.c +++ b/src/glue.c @@ -106,7 +106,7 @@ HParsedToken *h_make_seqn(HArena *arena, size_t n) return ret; } -HParsedToken *h_make_bytes(HArena *arena, uint8_t *array, size_t len) +HParsedToken *h_make_bytes(HArena *arena, const uint8_t *array, size_t len) { HParsedToken *ret = h_make_(arena, TT_BYTES); ret->bytes.len = len; diff --git a/src/glue.h b/src/glue.h index 0bbfe9cfa26ec1bb6376ff23aa3b2d6cc3b4e873..31597cd21c829d362e0a66c52a39dfc95b2a3a96 100644 --- a/src/glue.h +++ b/src/glue.h @@ -195,7 +195,7 @@ HParsedToken *h_act_ignore(const HParseResult *p, void* user_data); HParsedToken *h_make(HArena *arena, HTokenType type, void *value); HParsedToken *h_make_seq(HArena *arena); // Makes empty sequence. HParsedToken *h_make_seqn(HArena *arena, size_t n); // Makes empty sequence of expected size n. -HParsedToken *h_make_bytes(HArena *arena, uint8_t *array, size_t len); +HParsedToken *h_make_bytes(HArena *arena, const uint8_t *array, size_t len); HParsedToken *h_make_sint(HArena *arena, int64_t val); HParsedToken *h_make_uint(HArena *arena, uint64_t val); diff --git a/src/hammer.h b/src/hammer.h index d983f2c5bc231eeda7928ece341f31720668876e..ae2103ef7efcd8b321b39f2aa9778f39455f9ef9 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -716,6 +716,32 @@ HAMMER_FN_DECL(HParser*, h_get_value, const char* name); */ HAMMER_FN_DECL(HParser*, h_bind, const HParser *p, HContinuation k, void *env); +/** + * This parser skips 'n' bits of input. + * + * Result: None. The HParseResult exists but its AST is NULL. + */ +HAMMER_FN_DECL(HParser*, h_skip, size_t n); + +/** + * The HParser equivalent of fseek(), 'h_seek' modifies the parser's input + * position. Note that contrary to 'fseek', offsets are in bits, not bytes. + * The 'whence' argument uses the same values and semantics: SEEK_SET, + * SEEK_CUR, SEEK_END. + * + * Fails if the new input position would be negative or past the end of input. + * + * Result: TT_UINT. The new input position. + */ +HAMMER_FN_DECL(HParser*, h_seek, ssize_t offset, int whence); + +/** + * Report the current position in bits. Consumes no input. + * + * Result: TT_UINT. The current input position. + */ +HAMMER_FN_DECL_NOARG(HParser*, h_tell); + /** * Free the memory allocated to an HParseResult when it is no longer needed. */ diff --git a/src/internal.h b/src/internal.h index 79d6c9787dac2a444f18c5a456fc1aacb04b8112..324fcbafc5ef7601fac70ceaea04894b8d46010d 100644 --- a/src/internal.h +++ b/src/internal.h @@ -327,9 +327,16 @@ extern HParserBackendVTable h__glr_backend_vtable; // TODO(thequux): Set symbol visibility for these functions so that they aren't exported. int64_t h_read_bits(HInputStream* state, int count, char signed_p); +void h_skip_bits(HInputStream* state, size_t count); +void h_seek_bits(HInputStream* state, size_t pos); static inline size_t h_input_stream_pos(HInputStream* state) { + assert(state->index < SIZE_MAX / 8); return state->index * 8 + state->bit_offset + state->margin; } +static inline size_t h_input_stream_length(HInputStream *state) { + assert(state->length <= SIZE_MAX / 8); + return state->length * 8; +} // need to decide if we want to make this public. HParseResult* h_do_parse(const HParser* parser, HParseState *state); void put_cached(HParseState *ps, const HParser *p, HParseResult *cached); diff --git a/src/parsers/seek.c b/src/parsers/seek.c new file mode 100644 index 0000000000000000000000000000000000000000..027098b59424a2f78c9b54a0683e66111c02863f --- /dev/null +++ b/src/parsers/seek.c @@ -0,0 +1,118 @@ +#include "parser_internal.h" + +typedef struct { + ssize_t offset; + int whence; +} HSeek; + +static HParseResult *parse_skip(void *env, HParseState *state) +{ + size_t n = (uintptr_t)env; + + h_skip_bits(&state->input_stream, n); + return make_result(state->arena, NULL); +} + +static HParseResult *parse_seek(void *env, HParseState *state) +{ + HSeek *s = (HSeek *)env; + HInputStream *stream = &state->input_stream; + size_t pos; + + /* determine base position */ + switch (s->whence) { + case SEEK_SET: + pos = 0; + break; + case SEEK_END: + pos = h_input_stream_length(stream); + break; + case SEEK_CUR: + pos = h_input_stream_pos(stream); + break; + default: + return NULL; /* invalid argument */ + } + + /* calculate target position and do basic overflow checks */ + if (s->offset < 0 && (size_t)(- s->offset) > pos) + return NULL; /* underflow */ + if (s->offset > 0 && SIZE_MAX - s->offset < pos) + return NULL; /* overflow */ + pos += s->offset; + + /* perform the seek and check for overrun */ + h_seek_bits(stream, pos); + if (stream->overrun) + return NULL; + + HParsedToken *tok = a_new(HParsedToken, 1); + tok->token_type = TT_UINT; + tok->uint = pos; + return make_result(state->arena, tok); +} + +static HParseResult *parse_tell(void *env, HParseState *state) +{ + HParsedToken *tok = a_new(HParsedToken, 1); + tok->token_type = TT_UINT; + tok->uint = h_input_stream_pos(&state->input_stream); + return make_result(state->arena, tok); +} + +static const HParserVtable skip_vt = { + .parse = parse_skip, + .isValidRegular = h_false, + .isValidCF = h_false, + .compile_to_rvm = h_not_regular, + .higher = false, +}; + +static const HParserVtable seek_vt = { + .parse = parse_seek, + .isValidRegular = h_false, + .isValidCF = h_false, + .compile_to_rvm = h_not_regular, + .higher = false, +}; + +static const HParserVtable tell_vt = { + .parse = parse_tell, + .isValidRegular = h_false, + .isValidCF = h_false, + .compile_to_rvm = h_not_regular, + .higher = false, +}; + +HParser* h_skip(size_t n) +{ + return h_skip__m(&system_allocator, n); +} + +HParser *h_skip__m(HAllocator* mm__, size_t n) +{ + return h_new_parser(mm__, &skip_vt, (void *)n); +} + +HParser* h_seek(ssize_t offset, int whence) +{ + return h_seek__m(&system_allocator, offset, whence); +} + +HParser *h_seek__m(HAllocator* mm__, ssize_t offset, int whence) +{ + HSeek *env = h_new(HSeek, 1); + env->offset = offset; + env->whence = whence; + return h_new_parser(mm__, &seek_vt, env); +} + +HParser *h_tell() +{ + return h_tell__m(&system_allocator); +} + +HParser *h_tell__m(HAllocator* mm__) +{ + return h_new_parser(mm__, &tell_vt, NULL); +} diff --git a/src/platform_bsdlike.c b/src/platform_bsdlike.c index 2ccf874264a740e0784e8fba14e2ae78a337fa08..ffe1e64db4d1c0e2589160a40468c408f12a3fa6 100644 --- a/src/platform_bsdlike.c +++ b/src/platform_bsdlike.c @@ -1,4 +1,8 @@ +#ifdef __OpenBSD__ +#define _BSD_SOURCE // to obtain asprintf/vasprintf +#else #define _GNU_SOURCE // to obtain asprintf/vasprintf +#endif #include "platform.h" #include <stdio.h> diff --git a/src/t_parser.c b/src/t_parser.c index 331d2629018b40717bf49309ba0b561ce7a618a3..90f62bc5497e2f75b1ed315274a19c6081b327a9 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -743,6 +743,70 @@ static void test_bind(gconstpointer backend) { g_check_parse_failed(p, be, "272{", 4); } +static void test_skip(gconstpointer backend) { + HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend); + const HParser *p, *p_le, *p_be; + + p = h_sequence(h_ch('a'), h_skip(32), h_ch('f'), NULL); + g_check_parse_match(p, be, "abcdef", 6, "(u0x61 u0x66)"); + g_check_parse_failed(p, be, "abcdex", 6); + g_check_parse_failed(p, be, "abc", 3); + + p = h_sequence(h_ch('a'), h_skip(32), h_end_p(), NULL); + g_check_parse_match(p, be, "abcde", 5, "(u0x61)"); + + p = h_sequence(h_ch('a'), h_skip(3), h_ch('\0'), h_skip(5), h_ch('b'), NULL); + g_check_parse_match(p, be, "a\xe0\x1f\x62", 4, "(u0x61 u0 u0x62)"); // big-endian + p_le = h_with_endianness(BYTE_LITTLE_ENDIAN|BIT_LITTLE_ENDIAN, p); + p_be = h_with_endianness(BYTE_LITTLE_ENDIAN|BIT_BIG_ENDIAN, p); + g_check_parse_match(p_be, be, "a\xe0\x1f\x62", 4, "(u0x61 u0 u0x62)"); + g_check_parse_match(p_le, be, "a\x07\xf8\x62", 4, "(u0x61 u0 u0x62)"); + + p = h_sequence(h_ch('a'), h_skip(3), h_ch('\0'), h_skip(5), h_end_p(), NULL); + g_check_parse_match(p, be, "a\xe0\x1f", 3, "(u0x61 u0)"); // big-endian +} + +static void test_tell(gconstpointer backend) { + HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend); + const HParser *p; + + p = h_sequence(h_ch('a'), h_ch('b'), h_tell(), h_end_p(), NULL); + g_check_parse_match(p, be, "ab", 2, "(u0x61 u0x62 u0x10)"); + g_check_parse_failed(p, be, "abc", 1); + g_check_parse_failed(p, be, "a", 1); +} + +static void test_seek(gconstpointer backend) { + HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend); + const HParser *p; + + p = h_sequence(h_ch('a'), h_seek(40, SEEK_SET), h_ch('f'), NULL); + g_check_parse_match(p, be, "abcdef", 6, "(u0x61 u0x28 u0x66)"); + g_check_parse_failed(p, be, "abcdex", 6); + g_check_parse_failed(p, be, "abc", 3); + + p = h_sequence(h_ch('a'), h_seek(40, SEEK_SET), h_end_p(), NULL); + g_check_parse_match(p, be, "abcde", 5, "(u0x61 u0x28)"); + g_check_parse_failed(p, be, "abcdex", 6); + g_check_parse_failed(p, be, "abc", 3); + + p = h_sequence(h_ch('a'), h_seek(0, SEEK_END), h_end_p(), NULL); + g_check_parse_match(p, be, "abcde", 5, "(u0x61 u0x28)"); + g_check_parse_match(p, be, "abc", 3, "(u0x61 u0x18)"); + + p = h_sequence(h_ch('a'), h_seek(-16, SEEK_END), h_ch('x'), NULL); + g_check_parse_match(p, be, "abcdxy", 6, "(u0x61 u0x20 u0x78)"); + g_check_parse_match(p, be, "abxy", 4, "(u0x61 u0x10 u0x78)"); + g_check_parse_failed(p, be, "abc", 3); + g_check_parse_failed(p, be, "x", 1); + + p = h_sequence(h_ch('a'), h_seek(32, SEEK_CUR), h_ch('f'), NULL); + g_check_parse_match(p, be, "abcdef", 6, "(u0x61 u0x28 u0x66)"); + g_check_parse_failed(p, be, "xbcdef", 6); + g_check_parse_failed(p, be, "abcdex", 6); + g_check_parse_failed(p, be, "abc", 3); +} + void register_parser_tests(void) { g_test_add_data_func("/core/parser/packrat/token", GINT_TO_POINTER(PB_PACKRAT), test_token); g_test_add_data_func("/core/parser/packrat/ch", GINT_TO_POINTER(PB_PACKRAT), test_ch); @@ -795,6 +859,9 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/packrat/bind", GINT_TO_POINTER(PB_PACKRAT), test_bind); g_test_add_data_func("/core/parser/packrat/result_length", GINT_TO_POINTER(PB_PACKRAT), test_result_length); //g_test_add_data_func("/core/parser/packrat/token_position", GINT_TO_POINTER(PB_PACKRAT), test_token_position); + g_test_add_data_func("/core/parser/packrat/skip", GINT_TO_POINTER(PB_PACKRAT), test_skip); + g_test_add_data_func("/core/parser/packrat/seek", GINT_TO_POINTER(PB_PACKRAT), test_seek); + g_test_add_data_func("/core/parser/packrat/tell", GINT_TO_POINTER(PB_PACKRAT), test_tell); g_test_add_data_func("/core/parser/llk/token", GINT_TO_POINTER(PB_LLk), test_token); g_test_add_data_func("/core/parser/llk/ch", GINT_TO_POINTER(PB_LLk), test_ch);