diff --git a/src/SConscript b/src/SConscript index 0c4f81ed3bcce5408e681ec92a7f0a9e677141f3..1a920a72500081b4dc7b3f04c4274169e1826c64 100644 --- a/src/SConscript +++ b/src/SConscript @@ -52,7 +52,8 @@ parsers = ['parsers/%s.c'%s for s in 'unimplemented', 'whitespace', 'xor', - 'value']] + 'value', + 'seek']] backends = ['backends/%s.c' % s for s in ['packrat', 'llk', 'regex', 'glr', 'lalr', 'lr', 'lr0']] diff --git a/src/bitreader.c b/src/bitreader.c index fe21e439ec778aa39b3cbeb18c0b3ba4fbe337fd..0f0825b87c60697f4bd8aff727a3ffe4ecc19532 100644 --- a/src/bitreader.c +++ b/src/bitreader.c @@ -108,3 +108,77 @@ int64_t h_read_bits(HInputStream* state, int count, char signed_p) { out <<= final_shift; return (out ^ msb) - msb; // perform sign extension } + +void h_skip_bits(HInputStream* stream, size_t count) { + size_t left; + + if (count == 0) + return; + + if (stream->overrun) + return; + + if (stream->index == stream->length) { + stream->overrun = true; + return; + } + + // consume from a partial byte? + left = 8 - stream->bit_offset - stream->margin; + if (count < left) { + stream->bit_offset += count; + return; + } + if (left < 8) { + stream->index += 1; + stream->bit_offset = 0; + stream->margin = 0; + count -= left; + } + assert(stream->bit_offset == 0); + assert(stream->margin == 0); + + // consume full bytes + left = stream->length - stream->index; + if (count / 8 <= left) { + stream->index += count / 8; + count = count % 8; + } else { + stream->index = stream->length; + stream->overrun = true; + return; + } + assert(count < 8); + + // final partial byte + if (count > 0 && stream->index == stream->length) + stream->overrun = true; + else + stream->bit_offset = count; +} + +void h_seek_bits(HInputStream* stream, size_t pos) { + size_t pos_index = pos / 8; + size_t pos_offset = pos % 8; + + /* seek within the current byte? */ + if (pos_index == stream->index) { + stream->bit_offset = pos_offset; + return; + } + + stream->margin = 0; + + /* seek past the end? */ + if ((pos_index > stream->length) || + (pos_index == stream->length && pos_offset > 0)) { + stream->index = stream->length; + stream->bit_offset = 0; + stream->overrun = true; + return; + } + + stream->index = pos_index; + stream->bit_offset = pos_offset; + stream->margin = 0; +} diff --git a/src/hammer.h b/src/hammer.h index ad44fee910fcf42445e57e47ec8c1fe2d18d3724..32ec2e05c18dbb322b3769107b9d96999341b064 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -716,6 +716,32 @@ HAMMER_FN_DECL(HParser*, h_get_value, const char* name); */ HAMMER_FN_DECL(HParser*, h_bind, const HParser *p, HContinuation k, void *env); +/** + * This parser skips 'n' bits of input. + * + * Result: None. The HParseResult exists but its AST is NULL. + */ +HAMMER_FN_DECL(HParser*, h_skip, size_t n); + +/** + * The HParser equivalent of fseek(), 'h_seek' modifies the parser's input + * position. Note that contrary to 'fseek', offsets are in bits, not bytes. + * The 'whence' argument uses the same values and semantics: SEEK_SET, + * SEEK_CUR, SEEK_END. + * + * Fails if the new input position would be negative or past the end of input. + * + * Result: TT_UINT. The new input position. + */ +HAMMER_FN_DECL(HParser*, h_seek, ssize_t offset, int whence); + +/** + * Report the current position in bits. Consumes no input. + * + * Result: TT_UINT. The current input position. + */ +HAMMER_FN_DECL_NOARG(HParser*, h_tell); + /** * Free the memory allocated to an HParseResult when it is no longer needed. */ diff --git a/src/internal.h b/src/internal.h index 0e92e99e6facf5d04c6b13ca8de51272ba630a1d..347646a1835065b24004f4d2064495e348a4658c 100644 --- a/src/internal.h +++ b/src/internal.h @@ -327,9 +327,16 @@ extern HParserBackendVTable h__glr_backend_vtable; // TODO(thequux): Set symbol visibility for these functions so that they aren't exported. int64_t h_read_bits(HInputStream* state, int count, char signed_p); +void h_skip_bits(HInputStream* state, size_t count); +void h_seek_bits(HInputStream* state, size_t pos); static inline size_t h_input_stream_pos(HInputStream* state) { + assert(state->index < SIZE_MAX / 8); return state->index * 8 + state->bit_offset + state->margin; } +static inline size_t h_input_stream_length(HInputStream *state) { + assert(state->length <= SIZE_MAX / 8); + return state->length * 8; +} // need to decide if we want to make this public. HParseResult* h_do_parse(const HParser* parser, HParseState *state); void put_cached(HParseState *ps, const HParser *p, HParseResult *cached); diff --git a/src/parsers/seek.c b/src/parsers/seek.c new file mode 100644 index 0000000000000000000000000000000000000000..027098b59424a2f78c9b54a0683e66111c02863f --- /dev/null +++ b/src/parsers/seek.c @@ -0,0 +1,118 @@ +#include "parser_internal.h" + +typedef struct { + ssize_t offset; + int whence; +} HSeek; + +static HParseResult *parse_skip(void *env, HParseState *state) +{ + size_t n = (uintptr_t)env; + + h_skip_bits(&state->input_stream, n); + return make_result(state->arena, NULL); +} + +static HParseResult *parse_seek(void *env, HParseState *state) +{ + HSeek *s = (HSeek *)env; + HInputStream *stream = &state->input_stream; + size_t pos; + + /* determine base position */ + switch (s->whence) { + case SEEK_SET: + pos = 0; + break; + case SEEK_END: + pos = h_input_stream_length(stream); + break; + case SEEK_CUR: + pos = h_input_stream_pos(stream); + break; + default: + return NULL; /* invalid argument */ + } + + /* calculate target position and do basic overflow checks */ + if (s->offset < 0 && (size_t)(- s->offset) > pos) + return NULL; /* underflow */ + if (s->offset > 0 && SIZE_MAX - s->offset < pos) + return NULL; /* overflow */ + pos += s->offset; + + /* perform the seek and check for overrun */ + h_seek_bits(stream, pos); + if (stream->overrun) + return NULL; + + HParsedToken *tok = a_new(HParsedToken, 1); + tok->token_type = TT_UINT; + tok->uint = pos; + return make_result(state->arena, tok); +} + +static HParseResult *parse_tell(void *env, HParseState *state) +{ + HParsedToken *tok = a_new(HParsedToken, 1); + tok->token_type = TT_UINT; + tok->uint = h_input_stream_pos(&state->input_stream); + return make_result(state->arena, tok); +} + +static const HParserVtable skip_vt = { + .parse = parse_skip, + .isValidRegular = h_false, + .isValidCF = h_false, + .compile_to_rvm = h_not_regular, + .higher = false, +}; + +static const HParserVtable seek_vt = { + .parse = parse_seek, + .isValidRegular = h_false, + .isValidCF = h_false, + .compile_to_rvm = h_not_regular, + .higher = false, +}; + +static const HParserVtable tell_vt = { + .parse = parse_tell, + .isValidRegular = h_false, + .isValidCF = h_false, + .compile_to_rvm = h_not_regular, + .higher = false, +}; + +HParser* h_skip(size_t n) +{ + return h_skip__m(&system_allocator, n); +} + +HParser *h_skip__m(HAllocator* mm__, size_t n) +{ + return h_new_parser(mm__, &skip_vt, (void *)n); +} + +HParser* h_seek(ssize_t offset, int whence) +{ + return h_seek__m(&system_allocator, offset, whence); +} + +HParser *h_seek__m(HAllocator* mm__, ssize_t offset, int whence) +{ + HSeek *env = h_new(HSeek, 1); + env->offset = offset; + env->whence = whence; + return h_new_parser(mm__, &seek_vt, env); +} + +HParser *h_tell() +{ + return h_tell__m(&system_allocator); +} + +HParser *h_tell__m(HAllocator* mm__) +{ + return h_new_parser(mm__, &tell_vt, NULL); +} diff --git a/src/t_parser.c b/src/t_parser.c index 331d2629018b40717bf49309ba0b561ce7a618a3..90f62bc5497e2f75b1ed315274a19c6081b327a9 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -743,6 +743,70 @@ static void test_bind(gconstpointer backend) { g_check_parse_failed(p, be, "272{", 4); } +static void test_skip(gconstpointer backend) { + HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend); + const HParser *p, *p_le, *p_be; + + p = h_sequence(h_ch('a'), h_skip(32), h_ch('f'), NULL); + g_check_parse_match(p, be, "abcdef", 6, "(u0x61 u0x66)"); + g_check_parse_failed(p, be, "abcdex", 6); + g_check_parse_failed(p, be, "abc", 3); + + p = h_sequence(h_ch('a'), h_skip(32), h_end_p(), NULL); + g_check_parse_match(p, be, "abcde", 5, "(u0x61)"); + + p = h_sequence(h_ch('a'), h_skip(3), h_ch('\0'), h_skip(5), h_ch('b'), NULL); + g_check_parse_match(p, be, "a\xe0\x1f\x62", 4, "(u0x61 u0 u0x62)"); // big-endian + p_le = h_with_endianness(BYTE_LITTLE_ENDIAN|BIT_LITTLE_ENDIAN, p); + p_be = h_with_endianness(BYTE_LITTLE_ENDIAN|BIT_BIG_ENDIAN, p); + g_check_parse_match(p_be, be, "a\xe0\x1f\x62", 4, "(u0x61 u0 u0x62)"); + g_check_parse_match(p_le, be, "a\x07\xf8\x62", 4, "(u0x61 u0 u0x62)"); + + p = h_sequence(h_ch('a'), h_skip(3), h_ch('\0'), h_skip(5), h_end_p(), NULL); + g_check_parse_match(p, be, "a\xe0\x1f", 3, "(u0x61 u0)"); // big-endian +} + +static void test_tell(gconstpointer backend) { + HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend); + const HParser *p; + + p = h_sequence(h_ch('a'), h_ch('b'), h_tell(), h_end_p(), NULL); + g_check_parse_match(p, be, "ab", 2, "(u0x61 u0x62 u0x10)"); + g_check_parse_failed(p, be, "abc", 1); + g_check_parse_failed(p, be, "a", 1); +} + +static void test_seek(gconstpointer backend) { + HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend); + const HParser *p; + + p = h_sequence(h_ch('a'), h_seek(40, SEEK_SET), h_ch('f'), NULL); + g_check_parse_match(p, be, "abcdef", 6, "(u0x61 u0x28 u0x66)"); + g_check_parse_failed(p, be, "abcdex", 6); + g_check_parse_failed(p, be, "abc", 3); + + p = h_sequence(h_ch('a'), h_seek(40, SEEK_SET), h_end_p(), NULL); + g_check_parse_match(p, be, "abcde", 5, "(u0x61 u0x28)"); + g_check_parse_failed(p, be, "abcdex", 6); + g_check_parse_failed(p, be, "abc", 3); + + p = h_sequence(h_ch('a'), h_seek(0, SEEK_END), h_end_p(), NULL); + g_check_parse_match(p, be, "abcde", 5, "(u0x61 u0x28)"); + g_check_parse_match(p, be, "abc", 3, "(u0x61 u0x18)"); + + p = h_sequence(h_ch('a'), h_seek(-16, SEEK_END), h_ch('x'), NULL); + g_check_parse_match(p, be, "abcdxy", 6, "(u0x61 u0x20 u0x78)"); + g_check_parse_match(p, be, "abxy", 4, "(u0x61 u0x10 u0x78)"); + g_check_parse_failed(p, be, "abc", 3); + g_check_parse_failed(p, be, "x", 1); + + p = h_sequence(h_ch('a'), h_seek(32, SEEK_CUR), h_ch('f'), NULL); + g_check_parse_match(p, be, "abcdef", 6, "(u0x61 u0x28 u0x66)"); + g_check_parse_failed(p, be, "xbcdef", 6); + g_check_parse_failed(p, be, "abcdex", 6); + g_check_parse_failed(p, be, "abc", 3); +} + void register_parser_tests(void) { g_test_add_data_func("/core/parser/packrat/token", GINT_TO_POINTER(PB_PACKRAT), test_token); g_test_add_data_func("/core/parser/packrat/ch", GINT_TO_POINTER(PB_PACKRAT), test_ch); @@ -795,6 +859,9 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/packrat/bind", GINT_TO_POINTER(PB_PACKRAT), test_bind); g_test_add_data_func("/core/parser/packrat/result_length", GINT_TO_POINTER(PB_PACKRAT), test_result_length); //g_test_add_data_func("/core/parser/packrat/token_position", GINT_TO_POINTER(PB_PACKRAT), test_token_position); + g_test_add_data_func("/core/parser/packrat/skip", GINT_TO_POINTER(PB_PACKRAT), test_skip); + g_test_add_data_func("/core/parser/packrat/seek", GINT_TO_POINTER(PB_PACKRAT), test_seek); + g_test_add_data_func("/core/parser/packrat/tell", GINT_TO_POINTER(PB_PACKRAT), test_tell); g_test_add_data_func("/core/parser/llk/token", GINT_TO_POINTER(PB_LLk), test_token); g_test_add_data_func("/core/parser/llk/ch", GINT_TO_POINTER(PB_LLk), test_ch);