diff --git a/src/backends/llvm.c b/src/backends/llvm.c index 1c42259119e345b1a4932c3d0752d10be9321bc8..79f91eafab27dac54b544601363e39e2c4567276 100644 --- a/src/backends/llvm.c +++ b/src/backends/llvm.c @@ -145,6 +145,130 @@ void h_llvm_free(HParser *parser) { llvm_parser->mod = NULL; } +/* + * Construct LLVM IR to decide if a runtime value is a member of a compile-time + * character set, and branch depending on the result. + * + * Parameters: + * - mod [in]: an LLVMModuleRef + * - func [in]: an LLVMValueRef to the function to add the new basic blocks + * - builder [in]: an LLVMBuilderRef, positioned appropriately + * - r [in]: an LLVMValueRef to the value to test + * - cs [in]: the HCharset to test membership in + * - yes [in]: the basic block to branch to if r is in cs + * - no [in]: the basic block to branch to if r is not in cs + */ + +void h_llvm_make_charset_membership_test(LLVMModuleRef mod, LLVMValueRef func, LLVMBuilderRef builder, + LLVMValueRef r, HCharset cs, + LLVMBasicBlockRef yes, LLVMBasicBlockRef no) { + /* + * A charset is a 256-element bit array, 32 bytes long in total. Ours is + * static at compile time, so we can try to construct minimal LLVM IR for + * this particular charset. In particular, we should handle cases like + * only one or two bits being set, or a long consecutive range, efficiently. + * + * In LLVM IR, we can test propositions like r == x, r <= x, r >= x and their + * negations efficiently, so the challenge here is to turn a character map + * into a minimal set of such propositions. + * + * TODO: actually do this; right now for the sake of a first pass we're just + * testing r == x for every x in cs. + */ + + for (int i = 0; i < 256; ++i) { + if (charset_isset(cs, i)) { + char bbname[16]; + uint8_t c = (uint8_t)i; + snprintf(bbname, 16, "cs_memb_%02x", c); + LLVMValueRef icmp = LLVMBuildICmp(builder, LLVMIntEQ, + LLVMConstInt(LLVMInt8Type(), c, 0), r, "c == r"); + LLVMBasicBlockRef bb = LLVMAppendBasicBlock(func, bbname); + LLVMBuildCondBr(builder, icmp, yes, bb); + LLVMPositionBuilderAtEnd(builder, bb); + } + } + + LLVMBuildBr(builder, no); +} + +/* + * Construct LLVM IR to allocate a token of type TT_SINT or TT_UINT + * + * Parameters: + * - mod [in]: an LLVMModuleRef + * - builder [in]: an LLVMBuilderRef, positioned appropriately + * - stream [in]: a value ref to an llvm_inputstreamptr, for the input stream + * - arena [in]: a value ref to an llvm_arenaptr to be used for the malloc + * - r [in]: a value ref to the value to be used to this token + * - mr_out [out]: the return value from make_result() + * + * TODO actually support TT_SINT, inputs other than 8 bit + */ + +void h_llvm_make_tt_suint(LLVMModuleRef mod, LLVMBuilderRef builder, + LLVMValueRef stream, LLVMValueRef arena, + LLVMValueRef r, LLVMValueRef *mr_out) { + /* Set up call to h_arena_malloc() for a new HParsedToken */ + LLVMValueRef tok_size = LLVMConstInt(LLVMInt32Type(), sizeof(HParsedToken), 0); + LLVMValueRef amalloc_args[] = { arena, tok_size }; + /* %h_arena_malloc = call void* @h_arena_malloc(%struct.HArena_.1* %1, i32 48) */ + LLVMValueRef amalloc = LLVMBuildCall(builder, LLVMGetNamedFunction(mod, "h_arena_malloc"), + amalloc_args, 2, "h_arena_malloc"); + /* %tok = bitcast void* %h_arena_malloc to %struct.HParsedToken_.2* */ + LLVMValueRef tok = LLVMBuildBitCast(builder, amalloc, llvm_parsedtokenptr, "tok"); + + /* + * tok->token_type = TT_UINT; + * + * %token_type = getelementptr inbounds %struct.HParsedToken_.2, %struct.HParsedToken_.2* %3, i32 0, i32 0 + * + * TODO if we handle TT_SINT too, adjust here and the zero-ext below + */ + LLVMValueRef toktype = LLVMBuildStructGEP(builder, tok, 0, "token_type"); + /* store i32 8, i32* %token_type */ + LLVMBuildStore(builder, LLVMConstInt(LLVMInt32Type(), 8, 0), toktype); + + /* + * tok->uint = r; + * + * %token_data = getelementptr inbounds %struct.HParsedToken_.2, %struct.HParsedToken_.2* %3, i32 0, i32 1 + */ + LLVMValueRef tokdata = LLVMBuildStructGEP(builder, tok, 1, "token_data"); + /* + * TODO + * + * This is where we'll need to adjust to handle other types (sign vs. zero extend, omit extend if + * r is 64-bit already + */ + LLVMBuildStore(builder, LLVMBuildZExt(builder, r, LLVMInt64Type(), "r"), tokdata); + /* + * Store the index from the stream into the token + */ + /* %t_index = getelementptr inbounds %struct.HParsedToken_.2, %struct.HParsedToken_.2* %3, i32 0, i32 2 */ + LLVMValueRef tokindex = LLVMBuildStructGEP(builder, tok, 2, "t_index"); + /* %s_index = getelementptr inbounds %struct.HInputStream_.0, %struct.HInputStream_.0* %0, i32 0, i32 2 */ + LLVMValueRef streamindex = LLVMBuildStructGEP(builder, stream, 2, "s_index"); + /* %4 = load i64, i64* %s_index */ + /* store i64 %4, i64* %t_index */ + LLVMBuildStore(builder, LLVMBuildLoad(builder, streamindex, ""), tokindex); + /* Store the bit length into the token */ + LLVMValueRef tokbitlen = LLVMBuildStructGEP(builder, tok, 3, "bit_length"); + /* TODO handle multiple bit lengths */ + LLVMBuildStore(builder, LLVMConstInt(LLVMInt64Type(), 8, 0), tokbitlen); + + /* + * Now call make_result() + * + * %make_result = call %struct.HParseResult_.3* @make_result(%struct.HArena_.1* %1, %struct.HParsedToken_.2* %3) + */ + LLVMValueRef result_args[] = { arena, tok }; + LLVMValueRef mr = LLVMBuildCall(builder, LLVMGetNamedFunction(mod, "make_result"), + result_args, 2, "make_result"); + + *mr_out = mr; +} + HParseResult *h_llvm_parse(HAllocator* mm__, const HParser* parser, HInputStream *input_stream) { const HLLVMParser *llvm_parser = parser->backend_data; HArena *arena = h_new_arena(mm__, 0); diff --git a/src/llvm.h b/src/llvm.h index 3b6c7ed541b8c644da0ad74ff34004207c82056b..369f5729d54c0c0f3e2babec784a887cb0bc824e 100644 --- a/src/llvm.h +++ b/src/llvm.h @@ -9,4 +9,11 @@ LLVMTypeRef llvm_inputstream, llvm_inputstreamptr, llvm_arena, llvm_arenaptr; LLVMTypeRef llvm_parsedtoken, llvm_parsedtokenptr, llvm_parseresult, llvm_parseresultptr; +void h_llvm_make_charset_membership_test(LLVMModuleRef mod, LLVMValueRef func, LLVMBuilderRef builder, + LLVMValueRef r, HCharset cs, + LLVMBasicBlockRef yes, LLVMBasicBlockRef no); +void h_llvm_make_tt_suint(LLVMModuleRef mod, LLVMBuilderRef builder, + LLVMValueRef stream, LLVMValueRef arena, + LLVMValueRef r, LLVMValueRef *mr_out); + #endif // #ifndef HAMMER_LLVM__H diff --git a/src/parsers/ch.c b/src/parsers/ch.c index a7ac9becdb7b5eab202818cc2d41b9ac74b26c25..1c396a2f3c8c2e2e8a7433964c397f8776688462 100644 --- a/src/parsers/ch.c +++ b/src/parsers/ch.c @@ -85,42 +85,9 @@ static bool ch_llvm(LLVMBuilderRef builder, LLVMValueRef func, LLVMModuleRef mod // Basic block: success LLVMPositionBuilderAtEnd(builder, success); - // Set up call to h_arena_malloc() for a new HParsedToken - LLVMValueRef tok_size = LLVMConstInt(LLVMInt32Type(), sizeof(HParsedToken), 0); - LLVMValueRef amalloc_args[] = { arena, tok_size }; - // %h_arena_malloc = call void* @h_arena_malloc(%struct.HArena_.1* %1, i32 48) - LLVMValueRef amalloc = LLVMBuildCall(builder, LLVMGetNamedFunction(mod, "h_arena_malloc"), amalloc_args, 2, "h_arena_malloc"); - // %3 = bitcast void* %h_arena_malloc to %struct.HParsedToken_.2* - LLVMValueRef tok = LLVMBuildBitCast(builder, amalloc, llvm_parsedtokenptr, ""); - - // tok->token_type = TT_UINT; - // - // %token_type = getelementptr inbounds %struct.HParsedToken_.2, %struct.HParsedToken_.2* %3, i32 0, i32 0 - LLVMValueRef toktype = LLVMBuildStructGEP(builder, tok, 0, "token_type"); - // store i32 8, i32* %token_type - LLVMBuildStore(builder, LLVMConstInt(LLVMInt32Type(), 8, 0), toktype); - - // tok->uint = r; - // - // %token_data = getelementptr inbounds %struct.HParsedToken_.2, %struct.HParsedToken_.2* %3, i32 0, i32 1 - LLVMValueRef tokdata = LLVMBuildStructGEP(builder, tok, 1, "token_data"); - // %r = zext i8 %2 to i64 - // store i64 %r, i64* %token_data - LLVMBuildStore(builder, LLVMBuildZExt(builder, r, LLVMInt64Type(), "r"), tokdata); - // %t_index = getelementptr inbounds %struct.HParsedToken_.2, %struct.HParsedToken_.2* %3, i32 0, i32 2 - LLVMValueRef tokindex = LLVMBuildStructGEP(builder, tok, 2, "t_index"); - // %s_index = getelementptr inbounds %struct.HInputStream_.0, %struct.HInputStream_.0* %0, i32 0, i32 2 - LLVMValueRef streamindex = LLVMBuildStructGEP(builder, stream, 2, "s_index"); - // %4 = load i64, i64* %s_index - // store i64 %4, i64* %t_index - LLVMBuildStore(builder, LLVMBuildLoad(builder, streamindex, ""), tokindex); - LLVMValueRef tokbitlen = LLVMBuildStructGEP(builder, tok, 3, "bit_length"); - LLVMBuildStore(builder, LLVMConstInt(LLVMInt64Type(), 8, 0), tokbitlen); - - // Now call make_result() - // %make_result = call %struct.HParseResult_.3* @make_result(%struct.HArena_.1* %1, %struct.HParsedToken_.2* %3) - LLVMValueRef result_args[] = { arena, tok }; - LLVMValueRef mr = LLVMBuildCall(builder, LLVMGetNamedFunction(mod, "make_result"), result_args, 2, "make_result"); + /* Make a token */ + LLVMValueRef mr; + h_llvm_make_tt_suint(mod, builder, stream, arena, r, &mr); // br label %ch_end LLVMBuildBr(builder, end); diff --git a/src/parsers/charset.c b/src/parsers/charset.c index a4b8c89c7daca326cf77ee9bf5c8ae4660884c56..2f73da722408c1575eab883afd0242eef63eacd2 100644 --- a/src/parsers/charset.c +++ b/src/parsers/charset.c @@ -1,7 +1,12 @@ #include <assert.h> #include <string.h> #include "../internal.h" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#include <llvm-c/Core.h> +#pragma GCC diagnostic pop #include "parser_internal.h" +#include "../llvm.h" static HParseResult* parse_charset(void *env, HParseState *state) { uint8_t in = h_read_bits(&state->input_stream, 8, false); @@ -70,12 +75,82 @@ static bool cs_ctrvm(HRVMProg *prog, void *env) { return true; } +static bool cs_llvm(LLVMBuilderRef builder, LLVMValueRef func, + LLVMModuleRef mod, void* env) { + /* + * LLVM to build a function to parse a charset; the args are a stream and an + * arena. + */ + + LLVMValueRef stream = LLVMGetFirstParam(func); + stream = LLVMBuildBitCast(builder, stream, llvm_inputstreamptr, "stream"); + LLVMValueRef arena = LLVMGetLastParam(func); + + /* Set up our basic blocks */ + LLVMBasicBlockRef entry = LLVMAppendBasicBlock(func, "cs_entry"); + LLVMBasicBlockRef success = LLVMAppendBasicBlock(func, "cs_success"); + LLVMBasicBlockRef fail = LLVMAppendBasicBlock(func, "cs_fail"); + LLVMBasicBlockRef end = LLVMAppendBasicBlock(func, "cs_end"); + + /* Basic block: entry */ + LLVMPositionBuilderAtEnd(builder, entry); + /* First we read the char */ + LLVMValueRef bits_args[3]; + bits_args[0] = stream; + bits_args[1] = LLVMConstInt(LLVMInt32Type(), 8, 0); + bits_args[2] = LLVMConstInt(LLVMInt8Type(), 0, 0); + LLVMValueRef bits = LLVMBuildCall(builder, LLVMGetNamedFunction(mod, "h_read_bits"), bits_args, 3, "read_bits"); + LLVMValueRef r = LLVMBuildTrunc(builder, bits, LLVMInt8Type(), ""); // TODO Necessary? (same question in ch_llvm()) + + /* We have a char, need to check if it's in the charset */ + HCharset cs = (HCharset)env; + /* Branch to either success or end, conditional on whether r is in cs */ + h_llvm_make_charset_membership_test(mod, func, builder, r, cs, success, fail); + + /* Basic block: success */ + LLVMPositionBuilderAtEnd(builder, success); + + LLVMValueRef mr; + h_llvm_make_tt_suint(mod, builder, stream, arena, r, &mr); + + /* br label %ch_end */ + LLVMBuildBr(builder, end); + + /* Basic block: fail */ + LLVMPositionBuilderAtEnd(builder, fail); + /* + * We just branch straight to end; this exists so that the phi node in + * end knows where all the incoming edges are from, rather than needing + * some basic block constructed in h_llvm_make_charset_membership_test() + */ + LLVMBuildBr(builder, end); + + /* Basic block: end */ + LLVMPositionBuilderAtEnd(builder, end); + // %rv = phi %struct.HParseResult_.3* [ %make_result, %ch_success ], [ null, %ch_entry ] + LLVMValueRef rv = LLVMBuildPhi(builder, llvm_parseresultptr, "rv"); + LLVMBasicBlockRef rv_phi_incoming_blocks[] = { + success, + fail + }; + LLVMValueRef rv_phi_incoming_values[] = { + mr, + LLVMConstNull(llvm_parseresultptr) + }; + LLVMAddIncoming(rv, rv_phi_incoming_values, rv_phi_incoming_blocks, 2); + // ret %struct.HParseResult_.3* %rv + LLVMBuildRet(builder, rv); + + return true; +} + static const HParserVtable charset_vt = { .parse = parse_charset, .isValidRegular = h_true, .isValidCF = h_true, .desugar = desugar_charset, .compile_to_rvm = cs_ctrvm, + .llvm = cs_llvm, .higher = false, }; diff --git a/src/t_parser.c b/src/t_parser.c index 17dc9a91cc5671567a3db18bb5cb2f5148f5e90a..f7c4baf7c0b59342949b3e5b0a5ce1d1b913ac2b 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -963,4 +963,5 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/glr/token_position", GINT_TO_POINTER(PB_GLR), test_token_position); g_test_add_data_func("/core/parser/llvm/ch", GINT_TO_POINTER(PB_LLVM), test_ch); + g_test_add_data_func("/core/parser/llvm/ch_range", GINT_TO_POINTER(PB_LLVM), test_ch_range); }