diff --git a/src/backends/llvm.c b/src/backends/llvm.c index edbe3ebbecbfcd70df93cfc946e41596887a2965..79f91eafab27dac54b544601363e39e2c4567276 100644 --- a/src/backends/llvm.c +++ b/src/backends/llvm.c @@ -145,6 +145,53 @@ void h_llvm_free(HParser *parser) { llvm_parser->mod = NULL; } +/* + * Construct LLVM IR to decide if a runtime value is a member of a compile-time + * character set, and branch depending on the result. + * + * Parameters: + * - mod [in]: an LLVMModuleRef + * - func [in]: an LLVMValueRef to the function to add the new basic blocks + * - builder [in]: an LLVMBuilderRef, positioned appropriately + * - r [in]: an LLVMValueRef to the value to test + * - cs [in]: the HCharset to test membership in + * - yes [in]: the basic block to branch to if r is in cs + * - no [in]: the basic block to branch to if r is not in cs + */ + +void h_llvm_make_charset_membership_test(LLVMModuleRef mod, LLVMValueRef func, LLVMBuilderRef builder, + LLVMValueRef r, HCharset cs, + LLVMBasicBlockRef yes, LLVMBasicBlockRef no) { + /* + * A charset is a 256-element bit array, 32 bytes long in total. Ours is + * static at compile time, so we can try to construct minimal LLVM IR for + * this particular charset. In particular, we should handle cases like + * only one or two bits being set, or a long consecutive range, efficiently. + * + * In LLVM IR, we can test propositions like r == x, r <= x, r >= x and their + * negations efficiently, so the challenge here is to turn a character map + * into a minimal set of such propositions. + * + * TODO: actually do this; right now for the sake of a first pass we're just + * testing r == x for every x in cs. + */ + + for (int i = 0; i < 256; ++i) { + if (charset_isset(cs, i)) { + char bbname[16]; + uint8_t c = (uint8_t)i; + snprintf(bbname, 16, "cs_memb_%02x", c); + LLVMValueRef icmp = LLVMBuildICmp(builder, LLVMIntEQ, + LLVMConstInt(LLVMInt8Type(), c, 0), r, "c == r"); + LLVMBasicBlockRef bb = LLVMAppendBasicBlock(func, bbname); + LLVMBuildCondBr(builder, icmp, yes, bb); + LLVMPositionBuilderAtEnd(builder, bb); + } + } + + LLVMBuildBr(builder, no); +} + /* * Construct LLVM IR to allocate a token of type TT_SINT or TT_UINT * diff --git a/src/llvm.h b/src/llvm.h index 927241f7cd4fa6f897e726d72a954de9f4343c96..369f5729d54c0c0f3e2babec784a887cb0bc824e 100644 --- a/src/llvm.h +++ b/src/llvm.h @@ -9,6 +9,9 @@ LLVMTypeRef llvm_inputstream, llvm_inputstreamptr, llvm_arena, llvm_arenaptr; LLVMTypeRef llvm_parsedtoken, llvm_parsedtokenptr, llvm_parseresult, llvm_parseresultptr; +void h_llvm_make_charset_membership_test(LLVMModuleRef mod, LLVMValueRef func, LLVMBuilderRef builder, + LLVMValueRef r, HCharset cs, + LLVMBasicBlockRef yes, LLVMBasicBlockRef no); void h_llvm_make_tt_suint(LLVMModuleRef mod, LLVMBuilderRef builder, LLVMValueRef stream, LLVMValueRef arena, LLVMValueRef r, LLVMValueRef *mr_out); diff --git a/src/parsers/charset.c b/src/parsers/charset.c index a4b8c89c7daca326cf77ee9bf5c8ae4660884c56..2f73da722408c1575eab883afd0242eef63eacd2 100644 --- a/src/parsers/charset.c +++ b/src/parsers/charset.c @@ -1,7 +1,12 @@ #include <assert.h> #include <string.h> #include "../internal.h" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#include <llvm-c/Core.h> +#pragma GCC diagnostic pop #include "parser_internal.h" +#include "../llvm.h" static HParseResult* parse_charset(void *env, HParseState *state) { uint8_t in = h_read_bits(&state->input_stream, 8, false); @@ -70,12 +75,82 @@ static bool cs_ctrvm(HRVMProg *prog, void *env) { return true; } +static bool cs_llvm(LLVMBuilderRef builder, LLVMValueRef func, + LLVMModuleRef mod, void* env) { + /* + * LLVM to build a function to parse a charset; the args are a stream and an + * arena. + */ + + LLVMValueRef stream = LLVMGetFirstParam(func); + stream = LLVMBuildBitCast(builder, stream, llvm_inputstreamptr, "stream"); + LLVMValueRef arena = LLVMGetLastParam(func); + + /* Set up our basic blocks */ + LLVMBasicBlockRef entry = LLVMAppendBasicBlock(func, "cs_entry"); + LLVMBasicBlockRef success = LLVMAppendBasicBlock(func, "cs_success"); + LLVMBasicBlockRef fail = LLVMAppendBasicBlock(func, "cs_fail"); + LLVMBasicBlockRef end = LLVMAppendBasicBlock(func, "cs_end"); + + /* Basic block: entry */ + LLVMPositionBuilderAtEnd(builder, entry); + /* First we read the char */ + LLVMValueRef bits_args[3]; + bits_args[0] = stream; + bits_args[1] = LLVMConstInt(LLVMInt32Type(), 8, 0); + bits_args[2] = LLVMConstInt(LLVMInt8Type(), 0, 0); + LLVMValueRef bits = LLVMBuildCall(builder, LLVMGetNamedFunction(mod, "h_read_bits"), bits_args, 3, "read_bits"); + LLVMValueRef r = LLVMBuildTrunc(builder, bits, LLVMInt8Type(), ""); // TODO Necessary? (same question in ch_llvm()) + + /* We have a char, need to check if it's in the charset */ + HCharset cs = (HCharset)env; + /* Branch to either success or end, conditional on whether r is in cs */ + h_llvm_make_charset_membership_test(mod, func, builder, r, cs, success, fail); + + /* Basic block: success */ + LLVMPositionBuilderAtEnd(builder, success); + + LLVMValueRef mr; + h_llvm_make_tt_suint(mod, builder, stream, arena, r, &mr); + + /* br label %ch_end */ + LLVMBuildBr(builder, end); + + /* Basic block: fail */ + LLVMPositionBuilderAtEnd(builder, fail); + /* + * We just branch straight to end; this exists so that the phi node in + * end knows where all the incoming edges are from, rather than needing + * some basic block constructed in h_llvm_make_charset_membership_test() + */ + LLVMBuildBr(builder, end); + + /* Basic block: end */ + LLVMPositionBuilderAtEnd(builder, end); + // %rv = phi %struct.HParseResult_.3* [ %make_result, %ch_success ], [ null, %ch_entry ] + LLVMValueRef rv = LLVMBuildPhi(builder, llvm_parseresultptr, "rv"); + LLVMBasicBlockRef rv_phi_incoming_blocks[] = { + success, + fail + }; + LLVMValueRef rv_phi_incoming_values[] = { + mr, + LLVMConstNull(llvm_parseresultptr) + }; + LLVMAddIncoming(rv, rv_phi_incoming_values, rv_phi_incoming_blocks, 2); + // ret %struct.HParseResult_.3* %rv + LLVMBuildRet(builder, rv); + + return true; +} + static const HParserVtable charset_vt = { .parse = parse_charset, .isValidRegular = h_true, .isValidCF = h_true, .desugar = desugar_charset, .compile_to_rvm = cs_ctrvm, + .llvm = cs_llvm, .higher = false, }; diff --git a/src/t_parser.c b/src/t_parser.c index 17dc9a91cc5671567a3db18bb5cb2f5148f5e90a..f7c4baf7c0b59342949b3e5b0a5ce1d1b913ac2b 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -963,4 +963,5 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/glr/token_position", GINT_TO_POINTER(PB_GLR), test_token_position); g_test_add_data_func("/core/parser/llvm/ch", GINT_TO_POINTER(PB_LLVM), test_ch); + g_test_add_data_func("/core/parser/llvm/ch_range", GINT_TO_POINTER(PB_LLVM), test_ch_range); }