diff --git a/src/SConscript b/src/SConscript index 9b89730d9b7015dad78e40f790a414cdab00fb44..6ec943c32352bf3eb654599af8ae6503eb50190f 100644 --- a/src/SConscript +++ b/src/SConscript @@ -70,7 +70,7 @@ backends = ['backends/%s.c' % s for s in # Add LLVM backend if enabled if GetOption("use_llvm"): - llvm_backend_files = ['llvm.c', 'llvm_charset.c', 'llvm_suint.c'] + llvm_backend_files = ['llvm.c', 'llvm_bytes.c', 'llvm_charset.c', 'llvm_suint.c'] backends = backends + ['backends/llvm/%s' % s for s in llvm_backend_files] misc_hammer_parts = [ diff --git a/src/backends/llvm/llvm.c b/src/backends/llvm/llvm.c index 185367a2e166920ada40a3e629ab72a38b747261..1e673b67f3954360ede0f8aae1fbd205fd97f1f1 100644 --- a/src/backends/llvm/llvm.c +++ b/src/backends/llvm/llvm.c @@ -26,6 +26,20 @@ HParseResult* make_result(HArena *arena, HParsedToken *tok) { } void h_llvm_declare_common(HLLVMParserCompileContext *ctxt) { +#if SIZE_MAX == UINT64_MAX + ctxt->llvm_size_t = LLVMInt64Type(); +#elif SIZE_MAX == UINT32_MAX + ctxt->llvm_size_t = LLVMInt32Type(); +#else +#error "SIZE_MAX is not consistent with either 64 or 32-bit platform, couldn't guess LLVM type for size_t" +#endif +#if UINTPTR_MAX == UINT64_MAX + ctxt->llvm_intptr_t = LLVMInt64Type(); +#elif UINTPTR_MAX == UINT32_MAX + ctxt->llvm_intptr_t = LLVMInt32Type(); +#else +#error "UINTPTR_MAX is not consistent with either 64 or 32-bit platform, couldn't guess LLVM type for intptr" +#endif ctxt->llvm_inputstream = LLVMStructCreateNamed(LLVMGetGlobalContext(), "struct.HInputStream_"); LLVMTypeRef llvm_inputstream_struct_types[] = { LLVMPointerType(LLVMInt8Type(), 0), @@ -46,12 +60,20 @@ void h_llvm_declare_common(HLLVMParserCompileContext *ctxt) { LLVMTypeRef llvm_parsedtoken_struct_types[] = { LLVMInt32Type(), // actually an enum value LLVMInt64Type(), // actually this is a union; the largest thing in it is 64 bits - LLVMInt64Type(), // FIXME sizeof(size_t) will be 32 bits on 32-bit platforms - LLVMInt64Type(), // FIXME ditto + ctxt->llvm_size_t, + ctxt->llvm_size_t, LLVMInt8Type() }; LLVMStructSetBody(ctxt->llvm_parsedtoken, llvm_parsedtoken_struct_types, 5, 0); ctxt->llvm_parsedtokenptr = LLVMPointerType(ctxt->llvm_parsedtoken, 0); + /* The HBytes struct is one of the cases for the union in HParsedToken */ + ctxt->llvm_hbytes = LLVMStructCreateNamed(LLVMGetGlobalContext(), "struct.HBytes_"); + LLVMTypeRef llvm_hbytes_struct_types[] = { + LLVMPointerType(LLVMInt8Type(), 0), /* HBytes.token */ + ctxt->llvm_size_t /* HBytes.len */ + }; + LLVMStructSetBody(ctxt->llvm_hbytes, llvm_hbytes_struct_types, 2, 0); + ctxt->llvm_hbytesptr = LLVMPointerType(ctxt->llvm_hbytes, 0); ctxt->llvm_parseresult = LLVMStructCreateNamed(LLVMGetGlobalContext(), "struct.HParseResult_"); LLVMTypeRef llvm_parseresult_struct_types[] = { ctxt->llvm_parsedtokenptr, diff --git a/src/backends/llvm/llvm.h b/src/backends/llvm/llvm.h index 0721c3733b818877090af2e420d611902b375e5b..aab534a56573ee7501f1b7f00090d0d7b7852c99 100644 --- a/src/backends/llvm/llvm.h +++ b/src/backends/llvm/llvm.h @@ -20,6 +20,10 @@ struct HLLVMParserCompileContext_ { LLVMValueRef func; LLVMBuilderRef builder; /* Typerefs */ + /* We determine typerefs for some standard C types we'll need later up front */ + LLVMTypeRef llvm_size_t; + LLVMTypeRef llvm_intptr_t; + /* LLVM types for Hammer structs and pointers */ LLVMTypeRef llvm_inputstream; LLVMTypeRef llvm_inputstreamptr; LLVMTypeRef llvm_arena; @@ -28,6 +32,8 @@ struct HLLVMParserCompileContext_ { LLVMTypeRef llvm_parsedtokenptr; LLVMTypeRef llvm_parseresult; LLVMTypeRef llvm_parseresultptr; + LLVMTypeRef llvm_hbytes; + LLVMTypeRef llvm_hbytesptr; /* Set up in function preamble */ LLVMValueRef stream; LLVMValueRef arena; @@ -36,6 +42,9 @@ struct HLLVMParserCompileContext_ { bool h_llvm_make_charset_membership_test(HLLVMParserCompileContext *ctxt, LLVMValueRef r, HCharset cs, LLVMBasicBlockRef yes, LLVMBasicBlockRef no); +void h_llvm_make_tt_bytes_fixed(HLLVMParserCompileContext *ctxt, + const uint8_t *bytes, size_t len, + LLVMValueRef *mr_out); void h_llvm_make_tt_suint(HLLVMParserCompileContext *ctxt, uint8_t length, uint8_t signedp, LLVMValueRef r, LLVMValueRef *mr_out); diff --git a/src/backends/llvm/llvm_bytes.c b/src/backends/llvm/llvm_bytes.c new file mode 100644 index 0000000000000000000000000000000000000000..9a63f6f935963f6eefe48dd744ed8f7e677623a4 --- /dev/null +++ b/src/backends/llvm/llvm_bytes.c @@ -0,0 +1,92 @@ +#ifdef HAMMER_LLVM_BACKEND + +#include <llvm-c/Analysis.h> +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#include <llvm-c/Core.h> +#pragma GCC diagnostic pop +#include <llvm-c/ExecutionEngine.h> +#include "../../internal.h" +#include "llvm.h" + +/* + * Construct LLVM IR to allocate a token of type TT_BYTES with a compile-time + * constant value + * + * Parameters: + * - ctxt [in]: an HLLVMParserCompileContext + * - bytes [in]: an array of bytes + * - len [in]: size of bytes + * - mr_out [out]: the return value from make_result() + */ + +void h_llvm_make_tt_bytes_fixed(HLLVMParserCompileContext *ctxt, + const uint8_t *bytes, size_t len, + LLVMValueRef *mr_out) { + /* Set up call to h_arena_malloc() for a new HParsedToken */ + LLVMValueRef tok_size = LLVMConstInt(LLVMInt32Type(), sizeof(HParsedToken), 0); + LLVMValueRef amalloc_args[] = { ctxt->arena, tok_size }; + /* %h_arena_malloc = call void* @h_arena_malloc(%struct.HArena_.1* %1, i32 48) */ + LLVMValueRef amalloc = LLVMBuildCall(ctxt->builder, + LLVMGetNamedFunction(ctxt->mod, "h_arena_malloc"), + amalloc_args, 2, "h_arena_malloc"); + /* %tok = bitcast void* %h_arena_malloc to %struct.HParsedToken_.2* */ + LLVMValueRef tok = LLVMBuildBitCast(ctxt->builder, amalloc, ctxt->llvm_parsedtokenptr, "tok"); + + /* + * tok->token_type = TT_BYTES; + */ + LLVMValueRef toktype = LLVMBuildStructGEP(ctxt->builder, tok, 0, "token_type"); + LLVMBuildStore(ctxt->builder, LLVMConstInt(LLVMInt32Type(), TT_BYTES, 0), toktype); + + /* + * XXX the way LLVM handles unions is batshit insane and forces IR writers + * to figure out which element of the union is largest just to declare the + * type, and then get all the alignments right - in effect, manually crufting + * up something compatible with their C compiler's ABI. This is not so much + * a portability bug as a portability bug queen with a bone-penetrating + * ovipositor for laying her eggs in one's brain. + * + * The sole saving grace here is that the limited number of platforms LLVM + * can JIT on make it conceivable I may get this right for the cases that come + * up in practice if not for the general case. If it breaks horribly, the + * slightly slower but safe option is to implement a function to set the + * relevant union fields from its arguments in C and build a call to it. + * + * The equivalent C that prompted this rant is quite depressingly simple: + * + * tok->bytes.token = bytes; + * tok->bytes.len = len; + */ + + LLVMValueRef hbytes_gep_tmp = + LLVMBuildStructGEP(ctxt->builder, tok, 1, "tok_union"); + LLVMValueRef hbytes_gep = LLVMBuildBitCast(ctxt->builder, hbytes_gep_tmp, + ctxt->llvm_hbytesptr, "hbytes"); + LLVMValueRef hbytes_token_gep = + LLVMBuildStructGEP(ctxt->builder, hbytes_gep, 0, "hbytes_token"); + /* + * We have to do this silly (uintptr_t) / LLVMConstIntToPtr() dance because + * LLVM doesn't seem to offer any way to construct a compile-time pointer + * constant other than NULL directly. + */ + LLVMBuildStore(ctxt->builder, + LLVMConstIntToPtr(LLVMConstInt(ctxt->llvm_intptr_t, (uintptr_t)bytes, 0), + LLVMPointerType(LLVMInt8Type(), 0)), + hbytes_token_gep); + LLVMValueRef hbytes_len_gep = + LLVMBuildStructGEP(ctxt->builder, hbytes_gep, 1, "hbytes_len"); + LLVMBuildStore(ctxt->builder, LLVMConstInt(ctxt->llvm_size_t, len, 0), hbytes_len_gep); + + /* + * Now call make_result() + */ + LLVMValueRef result_args[] = { ctxt->arena, tok }; + LLVMValueRef mr = LLVMBuildCall(ctxt->builder, + LLVMGetNamedFunction(ctxt->mod, "make_result"), + result_args, 2, "make_result"); + + *mr_out = mr; +} + +#endif /* defined(HAMMER_LLVM_BACKEND) */ diff --git a/src/backends/llvm/llvm_charset.c b/src/backends/llvm/llvm_charset.c index 56e3e80c1d421f37d6bf32bd5a1ba20e393d6676..ae53f54f492170e513cc2e91079e80aaee8b2c52 100644 --- a/src/backends/llvm/llvm_charset.c +++ b/src/backends/llvm/llvm_charset.c @@ -857,6 +857,8 @@ static bool h_llvm_build_ir_for_bitmap(HLLVMParserCompileContext *ctxt, LLVMValueRef bitmap_initializer = LLVMConstArray(LLVMInt32Type(), bitmap_entries, 8); /* ...and we need a global variable to stick it in to GEP it */ LLVMValueRef bitmap = LLVMAddGlobal(ctxt->mod, LLVMTypeOf(bitmap_initializer), "bitmap"); + LLVMSetLinkage(bitmap, LLVMInternalLinkage); + LLVMSetGlobalConstant(bitmap, 1); LLVMSetInitializer(bitmap, bitmap_initializer); /* Compute the index into the bitmap */ diff --git a/src/backends/llvm/llvm_suint.c b/src/backends/llvm/llvm_suint.c index 571d6b00d39cbfc30f7b66d65c44e6554372ec0b..5f2ed487c8fdcdd80f4abe0ab46eb66c82604967 100644 --- a/src/backends/llvm/llvm_suint.c +++ b/src/backends/llvm/llvm_suint.c @@ -13,14 +13,11 @@ * Construct LLVM IR to allocate a token of type TT_SINT or TT_UINT * * Parameters: - * - mod [in]: an LLVMModuleRef - * - builder [in]: an LLVMBuilderRef, positioned appropriately - * - stream [in]: a value ref to an llvm_inputstreamptr, for the input stream - * - arena [in]: a value ref to an llvm_arenaptr to be used for the malloc + * - ctxt [in]: an HLLVMParserCompileContext + * - length [in]: length in bits + * - signedp [in]: TT_SINT if non-zero, TT_UINT otherwise * - r [in]: a value ref to the value to be used to this token * - mr_out [out]: the return value from make_result() - * - * TODO actually support TT_SINT, inputs other than 8 bit */ void h_llvm_make_tt_suint(HLLVMParserCompileContext *ctxt, diff --git a/src/parsers/end.c b/src/parsers/end.c index 85499d9348cd1df6503428a55d7a2ab878d1ef63..3643f6c7f109c93dd60d143c2c94c06adc06846f 100644 --- a/src/parsers/end.c +++ b/src/parsers/end.c @@ -1,3 +1,10 @@ +#ifdef HAMMER_LLVM_BACKEND +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#include <llvm-c/Core.h> +#pragma GCC diagnostic pop +#include "../backends/llvm/llvm.h" +#endif #include "parser_internal.h" static HParseResult* parse_end(void *env, HParseState *state) { @@ -19,12 +26,92 @@ static bool end_ctrvm(HRVMProg *prog, void *env) { return true; } +#ifdef HAMMER_LLVM_BACKEND + +static bool end_llvm(HLLVMParserCompileContext *ctxt, void* env) { + if (!ctxt) return false; + + /* Set up some basic blocks */ + LLVMBasicBlockRef entry = LLVMAppendBasicBlock(ctxt->func, "end_entry"); + LLVMBasicBlockRef success = LLVMAppendBasicBlock(ctxt->func, "end_success"); + LLVMBasicBlockRef end = LLVMAppendBasicBlock(ctxt->func, "end_end"); + + /* Basic block: entry */ + LLVMBuildBr(ctxt->builder, entry); + LLVMPositionBuilderAtEnd(ctxt->builder, entry); + + /* + * This needs to test if we're at the end of the input stream by + * comparing the index and length fields; build a struct GEP to + * get at their values. + */ + LLVMValueRef gep_indices[2]; + /* The struct itself */ + gep_indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0); + /* The index field (see HInputStream in internal.h */ + gep_indices[1] = LLVMConstInt(LLVMInt32Type(), 2, 0); + /* GEP */ + LLVMValueRef index_ptr = LLVMBuildGEP(ctxt->builder, + ctxt->stream, gep_indices, 2, "index_ptr"); + /* The length field (see HInputStream in internal.h */ + gep_indices[1] = LLVMConstInt(LLVMInt32Type(), 3, 0); + /* GEP */ + LLVMValueRef length_ptr = LLVMBuildGEP(ctxt->builder, + ctxt->stream, gep_indices, 2, "length_ptr"); + /* Now load them */ + LLVMValueRef index = LLVMBuildLoad(ctxt->builder, + index_ptr, "index"); + LLVMValueRef length = LLVMBuildLoad(ctxt->builder, + length_ptr, "length"); + /* Compare */ + LLVMValueRef icmp = LLVMBuildICmp(ctxt->builder, LLVMIntEQ, index, length, "index == length"); + /* Branch on comparison */ + LLVMBuildCondBr(ctxt->builder, icmp, success, end); + + /* Basic block: success */ + LLVMPositionBuilderAtEnd(ctxt->builder, success); + /* Set up a call to h_arena_malloc() to get an HParseResult */ + LLVMValueRef make_result_args[] = { + ctxt->arena, + LLVMConstNull(ctxt->llvm_parsedtokenptr) + }; + LLVMValueRef result_ptr = LLVMBuildCall(ctxt->builder, + LLVMGetNamedFunction(ctxt->mod, "make_result"), + make_result_args, 2, "result_ptr"); + + /* Branch to end */ + LLVMBuildBr(ctxt->builder, end); + + /* Basic block: end */ + LLVMPositionBuilderAtEnd(ctxt->builder, end); + /* Set up a phi depending on whether we have a token or not */ + LLVMValueRef rv = LLVMBuildPhi(ctxt->builder, ctxt->llvm_parseresultptr, "rv"); + LLVMBasicBlockRef rv_phi_incoming_blocks[] = { + success, + entry + }; + LLVMValueRef rv_phi_incoming_values[] = { + result_ptr, + LLVMConstNull(ctxt->llvm_parseresultptr) + }; + LLVMAddIncoming(rv, rv_phi_incoming_values, rv_phi_incoming_blocks, 2); + /* Return it */ + LLVMBuildRet(ctxt->builder, rv); + + return true; +} + +#endif /* defined(HAMMER_LLVM_BACKEND) */ + static const HParserVtable end_vt = { .parse = parse_end, .isValidRegular = h_true, .isValidCF = h_true, .desugar = desugar_end, .compile_to_rvm = end_ctrvm, +#ifdef HAMMER_LLVM_BACKEND + .llvm = end_llvm, +#endif .higher = false, }; diff --git a/src/parsers/epsilon.c b/src/parsers/epsilon.c index bb6e8beb31cca3ff09a565171b4e554e07f2ffad..4c5abc406a390b3fce592a6b041bca5518ebb0ae 100644 --- a/src/parsers/epsilon.c +++ b/src/parsers/epsilon.c @@ -1,3 +1,10 @@ +#ifdef HAMMER_LLVM_BACKEND +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#include <llvm-c/Core.h> +#pragma GCC diagnostic pop +#include "../backends/llvm/llvm.h" +#endif #include "parser_internal.h" static HParseResult* parse_epsilon(void* env, HParseState* state) { @@ -12,12 +19,45 @@ static bool epsilon_ctrvm(HRVMProg *prog, void* env) { return true; } +#ifdef HAMMER_LLVM_BACKEND + +static bool epsilon_llvm(HLLVMParserCompileContext *ctxt, void* env) { + if (!ctxt) return false; + + LLVMBasicBlockRef epsilon_bb = LLVMAppendBasicBlock(ctxt->func, "epsilon"); + + /* Basic block: epsilon */ + LLVMBuildBr(ctxt->builder, epsilon_bb); + LLVMPositionBuilderAtEnd(ctxt->builder, epsilon_bb); + + /* + * For epsilon we make a null-token parse result like with end, but we + * do it unconditionally. + */ + LLVMValueRef make_result_args[] = { + ctxt->arena, + LLVMConstNull(ctxt->llvm_parsedtokenptr) + }; + LLVMValueRef result_ptr = LLVMBuildCall(ctxt->builder, + LLVMGetNamedFunction(ctxt->mod, "make_result"), + make_result_args, 2, "result_ptr"); + /* Return it */ + LLVMBuildRet(ctxt->builder, result_ptr); + + return true; +} + +#endif + static const HParserVtable epsilon_vt = { .parse = parse_epsilon, .isValidRegular = h_true, .isValidCF = h_true, .desugar = desugar_epsilon, .compile_to_rvm = epsilon_ctrvm, +#ifdef HAMMER_LLVM_BACKEND + .llvm = epsilon_llvm, +#endif .higher = false, }; diff --git a/src/parsers/nothing.c b/src/parsers/nothing.c index 0a60108bcc2c0fe69a656fb1cfb4f067ff290922..d95f3f2423d040673fedc2481bffbc010ddfc350 100644 --- a/src/parsers/nothing.c +++ b/src/parsers/nothing.c @@ -1,3 +1,10 @@ +#ifdef HAMMER_LLVM_BACKEND +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#include <llvm-c/Core.h> +#pragma GCC diagnostic pop +#include "../backends/llvm/llvm.h" +#endif #include "parser_internal.h" static HParseResult* parse_nothing() { @@ -16,12 +23,32 @@ static bool nothing_ctrvm(HRVMProg *prog, void* env) { return true; } +#ifdef HAMMER_LLVM_BACKEND + +static bool nothing_llvm(HLLVMParserCompileContext *ctxt, void* env) { + if (!ctxt) return false; + + /* This one just always returns NULL */ + LLVMBasicBlockRef entry = LLVMAppendBasicBlock(ctxt->func, "nothing_entry"); + LLVMBuildBr(ctxt->builder, entry); + LLVMPositionBuilderAtEnd(ctxt->builder, entry); + + LLVMBuildRet(ctxt->builder, LLVMConstNull(ctxt->llvm_parseresultptr)); + + return true; +} + +#endif /* defined(HAMMER_LLVM_BACKEND) */ + static const HParserVtable nothing_vt = { .parse = parse_nothing, .isValidRegular = h_true, .isValidCF = h_true, .desugar = desugar_nothing, .compile_to_rvm = nothing_ctrvm, +#ifdef HAMMER_LLVM_BACKEND + .llvm = nothing_llvm, +#endif .higher = false, }; diff --git a/src/parsers/token.c b/src/parsers/token.c index 19029726ad11a52fa0eadf62b67a7b15cd2e4744..3899abe8d5abf0e45ba9ac72b544095f1019bfa2 100644 --- a/src/parsers/token.c +++ b/src/parsers/token.c @@ -1,4 +1,11 @@ #include <assert.h> +#ifdef HAMMER_LLVM_BACKEND +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#include <llvm-c/Core.h> +#pragma GCC diagnostic pop +#include "../backends/llvm/llvm.h" +#endif #include "parser_internal.h" typedef struct { @@ -19,7 +26,6 @@ static HParseResult* parse_token(void *env, HParseState *state) { return make_result(state->arena, tok); } - static HParsedToken *reshape_token(const HParseResult *p, void* user_data) { // fetch sequence of uints from p assert(p->ast); @@ -67,12 +73,247 @@ static bool token_ctrvm(HRVMProg *prog, void *env) { return true; } +#ifdef HAMMER_LLVM_BACKEND + +/* + * Emit LLVM IR to recognize a token by comparing it to a string stored in + * the LLVM module globals. We use this for longer tokens. + */ + +static bool token_llvm_with_global(HLLVMParserCompileContext *ctxt, HToken *t) { + LLVMValueRef bits_args[3], bits, i, i_init, i_incr, str, str_const, len; + LLVMValueRef r, c, mr, icmp_i_len, icmp_c_r, rv; + LLVMValueRef c_gep_indices[2], c_gep; + LLVMBasicBlockRef entry, loop_start, loop_middle, loop_incr, success, end; + + /* Set up basic blocks: entry, success and exit branches */ + entry = LLVMAppendBasicBlock(ctxt->func, "tok_seq_entry"); + loop_start = LLVMAppendBasicBlock(ctxt->func, "tok_seq_loop_start"); + loop_middle = LLVMAppendBasicBlock(ctxt->func, "tok_seq_loop_middle"); + loop_incr = LLVMAppendBasicBlock(ctxt->func, "tok_seq_loop_incr"); + success = LLVMAppendBasicBlock(ctxt->func, "tok_seq_success"); + end = LLVMAppendBasicBlock(ctxt->func, "tok_seq_end"); + + /* Branch to entry block */ + LLVMBuildBr(ctxt->builder, entry); + LLVMPositionBuilderAtEnd(ctxt->builder, entry); + + /* + * Get our string into the globals as a constant; skip the null termination + * and save a byte since we can compare to length in the loop. + */ + str_const = LLVMConstString((const char *)(t->str), t->len, 1); + str = LLVMAddGlobal(ctxt->mod, LLVMArrayType(LLVMInt8Type(), t->len), "tok_str"); + LLVMSetLinkage(str, LLVMInternalLinkage); + LLVMSetGlobalConstant(str, 1); + LLVMSetInitializer(str, str_const); + + /* Have the token length available */ + len = LLVMConstInt(ctxt->llvm_size_t, t->len, 0); + + /* For each char of token... */ + bits_args[0] = ctxt->stream; + bits_args[1] = LLVMConstInt(LLVMInt32Type(), 8, 0); + bits_args[2] = LLVMConstInt(LLVMInt8Type(), 0, 0); + + /* Start the loop */ + LLVMBuildBr(ctxt->builder, loop_start); + LLVMPositionBuilderAtEnd(ctxt->builder, loop_start); + + /* Keep an index counter */ + i = LLVMBuildPhi(ctxt->builder, ctxt->llvm_size_t, "i"); + i_init = LLVMConstInt(ctxt->llvm_size_t, 0, 0); + /* + * We'll need another one once we know the value of i at the end of + * the loop + */ + LLVMAddIncoming(i, &i_init, &entry, 1); + + /* + * Compare i to token string length (i.e., have we hit the end of the + * token?); if ==, branch to success, if <, continue loop. + */ + icmp_i_len = LLVMBuildICmp(ctxt->builder, LLVMIntULT, i, len, "i < len"); + LLVMBuildCondBr(ctxt->builder, icmp_i_len, loop_middle, success); + + /* Basic block loop_middle */ + LLVMPositionBuilderAtEnd(ctxt->builder, loop_middle); + + /* Get a char from the token string */ + c_gep_indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0); + c_gep_indices[1] = i; + c_gep = LLVMBuildInBoundsGEP(ctxt->builder, str, c_gep_indices, 2, "c_p"); + c = LLVMBuildLoad(ctxt->builder, c_gep, "c"); + + /* Read one char from input */ + bits = LLVMBuildCall(ctxt->builder, + LLVMGetNamedFunction(ctxt->mod, "h_read_bits"), bits_args, 3, "read_bits"); + /* Clamp to i8 */ + r = LLVMBuildTrunc(ctxt->builder, bits, LLVMInt8Type(), ""); + + /* + * Compare c and r; if !=, token mismatches, break out of loop and + * fail. If ==, increment counter and go to next iteration. + */ + icmp_c_r = LLVMBuildICmp(ctxt->builder, LLVMIntEQ, c, r, "c == r"); + LLVMBuildCondBr(ctxt->builder, icmp_c_r, loop_incr, end); + + /* Basic block loop_incr */ + LLVMPositionBuilderAtEnd(ctxt->builder, loop_incr); + /* End of loop, 2nd LLVMAddIncoming() for i */ + i_incr = LLVMBuildAdd(ctxt->builder, i, + LLVMConstInt(ctxt->llvm_size_t, 1, 0), "i_incr"); + LLVMAddIncoming(i, &i_incr, &loop_incr, 1); + + /* Next iteration */ + LLVMBuildBr(ctxt->builder, loop_start); + + /* Basic block: success */ + LLVMPositionBuilderAtEnd(ctxt->builder, success); + h_llvm_make_tt_bytes_fixed(ctxt, t->str, t->len, &mr); + LLVMBuildBr(ctxt->builder, end); + + /* Basic block: end */ + LLVMPositionBuilderAtEnd(ctxt->builder, end); + /* phi the token or a null depending on where we came from */ + rv = LLVMBuildPhi(ctxt->builder, ctxt->llvm_parseresultptr, "rv"); + LLVMBasicBlockRef rv_phi_incoming_blocks[] = { + success, + loop_middle + }; + LLVMValueRef rv_phi_incoming_values[] = { + mr, + LLVMConstNull(ctxt->llvm_parseresultptr) + }; + LLVMAddIncoming(rv, rv_phi_incoming_values, rv_phi_incoming_blocks, 2); + /* Return it */ + LLVMBuildRet(ctxt->builder, rv); + + return true; +} + +/* + * Emit LLVM IR to recognize a token by sequentially checking each character; + * suitable for short tokens. This also handles the zero-length token case. + */ + +static bool token_llvm_with_sequential_comparisons(HLLVMParserCompileContext *ctxt, HToken *t) { + HAllocator *mm__; + LLVMValueRef bits, r, c, icmp, mr, rv; + LLVMValueRef bits_args[3]; + LLVMBasicBlockRef entry, success, end, next_char; + char name[64]; + int i; + + /* Get allocator ready */ + mm__ = ctxt->mm__; + + /* Set up basic blocks: entry, success and exit branches */ + entry = LLVMAppendBasicBlock(ctxt->func, "tok_seq_entry"); + success = LLVMAppendBasicBlock(ctxt->func, "tok_seq_success"); + end = LLVMAppendBasicBlock(ctxt->func, "tok_seq_end"); + + /* Branch to entry block */ + LLVMBuildBr(ctxt->builder, entry); + LLVMPositionBuilderAtEnd(ctxt->builder, entry); + + /* Basic block refs for the phi later */ + LLVMBasicBlockRef *bbs_into_phi = h_new(LLVMBasicBlockRef, 1 + t->len); + LLVMValueRef *values_into_phi = h_new(LLVMValueRef, 1 + t->len); + + /* For each char of token... */ + bits_args[0] = ctxt->stream; + bits_args[1] = LLVMConstInt(LLVMInt32Type(), 8, 0); + bits_args[2] = LLVMConstInt(LLVMInt8Type(), 0, 0); + /* Track the current basic block */ + LLVMBasicBlockRef curr_char = entry; + for (i = 0; i < t->len; ++i) { + /* Read a char */ + bits = LLVMBuildCall(ctxt->builder, + LLVMGetNamedFunction(ctxt->mod, "h_read_bits"), bits_args, 3, "read_bits"); + /* Clamp to i8 */ + r = LLVMBuildTrunc(ctxt->builder, bits, LLVMInt8Type(), ""); + /* Comparison */ + c = LLVMConstInt(LLVMInt8Type(), t->str[i], 0); + snprintf(name, 64, "t->str[%d] == r", i); + icmp = LLVMBuildICmp(ctxt->builder, LLVMIntEQ, c, r, name); + /* Next basic block */ + snprintf(name, 64, "tok_matched_%d", i); + next_char = LLVMAppendBasicBlock(ctxt->func, name); + /* Conditional branch */ + LLVMBuildCondBr(ctxt->builder, icmp, next_char, end); + /* Fill in our row in the phi tables */ + bbs_into_phi[1 + i] = curr_char; + values_into_phi[1 + i] = LLVMConstNull(ctxt->llvm_parseresultptr); + /* Start from next_char */ + LLVMPositionBuilderAtEnd(ctxt->builder, next_char); + /* Update the current basic block */ + curr_char = next_char; + } + + /* If we got here, accept the token */ + LLVMBuildBr(ctxt->builder, success); + + /* Success block: make a token */ + LLVMPositionBuilderAtEnd(ctxt->builder, success); + h_llvm_make_tt_bytes_fixed(ctxt, t->str, t->len, &mr); + /* Fill in our row in the phi tables */ + bbs_into_phi[0] = success; + values_into_phi[0] = mr; + /* Branch to end so we can return the token */ + LLVMBuildBr(ctxt->builder, end); + + /* End block: return a token if we made one */ + LLVMPositionBuilderAtEnd(ctxt->builder, end); + /* phi the token or a null depending on where we came from */ + rv = LLVMBuildPhi(ctxt->builder, ctxt->llvm_parseresultptr, "rv"); + LLVMAddIncoming(rv, values_into_phi, bbs_into_phi, 1 + t->len); + /* Free the stuff we allocated to build the phi */ + h_free(bbs_into_phi); + h_free(values_into_phi); + /* Return it */ + LLVMBuildRet(ctxt->builder, rv); + + return true; +} + +#define TOKEN_LENGTH_USE_GLOBAL_CUTOFF 4 + +static bool token_llvm(HLLVMParserCompileContext *ctxt, void* env) { + HToken *t; + if (!ctxt) return false; + + /* Get the token */ + t = (HToken *)env; + /* + * Check its length; we have two possible code-generation strategies + * here: treat it like chars sequentially and emit a series of read/ + * tests, or put the string in the LLVM module globals and compare + * in a loop. Use the former for very short strings and the latter + * for longer ones. + * + * XXX Like with charsets, we should also think about memoizing these + * for recurring strings. + */ + if (t->len > TOKEN_LENGTH_USE_GLOBAL_CUTOFF && + t->len > 0) { + return token_llvm_with_global(ctxt, t); + } else { + return token_llvm_with_sequential_comparisons(ctxt, t); + } +} + +#endif /* defined(HAMMER_LLVM_BACKEND) */ + const HParserVtable token_vt = { .parse = parse_token, .isValidRegular = h_true, .isValidCF = h_true, .desugar = desugar_token, .compile_to_rvm = token_ctrvm, +#ifdef HAMMER_LLVM_BACKEND + .llvm = token_llvm, +#endif .higher = false, }; diff --git a/src/t_parser.c b/src/t_parser.c index 304362e49c61775cdef9edb8c459365695e5af5e..e4c0c48c636108ab8038a78bde8a4e375982031f 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -8,9 +8,14 @@ static void test_token(gconstpointer backend) { const HParser *token_ = h_token((const uint8_t*)"95\xa2", 3); + /* This one is above the loop-unrolling cutoff for the LLVM backend */ + const HParser *token_long = h_token((const uint8_t *)"xyzzy", 5); g_check_parse_match(token_, (HParserBackend)GPOINTER_TO_INT(backend), "95\xa2", 3, "<39.35.a2>"); g_check_parse_failed(token_, (HParserBackend)GPOINTER_TO_INT(backend), "95", 2); + g_check_parse_match(token_long, (HParserBackend)GPOINTER_TO_INT(backend), "xyzzy", 5, "<78.79.7a.7a.79>"); + g_check_parse_failed(token_long, (HParserBackend)GPOINTER_TO_INT(backend), "xyz", 3); + g_check_parse_failed(token_long, (HParserBackend)GPOINTER_TO_INT(backend), "xyzzx", 5); } static void test_ch(gconstpointer backend) { @@ -25,6 +30,9 @@ static void test_ch_range(gconstpointer backend) { const HParser *range_2 = h_ch_range('a', 'z'); const HParser *range_3 = h_ch_range('A', 'z'); const HParser *range_all = h_ch_range(0, 255); + const HParser *range_left = h_ch_range(0, 64); + const HParser *range_right = h_ch_range(224, 255); + unsigned char tmp[2]; g_check_parse_match(range_1, (HParserBackend)GPOINTER_TO_INT(backend), "b", 1, "u0x62"); g_check_parse_failed(range_1, (HParserBackend)GPOINTER_TO_INT(backend), "d", 1); @@ -34,6 +42,15 @@ static void test_ch_range(gconstpointer backend) { g_check_parse_failed(range_3, (HParserBackend)GPOINTER_TO_INT(backend), "2", 1); /* range_all never fails anything */ g_check_parse_match(range_all, (HParserBackend)GPOINTER_TO_INT(backend), "B", 1, "u0x42"); + tmp[1] = '\0'; + tmp[0] = 32; + g_check_parse_match(range_left, (HParserBackend)GPOINTER_TO_INT(backend), tmp, 1, "u0x20"); + tmp[0] = 128; + g_check_parse_failed(range_left, (HParserBackend)GPOINTER_TO_INT(backend), tmp, 1); + tmp[0] = 240; + g_check_parse_match(range_right, (HParserBackend)GPOINTER_TO_INT(backend), tmp, 1, "u0xf0"); + tmp[0] = 128; + g_check_parse_failed(range_right, (HParserBackend)GPOINTER_TO_INT(backend), tmp, 1); } //@MARK_START @@ -1025,6 +1042,7 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/glr/token_position", GINT_TO_POINTER(PB_GLR), test_token_position); #ifdef HAMMER_LLVM_BACKEND + g_test_add_data_func("/core/parser/llvm/token", GINT_TO_POINTER(PB_LLVM), test_token); g_test_add_data_func("/core/parser/llvm/ch", GINT_TO_POINTER(PB_LLVM), test_ch); g_test_add_data_func("/core/parser/llvm/ch_range", GINT_TO_POINTER(PB_LLVM), test_ch_range); g_test_add_data_func("/core/parser/llvm/int64", GINT_TO_POINTER(PB_LLVM), test_int64); @@ -1037,5 +1055,6 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/llvm/uint8", GINT_TO_POINTER(PB_LLVM), test_uint8); g_test_add_data_func("/core/parser/llvm/in", GINT_TO_POINTER(PB_LLVM), test_in); g_test_add_data_func("/core/parser/llvm/not_in", GINT_TO_POINTER(PB_LLVM), test_not_in); + g_test_add_data_func("/core/parser/llvm/nothing_p", GINT_TO_POINTER(PB_LLVM), test_nothing_p); #endif /* defined(HAMMER_LLVM_BACKEND) */ }