diff --git a/src/SConscript b/src/SConscript index 9b89730d9b7015dad78e40f790a414cdab00fb44..6ec943c32352bf3eb654599af8ae6503eb50190f 100644 --- a/src/SConscript +++ b/src/SConscript @@ -70,7 +70,7 @@ backends = ['backends/%s.c' % s for s in # Add LLVM backend if enabled if GetOption("use_llvm"): - llvm_backend_files = ['llvm.c', 'llvm_charset.c', 'llvm_suint.c'] + llvm_backend_files = ['llvm.c', 'llvm_bytes.c', 'llvm_charset.c', 'llvm_suint.c'] backends = backends + ['backends/llvm/%s' % s for s in llvm_backend_files] misc_hammer_parts = [ diff --git a/src/backends/llvm/llvm.c b/src/backends/llvm/llvm.c index 2c6602b00557145f393900ad856462e408f816af..1e673b67f3954360ede0f8aae1fbd205fd97f1f1 100644 --- a/src/backends/llvm/llvm.c +++ b/src/backends/llvm/llvm.c @@ -26,12 +26,19 @@ HParseResult* make_result(HArena *arena, HParsedToken *tok) { } void h_llvm_declare_common(HLLVMParserCompileContext *ctxt) { -#if SIZE_MAX == 0xffffffffffffffff +#if SIZE_MAX == UINT64_MAX ctxt->llvm_size_t = LLVMInt64Type(); -#elif SIZE_MAX == 0xffffffff +#elif SIZE_MAX == UINT32_MAX ctxt->llvm_size_t = LLVMInt32Type(); #else #error "SIZE_MAX is not consistent with either 64 or 32-bit platform, couldn't guess LLVM type for size_t" +#endif +#if UINTPTR_MAX == UINT64_MAX + ctxt->llvm_intptr_t = LLVMInt64Type(); +#elif UINTPTR_MAX == UINT32_MAX + ctxt->llvm_intptr_t = LLVMInt32Type(); +#else +#error "UINTPTR_MAX is not consistent with either 64 or 32-bit platform, couldn't guess LLVM type for intptr" #endif ctxt->llvm_inputstream = LLVMStructCreateNamed(LLVMGetGlobalContext(), "struct.HInputStream_"); LLVMTypeRef llvm_inputstream_struct_types[] = { diff --git a/src/backends/llvm/llvm.h b/src/backends/llvm/llvm.h index 49d681c449367da7a8b7c467e93a2cc182088723..aab534a56573ee7501f1b7f00090d0d7b7852c99 100644 --- a/src/backends/llvm/llvm.h +++ b/src/backends/llvm/llvm.h @@ -20,7 +20,10 @@ struct HLLVMParserCompileContext_ { LLVMValueRef func; LLVMBuilderRef builder; /* Typerefs */ + /* We determine typerefs for some standard C types we'll need later up front */ LLVMTypeRef llvm_size_t; + LLVMTypeRef llvm_intptr_t; + /* LLVM types for Hammer structs and pointers */ LLVMTypeRef llvm_inputstream; LLVMTypeRef llvm_inputstreamptr; LLVMTypeRef llvm_arena; @@ -39,6 +42,9 @@ struct HLLVMParserCompileContext_ { bool h_llvm_make_charset_membership_test(HLLVMParserCompileContext *ctxt, LLVMValueRef r, HCharset cs, LLVMBasicBlockRef yes, LLVMBasicBlockRef no); +void h_llvm_make_tt_bytes_fixed(HLLVMParserCompileContext *ctxt, + const uint8_t *bytes, size_t len, + LLVMValueRef *mr_out); void h_llvm_make_tt_suint(HLLVMParserCompileContext *ctxt, uint8_t length, uint8_t signedp, LLVMValueRef r, LLVMValueRef *mr_out); diff --git a/src/backends/llvm/llvm_bytes.c b/src/backends/llvm/llvm_bytes.c new file mode 100644 index 0000000000000000000000000000000000000000..9a63f6f935963f6eefe48dd744ed8f7e677623a4 --- /dev/null +++ b/src/backends/llvm/llvm_bytes.c @@ -0,0 +1,92 @@ +#ifdef HAMMER_LLVM_BACKEND + +#include <llvm-c/Analysis.h> +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#include <llvm-c/Core.h> +#pragma GCC diagnostic pop +#include <llvm-c/ExecutionEngine.h> +#include "../../internal.h" +#include "llvm.h" + +/* + * Construct LLVM IR to allocate a token of type TT_BYTES with a compile-time + * constant value + * + * Parameters: + * - ctxt [in]: an HLLVMParserCompileContext + * - bytes [in]: an array of bytes + * - len [in]: size of bytes + * - mr_out [out]: the return value from make_result() + */ + +void h_llvm_make_tt_bytes_fixed(HLLVMParserCompileContext *ctxt, + const uint8_t *bytes, size_t len, + LLVMValueRef *mr_out) { + /* Set up call to h_arena_malloc() for a new HParsedToken */ + LLVMValueRef tok_size = LLVMConstInt(LLVMInt32Type(), sizeof(HParsedToken), 0); + LLVMValueRef amalloc_args[] = { ctxt->arena, tok_size }; + /* %h_arena_malloc = call void* @h_arena_malloc(%struct.HArena_.1* %1, i32 48) */ + LLVMValueRef amalloc = LLVMBuildCall(ctxt->builder, + LLVMGetNamedFunction(ctxt->mod, "h_arena_malloc"), + amalloc_args, 2, "h_arena_malloc"); + /* %tok = bitcast void* %h_arena_malloc to %struct.HParsedToken_.2* */ + LLVMValueRef tok = LLVMBuildBitCast(ctxt->builder, amalloc, ctxt->llvm_parsedtokenptr, "tok"); + + /* + * tok->token_type = TT_BYTES; + */ + LLVMValueRef toktype = LLVMBuildStructGEP(ctxt->builder, tok, 0, "token_type"); + LLVMBuildStore(ctxt->builder, LLVMConstInt(LLVMInt32Type(), TT_BYTES, 0), toktype); + + /* + * XXX the way LLVM handles unions is batshit insane and forces IR writers + * to figure out which element of the union is largest just to declare the + * type, and then get all the alignments right - in effect, manually crufting + * up something compatible with their C compiler's ABI. This is not so much + * a portability bug as a portability bug queen with a bone-penetrating + * ovipositor for laying her eggs in one's brain. + * + * The sole saving grace here is that the limited number of platforms LLVM + * can JIT on make it conceivable I may get this right for the cases that come + * up in practice if not for the general case. If it breaks horribly, the + * slightly slower but safe option is to implement a function to set the + * relevant union fields from its arguments in C and build a call to it. + * + * The equivalent C that prompted this rant is quite depressingly simple: + * + * tok->bytes.token = bytes; + * tok->bytes.len = len; + */ + + LLVMValueRef hbytes_gep_tmp = + LLVMBuildStructGEP(ctxt->builder, tok, 1, "tok_union"); + LLVMValueRef hbytes_gep = LLVMBuildBitCast(ctxt->builder, hbytes_gep_tmp, + ctxt->llvm_hbytesptr, "hbytes"); + LLVMValueRef hbytes_token_gep = + LLVMBuildStructGEP(ctxt->builder, hbytes_gep, 0, "hbytes_token"); + /* + * We have to do this silly (uintptr_t) / LLVMConstIntToPtr() dance because + * LLVM doesn't seem to offer any way to construct a compile-time pointer + * constant other than NULL directly. + */ + LLVMBuildStore(ctxt->builder, + LLVMConstIntToPtr(LLVMConstInt(ctxt->llvm_intptr_t, (uintptr_t)bytes, 0), + LLVMPointerType(LLVMInt8Type(), 0)), + hbytes_token_gep); + LLVMValueRef hbytes_len_gep = + LLVMBuildStructGEP(ctxt->builder, hbytes_gep, 1, "hbytes_len"); + LLVMBuildStore(ctxt->builder, LLVMConstInt(ctxt->llvm_size_t, len, 0), hbytes_len_gep); + + /* + * Now call make_result() + */ + LLVMValueRef result_args[] = { ctxt->arena, tok }; + LLVMValueRef mr = LLVMBuildCall(ctxt->builder, + LLVMGetNamedFunction(ctxt->mod, "make_result"), + result_args, 2, "make_result"); + + *mr_out = mr; +} + +#endif /* defined(HAMMER_LLVM_BACKEND) */ diff --git a/src/parsers/token.c b/src/parsers/token.c index 19029726ad11a52fa0eadf62b67a7b15cd2e4744..6f016064d448227829caa8b8dedd8c5a66037873 100644 --- a/src/parsers/token.c +++ b/src/parsers/token.c @@ -1,4 +1,11 @@ #include <assert.h> +#ifdef HAMMER_LLVM_BACKEND +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#include <llvm-c/Core.h> +#pragma GCC diagnostic pop +#include "../backends/llvm/llvm.h" +#endif #include "parser_internal.h" typedef struct { @@ -19,7 +26,6 @@ static HParseResult* parse_token(void *env, HParseState *state) { return make_result(state->arena, tok); } - static HParsedToken *reshape_token(const HParseResult *p, void* user_data) { // fetch sequence of uints from p assert(p->ast); @@ -67,12 +73,140 @@ static bool token_ctrvm(HRVMProg *prog, void *env) { return true; } +#ifdef HAMMER_LLVM_BACKEND + +/* + * Emit LLVM IR to recognize a token by comparing it to a string stored in + * the LLVM module globals. We use this for longer tokens. + */ + +static bool token_llvm_with_global(HLLVMParserCompileContext *ctxt, HToken *t) { + /* TODO */ + return true; +} + +/* + * Emit LLVM IR to recognize a token by sequentially checking each character; + * suitable for short tokens. This also handles the zero-length token case. + */ + +static bool token_llvm_with_sequential_comparisons(HLLVMParserCompileContext *ctxt, HToken *t) { + HAllocator *mm__; + LLVMValueRef bits, r, c, icmp, mr, rv; + LLVMValueRef bits_args[3]; + LLVMBasicBlockRef entry, success, end, next_char; + char name[64]; + int i; + + /* Get allocator ready */ + mm__ = ctxt->mm__; + + /* Set up basic blocks: entry, success and exit branches */ + entry = LLVMAppendBasicBlock(ctxt->func, "tok_seq_entry"); + success = LLVMAppendBasicBlock(ctxt->func, "tok_seq_success"); + end = LLVMAppendBasicBlock(ctxt->func, "tok_seq_end"); + + /* Branch to entry block */ + LLVMBuildBr(ctxt->builder, entry); + LLVMPositionBuilderAtEnd(ctxt->builder, entry); + + /* Basic block refs for the phi later */ + LLVMBasicBlockRef *bbs_into_phi = h_new(LLVMBasicBlockRef, 1 + t->len); + LLVMValueRef *values_into_phi = h_new(LLVMValueRef, 1 + t->len); + + /* For each char of token... */ + bits_args[0] = ctxt->stream; + bits_args[1] = LLVMConstInt(LLVMInt32Type(), 8, 0); + bits_args[2] = LLVMConstInt(LLVMInt8Type(), 0, 0); + /* Track the current basic block */ + LLVMBasicBlockRef curr_char = entry; + for (i = 0; i < t->len; ++i) { + /* Read a char */ + bits = LLVMBuildCall(ctxt->builder, + LLVMGetNamedFunction(ctxt->mod, "h_read_bits"), bits_args, 3, "read_bits"); + /* Clamp to i8 */ + r = LLVMBuildTrunc(ctxt->builder, bits, LLVMInt8Type(), ""); + /* Comparison */ + c = LLVMConstInt(LLVMInt8Type(), t->str[i], 0); + snprintf(name, 64, "t->str[%d] == r", i); + icmp = LLVMBuildICmp(ctxt->builder, LLVMIntEQ, c, r, name); + /* Next basic block */ + snprintf(name, 64, "tok_matched_%d", i); + next_char = LLVMAppendBasicBlock(ctxt->func, name); + /* Conditional branch */ + LLVMBuildCondBr(ctxt->builder, icmp, next_char, end); + /* Fill in our row in the phi tables */ + bbs_into_phi[1 + i] = curr_char; + values_into_phi[1 + i] = LLVMConstNull(ctxt->llvm_parseresultptr); + /* Start from next_char */ + LLVMPositionBuilderAtEnd(ctxt->builder, next_char); + /* Update the current basic block */ + curr_char = next_char; + } + + /* If we got here, accept the token */ + LLVMBuildBr(ctxt->builder, success); + + /* Success block: make a token */ + LLVMPositionBuilderAtEnd(ctxt->builder, success); + h_llvm_make_tt_bytes_fixed(ctxt, t->str, t->len, &mr); + /* Fill in our row in the phi tables */ + bbs_into_phi[0] = success; + values_into_phi[0] = mr; + /* Branch to end so we can return the token */ + LLVMBuildBr(ctxt->builder, end); + + /* End block: return a token if we made one */ + LLVMPositionBuilderAtEnd(ctxt->builder, end); + /* phi the token or a null depending on where we came from */ + rv = LLVMBuildPhi(ctxt->builder, ctxt->llvm_parseresultptr, "rv"); + LLVMAddIncoming(rv, values_into_phi, bbs_into_phi, 1 + t->len); + /* Free the stuff we allocated to build the phi */ + h_free(bbs_into_phi); + h_free(values_into_phi); + /* Return it */ + LLVMBuildRet(ctxt->builder, rv); + + return true; +} + +#define TOKEN_LENGTH_USE_GLOBAL_CUTOFF 4 + +static bool token_llvm(HLLVMParserCompileContext *ctxt, void* env) { + HToken *t; + if (!ctxt) return false; + + /* Get the token */ + t = (HToken *)env; + /* + * Check its length; we have two possible code-generation strategies + * here: treat it like chars sequentially and emit a series of read/ + * tests, or put the string in the LLVM module globals and compare + * in a loop. Use the former for very short strings and the latter + * for longer ones. + * + * XXX Like with charsets, we should also think about memoizing these + * for recurring strings. + */ + if (t->len > TOKEN_LENGTH_USE_GLOBAL_CUTOFF && + t->len > 0) { + return token_llvm_with_global(ctxt, t); + } else { + return token_llvm_with_sequential_comparisons(ctxt, t); + } +} + +#endif /* defined(HAMMER_LLVM_BACKEND) */ + const HParserVtable token_vt = { .parse = parse_token, .isValidRegular = h_true, .isValidCF = h_true, .desugar = desugar_token, .compile_to_rvm = token_ctrvm, +#ifdef HAMMER_LLVM_BACKEND + .llvm = token_llvm, +#endif .higher = false, }; diff --git a/src/t_parser.c b/src/t_parser.c index 30e9fc779f3091a5a621c2c0151d33fe1b860501..69e9dd6aba29c0cb449eefc087b91138dbfa1299 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -1037,6 +1037,7 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/glr/token_position", GINT_TO_POINTER(PB_GLR), test_token_position); #ifdef HAMMER_LLVM_BACKEND + g_test_add_data_func("/core/parser/llvm/token", GINT_TO_POINTER(PB_LLVM), test_token); g_test_add_data_func("/core/parser/llvm/ch", GINT_TO_POINTER(PB_LLVM), test_ch); g_test_add_data_func("/core/parser/llvm/ch_range", GINT_TO_POINTER(PB_LLVM), test_ch_range); g_test_add_data_func("/core/parser/llvm/int64", GINT_TO_POINTER(PB_LLVM), test_int64);