From c6280a98bc90e3fe175e4941489e56836d404c6c Mon Sep 17 00:00:00 2001 From: Alex Willmer <alex@moreati.org.uk> Date: Fri, 4 Dec 2015 10:20:25 +0000 Subject: [PATCH] Added Python versions of base64 examples These are transliterations of the existing C files. They're not particularly Pythonic or performant, but they're a start. Example of usage ``` $ echo ' YW55IGNhcm5hbCBwbGVhcw==' | PYTHONPATH=../build/opt/src/bindings/python/ python base64.py inputsize=27 input= YW55IGNhcm5hbCBwbGVhcw== ((((89L, 87L, 53L, 53L), (73L, 71L, 78L, 104L), (99L, 109L, 53L, 104L), (98L, 67L, 66L, 119L), (98L, 71L, 86L, 104L)), (99L, 'w', '=', '=')),) $ echo ' YW55IGNhcm5hbCBwbGVhcw==' | PYTHONPATH=../build/opt/src/bindings/python/ python base64_sem1.py inputsize=27 input= YW55IGNhcm5hbCBwbGVhcw== (97L, 110L, 121L, 32L, 99L, 97L, 114L, 110L, 97L, 108L, 32L, 112L, 108L, 101L, 97L, 115L) $ echo ' YW55IGNhcm5hbCBwbGVhcw==' | PYTHONPATH=../build/opt/src/bindings/python/ python base64_sem2.py inputsize=27 input= YW55IGNhcm5hbCBwbGVhcw== (97L, 110L, 121L, 32L, 99L, 97L, 114L, 110L, 97L, 108L, 32L, 112L, 108L, 101L, 97L, 115L) ``` --- examples/base64.py | 60 ++++++++++++++ examples/base64_sem1.py | 169 ++++++++++++++++++++++++++++++++++++++++ examples/base64_sem2.py | 159 +++++++++++++++++++++++++++++++++++++ 3 files changed, 388 insertions(+) create mode 100644 examples/base64.py create mode 100644 examples/base64_sem1.py create mode 100644 examples/base64_sem2.py diff --git a/examples/base64.py b/examples/base64.py new file mode 100644 index 00000000..3ffe304c --- /dev/null +++ b/examples/base64.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python2 + +# Example parser: Base64, syntax only. +# +# Demonstrates how to construct a Hammer parser that recognizes valid Base64 +# sequences. +# +# Note that no semantic evaluation of the sequence is performed, i.e. the +# byte sequence being represented is not returned, or determined. See +# base64_sem1.py and base64_sem2.py for examples how to attach appropriate +# semantic actions to the grammar. + +from __future__ import print_function + +import sys + +import hammer as h + + +def init_parser(): + # CORE + digit = h.ch_range(0x30, 0x39) + alpha = h.choice(h.ch_range(0x41, 0x5a), h.ch_range(0x61, 0x7a)) + + # AUX. + plus = h.ch('+') + slash = h.ch('/') + equals = h.ch('=') + + bsfdig = h.choice(alpha, digit, plus, slash) + bsfdig_4bit = h.in_('AEIMQUYcgkosw048') + bsfdig_2bit = h.in_('AQgw') + base64_3 = h.repeat_n(bsfdig, 4) + base64_2 = h.sequence(bsfdig, bsfdig, bsfdig_4bit, equals) + base64_1 = h.sequence(bsfdig, bsfdig_2bit, equals, equals) + base64 = h.sequence(h.many(base64_3), + h.optional(h.choice(base64_2, base64_1))) + + return h.sequence(h.whitespace(base64), h.whitespace(h.end_p())) + + +def main(): + document = init_parser() + + s = sys.stdin.read() + inputsize = len(s) + print('inputsize=%i' % inputsize, file=sys.stderr) + print('input=%s' % s, file=sys.stderr, end='') + + result = document.parse(s) + + if result: + #print('parsed=%i bytes', result.bit_length/8, file=sys.stderr) + print(result) + + +if __name__ == '__main__': + import sys + + main() diff --git a/examples/base64_sem1.py b/examples/base64_sem1.py new file mode 100644 index 00000000..f0676ebb --- /dev/null +++ b/examples/base64_sem1.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python2 + +# Example parser: Base64, with fine-grained semantic actions +# +# Demonstrates how to attach semantic actions to grammar rules and piece by +# piece transform the parse tree into the desired semantic representation, +# in this case a sequence of 8-bit values. +# +# Those rules using h.action get an attached action, which must be declared +# (as a function). +# +# This variant of the example uses fine-grained semantic actions that +# transform the parse tree in small steps in a bottom-up fashion. Compare +# base64_sem2.py for an alternative approach using a single top-level action. + +from __future__ import print_function + +import functools +import sys + +import hammer as h + + +# Semantic actions for the grammar below, each corresponds to an "ARULE". +# They must be named act_<rulename>. + +def act_bsfdig(p, user_data=None): + # FIXME See the note in init_parser() + c = p if isinstance(p, (int, long)) else ord(p) + + if 0x41 <= c <= 0x5A: # A-Z + return c - 0x41 + elif 0x61 <= c <= 0x7A: # a-z + return c - 0x61 + 26 + elif 0x30 <= c <= 0x39: # 0-9 + return c - 0x30 + 52 + elif c == '+': + return 62 + elif c == '/': + return 63 + else: + raise ValueError + +# Hammer's Python bindings don't currently expose h_act_index or hact_ignore + +def act_index0(p, user_data=None): + return p[0] + +def act_ignore(p, user_data=None): + return None + +act_bsfdig_4bit = act_bsfdig +act_bsfdig_2bit = act_bsfdig + +act_equals = act_ignore +act_ws = act_ignore + +act_document = act_index0 + + +def act_base64_n(n, p, user_data=None): + """General-form action to turn a block of base64 digits into bytes. + """ + res = [0]*n + + x = 0 + bits = 0 + for i in xrange(0, n+1): + x <<= 6 + x |= p[i] or 0 + bits += 6 + + x >>= bits % 8 # align, i.e. cut off extra bits + + for i in xrange(n): + item = x & 0xFF + + res[n-1-i] = item # output the last byte and + x >>= 8 # discard it + + return tuple(res) + + +act_base64_3 = functools.partial(act_base64_n, 3) +act_base64_2 = functools.partial(act_base64_n, 2) +act_base64_1 = functools.partial(act_base64_n, 1) + + +def act_base64(p, user_data=None): + assert isinstance(p, tuple) + assert len(p) == 2 + assert isinstance(p[0], tuple) + + res = [] + + # concatenate base64_3 blocks + for elem in p[0]: + res.extend(elem) + + # append one trailing base64_2 or _1 block + tok = p[1] + if isinstance(tok, tuple): + res.extend(tok) + + return tuple(res) + + +def init_parser(): + """Return a parser with the grammar to be recognized. + """ + # CORE + + # This is a direct translation of the C example. In C the literal 0x30 + # is interchangable with the char literal '0' (note the single quotes). + # This is not the case in Python. + + # TODO In the interests of being more Pythonic settle on either string + # literals, or integers + digit = h.ch_range(0x30, 0x39) + alpha = h.choice(h.ch_range(0x41, 0x5a), h.ch_range(0x61, 0x7a)) + space = h.in_(" \t\n\r\f\v") + + # AUX. + plus = h.ch('+') + slash = h.ch('/') + equals = h.action(h.ch('='), act_equals) + + bsfdig = h.action(h.choice(alpha, digit, plus, slash), act_bsfdig) + bsfdig_4bit = h.action(h.in_("AEIMQUYcgkosw048"), act_bsfdig_4bit) + bsfdig_2bit = h.action(h.in_("AQgw"), act_bsfdig_2bit) + base64_3 = h.action(h.repeat_n(bsfdig, 4), act_base64_3) + base64_2 = h.action(h.sequence(bsfdig, bsfdig, bsfdig_4bit, equals), + act_base64_2) + base64_1 = h.action(h.sequence(bsfdig, bsfdig_2bit, equals, equals), + act_base64_1) + base64 = h.action(h.sequence(h.many(base64_3), + h.optional(h.choice(base64_2, + base64_1))), + act_base64) + + # TODO This is not quite the same as the C example, with uses act_ignore. + # But I can't get hammer to filter any value returned by act_ignore. + ws = h.ignore(h.many(space)) + document = h.action(h.sequence(ws, base64, ws, h.end_p()), + act_document) + + # BUG sometimes inputs that should just don't parse. + # It *seemed* to happen mostly with things like "bbbbaaaaBA==". + # Using less actions seemed to make it less likely. + + return document + +def main(): + parser = init_parser() + + s = sys.stdin.read() + inputsize = len(s) + print('inputsize=%i' % inputsize, file=sys.stderr) + print('input=%s' % s, file=sys.stderr, end='') + + result = parser.parse(s) + + if result: + #print('parsed=%i bytes', result.bit_length/8, file=sys.stderr) + print(result) + + +if __name__ == '__main__': + main() diff --git a/examples/base64_sem2.py b/examples/base64_sem2.py new file mode 100644 index 00000000..6b5f8db1 --- /dev/null +++ b/examples/base64_sem2.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python2 + +# Example parser: Base64, with fine-grained semantic actions +# +# Demonstrates how to attach semantic actions to a grammar and transform the +# parse tree into the desired semantic representation, in this case a sequence +# of 8-bit values. +# +# Those rules using h.action get an attached action, which must be declared +# (as a function). +# +# This variant of the example uses coarse-grained semantic actions, +# transforming the entire parse tree in one big step. Compare base64_sem1.py +# for an alternative approach using a fine-grained piece-by-piece +# transformation. + +from __future__ import print_function + +import functools +import sys + +import hammer as h + + +# Semantic actions for the grammar below, each corresponds to an "ARULE". +# They must be named act_<rulename>. + +def bsfdig_value(p): + """Return the numeric value of a parsed base64 digit. + """ + c = p if isinstance(p, (int, long)) else ord(p) + if c: + if 0x41 <= c <= 0x5A: # A-Z + return c - 0x41 + elif 0x61 <= c <= 0x7A: # a-z + return c - 0x61 + 26 + elif 0x30 <= c <= 0x39: # 0-9 + return c - 0x30 + 52 + elif c == '+': + return 62 + elif c == '/': + return 63 + return 0 + +def act_base64(p, user_data=None): + assert isinstance(p, tuple) + assert len(p) == 2 + assert isinstance(p[0], tuple) + + # grab b64_3 block sequence + # grab and analyze b64 end block (_2 or _1) + b64_3 = p[0] + b64_2 = p[1] + b64_1 = p[1] + + if not isinstance(b64_2, tuple): + b64_1 = b64_2 = None + elif b64_2[2] == '=': + b64_2 = None + else: + b64_1 = None + + # allocate result sequence + res = [] + + # concatenate base64_3 blocks + for digits in b64_3: + assert isinstance(digits, tuple) + + x = bsfdig_value(digits[0]) + x <<= 6; x |= bsfdig_value(digits[1]) + x <<= 6; x |= bsfdig_value(digits[2]) + x <<= 6; x |= bsfdig_value(digits[3]) + res.append((x >> 16) & 0xFF) + res.append((x >> 8) & 0xFF) + res.append(x & 0xFF) + + # append one trailing base64_2 or _1 block + if b64_2: + digits = b64_2 + x = bsfdig_value(digits[0]) + x <<= 6; x |= bsfdig_value(digits[1]) + x <<= 6; x |= bsfdig_value(digits[2]) + res.append((x >> 10) & 0xFF) + res.append((x >> 2) & 0xFF) + elif b64_1: + digits = b64_1 + x = bsfdig_value(digits[0]) + x <<= 6; x |= bsfdig_value(digits[1]) + res.append((x >> 4) & 0xFF) + + return tuple(res) + +# Hammer's Python bindings don't currently expose h_act_index or hact_ignore + +def act_index0(p, user_data=None): + return p[0] + +def act_ignore(p, user_data=None): + return None + +act_ws = act_ignore +act_document = act_index0 + + +def init_parser(): + """Set up the parser with the grammar to be recognized. + """ + # CORE + digit = h.ch_range(0x30, 0x39) + alpha = h.choice(h.ch_range(0x41, 0x5a), h.ch_range(0x61, 0x7a)) + space = h.in_(" \t\n\r\f\v") + + # AUX. + plus = h.ch('+') + slash = h.ch('/') + equals = h.ch('=') + + bsfdig = h.choice(alpha, digit, plus, slash) + bsfdig_4bit = h.in_("AEIMQUYcgkosw048") + bsfdig_2bit = h.in_("AQgw") + base64_3 = h.repeat_n(bsfdig, 4) + base64_2 = h.sequence(bsfdig, bsfdig, bsfdig_4bit, equals) + base64_1 = h.sequence(bsfdig, bsfdig_2bit, equals, equals) + base64 = h.action(h.sequence(h.many(base64_3), + h.optional(h.choice(base64_2, + base64_1))), + act_base64) + + # TODO This is not quite the same as the C example, with uses act_ignore. + # But I can't get hammer to filter any value returned by act_ignore. + ws = h.ignore(h.many(space)) + document = h.action(h.sequence(ws, base64, ws, h.end_p()), + act_document) + + # BUG sometimes inputs that should just don't parse. + # It *seemed* to happen mostly with things like "bbbbaaaaBA==". + # Using less actions seemed to make it less likely. + + return document + + +def main(): + parser = init_parser() + + s = sys.stdin.read() + inputsize = len(s) + print('inputsize=%i' % inputsize, file=sys.stderr) + print('input=%s' % s, file=sys.stderr, end='') + + result = parser.parse(s) + + if result: + #print('parsed=%i bytes', result.bit_length/8, file=sys.stderr) + print(result) + + +if __name__ == '__main__': + main() -- GitLab