From c6280a98bc90e3fe175e4941489e56836d404c6c Mon Sep 17 00:00:00 2001
From: Alex Willmer <alex@moreati.org.uk>
Date: Fri, 4 Dec 2015 10:20:25 +0000
Subject: [PATCH] Added Python versions of base64 examples

These are transliterations of the existing C files.
They're not particularly Pythonic or performant, but they're a start.

Example of usage

```
$ echo '  YW55IGNhcm5hbCBwbGVhcw==' | PYTHONPATH=../build/opt/src/bindings/python/ python base64.py
inputsize=27
input=  YW55IGNhcm5hbCBwbGVhcw==
((((89L, 87L, 53L, 53L), (73L, 71L, 78L, 104L), (99L, 109L, 53L, 104L), (98L, 67L, 66L, 119L), (98L, 71L, 86L, 104L)), (99L, 'w', '=', '=')),)
$ echo '  YW55IGNhcm5hbCBwbGVhcw==' | PYTHONPATH=../build/opt/src/bindings/python/ python base64_sem1.py
inputsize=27
input=  YW55IGNhcm5hbCBwbGVhcw==
(97L, 110L, 121L, 32L, 99L, 97L, 114L, 110L, 97L, 108L, 32L, 112L, 108L, 101L, 97L, 115L)
$ echo '  YW55IGNhcm5hbCBwbGVhcw==' | PYTHONPATH=../build/opt/src/bindings/python/ python base64_sem2.py
inputsize=27
input=  YW55IGNhcm5hbCBwbGVhcw==
(97L, 110L, 121L, 32L, 99L, 97L, 114L, 110L, 97L, 108L, 32L, 112L, 108L, 101L, 97L, 115L)
```
---
 examples/base64.py      |  60 ++++++++++++++
 examples/base64_sem1.py | 169 ++++++++++++++++++++++++++++++++++++++++
 examples/base64_sem2.py | 159 +++++++++++++++++++++++++++++++++++++
 3 files changed, 388 insertions(+)
 create mode 100644 examples/base64.py
 create mode 100644 examples/base64_sem1.py
 create mode 100644 examples/base64_sem2.py

diff --git a/examples/base64.py b/examples/base64.py
new file mode 100644
index 00000000..3ffe304c
--- /dev/null
+++ b/examples/base64.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python2
+
+# Example parser: Base64, syntax only.
+#
+# Demonstrates how to construct a Hammer parser that recognizes valid Base64
+# sequences.
+#
+# Note that no semantic evaluation of the sequence is performed, i.e. the
+# byte sequence being represented is not returned, or determined. See
+# base64_sem1.py and base64_sem2.py for examples how to attach appropriate
+# semantic actions to the grammar.
+
+from __future__ import print_function
+
+import sys
+
+import hammer as h
+
+
+def init_parser():
+    # CORE
+    digit = h.ch_range(0x30, 0x39)
+    alpha = h.choice(h.ch_range(0x41, 0x5a), h.ch_range(0x61, 0x7a))
+
+    # AUX.
+    plus = h.ch('+')
+    slash = h.ch('/')
+    equals = h.ch('=')
+
+    bsfdig = h.choice(alpha, digit, plus, slash)
+    bsfdig_4bit = h.in_('AEIMQUYcgkosw048')
+    bsfdig_2bit = h.in_('AQgw')
+    base64_3 = h.repeat_n(bsfdig, 4)
+    base64_2 = h.sequence(bsfdig, bsfdig, bsfdig_4bit, equals)
+    base64_1 = h.sequence(bsfdig, bsfdig_2bit, equals, equals)
+    base64 = h.sequence(h.many(base64_3),
+                        h.optional(h.choice(base64_2, base64_1)))
+
+    return h.sequence(h.whitespace(base64), h.whitespace(h.end_p()))
+
+
+def main():
+    document = init_parser()
+
+    s = sys.stdin.read()
+    inputsize = len(s)
+    print('inputsize=%i' % inputsize, file=sys.stderr)
+    print('input=%s' % s, file=sys.stderr, end='')
+
+    result = document.parse(s)
+
+    if result:
+        #print('parsed=%i bytes', result.bit_length/8, file=sys.stderr)
+        print(result)
+
+
+if __name__ == '__main__':
+    import sys
+
+    main()
diff --git a/examples/base64_sem1.py b/examples/base64_sem1.py
new file mode 100644
index 00000000..f0676ebb
--- /dev/null
+++ b/examples/base64_sem1.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python2
+
+# Example parser: Base64, with fine-grained semantic actions
+#
+# Demonstrates how to attach semantic actions to grammar rules and piece by
+# piece transform the parse tree into the desired semantic representation,
+# in this case a sequence of 8-bit values.
+#
+# Those rules using h.action get an attached action, which must be declared
+# (as a function).
+#
+# This variant of the example uses fine-grained semantic actions that
+# transform the parse tree in small steps in a bottom-up fashion. Compare
+# base64_sem2.py for an alternative approach using a single top-level action.
+
+from __future__ import print_function
+
+import functools
+import sys
+
+import hammer as h
+
+
+# Semantic actions for the grammar below, each corresponds to an "ARULE".
+# They must be named act_<rulename>.
+
+def act_bsfdig(p, user_data=None):
+    # FIXME See the note in init_parser()
+    c = p if isinstance(p, (int, long)) else ord(p)
+
+    if 0x41 <= c <= 0x5A: # A-Z
+        return c - 0x41
+    elif 0x61 <= c <= 0x7A: # a-z
+        return c - 0x61 + 26
+    elif 0x30 <= c <= 0x39: # 0-9
+        return c - 0x30 + 52
+    elif c == '+':
+        return 62
+    elif c == '/':
+        return 63
+    else:
+        raise ValueError
+
+# Hammer's Python bindings don't currently expose h_act_index or hact_ignore
+
+def act_index0(p, user_data=None):
+    return p[0]
+
+def act_ignore(p, user_data=None):
+    return None
+
+act_bsfdig_4bit = act_bsfdig
+act_bsfdig_2bit = act_bsfdig
+
+act_equals      = act_ignore
+act_ws          = act_ignore
+
+act_document    = act_index0
+
+
+def act_base64_n(n, p, user_data=None):
+    """General-form action to turn a block of base64 digits into bytes.
+    """
+    res = [0]*n
+
+    x = 0
+    bits = 0
+    for i in xrange(0, n+1):
+        x <<= 6
+        x |= p[i] or 0
+        bits += 6
+
+    x >>= bits % 8 # align, i.e. cut off extra bits
+
+    for i in xrange(n):
+        item = x & 0xFF
+
+        res[n-1-i] = item   # output the last byte and
+        x >>= 8             # discard it
+
+    return tuple(res)
+
+
+act_base64_3 = functools.partial(act_base64_n, 3)
+act_base64_2 = functools.partial(act_base64_n, 2)
+act_base64_1 = functools.partial(act_base64_n, 1)
+
+
+def act_base64(p, user_data=None):
+    assert isinstance(p, tuple)
+    assert len(p) == 2
+    assert isinstance(p[0], tuple)
+
+    res = []
+    
+    # concatenate base64_3 blocks
+    for elem in p[0]:
+        res.extend(elem)
+
+    # append one trailing base64_2 or _1 block
+    tok = p[1]
+    if isinstance(tok, tuple):
+        res.extend(tok)
+
+    return tuple(res)
+
+
+def init_parser():
+    """Return a parser with the grammar to be recognized.
+    """
+    # CORE
+
+    # This is a direct translation of the  C example. In C the literal 0x30
+    # is interchangable with the char literal '0' (note the single quotes).
+    # This is not the case in Python.
+    
+    # TODO In the interests of being more Pythonic settle on either string
+    #      literals, or integers
+    digit   = h.ch_range(0x30, 0x39)
+    alpha   = h.choice(h.ch_range(0x41, 0x5a), h.ch_range(0x61, 0x7a))
+    space   = h.in_(" \t\n\r\f\v")
+
+    # AUX.
+    plus    = h.ch('+')
+    slash   = h.ch('/')
+    equals  = h.action(h.ch('='), act_equals)
+
+    bsfdig      = h.action(h.choice(alpha, digit, plus, slash), act_bsfdig)
+    bsfdig_4bit = h.action(h.in_("AEIMQUYcgkosw048"), act_bsfdig_4bit)
+    bsfdig_2bit = h.action(h.in_("AQgw"), act_bsfdig_2bit)
+    base64_3    = h.action(h.repeat_n(bsfdig, 4), act_base64_3)
+    base64_2    = h.action(h.sequence(bsfdig, bsfdig, bsfdig_4bit, equals),
+                           act_base64_2)
+    base64_1    = h.action(h.sequence(bsfdig, bsfdig_2bit, equals, equals),
+                           act_base64_1)
+    base64      = h.action(h.sequence(h.many(base64_3),
+                                      h.optional(h.choice(base64_2,
+                                                          base64_1))),
+                           act_base64)
+
+    # TODO This is not quite the same as the C example, with uses act_ignore.
+    #      But I can't get hammer to filter any value returned by act_ignore.
+    ws          = h.ignore(h.many(space))
+    document    = h.action(h.sequence(ws, base64, ws, h.end_p()),
+                           act_document)
+
+    # BUG sometimes inputs that should just don't parse.
+    # It *seemed* to happen mostly with things like "bbbbaaaaBA==".
+    # Using less actions seemed to make it less likely.
+
+    return document
+
+def main():
+    parser = init_parser()
+
+    s = sys.stdin.read()
+    inputsize = len(s)
+    print('inputsize=%i' % inputsize, file=sys.stderr)
+    print('input=%s' % s, file=sys.stderr, end='')
+
+    result = parser.parse(s)
+
+    if result:
+        #print('parsed=%i bytes', result.bit_length/8, file=sys.stderr)
+        print(result)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/base64_sem2.py b/examples/base64_sem2.py
new file mode 100644
index 00000000..6b5f8db1
--- /dev/null
+++ b/examples/base64_sem2.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python2
+
+# Example parser: Base64, with fine-grained semantic actions
+#
+# Demonstrates how to attach semantic actions to a grammar and transform the
+# parse tree into the desired semantic representation, in this case a sequence
+# of 8-bit values.
+#
+# Those rules using h.action get an attached action, which must be declared
+# (as a function).
+#
+# This variant of the example uses coarse-grained semantic actions,
+# transforming the entire parse tree in one big step. Compare base64_sem1.py
+# for an alternative approach using a fine-grained piece-by-piece
+# transformation.
+
+from __future__ import print_function
+
+import functools
+import sys
+
+import hammer as h
+
+
+# Semantic actions for the grammar below, each corresponds to an "ARULE".
+# They must be named act_<rulename>.
+
+def bsfdig_value(p):
+    """Return the numeric value of a parsed base64 digit.
+    """
+    c = p if isinstance(p, (int, long)) else ord(p)
+    if c:
+        if 0x41 <= c <= 0x5A: # A-Z
+            return  c - 0x41
+        elif 0x61 <= c <= 0x7A: # a-z
+            return  c - 0x61 + 26
+        elif 0x30 <= c <= 0x39: # 0-9
+            return  c - 0x30 + 52
+        elif c == '+':
+            return  62
+        elif c == '/':
+            return  63
+    return 0
+
+def act_base64(p, user_data=None):
+    assert isinstance(p, tuple)
+    assert len(p) == 2
+    assert isinstance(p[0], tuple)
+
+    # grab b64_3 block sequence
+    # grab and analyze b64 end block (_2 or _1)
+    b64_3 = p[0]
+    b64_2 = p[1]
+    b64_1 = p[1]
+
+    if not isinstance(b64_2, tuple):
+        b64_1 = b64_2 = None
+    elif b64_2[2] == '=':
+        b64_2 = None
+    else:
+        b64_1 = None
+
+    # allocate result sequence
+    res = []
+
+    # concatenate base64_3 blocks
+    for digits in b64_3:
+        assert isinstance(digits, tuple)
+
+        x = bsfdig_value(digits[0])
+        x <<= 6; x |= bsfdig_value(digits[1])
+        x <<= 6; x |= bsfdig_value(digits[2])
+        x <<= 6; x |= bsfdig_value(digits[3])
+        res.append((x >> 16) & 0xFF)
+        res.append((x >> 8) & 0xFF)
+        res.append(x & 0xFF)
+
+    # append one trailing base64_2 or _1 block
+    if b64_2:
+        digits = b64_2
+        x = bsfdig_value(digits[0])
+        x <<= 6; x |= bsfdig_value(digits[1])
+        x <<= 6; x |= bsfdig_value(digits[2])
+        res.append((x >> 10) & 0xFF)
+        res.append((x >> 2) & 0xFF)
+    elif b64_1:
+        digits = b64_1
+        x = bsfdig_value(digits[0])
+        x <<= 6; x |= bsfdig_value(digits[1])
+        res.append((x >> 4) & 0xFF)
+
+    return tuple(res)
+
+# Hammer's Python bindings don't currently expose h_act_index or hact_ignore
+
+def act_index0(p, user_data=None):
+    return p[0]
+
+def act_ignore(p, user_data=None):
+    return None
+
+act_ws          = act_ignore
+act_document    = act_index0
+
+
+def init_parser():
+    """Set up the parser with the grammar to be recognized.
+    """
+    # CORE
+    digit   = h.ch_range(0x30, 0x39)
+    alpha   = h.choice(h.ch_range(0x41, 0x5a), h.ch_range(0x61, 0x7a))
+    space   = h.in_(" \t\n\r\f\v")
+
+    # AUX.
+    plus    = h.ch('+')
+    slash   = h.ch('/')
+    equals  = h.ch('=')
+
+    bsfdig      = h.choice(alpha, digit, plus, slash)
+    bsfdig_4bit = h.in_("AEIMQUYcgkosw048")
+    bsfdig_2bit = h.in_("AQgw")
+    base64_3    = h.repeat_n(bsfdig, 4)
+    base64_2    = h.sequence(bsfdig, bsfdig, bsfdig_4bit, equals)
+    base64_1    = h.sequence(bsfdig, bsfdig_2bit, equals, equals)
+    base64      = h.action(h.sequence(h.many(base64_3),
+                                      h.optional(h.choice(base64_2,
+                                                          base64_1))),
+                           act_base64)
+
+    # TODO This is not quite the same as the C example, with uses act_ignore.
+    #      But I can't get hammer to filter any value returned by act_ignore.
+    ws          = h.ignore(h.many(space))
+    document    = h.action(h.sequence(ws, base64, ws, h.end_p()),
+                           act_document)
+
+    # BUG sometimes inputs that should just don't parse.
+    # It *seemed* to happen mostly with things like "bbbbaaaaBA==".
+    # Using less actions seemed to make it less likely.
+
+    return document
+
+
+def main():
+    parser = init_parser()
+
+    s = sys.stdin.read()
+    inputsize = len(s)
+    print('inputsize=%i' % inputsize, file=sys.stderr)
+    print('input=%s' % s, file=sys.stderr, end='')
+
+    result = parser.parse(s)
+
+    if result:
+        #print('parsed=%i bytes', result.bit_length/8, file=sys.stderr)
+        print(result)
+
+
+if __name__ == '__main__':
+    main()
-- 
GitLab