diff --git a/izodparse/ok.py b/izodparse/ok.py new file mode 100644 index 0000000000000000000000000000000000000000..ee916463a6a7ed964a267dccccafeb57fd792811 --- /dev/null +++ b/izodparse/ok.py @@ -0,0 +1,8 @@ +"Testing utilities." + +def ok(a, b): assert a == b, (a, b) + +def please_be(a, b): + if a != b: + raise ValueError(a, '!=', b) + diff --git a/izodparse/pdftour.py b/izodparse/pdftour.py index 3a48cbc93d0de76b9a3d5c8f43a1c587fde18976..fdb9c9cb320e58a817ad3829b7d57bb83edd3552 100755 --- a/izodparse/pdftour.py +++ b/izodparse/pdftour.py @@ -55,7 +55,8 @@ We can attempt parsing at that point with an ad-hoc custom grammar: We can get a debug dump of a grammar: - >>> print(pdftour.show_grammar(pdftour.xref_header)) + >>> from izodparse.peg import show_grammar + >>> print(show_grammar(pdftour.xref_header)) S: (((('xref' eol) ex0) ' ') ex0) eol eol: ('\r\n' | '\r') | '\n' ex0: '+' | '-' | '' digit digit* @@ -97,7 +98,7 @@ page contents: b'\x00)', b'\x00\x0f', b'\x00\x0f', b'\x00,', b'\x00\x10', b'\x00\x10', b'\x00-', b'\x00\x11', b'\x00\x11', b'\x00.', b'\x00\x12', b'\x00\x12', b'\x00/', b'\x00\x13... - >>> print(pdftour.show_grammar(pdftour.csranges_to_grammar(_[0]))) + >>> print(show_grammar(pdftour.csranges_to_grammar(_[0]))) S: ([\x00-\xff] [\x00-\xff])* Also, we can pull out page contents: @@ -114,15 +115,6 @@ Also, we can pull out page contents: 0 G 0 188.5 m -Too slow to use in practice (30 kilobytes/sec), but hopefully -validates our understanding of the problem and communicates it more -clearly than a lower-level language would. - -A pitfall I've run into a lot with the PEG parser in here is that if -you try to parse a Unicode string with a byte-string grammar or vice -versa, you just get silent failures to parse. This is a specific case -of the more general problem with silent failures. - Still unimplemented: - CMap bfranges that *use* arrays; @@ -150,458 +142,12 @@ You should have received a copy of the GNU General Public License along with izodparse. If not, see <http://www.gnu.org/licenses/>. """ -import sys, types, functools, operator, zlib - - -class memoprop: - """"Simplified, non-multithreaded version of functools.cached_property. - - For Pythons earlier than 3.8. Doesn't support __slots__, custom - __dict__, etc. - - """ - def __init__(self, func): - self.func = func - - def __get__(self, instance, cls): - cache = instance.__dict__ - if self not in cache: - cache[self] = self.func(instance) - return cache[self] - -def please_be(a, b): - if a != b: - raise ValueError(a, '!=', b) - - -### Packrat parsing engine with predictive lookahead parsing ### - -# First, some debugging utilities: -def represent_cset(cs): - "Debugging helper for understanding first sets; takes a set() of ints or chars." - if any(isinstance(c, str) for c in cs): - return represent_uset(cs) - - ranges = [] - for bt in sorted(cs): - if bt == b'-'[0] and not (bt - 1 in cs and bt + 1 in cs): - ranges[0:0] = [(bt, bt)] - continue - - if ranges and bt == ranges[-1][-1] + 1: - ranges[-1] = ranges[-1][0], bt - else: - ranges.append((bt, bt)) - - rangereps = [bytes((b,)) if a == b else bytes((a, ord('-'), b)) - for a, b in ranges] - return '[%s]' % repr(b''.join(rangereps))[2:-1] - -def represent_uset(cs): - "Unicode version of represent_cset." - ranges = [] - for c in sorted(cs): - if c == '-' and not (',' in cs and '.' in cs): # previous and following characters - ranges[0:0] = [(c, c)] - continue - - c = ord(c) - if ranges and c == ranges[-1][-1] + 1: - ranges[-1] = ranges[-1][0], c - else: - ranges.append((c, c)) - - rangereps = [chr(b) if a == b else chr(a) + '-' + chr(b) - for a, b in ranges] - return '[%s]' % ''.join(rangereps) - - -class GrammarVisitor: - """Compute global properties of a grammar, like what expressions are referenced twice. - - This is useful for printing out a readable representation of a - grammar. (Also, though we don't use this fact, singly-referenced - parsing expressions can never gain any benefit from the memo - table.) - - This version of Visitor deviates a bit from the standard pattern, - because our grammars are cyclic, so simply having the client - peremptorily invoke .accept on each of its children would loop - infinitely. Instead, we have the client invoke .inform_arc, which - allows the Visitor to decide whether to proceed to invoking - .accept. And then the .visit or .visit_type method doesn't need - to exist at all; .accept merely invokes .inform_arc for each - child. (This has the bug that if the root node has no children, - we'll never know about it.) - - """ - # A few nits that could be cleaned up: - # - negative charsets would make things like `regchar: - # [\x01-\x08\x0b\x0e-\x1f!-$&-'*-.0-;=?-Z\\^-z|~-\xff]` a lot more readable. - # This would involve shunting ^ to be not first if it's first I guess. - # - `[[]]` is not okay. `-` gets pulled to the front; `]` should - # be too. Not sure how to handle set(['-', ']']) but it hasn't arisen yet. - # - It would be beneficial to take advantage of the associativity - # of | and + to reduce parens. That might be better than the - # current precedence scheme. - - def __init__(self): - self.nodes = set() - self.parents_of = {} - self.roots = set() - - def inform_arc(self, parent, child): - # In the normal case, we got here because we invoked .accept - # on parent, so it'll already be in .nodes; but not for the - # initial entry point. - if parent not in self.nodes: - self.roots.add(parent) - self.nodes.add(parent) - - if child not in self.parents_of: - self.parents_of[child] = [] - self.parents_of[child].append(parent) - - if child not in self.nodes: - self.nodes.add(child) - child.accept(self) - - def show(self, rootname='S'): - "Dump out a human-readable grammar." - toplevel = (list(self.roots) + - [n for n in self.nodes - self.roots - if n.name is not None - or len(self.parents_of[n]) != 1]) - - # Assign names to all top-level nodes so they can be referenced. - names = {} - i = 0 - - for n in toplevel: - if n.name is None and n == toplevel[0]: - names[n] = rootname - elif n.name is None: - names[n] = 'ex%d' % i - i += 1 - else: - names[n] = n.name - - # Traverse the grammar a second time and output the - # description of each top-level rule. - output = [] - for n in toplevel: - output.append('%s: ' % names[n]) - output.append(self.pprint(n, names, 0, 0, top=True)) - output.append('\n') - - return ''.join(output) - - def pprint(self, prod, names, precedence_left, precedence_right, top=False): - "Produce a representation of a parsing expression. Violations of encapsulation willy-nilly." - if prod in names and not top: - return names[prod] - - if isinstance(prod, Any): - return '%s*' % self.pprint(prod.body, names, 20, 20) - elif isinstance(prod, Cat): - v = '%s %s' % (self.pprint(prod.a, names, precedence_left, 5), - self.pprint(prod.b, names, 5, precedence_right)) - return v if precedence_left <= 5 > precedence_right else '(%s)' % v - elif isinstance(prod, Charset): - return represent_cset(prod.first) - elif isinstance(prod, Alt): - v = '%s | %s' % (self.pprint(prod.a, names, precedence_left, 10), - self.pprint(prod.b, names, 10, precedence_right)) - return v if precedence_left <= 10 > precedence_right else '(%s)' % v - elif isinstance(prod, Lit): - return repr(prod.s) if isinstance(prod.s, str) else repr(prod.s.decode('utf-8')) - elif isinstance(prod, Thunk): - return self.pprint(prod.forced, names, precedence_left, precedence_right) - elif isinstance(prod, Tell): - return '@' - - else: - return str(prod) - - -def show_grammar(prod, rootname='S'): - v = GrammarVisitor() - prod.accept(v) - return v.show(rootname) - - -class Parse: - "A Packrat parse of a string or byte string." - def __init__(self, s, trace=lambda *args: None): - "s is the string to parse. Pass trace=print to enable verbose logging." - self.s = s - self.memos = {} - self.trace = trace - - def do(self, pos, ex): - "Attempt to parse using production/parsing expression `ex` starting at `pos`." - k = pos, ex - self.trace("parsing %s at %d" % (ex, pos)) - if k in self.memos: - self.trace("memo hit") - return self.memos[k] - result = ex.parse(self, pos) - if result and ex.xform: - result = result[0], ex.xform(result[1]) - - # if len(self.memos) > 16384: - # self.memos.clear() - self.memos[k] = result - self.trace("%s returns %s" % (ex, result)) - return result - - -class Prod: - "Base class for grammar productions, i.e., parsing expression types." - xform = None # Hook for post-processing. This turns out to be the wrong thing - name = None # Human-readable tag for debugging output - - def __str__(self): - return self.debugstr - - @memoprop - def debugstr(self): - "Used for tracing." - first = represent_cset(self.first) - if self.name: - return self.name + first - else: - return '<%s%s>' % (self.__class__.__name__, first) - - def __add__(self, other): - "Concatenation of parsing expressions." - return Cat(self, as_prod(other)) - def __radd__(self, other): - return as_prod(other) + self - - def __or__(self, other): - "Alternation (ordered choice) of parsing expressions." - return Alt(self, as_prod(other)) - def __ror__(self, other): - return as_prod(other) | self - - def some(self): - """One or more repetitions. - - The possibility of overwriting its .xform shows why .xform is bad. - """ - result = self + Any(self) - result.xform = lambda d: [d[0]] + d[1] - return result - - -def as_prod(datum): - "Coerce an arbitrary thing into a grammar production." - if isinstance(datum, Prod): - return datum - - if isinstance(datum, bytes) or isinstance(datum, str): - return Lit(datum) - - if isinstance(datum, list): - if len(datum) == 1: - return as_prod(datum[0]) - return as_prod(datum[0]) | datum[1:] - - if isinstance(datum, types.FunctionType): - return Thunk(datum) - - raise ValueError(datum) - - -class Lit(Prod): - "A parsing expression that matches a literal string or byte string." - def __init__(self, s): - self.s = s - self.first = {s[0]} if s else set() - self.nullable = not s - - def parse(self, parse, pos): - npos = pos + len(self.s) - if parse.s[pos:npos] == self.s: - return npos, self.s - - def accept(self, visitor): - pass - -def ok(a, b): assert a == b, (a, b) -ok(Parse("hello").do(0, Lit("hel")), (3, "hel")) -ok(Parse(b"hello").do(0, Lit(b"hel")), (3, b"hel")) -ok(Parse("hello").do(0, Lit("hec")), None) - - -class Cat(Prod): - "A parsing expression that matches the concatenation of two productions." - def __init__(self, a, b): - self.a, self.b = a, b - - def parse(self, parse, pos): - a = parse.do(pos, self.a) - if a is None: - return None - b = parse.do(a[0], self.b) - if b is None: - return None - return b[0], (a[1], b[1]) - - # These properties, used for predictive parsing, are lazily - # computed so that you can finish constructing a cyclic graph - # using Thunks before evaluating them. - @memoprop - def nullable(self): - return self.a.nullable and self.b.nullable - - @memoprop - def first(self): - return self.a.first | self.b.first if self.a.nullable else self.a.first - - def accept(self, visitor): - visitor.inform_arc(self, self.a) - visitor.inform_arc(self, self.b) - -assert Parse("hello").do(0, Lit('he') + 'll') == (4, ('he', 'll')) -assert Parse("hello").do(0, Lit('he') + 'lc') is None - - -class Alt(Prod): - "Matches the ordered-choice alternation of two productions." - def __init__(self, a, b): - self.a, self.b = a, b - - def parse(self, parse, pos): - c = parse.s[pos] if pos < len(parse.s) else None - # This form of lookahead increases speed by only about 30% - # to about 32 microseconds per byte. - if self.a.nullable or c in self.a.first: - a = parse.do(pos, self.a) - if a: - return a - - if self.b.nullable or c in self.b.first: - return parse.do(pos, self.b) - - return None - - @memoprop - def nullable(self): - return self.a.nullable or self.b.nullable - - @memoprop - def first(self): - return self.a.first | self.b.first - - def accept(self, visitor): - visitor.inform_arc(self, self.a) - visitor.inform_arc(self, self.b) - -assert Parse("hello").do(0, Lit('h') | 'x') == (1, 'h') -assert Parse("hello").do(0, Lit('x') | 'h') == (1, 'h') -assert Parse("hello").do(0, Lit('x') | 'y') == None -assert Parse("hello").do(0, Lit('h') | 'he') == (1, 'h') -assert Parse("hello").do(0, Lit('he') | 'h') == (2, 'he') - - -# To avoid Python's recursion depth when parsing nontrivial inputs, -# this is not defined in terms of Alt and Thunk, though in theory it -# could be. It turns out to be simpler this way because you don't -# have to post-process a cons list into a Python list. -class Any(Prod): - "Kleene-closure parsing expression: matches zero or more repetitions of a production." - def __init__(self, body): - self.body = body - self.nullable = True - - @memoprop - def first(self): - return self.body.first - - def parse(self, parse, pos): - results = [] - while True: - # Note, not taking advantage of predictive parsing in this - # case. Maybe I should. - kid = parse.do(pos, self.body) - if not kid: - return pos, results - results.append(kid[1]) - pos = kid[0] - - def accept(self, visitor): - visitor.inform_arc(self, self.body) - - -assert Parse("lalala!").do(0, Any(Lit('la'))) == (6, ['la', 'la', 'la']) -assert Parse("lalala!").do(0, Any(Lit('al'))) == (0, []) - - -class Thunk(Prod): - """A production whose definition is deferred until later. - - This allows the construction of cyclic grammars. In theory it - also allows the grammar to be constructed lazily and thus grow - during the parse. - """ - def __init__(self, body): - self.body = body - - @memoprop - def forced(self): - """Memoized property that ensures we only evaluate our body once. - - This doesn't affect performance, apparently, but semantically - it ensures that the meaning of the production doesn't change - over time. It still doesn't prevent you from creating lazily - computed infinite grammars, though... - - """ - return self.body() - - def parse(self, parse, pos): - return self.forced.parse(parse, pos) - - @property - def nullable(self): - return self.forced.nullable - - @property - def first(self): - return self.forced.first - - def accept(self, visitor): - visitor.inform_arc(self, self.forced) - - -class Charset(Prod): - "A parsing expression that matches any byte or character from a set()." - def __init__(self, cset): - self.first = cset - self.nullable = False - - def parse(self, parse, pos): - if pos >= len(parse.s): - return None - c = parse.s[pos] - if c in self.first: - return (pos + 1, c) - - def accept(self, visitor): - pass - -class Tell(Prod): - "Consumes no characters but 'parses' the current parse position." - first = set() - nullable = True - - def parse(self, parse, pos): - return pos, pos +import functools +import operator +import zlib - def accept(self, visitor): - pass +from .peg import Any, Charset, Lit, Thunk, Tell, Parse, as_prod +from .ok import ok, please_be ### PDF and PostScript and CMap parsing ### diff --git a/izodparse/peg.py b/izodparse/peg.py new file mode 100644 index 0000000000000000000000000000000000000000..42cfaa3f47ce01f52286a08fe773fdf767d1409e --- /dev/null +++ b/izodparse/peg.py @@ -0,0 +1,477 @@ +#!/usr/bin/python3 +r"""A small PEG parser generator. + +Too slow to use in practice (30 kilobytes/sec), but hopefully +validates our understanding of the problem and communicates it more +clearly than a lower-level language would. + +A pitfall I've run into a lot with the PEG parser in here is that if +you try to parse a Unicode string with a byte-string grammar or vice +versa, you just get silent failures to parse. This is a specific case +of the more general problem with silent failures. + +This file is part of izodparse. + +izodparse is free software: you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation, either version 3 of the License, or (at your +option) any later version. + +izodparse is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with izodparse. If not, see <http://www.gnu.org/licenses/>. + +""" +import types + +from .ok import ok, please_be + + +class memoprop: + """"Simplified, non-multithreaded version of functools.cached_property. + + For Pythons earlier than 3.8. Doesn't support __slots__, custom + __dict__, etc. + + """ + def __init__(self, func): + self.func = func + + def __get__(self, instance, cls): + cache = instance.__dict__ + if self not in cache: + cache[self] = self.func(instance) + return cache[self] + + +### Packrat parsing engine with predictive lookahead parsing ### + +# First, some debugging utilities: +def represent_cset(cs): + "Debugging helper for understanding first sets; takes a set() of ints or chars." + if any(isinstance(c, str) for c in cs): + return represent_uset(cs) + + ranges = [] + for bt in sorted(cs): + if bt == b'-'[0] and not (bt - 1 in cs and bt + 1 in cs): + ranges[0:0] = [(bt, bt)] + continue + + if ranges and bt == ranges[-1][-1] + 1: + ranges[-1] = ranges[-1][0], bt + else: + ranges.append((bt, bt)) + + rangereps = [bytes((b,)) if a == b else bytes((a, ord('-'), b)) + for a, b in ranges] + return '[%s]' % repr(b''.join(rangereps))[2:-1] + +def represent_uset(cs): + "Unicode version of represent_cset." + ranges = [] + for c in sorted(cs): + if c == '-' and not (',' in cs and '.' in cs): # previous and following characters + ranges[0:0] = [(c, c)] + continue + + c = ord(c) + if ranges and c == ranges[-1][-1] + 1: + ranges[-1] = ranges[-1][0], c + else: + ranges.append((c, c)) + + rangereps = [chr(b) if a == b else chr(a) + '-' + chr(b) + for a, b in ranges] + return '[%s]' % ''.join(rangereps) + + +class GrammarVisitor: + """Compute global properties of a grammar, like what expressions are referenced twice. + + This is useful for printing out a readable representation of a + grammar. (Also, though we don't use this fact, singly-referenced + parsing expressions can never gain any benefit from the memo + table.) + + This version of Visitor deviates a bit from the standard pattern, + because our grammars are cyclic, so simply having the client + peremptorily invoke .accept on each of its children would loop + infinitely. Instead, we have the client invoke .inform_arc, which + allows the Visitor to decide whether to proceed to invoking + .accept. And then the .visit or .visit_type method doesn't need + to exist at all; .accept merely invokes .inform_arc for each + child. (This has the bug that if the root node has no children, + we'll never know about it.) + + """ + # A few nits that could be cleaned up: + # - negative charsets would make things like `regchar: + # [\x01-\x08\x0b\x0e-\x1f!-$&-'*-.0-;=?-Z\\^-z|~-\xff]` a lot more readable. + # This would involve shunting ^ to be not first if it's first I guess. + # - `[[]]` is not okay. `-` gets pulled to the front; `]` should + # be too. Not sure how to handle set(['-', ']']) but it hasn't arisen yet. + # - It would be beneficial to take advantage of the associativity + # of | and + to reduce parens. That might be better than the + # current precedence scheme. + + def __init__(self): + self.nodes = set() + self.parents_of = {} + self.roots = set() + + def inform_arc(self, parent, child): + # In the normal case, we got here because we invoked .accept + # on parent, so it'll already be in .nodes; but not for the + # initial entry point. + if parent not in self.nodes: + self.roots.add(parent) + self.nodes.add(parent) + + if child not in self.parents_of: + self.parents_of[child] = [] + self.parents_of[child].append(parent) + + if child not in self.nodes: + self.nodes.add(child) + child.accept(self) + + def show(self, rootname='S'): + "Dump out a human-readable grammar." + toplevel = (list(self.roots) + + [n for n in self.nodes - self.roots + if n.name is not None + or len(self.parents_of[n]) != 1]) + + # Assign names to all top-level nodes so they can be referenced. + names = {} + i = 0 + + for n in toplevel: + if n.name is None and n == toplevel[0]: + names[n] = rootname + elif n.name is None: + names[n] = 'ex%d' % i + i += 1 + else: + names[n] = n.name + + # Traverse the grammar a second time and output the + # description of each top-level rule. + output = [] + for n in toplevel: + output.append('%s: ' % names[n]) + output.append(self.pprint(n, names, 0, 0, top=True)) + output.append('\n') + + return ''.join(output) + + def pprint(self, prod, names, precedence_left, precedence_right, top=False): + "Produce a representation of a parsing expression. Violations of encapsulation willy-nilly." + if prod in names and not top: + return names[prod] + + if isinstance(prod, Any): + return '%s*' % self.pprint(prod.body, names, 20, 20) + elif isinstance(prod, Cat): + v = '%s %s' % (self.pprint(prod.a, names, precedence_left, 5), + self.pprint(prod.b, names, 5, precedence_right)) + return v if precedence_left <= 5 > precedence_right else '(%s)' % v + elif isinstance(prod, Charset): + return represent_cset(prod.first) + elif isinstance(prod, Alt): + v = '%s | %s' % (self.pprint(prod.a, names, precedence_left, 10), + self.pprint(prod.b, names, 10, precedence_right)) + return v if precedence_left <= 10 > precedence_right else '(%s)' % v + elif isinstance(prod, Lit): + return repr(prod.s) if isinstance(prod.s, str) else repr(prod.s.decode('utf-8')) + elif isinstance(prod, Thunk): + return self.pprint(prod.forced, names, precedence_left, precedence_right) + elif isinstance(prod, Tell): + return '@' + + else: + return str(prod) + + +def show_grammar(prod, rootname='S'): + v = GrammarVisitor() + prod.accept(v) + return v.show(rootname) + + +class Parse: + "A Packrat parse of a string or byte string." + def __init__(self, s, trace=lambda *args: None): + "s is the string to parse. Pass trace=print to enable verbose logging." + self.s = s + self.memos = {} + self.trace = trace + + def do(self, pos, ex): + "Attempt to parse using production/parsing expression `ex` starting at `pos`." + k = pos, ex + self.trace("parsing %s at %d" % (ex, pos)) + if k in self.memos: + self.trace("memo hit") + return self.memos[k] + result = ex.parse(self, pos) + if result and ex.xform: + result = result[0], ex.xform(result[1]) + + # if len(self.memos) > 16384: + # self.memos.clear() + self.memos[k] = result + self.trace("%s returns %s" % (ex, result)) + return result + + +class Prod: + "Base class for grammar productions, i.e., parsing expression types." + xform = None # Hook for post-processing. This turns out to be the wrong thing + name = None # Human-readable tag for debugging output + + def __str__(self): + return self.debugstr + + @memoprop + def debugstr(self): + "Used for tracing." + first = represent_cset(self.first) + if self.name: + return self.name + first + else: + return '<%s%s>' % (self.__class__.__name__, first) + + def __add__(self, other): + "Concatenation of parsing expressions." + return Cat(self, as_prod(other)) + def __radd__(self, other): + return as_prod(other) + self + + def __or__(self, other): + "Alternation (ordered choice) of parsing expressions." + return Alt(self, as_prod(other)) + def __ror__(self, other): + return as_prod(other) | self + + def some(self): + """One or more repetitions. + + The possibility of overwriting its .xform shows why .xform is bad. + """ + result = self + Any(self) + result.xform = lambda d: [d[0]] + d[1] + return result + + +def as_prod(datum): + "Coerce an arbitrary thing into a grammar production." + if isinstance(datum, Prod): + return datum + + if isinstance(datum, bytes) or isinstance(datum, str): + return Lit(datum) + + if isinstance(datum, list): + if len(datum) == 1: + return as_prod(datum[0]) + return as_prod(datum[0]) | datum[1:] + + if isinstance(datum, types.FunctionType): + return Thunk(datum) + + raise ValueError(datum) + + +class Lit(Prod): + "A parsing expression that matches a literal string or byte string." + def __init__(self, s): + self.s = s + self.first = {s[0]} if s else set() + self.nullable = not s + + def parse(self, parse, pos): + npos = pos + len(self.s) + if parse.s[pos:npos] == self.s: + return npos, self.s + + def accept(self, visitor): + pass + +ok(Parse("hello").do(0, Lit("hel")), (3, "hel")) +ok(Parse(b"hello").do(0, Lit(b"hel")), (3, b"hel")) +ok(Parse("hello").do(0, Lit("hec")), None) + + +class Cat(Prod): + "A parsing expression that matches the concatenation of two productions." + def __init__(self, a, b): + self.a, self.b = a, b + + def parse(self, parse, pos): + a = parse.do(pos, self.a) + if a is None: + return None + b = parse.do(a[0], self.b) + if b is None: + return None + return b[0], (a[1], b[1]) + + # These properties, used for predictive parsing, are lazily + # computed so that you can finish constructing a cyclic graph + # using Thunks before evaluating them. + @memoprop + def nullable(self): + return self.a.nullable and self.b.nullable + + @memoprop + def first(self): + return self.a.first | self.b.first if self.a.nullable else self.a.first + + def accept(self, visitor): + visitor.inform_arc(self, self.a) + visitor.inform_arc(self, self.b) + +assert Parse("hello").do(0, Lit('he') + 'll') == (4, ('he', 'll')) +assert Parse("hello").do(0, Lit('he') + 'lc') is None + + +class Alt(Prod): + "Matches the ordered-choice alternation of two productions." + def __init__(self, a, b): + self.a, self.b = a, b + + def parse(self, parse, pos): + c = parse.s[pos] if pos < len(parse.s) else None + # This form of lookahead increases speed by only about 30% + # to about 32 microseconds per byte. + if self.a.nullable or c in self.a.first: + a = parse.do(pos, self.a) + if a: + return a + + if self.b.nullable or c in self.b.first: + return parse.do(pos, self.b) + + return None + + @memoprop + def nullable(self): + return self.a.nullable or self.b.nullable + + @memoprop + def first(self): + return self.a.first | self.b.first + + def accept(self, visitor): + visitor.inform_arc(self, self.a) + visitor.inform_arc(self, self.b) + +assert Parse("hello").do(0, Lit('h') | 'x') == (1, 'h') +assert Parse("hello").do(0, Lit('x') | 'h') == (1, 'h') +assert Parse("hello").do(0, Lit('x') | 'y') == None +assert Parse("hello").do(0, Lit('h') | 'he') == (1, 'h') +assert Parse("hello").do(0, Lit('he') | 'h') == (2, 'he') + + +# To avoid Python's recursion depth when parsing nontrivial inputs, +# this is not defined in terms of Alt and Thunk, though in theory it +# could be. It turns out to be simpler this way because you don't +# have to post-process a cons list into a Python list. +class Any(Prod): + "Kleene-closure parsing expression: matches zero or more repetitions of a production." + def __init__(self, body): + self.body = body + self.nullable = True + + @memoprop + def first(self): + return self.body.first + + def parse(self, parse, pos): + results = [] + while True: + # Note, not taking advantage of predictive parsing in this + # case. Maybe I should. + kid = parse.do(pos, self.body) + if not kid: + return pos, results + results.append(kid[1]) + pos = kid[0] + + def accept(self, visitor): + visitor.inform_arc(self, self.body) + + +assert Parse("lalala!").do(0, Any(Lit('la'))) == (6, ['la', 'la', 'la']) +assert Parse("lalala!").do(0, Any(Lit('al'))) == (0, []) + + +class Thunk(Prod): + """A production whose definition is deferred until later. + + This allows the construction of cyclic grammars. In theory it + also allows the grammar to be constructed lazily and thus grow + during the parse. + """ + def __init__(self, body): + self.body = body + + @memoprop + def forced(self): + """Memoized property that ensures we only evaluate our body once. + + This doesn't affect performance, apparently, but semantically + it ensures that the meaning of the production doesn't change + over time. It still doesn't prevent you from creating lazily + computed infinite grammars, though... + + """ + return self.body() + + def parse(self, parse, pos): + return self.forced.parse(parse, pos) + + @property + def nullable(self): + return self.forced.nullable + + @property + def first(self): + return self.forced.first + + def accept(self, visitor): + visitor.inform_arc(self, self.forced) + + +class Charset(Prod): + "A parsing expression that matches any byte or character from a set()." + def __init__(self, cset): + self.first = cset + self.nullable = False + + def parse(self, parse, pos): + if pos >= len(parse.s): + return None + c = parse.s[pos] + if c in self.first: + return (pos + 1, c) + + def accept(self, visitor): + pass + +class Tell(Prod): + "Consumes no characters but 'parses' the current parse position." + first = set() + nullable = True + + def parse(self, parse, pos): + return pos, pos + + def accept(self, visitor): + pass diff --git a/plans.org b/plans.org index 996d0b4e8129a3ef865dba3b7f908319a872e771..9eedac711decc78e95358947cdb5b7b0a481cb17 100644 --- a/plans.org +++ b/plans.org @@ -1,80 +1,87 @@ -* DONE split out pdftour into separate repo. -charpy? (too popular) rebound? (too popular) schmidtconcrete? (ok) -sclerometer? (wrong test) izod? (seems okay) 1zod or iz0d? better. -punch? (a tiny hammer, too popular). Bec? (de corbin, but too -popular) martel? (could be, but common surname) maillet? (could be, -but common surname) otsuchi? (would be fine) totokia? (would be -fine) mere? (too common) patu? (too common) +* things I am thinking of doing [3/14] +** DONE split out pdftour into separate repo. + charpy? (too popular) rebound? (too popular) schmidtconcrete? (ok) + sclerometer? (wrong test) izod? (seems okay) 1zod or iz0d? better. + punch? (a tiny hammer, too popular). Bec? (de corbin, but too + popular) martel? (could be, but common surname) maillet? (could be, + but common surname) otsuchi? (would be fine) totokia? (would be + fine) mere? (too common) patu? (too common) -For now it's 1zodparse. No, izodparse, so it's a valid Python module -name. + For now it's 1zodparse. No, izodparse, so it's a valid Python module + name. -How do I git filter-branch? I want pdftour.py and parsecmaps.py. or -maybe git-filter-repo? no, don't have it. --prune-empty? --all? -Can I use git ls-files -z | egrep -zv 'parsecmap|pdftour' | xargs -0 -git rm? + How do I git filter-branch? I want pdftour.py and parsecmaps.py. or + maybe git-filter-repo? no, don't have it. --prune-empty? --all? + Can I use git ls-files -z | egrep -zv 'parsecmap|pdftour' | xargs -0 + git rm? -maybe + maybe - git filter-branch --index-filter 'git ls-files -z | egrep -zv "parsemap|pdftour" | xargs -0 git rm' --all + git filter-branch --index-filter 'git ls-files -z | egrep -zv "parsemap|pdftour" | xargs -0 git rm' --all -no, that gives me a usage warning... + no, that gives me a usage warning... - git filter-branch --prune-empty --index-filter 'git ls-files -z | egrep -zv "parsemap|pdftour" | xargs -r0 git rm' -- --all + git filter-branch --prune-empty --index-filter 'git ls-files -z | egrep -zv "parsemap|pdftour" | xargs -r0 git rm' -- --all -almost! + almost! - FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch --prune-empty --index-filter 'git ls-files -z | egrep -zv "parsecmap|pdftour" | xargs -r0 git rm' -- --all + FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch --prune-empty --index-filter 'git ls-files -z | egrep -zv "parsecmap|pdftour" | xargs -r0 git rm' -- --all -Almost but now I have this merge commit: + Almost but now I have this merge commit: - user (1): - Merge /media/usb0/stuff/text_parse + user (1): + Merge /media/usb0/stuff/text_parse -In -https://stackoverflow.com/questions/9803294/prune-empty-merge-commits-from-history-in-git-repository -we find this suggestion: + In + https://stackoverflow.com/questions/9803294/prune-empty-merge-commits-from-history-in-git-repository + we find this suggestion: - compu@compu-HP-Pavilion-14-Notebook-PC:~/1zodparse$ git rebase --root HEAD - Successfully rebased and updated detached HEAD. - compu@compu-HP-Pavilion-14-Notebook-PC:~/1zodparse$ git shortlog - Kragen Javier Sitaker (10): - Add initial spike of parsing CMaps - Remove Python3.8 dependency from parsecmaps.py - Add readable debug display of grammars - Add PDF file traversal skeleton to parsecmaps - Enable parsecmaps to navigate PDF file graph structure - Actually get a CMap out of a PDF with pdftour - Implement stream object decompression in pdftour - Update pdftour comments a bit - Update pdftour comments and make it handle PDF comments - Clean up pdftour code slightly + compu@compu-HP-Pavilion-14-Notebook-PC:~/1zodparse$ git rebase --root HEAD + Successfully rebased and updated detached HEAD. + compu@compu-HP-Pavilion-14-Notebook-PC:~/1zodparse$ git shortlog + Kragen Javier Sitaker (10): + Add initial spike of parsing CMaps + Remove Python3.8 dependency from parsecmaps.py + Add readable debug display of grammars + Add PDF file traversal skeleton to parsecmaps + Enable parsecmaps to navigate PDF file graph structure + Actually get a CMap out of a PDF with pdftour + Implement stream object decompression in pdftour + Update pdftour comments a bit + Update pdftour comments and make it handle PDF comments + Clean up pdftour code slightly - compu@compu-HP-Pavilion-14-Notebook-PC:~/1zodparse$ git rebase --root master - Successfully rebased and updated refs/heads/master. - compu@compu-HP-Pavilion-14-Notebook-PC:~/1zodparse$ git shortlog - Kragen Javier Sitaker (10): - Add initial spike of parsing CMaps - Remove Python3.8 dependency from parsecmaps.py - Add readable debug display of grammars - Add PDF file traversal skeleton to parsecmaps - Enable parsecmaps to navigate PDF file graph structure - Actually get a CMap out of a PDF with pdftour - Implement stream object decompression in pdftour - Update pdftour comments a bit - Update pdftour comments and make it handle PDF comments - Clean up pdftour code slightly + compu@compu-HP-Pavilion-14-Notebook-PC:~/1zodparse$ git rebase --root master + Successfully rebased and updated refs/heads/master. + compu@compu-HP-Pavilion-14-Notebook-PC:~/1zodparse$ git shortlog + Kragen Javier Sitaker (10): + Add initial spike of parsing CMaps + Remove Python3.8 dependency from parsecmaps.py + Add readable debug display of grammars + Add PDF file traversal skeleton to parsecmaps + Enable parsecmaps to navigate PDF file graph structure + Actually get a CMap out of a PDF with pdftour + Implement stream object decompression in pdftour + Update pdftour comments a bit + Update pdftour comments and make it handle PDF comments + Clean up pdftour code slightly -This apparently changes GIT_COMMITTER_DATE but I don't care. - -* DONE make izodparse an installable Python package + This apparently changes GIT_COMMITTER_DATE but I don't care. +** TODO sanitize the history by removing this file? +This file has a bunch of stuff in it about my local machine config. +** TODO upload to gitlab +** DONE make izodparse an installable Python package Man, I forgot all about the distutils/setuptools mess. -* TODO examine example PDF file with compressed object streams and no fonts in page resource dictionaries -* TODO fix nested parentheses parsing -* TODO make xrefs, etc., lazy properties -* TODO use nested dicts for labeled properties of concatenations instead of tuples -* TODO make parse failures report where the problem is and what was expected -* TODO add pretty-printing of PDF dictionaries -* TODO examine text encodings in PDF files -* TODO add graphviz display, maybe inline in Jupyter -* TODO find XObjects with fonts +** DONE split out parsing-engine stuff from PDF/PS stuff +** TODO make xrefs, etc., lazy properties +** TODO examine example PDF file with compressed object streams and no fonts in page resource dictionaries [0/3] +*** TODO find it +*** TODO copy it +*** TODO see what its xrefs look like +** TODO fix nested parentheses parsing +** TODO use nested dicts for labeled properties of concatenations instead of tuples +** TODO make parse failures report where the problem is and what was expected +** TODO add pretty-printing of PDF dictionaries +** TODO examine text encodings in PDF files +** TODO add graphviz display, maybe inline in Jupyter +** TODO find XObjects with fonts