diff --git a/pdf.c b/pdf.c index 3c3a30c92198cfb34695c84edb65d1da7da676ea..602b2f427287a0ccd9eab1b256f9364d2650d23d 100644 --- a/pdf.c +++ b/pdf.c @@ -222,6 +222,179 @@ act_hupper(const HParseResult *p, void *u) return H_MAKE_UINT(H_CAST_UINT(p->ast) - 'A'); } +HParsedToken * +act_ahextruncated(const HParseResult *p, void *u) +{ + uint8_t b = 0; + HCountedArray *seq = H_CAST_SEQ(p->ast); + + /* Assumption: At this point seq->elements[0] is a hex digit + * and seq->elements[1] holds '>' (EOD) + */ + // XXX figure out how to compare to '>' + assert(seq->used == 2); + b = H_CAST_UINT(seq->elements[0]) << 4; + return H_MAKE_UINT(b); +} + +HParsedToken * +act_a85zero(const HParseResult *p, void *u) +{ + uint32_t b = 0; + return H_MAKE_UINT(b); +} + +#include <math.h> /* pow() */ + +HParsedToken * +act_a85digit(const HParseResult *p, void *u) +{ + uint8_t b = H_CAST_UINT(p->ast); + b -= '!'; + + /* At this point we have the base85 value of one byte out of 5 */ + return H_MAKE_UINT(b); +} + +HParsedToken * +act_a85fivedigits(const HParseResult *p, void *u) +{ + uint64_t fourbytes = 0; + HCountedArray *seq = H_CAST_SEQ(p->ast); + HParsedToken **digits = h_seq_elements(p->ast); + + /* 2^32-1, the max value the group can hold as per spec */ + #define A85GRPMAX 4294967295 + + /* Only for groups that do not need to padded to 5 */ + assert(seq->used == 5); + fourbytes += H_CAST_UINT(digits[0]) * ((uint64_t) pow(85,4)); + fourbytes += H_CAST_UINT(digits[1]) * ((uint64_t) pow(85,3)); + fourbytes += H_CAST_UINT(digits[2]) * ((uint64_t) pow(85,2)); + fourbytes += H_CAST_UINT(digits[3]) * ((uint64_t) pow(85,1)); + fourbytes += H_CAST_UINT(digits[4]); + + assert(fourbytes <= A85GRPMAX); + return H_MAKE_UINT(fourbytes); +} + +/* Checking the following condition in the spec: + * The value represented by a group of 5 characters is greater than 2^32 - 1. + */ +bool +validate_a85fivedigits(HParseResult *p, void *u) +{ + /* "s8W-!" should be the highest accepted value */ + return H_CAST_UINT(p->ast) <= A85GRPMAX; +} + +HParsedToken * +act_a85group(const HParseResult *p, void *u) +{ + uint8_t *bytes = h_arena_malloc(p->arena, 4); + uint32_t fourbytes = H_CAST_UINT(p->ast); + + bytes[0] = (fourbytes & 0xFF000000) >> 24; + bytes[1] = (fourbytes & 0x00FF0000) >> 16; + bytes[2] = (fourbytes & 0x0000FF00) >> 8; + bytes[3] = (fourbytes & 0x000000FF); + + HParsedToken *b = H_MAKE_BYTES(bytes, 4); + return b; +} + +HParsedToken * +act_a85partial2group(const HParseResult *p, void *u) +{ + uint64_t fourbytes = 0; + HCountedArray *seq = H_CAST_SEQ(p->ast); + HParsedToken **digits = h_seq_elements(p->ast); + + assert(seq->used == 2); + fourbytes += H_CAST_UINT(digits[0]) * ((uint64_t) pow(85,4)); + fourbytes += H_CAST_UINT(digits[1]) * ((uint64_t) pow(85,3)); + + assert(fourbytes <= A85GRPMAX); + return H_MAKE_UINT(fourbytes); +} + +bool +validate_a85partial2group(HParseResult *p, void *u) +{ + return H_CAST_UINT(p->ast) <= A85GRPMAX; +} + +HParsedToken * +act_a85partial3group(const HParseResult *p, void *u) +{ + uint64_t fourbytes = 0; + HCountedArray *seq = H_CAST_SEQ(p->ast); + HParsedToken **digits = h_seq_elements(p->ast); + + assert(seq->used == 3); + fourbytes += H_CAST_UINT(digits[0]) * ((uint64_t) pow(85,4)); + fourbytes += H_CAST_UINT(digits[1]) * ((uint64_t) pow(85,3)); + fourbytes += H_CAST_UINT(digits[2]) * ((uint64_t) pow(85,2)); + + assert(fourbytes <= A85GRPMAX); + return H_MAKE_UINT(fourbytes); +} + +bool +validate_a85partial3group(HParseResult *p, void *u) +{ + return H_CAST_UINT(p->ast) <= A85GRPMAX; +} + +HParsedToken * +act_a85partial4group(const HParseResult *p, void *u) +{ + uint64_t fourbytes = 0; + HCountedArray *seq = H_CAST_SEQ(p->ast); + HParsedToken **digits = h_seq_elements(p->ast); + + assert(seq->used == 4); + fourbytes += H_CAST_UINT(digits[0]) * ((uint64_t) pow(85,4)); + fourbytes += H_CAST_UINT(digits[1]) * ((uint64_t) pow(85,3)); + fourbytes += H_CAST_UINT(digits[2]) * ((uint64_t) pow(85,2)); + fourbytes += H_CAST_UINT(digits[3]) * ((uint64_t) pow(85,1)); + + assert(fourbytes <= A85GRPMAX); + return H_MAKE_UINT(fourbytes); +} + +bool +validate_a85partial4group(HParseResult *p, void *u) +{ + return H_CAST_UINT(p->ast) <= A85GRPMAX; +} + +HParsedToken * +act_a85partialgroup(const HParseResult *p, void *u) +{ + uint8_t bytes_helper[4]; + size_t bytes_used = 0; + uint8_t *bytes; + + uint32_t fourbytes = H_CAST_UINT(p->ast); + + /* Scan the uint backwards to find the first non-zero byte */ + for (size_t i = 3; i > 0; --i) + { + /* Shift by 0, 8, 16 and 24 to get the correct byte */ + bytes_helper[i] = (fourbytes >> ((3-i) * 8)) & 0xFF; + /* If we haven't set bytes_used yet, and the particular byte is nonzero */ + if (!bytes_used && bytes_helper[i]) + bytes_used = i; + } + assert(bytes_used > 0); + + bytes = h_arena_malloc(p->arena, bytes_used); + return H_MAKE_BYTES(bytes, bytes_used); +} + +// TODO: flatten sequence in a85string semantic action + HParsedToken * act_nat(const HParseResult *p, void *u) { @@ -468,6 +641,8 @@ HParser *p_pdfdbg; HParser *p_startxref; HParser *p_xref; HParser *p_objdef; +HParser *p_a85string; +HParser *p_ahexstream; /* continuations for h_bind() */ HParser *kstream(HAllocator *, const HParsedToken *, void *); @@ -718,6 +893,36 @@ init_parser(struct Env *aux) /* debug parser to consume as much as possible */ H_RULE(pdfdbg, SEQ(header, h_many(tail), body, OPT(xr_td), OPT(startxr))); + /* Parser for Ascii85Decode */ + H_RULE(a85eod, SEQ(h_ch('~'), OPT(h_many(lwchar)), h_ch('>'))); + H_ARULE(a85zero, h_ch('z')); + H_ARULE(a85digit, h_ch_range('!', 'u')); + + /* Line whitespace can occur between any digit and has to be ignored, */ + /* Comments are not allowed inside streams, and % character should cause + * a parse error. */ + #define MANY_LWS(X) h_many(CHX(lws, X)) + + /* This encoding of zero is not allowed */ + H_RULE(a85fiveexcl, h_repeat_n(MANY_LWS(h_ch('!')), 5)); + H_VARULE(a85fivedigits, SEQ(h_and(h_not(a85fiveexcl)), h_repeat_n(MANY_LWS(a85digit), 5))); + H_ARULE(a85group, CHX(a85zero, a85fivedigits)); + + H_VARULE(a85partial2group, h_repeat_n(MANY_LWS(a85digit), 2)); + H_VARULE(a85partial3group, h_repeat_n(MANY_LWS(a85digit), 3)); + H_VARULE(a85partial4group, h_repeat_n(MANY_LWS(a85digit), 4)); + H_ARULE(a85partialgroup, CHX(a85partial4group, a85partial3group, a85partial4group)); + + H_RULE(a85string, SEQ(h_many(a85group), OPT(a85partialgroup), IGN(a85eod))); + + /* AsciiHexDecode parser */ + H_RULE(ahexeod, h_ch('>')); + H_RULE(hdigitpair, SEQ(IGN(OPT(h_many(lwchar))), hdigit, IGN(OPT(h_many(lwchar))), hdigit)); + H_ARULE(ahextruncated, SEQ(IGN(OPT(h_many(lwchar))), hdigit, IGN(OPT(h_many(lwchar))), ahexeod)); + + H_RULE(hs_end, CHX(hdigitpair, ahextruncated)); + H_RULE(hexstream, SEQ(h_many(hdigitpair), hs_end)); + /* global parser variables */ p_pdf = pdf; @@ -725,6 +930,8 @@ init_parser(struct Env *aux) p_startxref = startxr; p_xref = CHX(xr_td, xrstm); p_objdef = objdef; + p_a85string = a85string; + p_ahexstream = hexstream; p_fail = h_nothing_p(); p_epsilon = epsilon; @@ -1096,6 +1303,44 @@ FlateDecode(const Dict *parms, HBytes b, HParser *p) return res; } +/* + * Decodes ASCII hexadecimal data into binary data. + * parms should be empty, because the filter has no parameters + */ +HParseResult * +ASCIIHexDecode(const Dict *parms, HBytes b, HParser *p) +{ + HParseResult *res; + + res = h_parse(p_ahexstream, b.token, b.len); + if(!res) + { + fprintf(stderr, "parse error in ASCIIHexDecode filter\n"); + return NULL; + } + + return res; +} + +/* + * Decodes ASCII base-85 encoded data and produces binary data. + * parms should be empty, because the filter has no parameters + */ +HParseResult* +ASCII85Decode(const Dict *parms, HBytes b, HParser *p) +{ + HParseResult *res; + + res = h_parse(p_a85string, b.token, b.len); + if(!res) + { + fprintf(stderr, "parse error in ASCII85Decode filter\n"); + return NULL; + } + + return res; +} + /* * decode the bytes in 'b' according to metadata in the stream dictionary 'd' * and parse the result with 'p'. @@ -1119,11 +1364,15 @@ decode_stream(const Dict *d, HBytes b, HParser *p) return NULL; // XXX filter chains not supported, yet assert(v->token_type == TT_BYTES); if (bytes_eq(v->bytes, "FlateDecode")) - filter = FlateDecode; + filter = FlateDecode; // XXX add ASCIIHexDecode and ASCII85Decode here + else if (bytes_eq(v->bytes, "ASCIIHexDecode")) + filter = ASCIIHexDecode; + else if (bytes_eq(v->bytes, "ASCII85Decode")) + filter = ASCII85Decode; else return NULL; /* filter not supported */ - v = dictentry(d, "DecodeParms"); + v = dictentry(d, "DecodeParms"); // XXX ASCII filters don't use DecodeParms if (v && v->token_type == TT_Dict) parms = v->user;