diff --git a/pdf.c b/pdf.c index 44dcb83138f2080480193d43eac6de5ed6e70516..1e55c3fcdc5d5fd8440bfd9084ab6353777e970f 100644 --- a/pdf.c +++ b/pdf.c @@ -436,7 +436,7 @@ HParsedToken * act_a85partial2group(const HParseResult *p, void *u) { // only a single character is necessary - // XXX: use H_MAKE? + // XXX: use H_ALLOC? uint8_t *bytes = h_arena_malloc(p->arena, sizeof(uint8_t)); uint32_t fourbytes = 0; @@ -450,12 +450,13 @@ act_a85partial2group(const HParseResult *p, void *u) fprintf(stdout, "act_a85partial2group: i = %d, digit = %2lx, fourbytes = %4x (%d)\n", i, H_CAST_UINT(digits[i]->seq->elements[0]), fourbytes, fourbytes); // XXX DEBUG } + // fill the other bytes with u (dec 117-33) + // XXX I'm confused. I think to restore the padding, '!' needs to be added instead of 'u' for (int i=2; i<5; i++) { fourbytes = fourbytes * 85 + 84; } - // truncate and return only the high order byte bytes[0] = (fourbytes & 0xFF000000) >> 24; fprintf(stdout, "act_a85partial2group: %4x (%d) ==> %2x \n", fourbytes, fourbytes, bytes[0]); // XXX DEBUG @@ -482,7 +483,7 @@ act_a85partial3group(const HParseResult *p, void *u) uint8_t *bytes = h_arena_malloc(p->arena, sizeof(uint8_t) * 2); for (int i=0; i<4; i++) { - assert(digits[i]->token_type == TT_SEQUENCE); + assert(digits[i]->token_type == TT_SEQUENCE); // XXX assert fails on test input A85(Hello, World!a) -> 87cURD_*#4DfTZ)+^k~> fourbytes = fourbytes * 85 + H_CAST_UINT(digits[i]->seq->elements[0]); } @@ -492,12 +493,13 @@ act_a85partial3group(const HParseResult *p, void *u) } // truncate and return only the high order bytes + // I think groups of 3 A85 characters decode to 2 bytes bytes[0] = (fourbytes & 0xFF000000) >> 24; bytes[1] = (fourbytes & 0x00FF0000) >> 16; bytes[2] = (fourbytes & 0x0000FF00) >> 8; - fprintf(stdout, "act_a85partial2group: %4x (%d) ==> %2x %2x %2x \n", fourbytes, fourbytes, bytes[0], - bytes[1], bytes[2]); + fprintf(stdout, "act_a85partial3group: %4x (%d) ==> %2x %2x %2x \n", fourbytes, fourbytes, bytes[0], + bytes[1], bytes[2]); // XXX DEBUG return H_MAKE_BYTES(bytes, 3); // XXX test: should this be 2? } @@ -527,8 +529,9 @@ act_a85partial4group(const HParseResult *p, void *u) // truncate and return only the high order bytes bytes[0] = (fourbytes & 0xFF000000) >> 24; bytes[1] = (fourbytes & 0x00FF0000) >> 16; + bytes[2] = (fourbytes & 0x0000FF00) >> 8; - fprintf(stdout, "act_a85partial2group: %4x (%d) ==> %2x %2x \n", fourbytes, fourbytes, bytes[0], bytes[1]); + fprintf(stdout, "act_a85partial4group: %4x (%d) ==> %2x %2x \n", fourbytes, fourbytes, bytes[0], bytes[1], bytes[2]); // XXX debug return H_MAKE_BYTES(bytes, 3); } @@ -1253,8 +1256,6 @@ init_parser(struct Env *aux) */ /* Whitespace can occur between any digit and has to be ignored, */ - /* Comments are not allowed inside streams, and % character should cause - * a parse error. */ H_RULE(aws, IGN(h_many(wchar))); // all white space, include CR & LF, but not comments #define MANY_AWS(X) h_many(CHX(aws, X)) @@ -1265,8 +1266,6 @@ init_parser(struct Env *aux) H_ARULE(a85digit, h_ch_range('!', 'u')); /* Line whitespace can occur between any digit and has to be ignored, */ - /* Comments are not allowed inside streams, and % character should cause - * a parse error. */ #define MANY_LWS(X) h_many(CHX(lws, X)) /* This encoding of zero is not allowed */ // H_RULE(a85fiveexcl, h_repeat_n(SEQ(h_ch('!'), aws), 5)); // seeing this is a violation @@ -1291,7 +1290,16 @@ init_parser(struct Env *aux) * Not sure whether comments can be embedded within content streams * If not, use the rule aws rather than ws */ - // XXX TODO: probably not, but check the standard again + /* + * It seems somewhat unclear. ASCII85Decode definitely can't have + * comments, because % can be part of a valid ASCII85Encoded character. + * However, it seems that comments are generally allowed: + * "Any occurrence of the PERCENT SIGN (25h) outside a string or + * inside a content stream (see 7.8.2, "Content streams") introduces + * a comment." ISO32000:2-2017 7.2.4 + */ + // XXX Ask Peter Wyatt + /* AsciiHexDecode */ H_RULE(ahexeod, h_ch('>')); @@ -2099,7 +2107,7 @@ p_take__m(HAllocator *mm__, size_t n, struct Env *aux) HParser *p_xrefdata__m(HAllocator *, const Dict *); HParser *p_objstm__m(HAllocator *, const Dict *); - +HParser *p_raw_test__m(HAllocator *, const Dict *); /* * Look into the dictionary associated with the stream to see if there is data @@ -2112,7 +2120,8 @@ p_stream_data__m(HAllocator *mm__, const Dict *dict) v = dictentry(dict, "Type"); if (v == NULL || v->token_type != TT_BYTES) // XXX -> custom type - return NULL; /* no /Type field */ + return p_raw_test__m(mm__, dict); /* no /Type field */ + // XXX restore return NULL above /* interpret known stream types */ if (bytes_eq(v->bytes, "XRef")) @@ -2401,6 +2410,25 @@ p_objstm__m(HAllocator *mm__, const Dict *dict) // XXX consistency-check against /First, idx, /N } +HParser* + +/* Debug parser to test encoded text streams */ +// XXX remove + +p_raw_test__m(HAllocator *mm__, const Dict *dict) +{ + /*const HParsedToken *v; + size_t N; + + v = dictentry(dict, "Length"); + if (v == NULL || v->token_type != TT_SINT || v->sint < 0 || + (uint64_t)v->sint > SIZE_MAX) { + fprintf(stderr, "missing /Length on object stream\n"); + return p_fail; + }*/ + return h_sequence__m(mm__, h_many__m(mm__, h_uint8__m(mm__)), NULL); +} + /* * This continuation is very similar to kstream, except that it does not * rely on /Length to consume the right amount of input. If /Length is