diff --git a/src/core/parser/PDFObjectParser.ts b/src/core/parser/PDFObjectParser.ts index c51b59fd4..6f84927cb 100644 --- a/src/core/parser/PDFObjectParser.ts +++ b/src/core/parser/PDFObjectParser.ts @@ -243,14 +243,25 @@ class PDFObjectParser extends BaseParser { protected findEndOfStreamFallback(startPos: Position) { // Move to end of stream, while handling nested streams + let acceptUnprefixedStream = true; let nestingLvl = 1; let end = this.bytes.offset(); while (!this.bytes.done()) { end = this.bytes.offset(); - if (this.matchKeyword(Keywords.stream)) { + if ( + this.matchKeyword(Keywords.embeddedStream1) || + this.matchKeyword(Keywords.embeddedStream2) || + this.matchKeyword(Keywords.embeddedStream3) || + this.matchKeyword(Keywords.embeddedStream4) || + this.matchKeyword(Keywords.embeddedStream5) || + this.matchKeyword(Keywords.embeddedStream6) || + this.matchKeyword(Keywords.embeddedStream7) || + (acceptUnprefixedStream && this.matchKeyword(Keywords.stream)) + ) { nestingLvl += 1; + acceptUnprefixedStream = true; } else if ( this.matchKeyword(Keywords.EOF1endstream) || this.matchKeyword(Keywords.EOF2endstream) || @@ -258,8 +269,10 @@ class PDFObjectParser extends BaseParser { this.matchKeyword(Keywords.endstream) ) { nestingLvl -= 1; + acceptUnprefixedStream = true; } else { this.bytes.next(); + acceptUnprefixedStream = false; } if (nestingLvl === 0) break; diff --git a/src/core/syntax/Keywords.ts b/src/core/syntax/Keywords.ts index 12e2f957c..5989af850 100644 --- a/src/core/syntax/Keywords.ts +++ b/src/core/syntax/Keywords.ts @@ -1,6 +1,14 @@ import CharCodes from 'src/core/syntax/CharCodes'; -const { Space, CarriageReturn, Newline } = CharCodes; +const { + Space, + CarriageReturn, + Newline, + Tab, + LessThan, + GreaterThan, + BackSlash, +} = CharCodes; const stream = [ CharCodes.s, @@ -76,6 +84,13 @@ export const Keywords = { streamEOF2: [...stream, CarriageReturn, Newline], streamEOF3: [...stream, CarriageReturn], streamEOF4: [...stream, Newline], + embeddedStream1: [Tab, ...stream], + embeddedStream2: [Space, ...stream], + embeddedStream3: [CarriageReturn, ...stream], + embeddedStream4: [Newline, ...stream], + embeddedStream5: [LessThan, ...stream], + embeddedStream6: [GreaterThan, ...stream], + embeddedStream7: [BackSlash, ...stream], endstream, EOF1endstream: [CarriageReturn, Newline, ...endstream], EOF2endstream: [CarriageReturn, ...endstream], diff --git a/tests/core/parser/PDFObjectParser.spec.ts b/tests/core/parser/PDFObjectParser.spec.ts index 576dfe14a..5c171ab09 100644 --- a/tests/core/parser/PDFObjectParser.spec.ts +++ b/tests/core/parser/PDFObjectParser.spec.ts @@ -576,6 +576,10 @@ describe(`PDFObjectParser`, () => { '<<>>\n\rstream\n\rthingz\n\rendstream', '<<\n/Length 8\n>>\nstream\n\rthingz\n\nendstream', ], + [ + '<<>>\n\rstream\n\rthingz bitstream\n\rendstream', + '<<\n/Length 18\n>>\nstream\n\rthingz bitstream\n\nendstream', + ], ].forEach(([input, output]) => { it(`can parse ${JSON.stringify(input)}`, () => { const object = parse(typedArrayFor(input));