From 4efdd298fdedb4e724dbeff6e73498141af2a8a4 Mon Sep 17 00:00:00 2001 From: Ben Callaghan Date: Wed, 24 Jul 2019 16:45:18 -0600 Subject: [PATCH 1/3] Verify stream length when reading --- src/PdfSharp/Pdf.IO/Lexer.cs | 70 +++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/src/PdfSharp/Pdf.IO/Lexer.cs b/src/PdfSharp/Pdf.IO/Lexer.cs index 5bff4193..b7abe0e5 100644 --- a/src/PdfSharp/Pdf.IO/Lexer.cs +++ b/src/PdfSharp/Pdf.IO/Lexer.cs @@ -190,7 +190,57 @@ public byte[] ReadStream(int length) else pos = _idxChar + 1; - _pdfSteam.Position = pos; + // Producer: + // Problem: Incorrect stream length + // Fix: Find the endstream keyword and measure the length + // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.3.8 + + // Producer: + // Problem: Not all pdf producers add a eol marker before endstream + // Fix: double check for endstream without the eol marker + // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.3.8 + + // Producer: + // Problem: Some pdf producers replace the eol marker with a carriage return + // Fix: double check for endstream without the eol marker + // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.3.8 + + // Verify stream length and resolve if bad + string nendstream = "\nendstream"; + string rendstream = "\rendstream"; + string rnendstream = "\r\nendstream"; + string endstream = "endstream"; + + string postStream = ReadRawString(pos + length, rnendstream.Length); + + bool bValid = postStream.StartsWith(nendstream) || + postStream.StartsWith(rendstream) || + postStream.StartsWith(rnendstream) || + postStream.StartsWith(endstream); // Not all pdf producers add a eol marker before endstream + + if (!bValid) + { + // find the first endstream occurrence + // first check to see if it is within the specified stream length. + int idxOffset = IndexOfEndStream(postStream); + if (idxOffset != -1) + { + length = length + idxOffset; + } + + if (idxOffset == -1) + { + // TODO:: read in chunks + postStream = ReadRawString(pos, _pdfLength - pos); + idxOffset = IndexOfEndStream(postStream); + if (idxOffset != -1) + { + length = idxOffset; + } + } + } + + _pdfSteam.Position = pos; byte[] bytes = new byte[length]; int read = _pdfSteam.Read(bytes, 0, length); Debug.Assert(read == length); @@ -205,6 +255,24 @@ public byte[] ReadStream(int length) return bytes; } + private static readonly string[] endstreamValues = { "\nendstream", "\rendstream", "endstream" }; + private int IndexOfEndStream(string val) + { + // Find the smallest value + int offset = -1; + + foreach (var es in endstreamValues) + { + int o = val.IndexOf(es, StringComparison.Ordinal); + if (o < offset || offset == -1) + { + offset = o; + } + } + + return offset; + } + /// /// Reads a string in raw encoding. /// From b6f478acb0ebf154be463bd63459c8d8bb79a91a Mon Sep 17 00:00:00 2001 From: Ben Callaghan Date: Wed, 24 Jul 2019 16:48:46 -0600 Subject: [PATCH 2/3] Simplified ReadStream --- src/PdfSharp/Pdf.IO/Parser.cs | 35 +---------------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/src/PdfSharp/Pdf.IO/Parser.cs b/src/PdfSharp/Pdf.IO/Parser.cs index 07b353a9..ef63cfa5 100644 --- a/src/PdfSharp/Pdf.IO/Parser.cs +++ b/src/PdfSharp/Pdf.IO/Parser.cs @@ -266,41 +266,8 @@ public PdfObject ReadObject(PdfObject pdfObject, PdfObjectID objectID, bool incl { PdfDictionary dict = (PdfDictionary)pdfObject; Debug.Assert(checkForStream, "Unexpected stream..."); -#if true_ ReadStream(dict); -#else - int length = GetStreamLength(dict); - byte[] bytes = _lexer.ReadStream(length); -#if true_ - if (dict.Elements.GetString("/Filter") == "/FlateDecode") - { - if (dict.Elements["/Subtype"] == null) - { - try - { - byte[] decoded = Filtering.FlateDecode.Decode(bytes); - if (decoded.Length == 0) - goto End; - string pageContent = Filtering.FlateDecode.DecodeToString(bytes); - if (pageContent.Length > 100) - pageContent = pageContent.Substring(pageContent.Length - 100); - pageContent.GetType(); - bytes = decoded; - dict.Elements.Remove("/Filter"); - dict.Elements.SetInteger("/Length", bytes.Length); - } - catch - { - } - } - End: ; - } -#endif - PdfDictionary.PdfStream stream = new PdfDictionary.PdfStream(bytes, dict); - dict.Stream = stream; - ReadSymbol(Symbol.EndStream); - symbol = ScanNextToken(); -#endif + symbol = _lexer.Symbol; } if (!fromObjecStream && symbol != Symbol.EndObj) ParserDiagnostics.ThrowParserException(PSSR.UnexpectedToken(_lexer.Token)); From 7c662b28dcd8cd789b90d6a7e1ccd5f13b6c9c89 Mon Sep 17 00:00:00 2001 From: Ben Callaghan Date: Wed, 24 Jul 2019 17:16:32 -0600 Subject: [PATCH 3/3] Automatically fix incorrect stream length --- src/PdfSharp/Pdf.IO/Parser.cs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/PdfSharp/Pdf.IO/Parser.cs b/src/PdfSharp/Pdf.IO/Parser.cs index ef63cfa5..cb3af9a4 100644 --- a/src/PdfSharp/Pdf.IO/Parser.cs +++ b/src/PdfSharp/Pdf.IO/Parser.cs @@ -283,8 +283,16 @@ private void ReadStream(PdfDictionary dict) { Symbol symbol = _lexer.Symbol; Debug.Assert(symbol == Symbol.BeginStream); + int length = GetStreamLength(dict); byte[] bytes = _lexer.ReadStream(length); + + if (bytes.Length != length) + { + // The file is corrupted, but still readable. + dict.Elements["/Length"] = new PdfInteger(bytes.Length); + } + PdfDictionary.PdfStream stream = new PdfDictionary.PdfStream(bytes, dict); Debug.Assert(dict.Stream == null, "Dictionary already has a stream."); dict.Stream = stream;