empira · bcallaghan-et · Jul 24, 2019 · Jul 24, 2019 · Jul 24, 2019
diff --git a/src/PdfSharp/Pdf.IO/Lexer.cs b/src/PdfSharp/Pdf.IO/Lexer.cs
@@ -190,7 +190,57 @@ public byte[] ReadStream(int length)
             else
                 pos = _idxChar + 1;
 
-            _pdfSteam.Position = pos;
+            // Producer: 
+            // Problem: Incorrect stream length
+            // Fix: Find the endstream keyword and measure the length
+            // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.3.8
+
+            // Producer: 
+            // Problem: Not all pdf producers add a eol marker before endstream
+            // Fix: double check for endstream without the eol marker
+            // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.3.8
+
+            // Producer: 
+            // Problem: Some pdf producers replace the eol marker with a carriage return
+            // Fix: double check for endstream without the eol marker
+            // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.3.8
+
+            // Verify stream length and resolve if bad
+            string nendstream = "\nendstream";
+            string rendstream = "\rendstream";
+            string rnendstream = "\r\nendstream";
+            string endstream = "endstream";
+
+			      string postStream = ReadRawString(pos + length, rnendstream.Length);
+
+            bool bValid = postStream.StartsWith(nendstream) ||
+                          postStream.StartsWith(rendstream) ||
+                          postStream.StartsWith(rnendstream) ||
+                          postStream.StartsWith(endstream); // Not all pdf producers add a eol marker before endstream
+
+            if (!bValid)
+            {
+				        // find the first endstream occurrence
+				        // first check to see if it is within the specified stream length.
+                int idxOffset = IndexOfEndStream(postStream);
+                if (idxOffset != -1)
+                {
+                    length = length + idxOffset;
+                }
+
+                if (idxOffset == -1)
+				        {
+                    // TODO:: read in chunks
+					          postStream = ReadRawString(pos, _pdfLength - pos);
+                    idxOffset = IndexOfEndStream(postStream);
+                    if (idxOffset != -1)
+                    {
+                        length = idxOffset;
+                    }
+                }
+			      }
+
+			      _pdfSteam.Position = pos;
             byte[] bytes = new byte[length];
             int read = _pdfSteam.Read(bytes, 0, length);
             Debug.Assert(read == length);
@@ -205,6 +255,24 @@ public byte[] ReadStream(int length)
             return bytes;
         }
 
+        private static readonly string[] endstreamValues = { "\nendstream", "\rendstream", "endstream" };
+        private int IndexOfEndStream(string val)
+        {
+            // Find the smallest value
+            int offset = -1;
+
+            foreach (var es in endstreamValues)
+            {
+                int o = val.IndexOf(es, StringComparison.Ordinal);
+                if (o < offset || offset == -1)
+                {
+                  offset = o;
+                }
+            }
+
+            return offset;
+        }
+
         /// <summary>
         /// Reads a string in raw encoding.
         /// </summary>

diff --git a/src/PdfSharp/Pdf.IO/Parser.cs b/src/PdfSharp/Pdf.IO/Parser.cs
@@ -266,41 +266,8 @@ public PdfObject ReadObject(PdfObject pdfObject, PdfObjectID objectID, bool incl
             {
                 PdfDictionary dict = (PdfDictionary)pdfObject;
                 Debug.Assert(checkForStream, "Unexpected stream...");
-#if true_
                 ReadStream(dict);
-#else
-                int length = GetStreamLength(dict);
-                byte[] bytes = _lexer.ReadStream(length);
-#if true_
-                if (dict.Elements.GetString("/Filter") == "/FlateDecode")
-                {
-                    if (dict.Elements["/Subtype"] == null)
-                    {
-                        try
-                        {
-                            byte[] decoded = Filtering.FlateDecode.Decode(bytes);
-                            if (decoded.Length == 0)
-                                goto End;
-                            string pageContent = Filtering.FlateDecode.DecodeToString(bytes);
-                            if (pageContent.Length > 100)
-                                pageContent = pageContent.Substring(pageContent.Length - 100);
-                            pageContent.GetType();
-                            bytes = decoded;
-                            dict.Elements.Remove("/Filter");
-                            dict.Elements.SetInteger("/Length", bytes.Length);
-                        }
-                        catch
-                        {
-                        }
-                    }
-                End: ;
-                }
-#endif
-                PdfDictionary.PdfStream stream = new PdfDictionary.PdfStream(bytes, dict);
-                dict.Stream = stream;
-                ReadSymbol(Symbol.EndStream);
-                symbol = ScanNextToken();
-#endif
+                symbol = _lexer.Symbol;
             }
             if (!fromObjecStream && symbol != Symbol.EndObj)
                 ParserDiagnostics.ThrowParserException(PSSR.UnexpectedToken(_lexer.Token));
@@ -316,8 +283,16 @@ private void ReadStream(PdfDictionary dict)
         {
             Symbol symbol = _lexer.Symbol;
             Debug.Assert(symbol == Symbol.BeginStream);
+
             int length = GetStreamLength(dict);
             byte[] bytes = _lexer.ReadStream(length);
+
+            if (bytes.Length != length)
+            {
+                // The file is corrupted, but still readable.
+                dict.Elements["/Length"] = new PdfInteger(bytes.Length);
+            }
+
             PdfDictionary.PdfStream stream = new PdfDictionary.PdfStream(bytes, dict);
             Debug.Assert(dict.Stream == null, "Dictionary already has a stream.");
             dict.Stream = stream;