From fb03e32475eb0bfd4ea8440a3fc7c81332bc721b Mon Sep 17 00:00:00 2001
From: Thomas Terhaar <uncletom@gmx.de>
Date: Fri, 16 Aug 2024 22:06:54 +0200
Subject: [PATCH 1/4] Improve resilience against malformed or corrupt documents

---
 .../src/PdfSharp/Pdf.Advanced/PdfTrailer.cs   | 140 ++++++++++++++++++
 .../src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs |  32 ++--
 .../PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs    |   8 +-
 .../PDFsharp/src/PdfSharp/Pdf.IO/PdfReader.cs |   9 +-
 .../PDFsharp/src/PdfSharp/Pdf/PdfDocument.cs  |   2 +-
 .../PDFsharp/src/PdfSharp/Pdf/PdfString.cs    |   4 +-
 6 files changed, 173 insertions(+), 22 deletions(-)
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs
index 0cf6157a..9658c23c 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs
@@ -4,6 +4,10 @@
 using PdfSharp.Pdf.IO;
 using PdfSharp.Pdf.Security;
 using PdfSharp.Pdf.Internal;
+using System.Text.RegularExpressions;
+using System.Text;
+using PdfSharp.Logging;
+using Microsoft.Extensions.Logging;
 
 namespace PdfSharp.Pdf.Advanced
 {
@@ -215,6 +219,142 @@ internal void Finish()
             _document.IrefTable.IsUnderConstruction = false;
         }
 
+        /// <summary>
+        /// Attempts to rebuild the trailer and iref-table if original ones seem to be corrupt
+        /// </summary>
+        /// <exception cref="PdfReaderException"></exception>
+        internal static PdfTrailer Rebuild(PdfDocument document, Stream stream, Parser parser)
+        {
+            PdfSharpLogHost.PdfReadingLogger.LogInformation("Attempt to rebuild trailer...");
+#if NET6_0_OR_GREATER
+            ArgumentNullException.ThrowIfNull(document, nameof(document));
+#else
+            if (document == null)
+                throw new ArgumentNullException(nameof(document));
+#endif
+            if (document._lexer == null)
+                throw new InvalidOperationException("Document must have a lexer set");
+
+            // TODO: for performance reasons, we would normally use static properties for the Regex
+            // (and Source-Generators for newer .Net Versions !)
+            // but since this should be a one-time operation, we declare them inline for clarity)
+            
+            // start on an object, e.g. "1 0 obj"
+            var rxObjectStart = new Regex("\\b(?<num>\\d+)\\s+(?<gen>\\d+)\\s+obj\\b");
+            // start of a trailer, e.g. "trailer <<"
+            var rxTrailerStart = new Regex("\\btrailer\\s*<<");
+            var irefTable = new PdfCrossReferenceTable(document);
+            var trailerStart = 0L;
+            try
+            {
+                // scan the whole file and collect object-ids
+                stream.Position = 0;
+                var buffer = new byte[4096];
+                var nextStreamPos = stream.Position + 1;    // start of the next chunk
+                while (stream.Position < stream.Length)
+                {
+                    var bufStart = stream.Position;
+                    var readLength = stream.Read(buffer, 0, buffer.Length);
+                    var readString = Encoding.ASCII.GetString(buffer, 0, readLength);
+                    // search for objects
+                    var numObjectsFound = 0;
+                    var objectMatches = rxObjectStart.Matches(readString);
+                    foreach (Match match in objectMatches)
+                    {
+                        if (match.Success)
+                        {
+                            var objNumber = int.Parse(match.Groups["num"].Value);
+                            var generationNumber = int.Parse(match.Groups["gen"].Value);
+                            var objId = new PdfObjectID(objNumber, generationNumber);
+                            var existingObj = irefTable[objId];
+                            if (existingObj != null)
+                                // always use the object found later in the file
+                                // this handles newer objects written by incremental updates
+                                existingObj.Position = bufStart + match.Index;
+                            else
+                                irefTable.Add(new PdfReference(objId, (int)bufStart + match.Index));
+                            nextStreamPos = bufStart + match.Index + match.Length;
+                            numObjectsFound++;
+                        }
+                    }
+                    // search for the trailer
+                    var trailerMatches = rxTrailerStart.Matches(readString);
+                    foreach (Match match in trailerMatches)
+                    {
+                        if (match.Success)
+                        {
+                            // if trailer is found multiple times, the last one wins (conforms to spec)
+                            trailerStart = bufStart + match.Index;
+                            nextStreamPos = Math.Max(nextStreamPos, trailerStart + match.Length);
+                        }
+                    }
+                    // read with overlap to avoid splitting an object-declaration
+                    if (readLength == buffer.Length)
+                        stream.Position = Math.Max(0, stream.Position - 12);
+                    if (stream.Position < stream.Length)
+                    {
+                        if (trailerMatches.Count > 0 || numObjectsFound > 0)
+                            stream.Position = nextStreamPos;
+                        else
+                            // read with overlap to avoid splitting an object-declaration
+                            stream.Position = Math.Max(0, stream.Position - 12);
+                    }
+                }
+                document.IrefTable = irefTable;
+                irefTable.IsUnderConstruction = true;
+
+                var allRefs = irefTable.AllReferences;
+                var trailer = new PdfTrailer(document);
+
+                if (trailerStart > 0L)
+                {
+                    // read the entries of the trailer dictionary
+                    stream.Position = trailerStart;
+                    document._lexer.Position = trailerStart;
+                    parser.ReadSymbol(Symbol.Trailer);
+                    parser.ReadSymbol(Symbol.BeginDictionary);
+                    parser.ReadDictionary(trailer, false);
+                    // TODO: what about /Prev entry ? these may also be corrupt (need a file to verify)
+                    // in theory, this can be ignored, because we already have read ALL objects
+                }
+                if (!trailer.Elements.ContainsKey(Keys.Root))
+                {
+                    // cases:
+                    // 1. no trailer found (maybe cut off at end of file)
+                    // 2. trailer is corrupt (found one with just a single /Size entry, /Catalog was missing)
+                    // read all found objects searching for the catalog (/Root entry)
+                    foreach (var objRef in allRefs)
+                    {
+                        parser.MoveToObject(objRef.ObjectID);
+                        var obj = parser.ReadIndirectObject(objRef);
+                        if (obj is PdfDictionary dict)
+                        {
+                            var type = dict.Elements.GetName(PdfCatalog.Keys.Type);
+                            // ensure we use a valid catalog (we may find multiple)
+                            if (type == "/Catalog" && dict.Elements.ContainsKey(PdfCatalog.Keys.Pages))
+                            {
+                                trailer.Elements[Keys.Root] = dict.Reference;
+                            }
+                        }
+                    }
+                }
+                // still no catalog ? then throw
+                if (!trailer.Elements.ContainsKey(Keys.Root))
+                    throw new PdfReaderException(
+                        "Unable to rebuild trailer and iref-table, catalog dictionary not found. The pdf is corrupt");
+
+                var largestObjectNumber = allRefs.Max(x => x.ObjectID.ObjectNumber);
+                trailer.Elements.SetInteger(Keys.Size, largestObjectNumber + 1);
+                PdfSharpLogHost.PdfReadingLogger.LogInformation("Trailer was rebuild with {count} found objects", irefTable.AllObjectIDs.Length);
+                return trailer;
+            }
+            catch (Exception ex)
+            {
+                throw new PdfReaderException("Unable to rebuild trailer and iref-table, pdf is corrupt", ex);
+            }
+        }
+
+
         /// <summary>
         /// Predefined keys of this dictionary.
         /// </summary>
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs
index 6935f61d..fe740896 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs
@@ -147,9 +147,9 @@ public Symbol ScanNextToken(bool testForObjectReference)
                     return Symbol = Symbol.Eof;
 
                 default:
-                    Debug.Assert(!Char.IsLetter(ch), "PDFsharp did something wrong. See code below.");
-                    ParserDiagnostics.HandleUnexpectedCharacter(ch, DumpNeighborhoodOfPosition());
-                    return Symbol = Symbol.None;
+                    // just skip over unexpected character
+                    ScanNextChar(true);
+                    goto TryAgain;
             }
         }
 
@@ -855,20 +855,22 @@ public int DetermineStreamLength(SizeType start, int searchLength, SuppressExcep
             if (start == 144848)
                 _ = sizeof(int);
 #endif
-            var rawString = RandomReadRawString(start, searchLength);
-
-            // When we come here, we have either an invalid or no \Length entry.
-            // Best we can do is to consider all byte before 'endstream' are part of the stream content.
-            // In case the stream is zipped, this is no problem. In case the stream is encrypted
-            // it would be a serious problem. But we wait if this really happens.
-            int idxEndStream = rawString.LastIndexOf("endstream", StringComparison.Ordinal);
-            if (idxEndStream == -1)
+            var firstStart = start;
+            while (start < _pdfLength)
             {
-                SuppressExceptions.HandleError(suppressObjectOrderExceptions, () => throw TH.ObjectNotAvailableException_CannotRetrieveStreamLength());
-                return -1;
+                var rawString = RandomReadRawString(start, Math.Min(searchLength, (int)(_pdfLength - start)));
+
+                // When we come here, we have either an invalid or no \Length entry.
+                // Best we can do is to consider all byte before 'endstream' are part of the stream content.
+                // In case the stream is zipped, this is no problem. In case the stream is encrypted
+                // it would be a serious problem. But we wait if this really happens.
+                int idxEndStream = rawString.LastIndexOf("endstream", StringComparison.Ordinal);
+                if (idxEndStream >= 0)
+                    return (int)(start - firstStart + idxEndStream);
+                start += Math.Max(1, searchLength - "endstream".Length - 1);
             }
-
-            return idxEndStream;
+            SuppressExceptions.HandleError(suppressObjectOrderExceptions, () => throw TH.ObjectNotAvailableException_CannotRetrieveStreamLength());
+            return -1;
         }
 
         /// <summary>
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs
index c2b18952..a5e8fbdb 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs
@@ -57,7 +57,7 @@ public Parser(PdfDocument? document, Stream objectStream, Parser documentParser)
         /// </summary>
         /// <param name="objectID">The ID of the object to move.</param>
         /// <param name="suppressObjectOrderExceptions">Suppresses exceptions that may be caused by not yet available objects.</param>
-        public SizeType MoveToObject(PdfObjectID objectID, SuppressExceptions? suppressObjectOrderExceptions)
+        public SizeType MoveToObject(PdfObjectID objectID, SuppressExceptions? suppressObjectOrderExceptions = null)
         {
             SizeType? position = _document.IrefTable[objectID]?.Position;
             if (!position.HasValue)
@@ -829,7 +829,7 @@ PdfItem ReadReference(PdfReference iref, bool includeReferences)
         /// <summary>
         /// Reads the next symbol that must be the specified one.
         /// </summary>
-        Symbol ReadSymbol(Symbol symbol)
+        internal Symbol ReadSymbol(Symbol symbol)
         {
             Symbol current = ScanNextToken(symbol == Symbol.ObjRef);
             if (symbol != current)
@@ -903,7 +903,7 @@ SizeType ReadSize()
         /// <summary>
         /// Reads the PdfObject of the reference, no matter if it’s saved at document level or inside an ObjectStream.
         /// </summary>
-        internal PdfObject ReadIndirectObject(PdfReference pdfReference, SuppressExceptions? suppressObjectOrderExceptions, bool withoutDecrypting = false)
+        internal PdfObject ReadIndirectObject(PdfReference pdfReference, SuppressExceptions? suppressObjectOrderExceptions = null, bool withoutDecrypting = false)
         {
             try
             {
@@ -1406,7 +1406,7 @@ bool CheckXRefTableEntry(SizeType position, int id, int generation, out int idCh
         /// <summary>
         /// Reads cross-reference stream(s).
         /// </summary>
-        PdfTrailer ReadXRefStream(PdfCrossReferenceTable xrefTable)
+        internal PdfTrailer ReadXRefStream(PdfCrossReferenceTable xrefTable)
         {
             // Read cross-reference stream.
             //Debug.Assert(_lexer.Symbol == Symbol.Integer);
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/PdfReader.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/PdfReader.cs
index 45ab116b..b8aa068c 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/PdfReader.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/PdfReader.cs
@@ -300,7 +300,14 @@ PdfDocument OpenFromStream(Stream stream, string? password, PdfDocumentOpenMode
                 var parser = new Parser(_document, options ?? new PdfReaderOptions(), _logger);
 
                 // 1. Read all trailers or cross-reference streams, but no objects.
-                _document.Trailer = parser.ReadTrailer();
+                try
+                {
+                    _document.Trailer = parser.ReadTrailer();
+                }
+                catch
+                {
+                    _document.Trailer = PdfTrailer.Rebuild(_document, stream, parser);
+                }
                 if (_document.Trailer == null!)
                     ParserDiagnostics.ThrowParserException(
                         "Invalid PDF file: no trailer found."); // TODO L10N using PsMsgs
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfDocument.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfDocument.cs
index 3faccf65..88f3ff43 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfDocument.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfDocument.cs
@@ -899,7 +899,7 @@ public void Flatten()
         /// <summary>
         /// Gets the standard security handler, if existing and encryption is active.
         /// </summary>
-        internal PdfStandardSecurityHandler? EffectiveSecurityHandler => Trailer.EffectiveSecurityHandler;
+        internal PdfStandardSecurityHandler? EffectiveSecurityHandler => Trailer?.EffectiveSecurityHandler;
 
         internal PdfTrailer Trailer { get; set; } = default!;
 
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfString.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfString.cs
index f60b4958..5d5cbbce 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfString.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfString.cs
@@ -277,9 +277,11 @@ static bool TryRereadAsUnicode(ref string? value)
                 return true;
             }
 
-#if true // UTF-16LE is not defined as valid text string encoding in PDF reference.
+#if false // UTF-16LE is not defined as valid text string encoding in PDF reference.
             if (value is ['\xFF', '\xFE', ..])
+            {
                 throw new NotImplementedException("Found UTF-16LE string. Please send us the PDF file and we will fix it (issues (at) pdfsharp.net).");
+            }
 #else
             // Adobe Reader also supports UTF-16LE.
             if (value is ['\xFF', '\xFE', ..])

From 65676202fe8f2ea0f6f669e09e682690d970a8dc Mon Sep 17 00:00:00 2001
From: Thomas Terhaar <uncletom@gmx.de>
Date: Sat, 17 Aug 2024 12:37:47 +0200
Subject: [PATCH 2/4] Fix comment

---
 .../src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs
index 9658c23c..f4ceba99 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs
@@ -321,7 +321,7 @@ internal static PdfTrailer Rebuild(PdfDocument document, Stream stream, Parser p
                 {
                     // cases:
                     // 1. no trailer found (maybe cut off at end of file)
-                    // 2. trailer is corrupt (found one with just a single /Size entry, /Catalog was missing)
+                    // 2. trailer is corrupt (found one with just a single /Size entry, /Root was missing)
                     // read all found objects searching for the catalog (/Root entry)
                     foreach (var objRef in allRefs)
                     {

From 4f1ca61e385527349ebd6ce080bf6444f873bc2a Mon Sep 17 00:00:00 2001
From: Thomas Terhaar <uncletom@gmx.de>
Date: Sat, 17 Aug 2024 14:58:21 +0200
Subject: [PATCH 3/4] Do not include eol-marker in stream-data

---
 .../src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs
index fe740896..d83018f7 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs
@@ -866,7 +866,28 @@ public int DetermineStreamLength(SizeType start, int searchLength, SuppressExcep
                 // it would be a serious problem. But we wait if this really happens.
                 int idxEndStream = rawString.LastIndexOf("endstream", StringComparison.Ordinal);
                 if (idxEndStream >= 0)
+                {
+                    // The spec says (7.3.8, Stream Objects):
+                    // "There should be an end-of-line marker after the data and before endstream;
+                    // this marker shall not be included in the stream length"
+
+                    // check bytes before the keyword for possible CRLF or LF or CR
+                    // (CR alone SHALL NOT be used but check it anyway)
+                    // sanity check, should always pass since we SHOULD have read the "stream" keyword before we came here
+                    if (start + idxEndStream >= 2)
+                    {
+                        _pdfStream.Position = start + idxEndStream - 2;
+                        var b1 = _pdfStream.ReadByte();
+                        var b2 = _pdfStream.ReadByte();
+                        if (b2 == '\n' || b2 == '\r')   // possible CRLF or single LF or single CR
+                        {
+                            idxEndStream--;
+                            if (b1 == '\r' && b2 != '\r')   // handle CRLF but not CRCR
+                                idxEndStream--;
+                        }
+                    }
                     return (int)(start - firstStart + idxEndStream);
+                }
                 start += Math.Max(1, searchLength - "endstream".Length - 1);
             }
             SuppressExceptions.HandleError(suppressObjectOrderExceptions, () => throw TH.ObjectNotAvailableException_CannotRetrieveStreamLength());

From 48c4e6d0668812a63e3318f888027e0d1bab2408 Mon Sep 17 00:00:00 2001
From: Thomas Terhaar <uncletom@gmx.de>
Date: Sat, 17 Aug 2024 15:00:20 +0200
Subject: [PATCH 4/4] Handle case where reported stream-length + position
 exceeds pdf-length. Fixes #148

---
 .../src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs  | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs
index a5e8fbdb..12e8822e 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs
@@ -369,9 +369,20 @@ void ReadDictionaryStream(PdfDictionary dict, SuppressExceptions? suppressObject
             // Step 3: We try to read the stream content.
             // Maybe we have to re-read it in case 'endstream' was not at the
             // right place after reading with the length value coming from /Length.
-            var bytes = _lexer.ScanStream(startPosition, streamLength);
-            var stream = new PdfDictionary.PdfStream(bytes, dict);
-            dict.Stream = stream;
+            byte[] bytes;
+            try
+            {
+                // this may throw if startPosition + streamLength > length of stream
+                bytes = _lexer.ScanStream(startPosition, streamLength);
+                var stream = new PdfDictionary.PdfStream(bytes, dict);
+                dict.Stream = stream;
+            }
+            catch
+            {
+                // reset stream position
+                _lexer.Position = startPosition;
+                // ignore exception, we'll try again after determining real stream-length
+            }
 #if DEBUG_  // Check it with Notepad++ directly in PDF file.
             // ReSharper disable once ConditionIsAlwaysTrueOrFalseAccordingToNullableAPIContract
             if (bytes is not null && bytes.Length > 0)