From fb03e32475eb0bfd4ea8440a3fc7c81332bc721b Mon Sep 17 00:00:00 2001 From: Thomas Terhaar Date: Fri, 16 Aug 2024 22:06:54 +0200 Subject: [PATCH 1/4] Improve resilience against malformed or corrupt documents --- .../src/PdfSharp/Pdf.Advanced/PdfTrailer.cs | 140 ++++++++++++++++++ .../src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs | 32 ++-- .../PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs | 8 +- .../PDFsharp/src/PdfSharp/Pdf.IO/PdfReader.cs | 9 +- .../PDFsharp/src/PdfSharp/Pdf/PdfDocument.cs | 2 +- .../PDFsharp/src/PdfSharp/Pdf/PdfString.cs | 4 +- 6 files changed, 173 insertions(+), 22 deletions(-) diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs index 0cf6157a..9658c23c 100644 --- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs +++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs @@ -4,6 +4,10 @@ using PdfSharp.Pdf.IO; using PdfSharp.Pdf.Security; using PdfSharp.Pdf.Internal; +using System.Text.RegularExpressions; +using System.Text; +using PdfSharp.Logging; +using Microsoft.Extensions.Logging; namespace PdfSharp.Pdf.Advanced { @@ -215,6 +219,142 @@ internal void Finish() _document.IrefTable.IsUnderConstruction = false; } + /// + /// Attempts to rebuild the trailer and iref-table if original ones seem to be corrupt + /// + /// + internal static PdfTrailer Rebuild(PdfDocument document, Stream stream, Parser parser) + { + PdfSharpLogHost.PdfReadingLogger.LogInformation("Attempt to rebuild trailer..."); +#if NET6_0_OR_GREATER + ArgumentNullException.ThrowIfNull(document, nameof(document)); +#else + if (document == null) + throw new ArgumentNullException(nameof(document)); +#endif + if (document._lexer == null) + throw new InvalidOperationException("Document must have a lexer set"); + + // TODO: for performance reasons, we would normally use static properties for the Regex + // (and Source-Generators for newer .Net Versions !) + // but since this should be a one-time operation, we declare them inline for clarity) + + // start on an object, e.g. "1 0 obj" + var rxObjectStart = new Regex("\\b(?\\d+)\\s+(?\\d+)\\s+obj\\b"); + // start of a trailer, e.g. "trailer <<" + var rxTrailerStart = new Regex("\\btrailer\\s*<<"); + var irefTable = new PdfCrossReferenceTable(document); + var trailerStart = 0L; + try + { + // scan the whole file and collect object-ids + stream.Position = 0; + var buffer = new byte[4096]; + var nextStreamPos = stream.Position + 1; // start of the next chunk + while (stream.Position < stream.Length) + { + var bufStart = stream.Position; + var readLength = stream.Read(buffer, 0, buffer.Length); + var readString = Encoding.ASCII.GetString(buffer, 0, readLength); + // search for objects + var numObjectsFound = 0; + var objectMatches = rxObjectStart.Matches(readString); + foreach (Match match in objectMatches) + { + if (match.Success) + { + var objNumber = int.Parse(match.Groups["num"].Value); + var generationNumber = int.Parse(match.Groups["gen"].Value); + var objId = new PdfObjectID(objNumber, generationNumber); + var existingObj = irefTable[objId]; + if (existingObj != null) + // always use the object found later in the file + // this handles newer objects written by incremental updates + existingObj.Position = bufStart + match.Index; + else + irefTable.Add(new PdfReference(objId, (int)bufStart + match.Index)); + nextStreamPos = bufStart + match.Index + match.Length; + numObjectsFound++; + } + } + // search for the trailer + var trailerMatches = rxTrailerStart.Matches(readString); + foreach (Match match in trailerMatches) + { + if (match.Success) + { + // if trailer is found multiple times, the last one wins (conforms to spec) + trailerStart = bufStart + match.Index; + nextStreamPos = Math.Max(nextStreamPos, trailerStart + match.Length); + } + } + // read with overlap to avoid splitting an object-declaration + if (readLength == buffer.Length) + stream.Position = Math.Max(0, stream.Position - 12); + if (stream.Position < stream.Length) + { + if (trailerMatches.Count > 0 || numObjectsFound > 0) + stream.Position = nextStreamPos; + else + // read with overlap to avoid splitting an object-declaration + stream.Position = Math.Max(0, stream.Position - 12); + } + } + document.IrefTable = irefTable; + irefTable.IsUnderConstruction = true; + + var allRefs = irefTable.AllReferences; + var trailer = new PdfTrailer(document); + + if (trailerStart > 0L) + { + // read the entries of the trailer dictionary + stream.Position = trailerStart; + document._lexer.Position = trailerStart; + parser.ReadSymbol(Symbol.Trailer); + parser.ReadSymbol(Symbol.BeginDictionary); + parser.ReadDictionary(trailer, false); + // TODO: what about /Prev entry ? these may also be corrupt (need a file to verify) + // in theory, this can be ignored, because we already have read ALL objects + } + if (!trailer.Elements.ContainsKey(Keys.Root)) + { + // cases: + // 1. no trailer found (maybe cut off at end of file) + // 2. trailer is corrupt (found one with just a single /Size entry, /Catalog was missing) + // read all found objects searching for the catalog (/Root entry) + foreach (var objRef in allRefs) + { + parser.MoveToObject(objRef.ObjectID); + var obj = parser.ReadIndirectObject(objRef); + if (obj is PdfDictionary dict) + { + var type = dict.Elements.GetName(PdfCatalog.Keys.Type); + // ensure we use a valid catalog (we may find multiple) + if (type == "/Catalog" && dict.Elements.ContainsKey(PdfCatalog.Keys.Pages)) + { + trailer.Elements[Keys.Root] = dict.Reference; + } + } + } + } + // still no catalog ? then throw + if (!trailer.Elements.ContainsKey(Keys.Root)) + throw new PdfReaderException( + "Unable to rebuild trailer and iref-table, catalog dictionary not found. The pdf is corrupt"); + + var largestObjectNumber = allRefs.Max(x => x.ObjectID.ObjectNumber); + trailer.Elements.SetInteger(Keys.Size, largestObjectNumber + 1); + PdfSharpLogHost.PdfReadingLogger.LogInformation("Trailer was rebuild with {count} found objects", irefTable.AllObjectIDs.Length); + return trailer; + } + catch (Exception ex) + { + throw new PdfReaderException("Unable to rebuild trailer and iref-table, pdf is corrupt", ex); + } + } + + /// /// Predefined keys of this dictionary. /// diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs index 6935f61d..fe740896 100644 --- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs +++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs @@ -147,9 +147,9 @@ public Symbol ScanNextToken(bool testForObjectReference) return Symbol = Symbol.Eof; default: - Debug.Assert(!Char.IsLetter(ch), "PDFsharp did something wrong. See code below."); - ParserDiagnostics.HandleUnexpectedCharacter(ch, DumpNeighborhoodOfPosition()); - return Symbol = Symbol.None; + // just skip over unexpected character + ScanNextChar(true); + goto TryAgain; } } @@ -855,20 +855,22 @@ public int DetermineStreamLength(SizeType start, int searchLength, SuppressExcep if (start == 144848) _ = sizeof(int); #endif - var rawString = RandomReadRawString(start, searchLength); - - // When we come here, we have either an invalid or no \Length entry. - // Best we can do is to consider all byte before 'endstream' are part of the stream content. - // In case the stream is zipped, this is no problem. In case the stream is encrypted - // it would be a serious problem. But we wait if this really happens. - int idxEndStream = rawString.LastIndexOf("endstream", StringComparison.Ordinal); - if (idxEndStream == -1) + var firstStart = start; + while (start < _pdfLength) { - SuppressExceptions.HandleError(suppressObjectOrderExceptions, () => throw TH.ObjectNotAvailableException_CannotRetrieveStreamLength()); - return -1; + var rawString = RandomReadRawString(start, Math.Min(searchLength, (int)(_pdfLength - start))); + + // When we come here, we have either an invalid or no \Length entry. + // Best we can do is to consider all byte before 'endstream' are part of the stream content. + // In case the stream is zipped, this is no problem. In case the stream is encrypted + // it would be a serious problem. But we wait if this really happens. + int idxEndStream = rawString.LastIndexOf("endstream", StringComparison.Ordinal); + if (idxEndStream >= 0) + return (int)(start - firstStart + idxEndStream); + start += Math.Max(1, searchLength - "endstream".Length - 1); } - - return idxEndStream; + SuppressExceptions.HandleError(suppressObjectOrderExceptions, () => throw TH.ObjectNotAvailableException_CannotRetrieveStreamLength()); + return -1; } /// diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs index c2b18952..a5e8fbdb 100644 --- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs +++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs @@ -57,7 +57,7 @@ public Parser(PdfDocument? document, Stream objectStream, Parser documentParser) /// /// The ID of the object to move. /// Suppresses exceptions that may be caused by not yet available objects. - public SizeType MoveToObject(PdfObjectID objectID, SuppressExceptions? suppressObjectOrderExceptions) + public SizeType MoveToObject(PdfObjectID objectID, SuppressExceptions? suppressObjectOrderExceptions = null) { SizeType? position = _document.IrefTable[objectID]?.Position; if (!position.HasValue) @@ -829,7 +829,7 @@ PdfItem ReadReference(PdfReference iref, bool includeReferences) /// /// Reads the next symbol that must be the specified one. /// - Symbol ReadSymbol(Symbol symbol) + internal Symbol ReadSymbol(Symbol symbol) { Symbol current = ScanNextToken(symbol == Symbol.ObjRef); if (symbol != current) @@ -903,7 +903,7 @@ SizeType ReadSize() /// /// Reads the PdfObject of the reference, no matter if it’s saved at document level or inside an ObjectStream. /// - internal PdfObject ReadIndirectObject(PdfReference pdfReference, SuppressExceptions? suppressObjectOrderExceptions, bool withoutDecrypting = false) + internal PdfObject ReadIndirectObject(PdfReference pdfReference, SuppressExceptions? suppressObjectOrderExceptions = null, bool withoutDecrypting = false) { try { @@ -1406,7 +1406,7 @@ bool CheckXRefTableEntry(SizeType position, int id, int generation, out int idCh /// /// Reads cross-reference stream(s). /// - PdfTrailer ReadXRefStream(PdfCrossReferenceTable xrefTable) + internal PdfTrailer ReadXRefStream(PdfCrossReferenceTable xrefTable) { // Read cross-reference stream. //Debug.Assert(_lexer.Symbol == Symbol.Integer); diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/PdfReader.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/PdfReader.cs index 45ab116b..b8aa068c 100644 --- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/PdfReader.cs +++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/PdfReader.cs @@ -300,7 +300,14 @@ PdfDocument OpenFromStream(Stream stream, string? password, PdfDocumentOpenMode var parser = new Parser(_document, options ?? new PdfReaderOptions(), _logger); // 1. Read all trailers or cross-reference streams, but no objects. - _document.Trailer = parser.ReadTrailer(); + try + { + _document.Trailer = parser.ReadTrailer(); + } + catch + { + _document.Trailer = PdfTrailer.Rebuild(_document, stream, parser); + } if (_document.Trailer == null!) ParserDiagnostics.ThrowParserException( "Invalid PDF file: no trailer found."); // TODO L10N using PsMsgs diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfDocument.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfDocument.cs index 3faccf65..88f3ff43 100644 --- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfDocument.cs +++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfDocument.cs @@ -899,7 +899,7 @@ public void Flatten() /// /// Gets the standard security handler, if existing and encryption is active. /// - internal PdfStandardSecurityHandler? EffectiveSecurityHandler => Trailer.EffectiveSecurityHandler; + internal PdfStandardSecurityHandler? EffectiveSecurityHandler => Trailer?.EffectiveSecurityHandler; internal PdfTrailer Trailer { get; set; } = default!; diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfString.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfString.cs index f60b4958..5d5cbbce 100644 --- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfString.cs +++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfString.cs @@ -277,9 +277,11 @@ static bool TryRereadAsUnicode(ref string? value) return true; } -#if true // UTF-16LE is not defined as valid text string encoding in PDF reference. +#if false // UTF-16LE is not defined as valid text string encoding in PDF reference. if (value is ['\xFF', '\xFE', ..]) + { throw new NotImplementedException("Found UTF-16LE string. Please send us the PDF file and we will fix it (issues (at) pdfsharp.net)."); + } #else // Adobe Reader also supports UTF-16LE. if (value is ['\xFF', '\xFE', ..]) From 65676202fe8f2ea0f6f669e09e682690d970a8dc Mon Sep 17 00:00:00 2001 From: Thomas Terhaar Date: Sat, 17 Aug 2024 12:37:47 +0200 Subject: [PATCH 2/4] Fix comment --- .../src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs index 9658c23c..f4ceba99 100644 --- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs +++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs @@ -321,7 +321,7 @@ internal static PdfTrailer Rebuild(PdfDocument document, Stream stream, Parser p { // cases: // 1. no trailer found (maybe cut off at end of file) - // 2. trailer is corrupt (found one with just a single /Size entry, /Catalog was missing) + // 2. trailer is corrupt (found one with just a single /Size entry, /Root was missing) // read all found objects searching for the catalog (/Root entry) foreach (var objRef in allRefs) { From 4f1ca61e385527349ebd6ce080bf6444f873bc2a Mon Sep 17 00:00:00 2001 From: Thomas Terhaar Date: Sat, 17 Aug 2024 14:58:21 +0200 Subject: [PATCH 3/4] Do not include eol-marker in stream-data --- .../src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs index fe740896..d83018f7 100644 --- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs +++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs @@ -866,7 +866,28 @@ public int DetermineStreamLength(SizeType start, int searchLength, SuppressExcep // it would be a serious problem. But we wait if this really happens. int idxEndStream = rawString.LastIndexOf("endstream", StringComparison.Ordinal); if (idxEndStream >= 0) + { + // The spec says (7.3.8, Stream Objects): + // "There should be an end-of-line marker after the data and before endstream; + // this marker shall not be included in the stream length" + + // check bytes before the keyword for possible CRLF or LF or CR + // (CR alone SHALL NOT be used but check it anyway) + // sanity check, should always pass since we SHOULD have read the "stream" keyword before we came here + if (start + idxEndStream >= 2) + { + _pdfStream.Position = start + idxEndStream - 2; + var b1 = _pdfStream.ReadByte(); + var b2 = _pdfStream.ReadByte(); + if (b2 == '\n' || b2 == '\r') // possible CRLF or single LF or single CR + { + idxEndStream--; + if (b1 == '\r' && b2 != '\r') // handle CRLF but not CRCR + idxEndStream--; + } + } return (int)(start - firstStart + idxEndStream); + } start += Math.Max(1, searchLength - "endstream".Length - 1); } SuppressExceptions.HandleError(suppressObjectOrderExceptions, () => throw TH.ObjectNotAvailableException_CannotRetrieveStreamLength()); From 48c4e6d0668812a63e3318f888027e0d1bab2408 Mon Sep 17 00:00:00 2001 From: Thomas Terhaar Date: Sat, 17 Aug 2024 15:00:20 +0200 Subject: [PATCH 4/4] Handle case where reported stream-length + position exceeds pdf-length. Fixes #148 --- .../src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs index a5e8fbdb..12e8822e 100644 --- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs +++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs @@ -369,9 +369,20 @@ void ReadDictionaryStream(PdfDictionary dict, SuppressExceptions? suppressObject // Step 3: We try to read the stream content. // Maybe we have to re-read it in case 'endstream' was not at the // right place after reading with the length value coming from /Length. - var bytes = _lexer.ScanStream(startPosition, streamLength); - var stream = new PdfDictionary.PdfStream(bytes, dict); - dict.Stream = stream; + byte[] bytes; + try + { + // this may throw if startPosition + streamLength > length of stream + bytes = _lexer.ScanStream(startPosition, streamLength); + var stream = new PdfDictionary.PdfStream(bytes, dict); + dict.Stream = stream; + } + catch + { + // reset stream position + _lexer.Position = startPosition; + // ignore exception, we'll try again after determining real stream-length + } #if DEBUG_ // Check it with Notepad++ directly in PDF file. // ReSharper disable once ConditionIsAlwaysTrueOrFalseAccordingToNullableAPIContract if (bytes is not null && bytes.Length > 0)