diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs
index 0cf6157a..f4ceba99 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs
@@ -4,6 +4,10 @@
using PdfSharp.Pdf.IO;
using PdfSharp.Pdf.Security;
using PdfSharp.Pdf.Internal;
+using System.Text.RegularExpressions;
+using System.Text;
+using PdfSharp.Logging;
+using Microsoft.Extensions.Logging;
namespace PdfSharp.Pdf.Advanced
{
@@ -215,6 +219,142 @@ internal void Finish()
_document.IrefTable.IsUnderConstruction = false;
}
+ ///
+ /// Attempts to rebuild the trailer and iref-table if original ones seem to be corrupt
+ ///
+ ///
+ internal static PdfTrailer Rebuild(PdfDocument document, Stream stream, Parser parser)
+ {
+ PdfSharpLogHost.PdfReadingLogger.LogInformation("Attempt to rebuild trailer...");
+#if NET6_0_OR_GREATER
+ ArgumentNullException.ThrowIfNull(document, nameof(document));
+#else
+ if (document == null)
+ throw new ArgumentNullException(nameof(document));
+#endif
+ if (document._lexer == null)
+ throw new InvalidOperationException("Document must have a lexer set");
+
+ // TODO: for performance reasons, we would normally use static properties for the Regex
+ // (and Source-Generators for newer .Net Versions !)
+ // but since this should be a one-time operation, we declare them inline for clarity)
+
+ // start on an object, e.g. "1 0 obj"
+ var rxObjectStart = new Regex("\\b(?\\d+)\\s+(?\\d+)\\s+obj\\b");
+ // start of a trailer, e.g. "trailer <<"
+ var rxTrailerStart = new Regex("\\btrailer\\s*<<");
+ var irefTable = new PdfCrossReferenceTable(document);
+ var trailerStart = 0L;
+ try
+ {
+ // scan the whole file and collect object-ids
+ stream.Position = 0;
+ var buffer = new byte[4096];
+ var nextStreamPos = stream.Position + 1; // start of the next chunk
+ while (stream.Position < stream.Length)
+ {
+ var bufStart = stream.Position;
+ var readLength = stream.Read(buffer, 0, buffer.Length);
+ var readString = Encoding.ASCII.GetString(buffer, 0, readLength);
+ // search for objects
+ var numObjectsFound = 0;
+ var objectMatches = rxObjectStart.Matches(readString);
+ foreach (Match match in objectMatches)
+ {
+ if (match.Success)
+ {
+ var objNumber = int.Parse(match.Groups["num"].Value);
+ var generationNumber = int.Parse(match.Groups["gen"].Value);
+ var objId = new PdfObjectID(objNumber, generationNumber);
+ var existingObj = irefTable[objId];
+ if (existingObj != null)
+ // always use the object found later in the file
+ // this handles newer objects written by incremental updates
+ existingObj.Position = bufStart + match.Index;
+ else
+ irefTable.Add(new PdfReference(objId, (int)bufStart + match.Index));
+ nextStreamPos = bufStart + match.Index + match.Length;
+ numObjectsFound++;
+ }
+ }
+ // search for the trailer
+ var trailerMatches = rxTrailerStart.Matches(readString);
+ foreach (Match match in trailerMatches)
+ {
+ if (match.Success)
+ {
+ // if trailer is found multiple times, the last one wins (conforms to spec)
+ trailerStart = bufStart + match.Index;
+ nextStreamPos = Math.Max(nextStreamPos, trailerStart + match.Length);
+ }
+ }
+ // read with overlap to avoid splitting an object-declaration
+ if (readLength == buffer.Length)
+ stream.Position = Math.Max(0, stream.Position - 12);
+ if (stream.Position < stream.Length)
+ {
+ if (trailerMatches.Count > 0 || numObjectsFound > 0)
+ stream.Position = nextStreamPos;
+ else
+ // read with overlap to avoid splitting an object-declaration
+ stream.Position = Math.Max(0, stream.Position - 12);
+ }
+ }
+ document.IrefTable = irefTable;
+ irefTable.IsUnderConstruction = true;
+
+ var allRefs = irefTable.AllReferences;
+ var trailer = new PdfTrailer(document);
+
+ if (trailerStart > 0L)
+ {
+ // read the entries of the trailer dictionary
+ stream.Position = trailerStart;
+ document._lexer.Position = trailerStart;
+ parser.ReadSymbol(Symbol.Trailer);
+ parser.ReadSymbol(Symbol.BeginDictionary);
+ parser.ReadDictionary(trailer, false);
+ // TODO: what about /Prev entry ? these may also be corrupt (need a file to verify)
+ // in theory, this can be ignored, because we already have read ALL objects
+ }
+ if (!trailer.Elements.ContainsKey(Keys.Root))
+ {
+ // cases:
+ // 1. no trailer found (maybe cut off at end of file)
+ // 2. trailer is corrupt (found one with just a single /Size entry, /Root was missing)
+ // read all found objects searching for the catalog (/Root entry)
+ foreach (var objRef in allRefs)
+ {
+ parser.MoveToObject(objRef.ObjectID);
+ var obj = parser.ReadIndirectObject(objRef);
+ if (obj is PdfDictionary dict)
+ {
+ var type = dict.Elements.GetName(PdfCatalog.Keys.Type);
+ // ensure we use a valid catalog (we may find multiple)
+ if (type == "/Catalog" && dict.Elements.ContainsKey(PdfCatalog.Keys.Pages))
+ {
+ trailer.Elements[Keys.Root] = dict.Reference;
+ }
+ }
+ }
+ }
+ // still no catalog ? then throw
+ if (!trailer.Elements.ContainsKey(Keys.Root))
+ throw new PdfReaderException(
+ "Unable to rebuild trailer and iref-table, catalog dictionary not found. The pdf is corrupt");
+
+ var largestObjectNumber = allRefs.Max(x => x.ObjectID.ObjectNumber);
+ trailer.Elements.SetInteger(Keys.Size, largestObjectNumber + 1);
+ PdfSharpLogHost.PdfReadingLogger.LogInformation("Trailer was rebuild with {count} found objects", irefTable.AllObjectIDs.Length);
+ return trailer;
+ }
+ catch (Exception ex)
+ {
+ throw new PdfReaderException("Unable to rebuild trailer and iref-table, pdf is corrupt", ex);
+ }
+ }
+
+
///
/// Predefined keys of this dictionary.
///
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs
index 6935f61d..d83018f7 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs
@@ -147,9 +147,9 @@ public Symbol ScanNextToken(bool testForObjectReference)
return Symbol = Symbol.Eof;
default:
- Debug.Assert(!Char.IsLetter(ch), "PDFsharp did something wrong. See code below.");
- ParserDiagnostics.HandleUnexpectedCharacter(ch, DumpNeighborhoodOfPosition());
- return Symbol = Symbol.None;
+ // just skip over unexpected character
+ ScanNextChar(true);
+ goto TryAgain;
}
}
@@ -855,20 +855,43 @@ public int DetermineStreamLength(SizeType start, int searchLength, SuppressExcep
if (start == 144848)
_ = sizeof(int);
#endif
- var rawString = RandomReadRawString(start, searchLength);
-
- // When we come here, we have either an invalid or no \Length entry.
- // Best we can do is to consider all byte before 'endstream' are part of the stream content.
- // In case the stream is zipped, this is no problem. In case the stream is encrypted
- // it would be a serious problem. But we wait if this really happens.
- int idxEndStream = rawString.LastIndexOf("endstream", StringComparison.Ordinal);
- if (idxEndStream == -1)
+ var firstStart = start;
+ while (start < _pdfLength)
{
- SuppressExceptions.HandleError(suppressObjectOrderExceptions, () => throw TH.ObjectNotAvailableException_CannotRetrieveStreamLength());
- return -1;
+ var rawString = RandomReadRawString(start, Math.Min(searchLength, (int)(_pdfLength - start)));
+
+ // When we come here, we have either an invalid or no \Length entry.
+ // Best we can do is to consider all byte before 'endstream' are part of the stream content.
+ // In case the stream is zipped, this is no problem. In case the stream is encrypted
+ // it would be a serious problem. But we wait if this really happens.
+ int idxEndStream = rawString.LastIndexOf("endstream", StringComparison.Ordinal);
+ if (idxEndStream >= 0)
+ {
+ // The spec says (7.3.8, Stream Objects):
+ // "There should be an end-of-line marker after the data and before endstream;
+ // this marker shall not be included in the stream length"
+
+ // check bytes before the keyword for possible CRLF or LF or CR
+ // (CR alone SHALL NOT be used but check it anyway)
+ // sanity check, should always pass since we SHOULD have read the "stream" keyword before we came here
+ if (start + idxEndStream >= 2)
+ {
+ _pdfStream.Position = start + idxEndStream - 2;
+ var b1 = _pdfStream.ReadByte();
+ var b2 = _pdfStream.ReadByte();
+ if (b2 == '\n' || b2 == '\r') // possible CRLF or single LF or single CR
+ {
+ idxEndStream--;
+ if (b1 == '\r' && b2 != '\r') // handle CRLF but not CRCR
+ idxEndStream--;
+ }
+ }
+ return (int)(start - firstStart + idxEndStream);
+ }
+ start += Math.Max(1, searchLength - "endstream".Length - 1);
}
-
- return idxEndStream;
+ SuppressExceptions.HandleError(suppressObjectOrderExceptions, () => throw TH.ObjectNotAvailableException_CannotRetrieveStreamLength());
+ return -1;
}
///
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs
index c2b18952..12e8822e 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Parser.cs
@@ -57,7 +57,7 @@ public Parser(PdfDocument? document, Stream objectStream, Parser documentParser)
///
/// The ID of the object to move.
/// Suppresses exceptions that may be caused by not yet available objects.
- public SizeType MoveToObject(PdfObjectID objectID, SuppressExceptions? suppressObjectOrderExceptions)
+ public SizeType MoveToObject(PdfObjectID objectID, SuppressExceptions? suppressObjectOrderExceptions = null)
{
SizeType? position = _document.IrefTable[objectID]?.Position;
if (!position.HasValue)
@@ -369,9 +369,20 @@ void ReadDictionaryStream(PdfDictionary dict, SuppressExceptions? suppressObject
// Step 3: We try to read the stream content.
// Maybe we have to re-read it in case 'endstream' was not at the
// right place after reading with the length value coming from /Length.
- var bytes = _lexer.ScanStream(startPosition, streamLength);
- var stream = new PdfDictionary.PdfStream(bytes, dict);
- dict.Stream = stream;
+ byte[] bytes;
+ try
+ {
+ // this may throw if startPosition + streamLength > length of stream
+ bytes = _lexer.ScanStream(startPosition, streamLength);
+ var stream = new PdfDictionary.PdfStream(bytes, dict);
+ dict.Stream = stream;
+ }
+ catch
+ {
+ // reset stream position
+ _lexer.Position = startPosition;
+ // ignore exception, we'll try again after determining real stream-length
+ }
#if DEBUG_ // Check it with Notepad++ directly in PDF file.
// ReSharper disable once ConditionIsAlwaysTrueOrFalseAccordingToNullableAPIContract
if (bytes is not null && bytes.Length > 0)
@@ -829,7 +840,7 @@ PdfItem ReadReference(PdfReference iref, bool includeReferences)
///
/// Reads the next symbol that must be the specified one.
///
- Symbol ReadSymbol(Symbol symbol)
+ internal Symbol ReadSymbol(Symbol symbol)
{
Symbol current = ScanNextToken(symbol == Symbol.ObjRef);
if (symbol != current)
@@ -903,7 +914,7 @@ SizeType ReadSize()
///
/// Reads the PdfObject of the reference, no matter if it’s saved at document level or inside an ObjectStream.
///
- internal PdfObject ReadIndirectObject(PdfReference pdfReference, SuppressExceptions? suppressObjectOrderExceptions, bool withoutDecrypting = false)
+ internal PdfObject ReadIndirectObject(PdfReference pdfReference, SuppressExceptions? suppressObjectOrderExceptions = null, bool withoutDecrypting = false)
{
try
{
@@ -1406,7 +1417,7 @@ bool CheckXRefTableEntry(SizeType position, int id, int generation, out int idCh
///
/// Reads cross-reference stream(s).
///
- PdfTrailer ReadXRefStream(PdfCrossReferenceTable xrefTable)
+ internal PdfTrailer ReadXRefStream(PdfCrossReferenceTable xrefTable)
{
// Read cross-reference stream.
//Debug.Assert(_lexer.Symbol == Symbol.Integer);
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/PdfReader.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/PdfReader.cs
index 45ab116b..b8aa068c 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/PdfReader.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/PdfReader.cs
@@ -300,7 +300,14 @@ PdfDocument OpenFromStream(Stream stream, string? password, PdfDocumentOpenMode
var parser = new Parser(_document, options ?? new PdfReaderOptions(), _logger);
// 1. Read all trailers or cross-reference streams, but no objects.
- _document.Trailer = parser.ReadTrailer();
+ try
+ {
+ _document.Trailer = parser.ReadTrailer();
+ }
+ catch
+ {
+ _document.Trailer = PdfTrailer.Rebuild(_document, stream, parser);
+ }
if (_document.Trailer == null!)
ParserDiagnostics.ThrowParserException(
"Invalid PDF file: no trailer found."); // TODO L10N using PsMsgs
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfDocument.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfDocument.cs
index 3faccf65..88f3ff43 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfDocument.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfDocument.cs
@@ -899,7 +899,7 @@ public void Flatten()
///
/// Gets the standard security handler, if existing and encryption is active.
///
- internal PdfStandardSecurityHandler? EffectiveSecurityHandler => Trailer.EffectiveSecurityHandler;
+ internal PdfStandardSecurityHandler? EffectiveSecurityHandler => Trailer?.EffectiveSecurityHandler;
internal PdfTrailer Trailer { get; set; } = default!;
diff --git a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfString.cs b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfString.cs
index f60b4958..5d5cbbce 100644
--- a/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfString.cs
+++ b/src/foundation/src/PDFsharp/src/PdfSharp/Pdf/PdfString.cs
@@ -277,9 +277,11 @@ static bool TryRereadAsUnicode(ref string? value)
return true;
}
-#if true // UTF-16LE is not defined as valid text string encoding in PDF reference.
+#if false // UTF-16LE is not defined as valid text string encoding in PDF reference.
if (value is ['\xFF', '\xFE', ..])
+ {
throw new NotImplementedException("Found UTF-16LE string. Please send us the PDF file and we will fix it (issues (at) pdfsharp.net).");
+ }
#else
// Adobe Reader also supports UTF-16LE.
if (value is ['\xFF', '\xFE', ..])