Skip to content

Commit 236c74c

Browse files
committed
Proper fix for invalid startxref.
1 parent afee7e1 commit 236c74c

File tree

4 files changed

+149
-135
lines changed

4 files changed

+149
-135
lines changed

src/PdfSharp/Pdf.Advanced/PdfTrailer.cs

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ public PdfTrailer(PdfCrossReferenceStream trailer)
7575
if (id != null)
7676
Elements.SetValue(Keys.ID, id);
7777
}
78-
78+
7979
public int Size
8080
{
8181
get { return Elements.GetInteger(Keys.Size); }
@@ -218,6 +218,37 @@ internal void Finish()
218218
_document._irefTable.IsUnderConstruction = false;
219219
}
220220

221+
/// <summary>
222+
/// Constructs the PdfTrailer from a document.
223+
/// </summary>
224+
/// <param name="parser">the parser used to read the file.</param>
225+
internal void ConstructFromDocument(Parser parser)
226+
{
227+
// TODO - May need to also search for encryption related trailer info
228+
PdfCrossReferenceTable xrefTable = _document._irefTable;
229+
Elements.SetInteger(Keys.Size, xrefTable.ObjectTable.Count);
230+
231+
// find the root.
232+
PdfDictionary rootToUse = null;
233+
foreach (var reference in xrefTable.AllReferences)
234+
{
235+
PdfObject obj = parser.ReadObject(null, reference.ObjectID, false, false);
236+
if (obj is PdfDictionary dObj)
237+
{
238+
if (dObj.Elements[PdfCatalog.Keys.Type] as PdfName == "/Catalog")
239+
{
240+
if (rootToUse == null)
241+
rootToUse = dObj;
242+
else if (dObj.ObjectID.GenerationNumber > rootToUse.ObjectID.GenerationNumber)
243+
rootToUse = dObj;
244+
}
245+
}
246+
}
247+
248+
if (rootToUse != null)
249+
Elements.SetReference(Keys.Root, rootToUse);
250+
}
251+
221252
/// <summary>
222253
/// Predefined keys of this dictionary.
223254
/// </summary>

src/PdfSharp/Pdf.IO/Lexer.cs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,10 @@ public bool TryScanNextToken(out Symbol symbol, out int position)
117117
case '%':
118118
// Eat comments, the parser doesn't handle them
119119
//return symbol = ScanComment();
120-
ScanComment();
120+
symbol = _symbol = ScanComment();
121+
// Do not eat EOF
122+
if (symbol == Symbol.Eof)
123+
return true;
121124
goto Again;
122125

123126
case '/':

src/PdfSharp/Pdf.IO/Parser.cs

Lines changed: 107 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -1110,62 +1110,31 @@ internal PdfTrailer ReadTrailer()
11101110
if (idx == -1)
11111111
throw new Exception("The StartXRef table could not be found, the file cannot be opened.");
11121112

1113-
ReadSymbol(Symbol.StartXRef);
1114-
int startxref = _lexer.Position = ReadInteger();
1115-
1116-
// Must be before the first 'goto valid_xref;' statement.
1117-
int xref_offset = 0;
1118-
1119-
// Check for valid startxref
1120-
if (IsValidXref())
1121-
{
1122-
goto valid_xref;
1123-
}
1124-
1125-
// If we reach this point, we have an invalid startxref
1126-
// First look for bytes preceding "%PDF-". Some pdf producers ignore these.
1127-
if (length >= 1024)
1128-
{
1129-
// "%PDF-" should be in this range
1130-
string header = _lexer.ReadRawString(0, 1024);
1131-
idx = header.IndexOf("%PDF-", StringComparison.Ordinal);
1132-
}
1133-
else
1134-
{
1135-
string header = _lexer.ReadRawString(0, length);
1136-
idx = header.IndexOf("%PDF-", StringComparison.Ordinal);
1137-
}
1138-
1139-
if (idx > 0)
1140-
{
1141-
//_lexer.ByteOffset = idx;
1142-
_lexer.Position = startxref + idx;
1143-
if (IsValidXref())
1144-
{
1145-
xref_offset = idx;
1146-
goto valid_xref;
1147-
}
1148-
}
1149-
1150-
_lexer.Position = startxref;
1113+
Symbol s = ReadSymbol(Symbol.StartXRef);
1114+
_lexer.Position = ReadInteger();
1115+
1116+
// Producer: iText1.3.1 by lowagie.com (based on itext-paulo-154)
1117+
// Problem: certificate data added to the start of file. Invalid startxref byte offset
1118+
// Fix: We could search for the a valid xref table but all byte offsets are probably incorrect.
1119+
// Probably best to just recreate the xref table.
1120+
// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.5.5
1121+
11511122
// Check for valid startxref
11521123
if (!IsValidXref())
11531124
{
1154-
PdfTrailer trailer = TryRecreateXRefTableAndTrailer(_document._irefTable);
1155-
if (trailer == null)
1125+
PdfTrailer trailer;
1126+
bool bSuccess = TryRecreateXRefTableAndTrailer(out trailer, _document);
1127+
if (!bSuccess)
11561128
throw new Exception("Could not recreate the xref table or trailer.");
11571129

11581130
_document._trailer = trailer;
11591131
return _document._trailer;
11601132
}
1161-
1162-
valid_xref:
1163-
_lexer.Position = startxref + xref_offset;
1164-
1133+
11651134
// Read all trailers.
11661135
while (true)
11671136
{
1168-
PdfTrailer trailer = ReadXRefTableAndTrailer(_document._irefTable, xref_offset);
1137+
PdfTrailer trailer = ReadXRefTableAndTrailer(_document._irefTable);
11691138
// 1st trailer seems to be the best.
11701139
if (_document._trailer == null)
11711140
_document._trailer = trailer;
@@ -1186,114 +1155,101 @@ internal PdfTrailer ReadTrailer()
11861155
/// <returns></returns>
11871156
private bool IsValidXref()
11881157
{
1189-
int length = _lexer.PdfLength;
11901158
int position = _lexer.Position;
1191-
// Make sure not inside a stream.
1192-
1193-
string content = "";
1194-
int content_pos = position;
1195-
while (true)
1159+
try
11961160
{
1197-
// look for stream and endstream in 1k chunks.
1198-
int read_length = Math.Min(1024, length - content_pos);
1199-
content += _lexer.ReadRawString(content_pos, read_length);
1200-
1201-
int ss = content.IndexOf("stream", StringComparison.Ordinal);
1202-
int es = content.IndexOf("endstream", StringComparison.Ordinal);
1203-
int eof = content.IndexOf("%%EOF", StringComparison.Ordinal);
1161+
Symbol symbol = ScanNextToken();
1162+
if (symbol == Symbol.XRef) // xref table
1163+
{
1164+
_lexer.Position = position;
1165+
return true;
1166+
}
12041167

1205-
if (ss != es)
1168+
if (symbol == Symbol.Integer) // Linearization parameter dictionary
12061169
{
1207-
if (ss == -1)
1208-
{
1209-
if (eof != -1 && eof < es)
1210-
break;
1211-
else
1212-
return false;
1213-
}
1214-
else if (es == -1)
1215-
break;
1216-
else if (ss < es)
1217-
break;
1218-
else if (ss > es)
1170+
// Just because we have an integer, doesn't mean the startxref is actually valid
1171+
if (ScanNextToken() == Symbol.Integer && ScanNextToken() == Symbol.Obj)
12191172
{
1220-
if (eof != -1 && eof < ss && eof < es)
1221-
break;
1222-
else
1223-
return false;
1173+
_lexer.Position = position;
1174+
return true;
12241175
}
12251176
}
12261177

1227-
if (eof != -1)
1228-
break;
1229-
1230-
content_pos = content_pos + read_length;
1231-
if (content_pos + read_length >= length)
1232-
{
1233-
// reached the end of the document without finding either.
1234-
break;
1235-
}
1178+
_lexer.Position = position;
1179+
return false;
12361180
}
1181+
catch
1182+
{
1183+
_lexer.Position = position;
1184+
return false;
1185+
}
1186+
}
12371187

1238-
_lexer.Position = position;
1188+
private bool TryRecreateXRefTableAndTrailer(out PdfTrailer trailer, PdfDocument document)
1189+
{
1190+
PdfCrossReferenceTable xrefTable = document._irefTable;
1191+
trailer = null;
1192+
int length = _lexer.PdfLength;
12391193

1240-
Symbol symbol = ScanNextToken();
1241-
if (symbol == Symbol.XRef)
1194+
// because some pdf producers put random info before the header, we need to find a proper starting position.
1195+
// i.e. Producer: iText1.3.1 by lowagie.com (based on itext-paulo-154)
1196+
int startIdx = -1;
1197+
string contents = "";
1198+
for (int i = 0, pos = 0; startIdx == -1 && pos < length; i++, pos = 1024 * i)
12421199
{
1243-
return true;
1200+
int len = Math.Min(1024, length - pos);
1201+
contents = $"{contents}{_lexer.ReadRawString(pos, len)}";
1202+
startIdx = contents.IndexOf("%PDF-1.", StringComparison.Ordinal);
12441203
}
12451204

1246-
if (symbol == Symbol.Integer)
1205+
if (startIdx == -1)
1206+
return false;
1207+
1208+
// Don't look past the last %%EOF marker
1209+
int endIdx = -1;
1210+
contents = "";
1211+
for (int i = 1; endIdx == -1; i++)
12471212
{
1248-
// Just because we have an integer, doesn't mean the startxref is actually valid
1249-
if (ScanNextToken() == Symbol.Integer && ScanNextToken() == Symbol.Obj)
1213+
int pos = length - (1024 * i);
1214+
int len = 1024;
1215+
1216+
if (pos < 0)
12501217
{
1251-
return true;
1218+
len = len + pos;
1219+
pos = 0;
12521220
}
1253-
}
12541221

1255-
return false;
1256-
}
1257-
1258-
private PdfTrailer TryRecreateXRefTableAndTrailer(PdfCrossReferenceTable xrefTable)
1259-
{
1260-
// Let's first check for a trailer
1261-
int length = _lexer.PdfLength;
1262-
1263-
int trail_idx;
1264-
if (length >= 1024)
1265-
{
1266-
string trail = _lexer.ReadRawString(length - 1024, 1024);
1267-
trail_idx = trail.LastIndexOf("trailer", StringComparison.Ordinal);
1268-
_lexer.Position = length - 1024 + trail_idx;
1269-
}
1270-
else
1271-
{
1272-
string trail = _lexer.ReadRawString(0, length);
1273-
trail_idx = trail.LastIndexOf("trailer", StringComparison.Ordinal);
1274-
_lexer.Position = trail_idx;
1275-
}
1222+
contents = $"{_lexer.ReadRawString(pos, len)}{contents}";
1223+
endIdx = contents.LastIndexOf("%%EOF", StringComparison.Ordinal);
1224+
if (endIdx != -1)
1225+
endIdx = length - contents.Length + endIdx;
12761226

1277-
if (trail_idx == -1)
1278-
return null; //TODO: Look for compressed xref table that should contain the trailer
1227+
if (pos == 0)
1228+
break;
1229+
}
12791230

1280-
ReadSymbol(Symbol.Trailer);
1281-
ReadSymbol(Symbol.BeginDictionary);
1282-
PdfTrailer trailer = new PdfTrailer(_document);
1283-
ReadDictionary(trailer, false);
1231+
if (endIdx == -1)
1232+
return false;
12841233

1234+
endIdx = endIdx + 5; // This should be where Eof char is
1235+
12851236
// Recreate the xref table.
12861237
//
12871238
// When symbol == Symbol.Obj
12881239
// [0] - generation
12891240
// [1] - id
12901241
TokenInfo[] token_stack = new TokenInfo[2];
1291-
_lexer.Position = 0;
1242+
1243+
_lexer.Position = startIdx;
12921244
while (true)
12931245
{
12941246
Symbol symbol = ScanNextToken(out int position);
1295-
if (symbol == Symbol.Eof)
1296-
break;
1247+
if (symbol == Symbol.Eof)
1248+
{
1249+
// Check if it's the last EOF
1250+
if (_lexer.Position >= endIdx)
1251+
break; // This is the end of the file.
1252+
}
12971253

12981254
// we need to skip over streams entirely
12991255
if (symbol == Symbol.BeginStream)
@@ -1327,21 +1283,45 @@ private PdfTrailer TryRecreateXRefTableAndTrailer(PdfCrossReferenceTable xrefTab
13271283
token_stack[0].Symbol == Symbol.Integer &&
13281284
token_stack[1].Symbol == Symbol.Integer)
13291285
{
1286+
// TODO:: Do we only need the most recent revision?
13301287
PdfObjectID objectID = new PdfObjectID(token_stack[1].Number, token_stack[0].Number);
13311288
if (!xrefTable.Contains(objectID))
13321289
xrefTable.Add(new PdfReference(objectID, token_stack[1].Position));
13331290
//ReadObject(null, objectID, false, false); // Can't do this because the object value will never be set after
13341291
//SkipCharsUntil(Symbol.EndObj); // Can't do this because streams will cause exceptions
13351292
}
13361293

1337-
token_stack[1] = token_stack[0];
1294+
token_stack[1] = token_stack[0];
13381295
TokenInfo token_info = new TokenInfo { Symbol = symbol, Position = position };
13391296
if (symbol == Symbol.Integer)
13401297
token_info.Number = _lexer.TokenToInteger;
13411298
token_stack[0] = token_info;
13421299
}
13431300

1344-
return trailer;
1301+
// find the root.
1302+
// foreach (var reference in xrefTable.AllReferences)
1303+
// {
1304+
// PdfObject obj = ReadObject(null, reference.ObjectID, false, false);
1305+
// if (obj is PdfDictionary dObj)
1306+
// {
1307+
// if (dObj.Elements[PdfCatalog.Keys.Type] as PdfName == "/Catalog")
1308+
// {
1309+
// PdfCatalog catalog = new PdfCatalog(dObj);
1310+
// }
1311+
// }
1312+
// }
1313+
1314+
1315+
1316+
1317+
1318+
1319+
1320+
1321+
trailer = new PdfTrailer(_document);
1322+
trailer.ConstructFromDocument(this);
1323+
1324+
return true;
13451325
}
13461326

13471327
struct TokenInfo
@@ -1354,7 +1334,7 @@ struct TokenInfo
13541334
/// <summary>
13551335
/// Reads cross reference table(s) and trailer(s).
13561336
/// </summary>
1357-
private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable, int xrefOffset)
1337+
private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable)
13581338
{
13591339
Debug.Assert(xrefTable != null);
13601340

@@ -1372,7 +1352,7 @@ private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable, int
13721352
int length = ReadInteger();
13731353
for (int id = start; id < start + length; id++)
13741354
{
1375-
int position = ReadInteger() + xrefOffset;
1355+
int position = ReadInteger();
13761356
int generation = ReadInteger();
13771357
ReadSymbol(Symbol.Keyword);
13781358
string token = _lexer.Token;

0 commit comments

Comments
 (0)