@@ -1110,62 +1110,31 @@ internal PdfTrailer ReadTrailer()
11101110 if ( idx == - 1 )
11111111 throw new Exception ( "The StartXRef table could not be found, the file cannot be opened." ) ;
11121112
1113- ReadSymbol ( Symbol . StartXRef ) ;
1114- int startxref = _lexer . Position = ReadInteger ( ) ;
1115-
1116- // Must be before the first 'goto valid_xref;' statement.
1117- int xref_offset = 0 ;
1118-
1119- // Check for valid startxref
1120- if ( IsValidXref ( ) )
1121- {
1122- goto valid_xref ;
1123- }
1124-
1125- // If we reach this point, we have an invalid startxref
1126- // First look for bytes preceding "%PDF-". Some pdf producers ignore these.
1127- if ( length >= 1024 )
1128- {
1129- // "%PDF-" should be in this range
1130- string header = _lexer . ReadRawString ( 0 , 1024 ) ;
1131- idx = header . IndexOf ( "%PDF-" , StringComparison . Ordinal ) ;
1132- }
1133- else
1134- {
1135- string header = _lexer . ReadRawString ( 0 , length ) ;
1136- idx = header . IndexOf ( "%PDF-" , StringComparison . Ordinal ) ;
1137- }
1138-
1139- if ( idx > 0 )
1140- {
1141- //_lexer.ByteOffset = idx;
1142- _lexer . Position = startxref + idx ;
1143- if ( IsValidXref ( ) )
1144- {
1145- xref_offset = idx ;
1146- goto valid_xref ;
1147- }
1148- }
1149-
1150- _lexer . Position = startxref ;
1113+ Symbol s = ReadSymbol ( Symbol . StartXRef ) ;
1114+ _lexer . Position = ReadInteger ( ) ;
1115+
1116+ // Producer: iText1.3.1 by lowagie.com (based on itext-paulo-154)
1117+ // Problem: certificate data added to the start of file. Invalid startxref byte offset
1118+ // Fix: We could search for the a valid xref table but all byte offsets are probably incorrect.
1119+ // Probably best to just recreate the xref table.
1120+ // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.5.5
1121+
11511122 // Check for valid startxref
11521123 if ( ! IsValidXref ( ) )
11531124 {
1154- PdfTrailer trailer = TryRecreateXRefTableAndTrailer ( _document . _irefTable ) ;
1155- if ( trailer == null )
1125+ PdfTrailer trailer ;
1126+ bool bSuccess = TryRecreateXRefTableAndTrailer ( out trailer , _document ) ;
1127+ if ( ! bSuccess )
11561128 throw new Exception ( "Could not recreate the xref table or trailer." ) ;
11571129
11581130 _document . _trailer = trailer ;
11591131 return _document . _trailer ;
11601132 }
1161-
1162- valid_xref :
1163- _lexer . Position = startxref + xref_offset ;
1164-
1133+
11651134 // Read all trailers.
11661135 while ( true )
11671136 {
1168- PdfTrailer trailer = ReadXRefTableAndTrailer ( _document . _irefTable , xref_offset ) ;
1137+ PdfTrailer trailer = ReadXRefTableAndTrailer ( _document . _irefTable ) ;
11691138 // 1st trailer seems to be the best.
11701139 if ( _document . _trailer == null )
11711140 _document . _trailer = trailer ;
@@ -1186,114 +1155,101 @@ internal PdfTrailer ReadTrailer()
11861155 /// <returns></returns>
11871156 private bool IsValidXref ( )
11881157 {
1189- int length = _lexer . PdfLength ;
11901158 int position = _lexer . Position ;
1191- // Make sure not inside a stream.
1192-
1193- string content = "" ;
1194- int content_pos = position ;
1195- while ( true )
1159+ try
11961160 {
1197- // look for stream and endstream in 1k chunks.
1198- int read_length = Math . Min ( 1024 , length - content_pos ) ;
1199- content += _lexer . ReadRawString ( content_pos , read_length ) ;
1200-
1201- int ss = content . IndexOf ( "stream" , StringComparison . Ordinal ) ;
1202- int es = content . IndexOf ( "endstream" , StringComparison . Ordinal ) ;
1203- int eof = content . IndexOf ( "%%EOF" , StringComparison . Ordinal ) ;
1161+ Symbol symbol = ScanNextToken ( ) ;
1162+ if ( symbol == Symbol . XRef ) // xref table
1163+ {
1164+ _lexer . Position = position ;
1165+ return true ;
1166+ }
12041167
1205- if ( ss != es )
1168+ if ( symbol == Symbol . Integer ) // Linearization parameter dictionary
12061169 {
1207- if ( ss == - 1 )
1208- {
1209- if ( eof != - 1 && eof < es )
1210- break ;
1211- else
1212- return false ;
1213- }
1214- else if ( es == - 1 )
1215- break ;
1216- else if ( ss < es )
1217- break ;
1218- else if ( ss > es )
1170+ // Just because we have an integer, doesn't mean the startxref is actually valid
1171+ if ( ScanNextToken ( ) == Symbol . Integer && ScanNextToken ( ) == Symbol . Obj )
12191172 {
1220- if ( eof != - 1 && eof < ss && eof < es )
1221- break ;
1222- else
1223- return false ;
1173+ _lexer . Position = position ;
1174+ return true ;
12241175 }
12251176 }
12261177
1227- if ( eof != - 1 )
1228- break ;
1229-
1230- content_pos = content_pos + read_length ;
1231- if ( content_pos + read_length >= length )
1232- {
1233- // reached the end of the document without finding either.
1234- break ;
1235- }
1178+ _lexer . Position = position ;
1179+ return false ;
12361180 }
1181+ catch
1182+ {
1183+ _lexer . Position = position ;
1184+ return false ;
1185+ }
1186+ }
12371187
1238- _lexer . Position = position ;
1188+ private bool TryRecreateXRefTableAndTrailer ( out PdfTrailer trailer , PdfDocument document )
1189+ {
1190+ PdfCrossReferenceTable xrefTable = document . _irefTable ;
1191+ trailer = null ;
1192+ int length = _lexer . PdfLength ;
12391193
1240- Symbol symbol = ScanNextToken ( ) ;
1241- if ( symbol == Symbol . XRef )
1194+ // because some pdf producers put random info before the header, we need to find a proper starting position.
1195+ // i.e. Producer: iText1.3.1 by lowagie.com (based on itext-paulo-154)
1196+ int startIdx = - 1 ;
1197+ string contents = "" ;
1198+ for ( int i = 0 , pos = 0 ; startIdx == - 1 && pos < length ; i ++ , pos = 1024 * i )
12421199 {
1243- return true ;
1200+ int len = Math . Min ( 1024 , length - pos ) ;
1201+ contents = $ "{ contents } { _lexer . ReadRawString ( pos , len ) } ";
1202+ startIdx = contents . IndexOf ( "%PDF-1." , StringComparison . Ordinal ) ;
12441203 }
12451204
1246- if ( symbol == Symbol . Integer )
1205+ if ( startIdx == - 1 )
1206+ return false ;
1207+
1208+ // Don't look past the last %%EOF marker
1209+ int endIdx = - 1 ;
1210+ contents = "" ;
1211+ for ( int i = 1 ; endIdx == - 1 ; i ++ )
12471212 {
1248- // Just because we have an integer, doesn't mean the startxref is actually valid
1249- if ( ScanNextToken ( ) == Symbol . Integer && ScanNextToken ( ) == Symbol . Obj )
1213+ int pos = length - ( 1024 * i ) ;
1214+ int len = 1024 ;
1215+
1216+ if ( pos < 0 )
12501217 {
1251- return true ;
1218+ len = len + pos ;
1219+ pos = 0 ;
12521220 }
1253- }
12541221
1255- return false ;
1256- }
1257-
1258- private PdfTrailer TryRecreateXRefTableAndTrailer ( PdfCrossReferenceTable xrefTable )
1259- {
1260- // Let's first check for a trailer
1261- int length = _lexer . PdfLength ;
1262-
1263- int trail_idx ;
1264- if ( length >= 1024 )
1265- {
1266- string trail = _lexer . ReadRawString ( length - 1024 , 1024 ) ;
1267- trail_idx = trail . LastIndexOf ( "trailer" , StringComparison . Ordinal ) ;
1268- _lexer . Position = length - 1024 + trail_idx ;
1269- }
1270- else
1271- {
1272- string trail = _lexer . ReadRawString ( 0 , length ) ;
1273- trail_idx = trail . LastIndexOf ( "trailer" , StringComparison . Ordinal ) ;
1274- _lexer . Position = trail_idx ;
1275- }
1222+ contents = $ "{ _lexer . ReadRawString ( pos , len ) } { contents } ";
1223+ endIdx = contents . LastIndexOf ( "%%EOF" , StringComparison . Ordinal ) ;
1224+ if ( endIdx != - 1 )
1225+ endIdx = length - contents . Length + endIdx ;
12761226
1277- if ( trail_idx == - 1 )
1278- return null ; //TODO: Look for compressed xref table that should contain the trailer
1227+ if ( pos == 0 )
1228+ break ;
1229+ }
12791230
1280- ReadSymbol ( Symbol . Trailer ) ;
1281- ReadSymbol ( Symbol . BeginDictionary ) ;
1282- PdfTrailer trailer = new PdfTrailer ( _document ) ;
1283- ReadDictionary ( trailer , false ) ;
1231+ if ( endIdx == - 1 )
1232+ return false ;
12841233
1234+ endIdx = endIdx + 5 ; // This should be where Eof char is
1235+
12851236 // Recreate the xref table.
12861237 //
12871238 // When symbol == Symbol.Obj
12881239 // [0] - generation
12891240 // [1] - id
12901241 TokenInfo [ ] token_stack = new TokenInfo [ 2 ] ;
1291- _lexer . Position = 0 ;
1242+
1243+ _lexer . Position = startIdx ;
12921244 while ( true )
12931245 {
12941246 Symbol symbol = ScanNextToken ( out int position ) ;
1295- if ( symbol == Symbol . Eof )
1296- break ;
1247+ if ( symbol == Symbol . Eof )
1248+ {
1249+ // Check if it's the last EOF
1250+ if ( _lexer . Position >= endIdx )
1251+ break ; // This is the end of the file.
1252+ }
12971253
12981254 // we need to skip over streams entirely
12991255 if ( symbol == Symbol . BeginStream )
@@ -1327,21 +1283,45 @@ private PdfTrailer TryRecreateXRefTableAndTrailer(PdfCrossReferenceTable xrefTab
13271283 token_stack [ 0 ] . Symbol == Symbol . Integer &&
13281284 token_stack [ 1 ] . Symbol == Symbol . Integer )
13291285 {
1286+ // TODO:: Do we only need the most recent revision?
13301287 PdfObjectID objectID = new PdfObjectID ( token_stack [ 1 ] . Number , token_stack [ 0 ] . Number ) ;
13311288 if ( ! xrefTable . Contains ( objectID ) )
13321289 xrefTable . Add ( new PdfReference ( objectID , token_stack [ 1 ] . Position ) ) ;
13331290 //ReadObject(null, objectID, false, false); // Can't do this because the object value will never be set after
13341291 //SkipCharsUntil(Symbol.EndObj); // Can't do this because streams will cause exceptions
13351292 }
13361293
1337- token_stack [ 1 ] = token_stack [ 0 ] ;
1294+ token_stack [ 1 ] = token_stack [ 0 ] ;
13381295 TokenInfo token_info = new TokenInfo { Symbol = symbol , Position = position } ;
13391296 if ( symbol == Symbol . Integer )
13401297 token_info . Number = _lexer . TokenToInteger ;
13411298 token_stack [ 0 ] = token_info ;
13421299 }
13431300
1344- return trailer ;
1301+ // find the root.
1302+ // foreach (var reference in xrefTable.AllReferences)
1303+ // {
1304+ // PdfObject obj = ReadObject(null, reference.ObjectID, false, false);
1305+ // if (obj is PdfDictionary dObj)
1306+ // {
1307+ // if (dObj.Elements[PdfCatalog.Keys.Type] as PdfName == "/Catalog")
1308+ // {
1309+ // PdfCatalog catalog = new PdfCatalog(dObj);
1310+ // }
1311+ // }
1312+ // }
1313+
1314+
1315+
1316+
1317+
1318+
1319+
1320+
1321+ trailer = new PdfTrailer ( _document ) ;
1322+ trailer . ConstructFromDocument ( this ) ;
1323+
1324+ return true ;
13451325 }
13461326
13471327 struct TokenInfo
@@ -1354,7 +1334,7 @@ struct TokenInfo
13541334 /// <summary>
13551335 /// Reads cross reference table(s) and trailer(s).
13561336 /// </summary>
1357- private PdfTrailer ReadXRefTableAndTrailer ( PdfCrossReferenceTable xrefTable , int xrefOffset )
1337+ private PdfTrailer ReadXRefTableAndTrailer ( PdfCrossReferenceTable xrefTable )
13581338 {
13591339 Debug . Assert ( xrefTable != null ) ;
13601340
@@ -1372,7 +1352,7 @@ private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable, int
13721352 int length = ReadInteger ( ) ;
13731353 for ( int id = start ; id < start + length ; id ++ )
13741354 {
1375- int position = ReadInteger ( ) + xrefOffset ;
1355+ int position = ReadInteger ( ) ;
13761356 int generation = ReadInteger ( ) ;
13771357 ReadSymbol ( Symbol . Keyword ) ;
13781358 string token = _lexer . Token ;
0 commit comments