@@ -60,7 +60,10 @@ export class CrawlIndexer {
6060 const redis = await initRedisWaitForSuccess ( params . redisDedupeUrl ) ;
6161 const dedupeIndex = new RedisDedupeIndex ( redis , "" ) ;
6262
63- for await ( const entry of this . iterWACZ ( params . sourceUrl ) ) {
63+ for await ( const entry of this . iterWACZ ( {
64+ url : params . sourceUrl ,
65+ name : params . sourceCrawlId || params . sourceUrl ,
66+ } ) ) {
6467 await dedupeIndex . queueImportSource ( entry . name , JSON . stringify ( entry ) ) ;
6568 }
6669
@@ -160,8 +163,7 @@ export class CrawlIndexer {
160163 }
161164
162165 if ( url && date && hash ) {
163- await dedupeIndex . addHashDupe ( hash , url , date , crawlId ) ;
164- await dedupeIndex . addImportedForCrawl ( hash , crawlId ) ;
166+ await dedupeIndex . addHashDupe ( hash , url , date , crawlId , true ) ;
165167 } else {
166168 logger . warn ( "Skipping invalid CDXJ, data missing" , {
167169 url,
@@ -177,8 +179,10 @@ export class CrawlIndexer {
177179 logger . debug ( "Processed" , { count } ) ;
178180 }
179181
180- async * iterWACZ ( url : string , name ?: string ) : AsyncIterable < DedupeIndexEntry > {
181- let path : string = url ;
182+ async * iterWACZ ( entry : DedupeIndexEntry ) : AsyncIterable < DedupeIndexEntry > {
183+ const { name } = entry ;
184+ let { url } = entry ;
185+ let path = url ;
182186
183187 try {
184188 path = new URL ( url ) . pathname ;
@@ -187,7 +191,8 @@ export class CrawlIndexer {
187191 }
188192
189193 if ( path . endsWith ( ".wacz" ) ) {
190- yield { name : basename ( name || url ) , url } ;
194+ console . log ( { ...entry , name : basename ( name || url ) } ) ;
195+ yield { ...entry , name : basename ( name || url ) } ;
191196 } else if ( path . endsWith ( ".json" ) ) {
192197 if ( ! url . startsWith ( "http://" ) && ! url . startsWith ( "https://" ) ) {
193198 const blob = await openAsBlob ( url ) ;
@@ -198,13 +203,8 @@ export class CrawlIndexer {
198203 const json = await resp . json ( ) ;
199204
200205 for ( const entry of json . resources ) {
201- const url = entry . path ;
202- if ( url && url . endsWith ( ".wacz" ) ) {
203- const { size , hash , crawlId , name } = entry ;
204- yield { crawlId , name , url , size , hash } ;
205- } else {
206- yield * this . iterWACZ ( entry . path , entry . name ) ;
207- }
206+ entry . url = entry . path ;
207+ yield * this . iterWACZ ( entry ) ;
208208 }
209209 } else {
210210 logger . warn ( "Unknown source" , { url } , "replay" ) ;
0 commit comments