@@ -9,6 +9,7 @@ import { initRedisWaitForSuccess } from "./util/redis.js";
99import { AsyncIterReader } from "warcio" ;
1010import { RedisDedupeIndex } from "./util/state.js" ;
1111import { basename } from "node:path" ;
12+ import { sleep } from "./util/timing.js" ;
1213
1314export type DedupeIndexEntry = {
1415 name : string ;
@@ -42,6 +43,13 @@ export class CrawlIndexer {
4243 type : "string" ,
4344 required : false ,
4445 } ,
46+
47+ removing : {
48+ describe : "If set, also remove unsued crawls/hashes from index" ,
49+ type : "boolean" ,
50+ required : false ,
51+ default : false ,
52+ } ,
4553 } )
4654 . parseSync ( ) ;
4755 }
@@ -62,16 +70,24 @@ export class CrawlIndexer {
6270
6371 for await ( const entry of this . iterWACZ ( {
6472 url : params . sourceUrl ,
65- name : params . sourceCrawlId || params . sourceUrl ,
73+ name : basename ( params . sourceUrl ) ,
74+ crawlId : params . sourceCrawlId ,
6675 } ) ) {
6776 await dedupeIndex . queueImportSource ( entry . name , JSON . stringify ( entry ) ) ;
77+ if ( params . removing && entry . crawlId ) {
78+ await dedupeIndex . markNotRemoved ( entry . crawlId ) ;
79+ }
6880 }
6981
7082 let count = 0 ;
83+ let total = 0 ;
7184 let res ;
7285
7386 while ( ( res = await dedupeIndex . nextQueuedImportSource ( ) ) ) {
74- const { name, entry, total } = res ;
87+ const { name, entry, remaining } = res ;
88+ if ( ! total ) {
89+ total = remaining ;
90+ }
7591 const { url, crawlId, size, hash } = JSON . parse (
7692 entry ,
7793 ) as DedupeIndexEntry ;
@@ -107,7 +123,15 @@ export class CrawlIndexer {
107123 await dedupeIndex . markImportSourceDone ( name , crawlIdReal ) ;
108124 }
109125
126+ if ( params . removing ) {
127+ const removeset = await dedupeIndex . getRemoveSet ( ) ;
128+ if ( removeset . size > 0 ) {
129+ await dedupeIndex . removeCrawlIds ( removeset ) ;
130+ }
131+ }
132+
110133 logger . info ( "Done!" ) ;
134+ await sleep ( 30 ) ;
111135 await dedupeIndex . markImportFinishedTS ( ) ;
112136 process . exit ( ExitCodes . Success ) ;
113137 }
@@ -180,7 +204,6 @@ export class CrawlIndexer {
180204 }
181205
182206 async * iterWACZ ( entry : DedupeIndexEntry ) : AsyncIterable < DedupeIndexEntry > {
183- const { name } = entry ;
184207 let { url } = entry ;
185208 let path = url ;
186209
@@ -191,8 +214,7 @@ export class CrawlIndexer {
191214 }
192215
193216 if ( path . endsWith ( ".wacz" ) ) {
194- console . log ( { ...entry , name : basename ( name || url ) } ) ;
195- yield { ...entry , name : basename ( name || url ) } ;
217+ yield entry ;
196218 } else if ( path . endsWith ( ".json" ) ) {
197219 if ( ! url . startsWith ( "http://" ) && ! url . startsWith ( "https://" ) ) {
198220 const blob = await openAsBlob ( url ) ;
0 commit comments