Skip to content

Commit 0d414f7

Browse files
committed
indexer optimize: commit only if added
1 parent dd8d2e1 commit 0d414f7

File tree

2 files changed

+24
-19
lines changed

2 files changed

+24
-19
lines changed

src/indexer.ts

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,10 @@ export class CrawlIndexer {
6060
const redis = await initRedisWaitForSuccess(params.redisDedupeUrl);
6161
const dedupeIndex = new RedisDedupeIndex(redis, "");
6262

63-
for await (const entry of this.iterWACZ(params.sourceUrl)) {
63+
for await (const entry of this.iterWACZ({
64+
url: params.sourceUrl,
65+
name: params.sourceCrawlId || params.sourceUrl,
66+
})) {
6467
await dedupeIndex.queueImportSource(entry.name, JSON.stringify(entry));
6568
}
6669

@@ -160,8 +163,7 @@ export class CrawlIndexer {
160163
}
161164

162165
if (url && date && hash) {
163-
await dedupeIndex.addHashDupe(hash, url, date, crawlId);
164-
await dedupeIndex.addImportedForCrawl(hash, crawlId);
166+
await dedupeIndex.addHashDupe(hash, url, date, crawlId, true);
165167
} else {
166168
logger.warn("Skipping invalid CDXJ, data missing", {
167169
url,
@@ -177,8 +179,10 @@ export class CrawlIndexer {
177179
logger.debug("Processed", { count });
178180
}
179181

180-
async *iterWACZ(url: string, name?: string): AsyncIterable<DedupeIndexEntry> {
181-
let path: string = url;
182+
async *iterWACZ(entry: DedupeIndexEntry): AsyncIterable<DedupeIndexEntry> {
183+
const { name } = entry;
184+
let { url } = entry;
185+
let path = url;
182186

183187
try {
184188
path = new URL(url).pathname;
@@ -187,7 +191,8 @@ export class CrawlIndexer {
187191
}
188192

189193
if (path.endsWith(".wacz")) {
190-
yield { name: basename(name || url), url };
194+
console.log({ ...entry, name: basename(name || url) });
195+
yield { ...entry, name: basename(name || url) };
191196
} else if (path.endsWith(".json")) {
192197
if (!url.startsWith("http://") && !url.startsWith("https://")) {
193198
const blob = await openAsBlob(url);
@@ -198,13 +203,8 @@ export class CrawlIndexer {
198203
const json = await resp.json();
199204

200205
for (const entry of json.resources) {
201-
const url = entry.path;
202-
if (url && url.endsWith(".wacz")) {
203-
const { size, hash, crawlId, name } = entry;
204-
yield { crawlId, name, url, size, hash };
205-
} else {
206-
yield* this.iterWACZ(entry.path, entry.name);
207-
}
206+
entry.url = entry.path;
207+
yield* this.iterWACZ(entry);
208208
}
209209
} else {
210210
logger.warn("Unknown source", { url }, "replay");

src/util/state.ts

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -299,11 +299,20 @@ export class RedisDedupeIndex {
299299
return { origUrl: val[2], origDate: val[1], index: val[0], crawlId };
300300
}
301301

302-
async addHashDupe(hash: string, url: string, date: string, crawlId?: string) {
302+
async addHashDupe(
303+
hash: string,
304+
url: string,
305+
date: string,
306+
crawlId?: string,
307+
commit = false,
308+
) {
303309
date = date.replace(/[^\d]/g, "");
304310
hash = hash.split(":").at(-1)!;
305311
const val = `${this.dedupeKeyIndex} ${date} ${url}`;
306-
await this.dedupeRedis.hsetnx(`h:${crawlId || this.crawlId}`, hash, val);
312+
crawlId = crawlId || this.crawlId;
313+
if ((await this.dedupeRedis.hsetnx(`h:${crawlId}`, hash, val)) && commit) {
314+
await this.dedupeRedis.hsetnx(DUPE_ALL_HASH_KEY, hash, crawlId);
315+
}
307316
}
308317

309318
// IMPORT
@@ -316,10 +325,6 @@ export class RedisDedupeIndex {
316325
await this.dedupeRedis.lpush(this.sourceQ, data);
317326
}
318327

319-
async addImportedForCrawl(hash: string, crawlId: string) {
320-
await this.dedupeRedis.hset(DUPE_ALL_HASH_KEY, hash, crawlId);
321-
}
322-
323328
async addImportedSourceForDedupe(key: string, entry: DedupeSourceEntry) {
324329
return (
325330
(await this.dedupeRedis.rpush(`c:${key}:wacz`, JSON.stringify(entry))) - 1

0 commit comments

Comments
 (0)