Skip to content

Commit 3a99457

Browse files
committed
Fix null pointer read while parsing CDX file
1 parent 8f34226 commit 3a99457

File tree

1 file changed

+26
-6
lines changed

1 file changed

+26
-6
lines changed

src/warc.c

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1427,7 +1427,8 @@ store_warc_record (const char *uri, const char *date, const char *uuid,
14271427
checksum and record ID fields. */
14281428
static bool
14291429
warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
1430-
int *field_num_checksum, int *field_num_record_id)
1430+
int *field_num_date, int *field_num_checksum,
1431+
int *field_num_record_id)
14311432
{
14321433
char *token;
14331434
char *save_ptr;
@@ -1451,6 +1452,9 @@ warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
14511452
case 'a':
14521453
*field_num_original_url = field_num;
14531454
break;
1455+
case 'b':
1456+
*field_num_date = field_num;
1457+
break;
14541458
case 'k':
14551459
*field_num_checksum = field_num;
14561460
break;
@@ -1464,16 +1468,19 @@ warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
14641468
}
14651469

14661470
return *field_num_original_url != -1
1471+
&& *field_num_date != -1
14671472
&& *field_num_checksum != -1
14681473
&& *field_num_record_id != -1;
14691474
}
14701475

14711476
/* Parse the CDX record and add it to the warc_dedup_table hash table. */
14721477
static void
14731478
warc_process_cdx_line (char *lineptr, int field_num_original_url,
1474-
int field_num_checksum, int field_num_record_id)
1479+
int field_num_date, int field_num_checksum,
1480+
int field_num_record_id)
14751481
{
14761482
char *original_url = NULL;
1483+
char *date = NULL;
14771484
char *checksum = NULL;
14781485
char *record_id = NULL;
14791486
char *token;
@@ -1487,6 +1494,8 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
14871494
char **val;
14881495
if (field_num == field_num_original_url)
14891496
val = &original_url;
1497+
else if (field_num == field_num_date)
1498+
val = &date;
14901499
else if (field_num == field_num_checksum)
14911500
val = &checksum;
14921501
else if (field_num == field_num_record_id)
@@ -1501,7 +1510,10 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
15011510
field_num++;
15021511
}
15031512

1504-
if (original_url != NULL && checksum != NULL && record_id != NULL)
1513+
if (original_url != NULL
1514+
&& date != NULL
1515+
&& checksum != NULL
1516+
&& record_id != NULL)
15051517
{
15061518
/* For some extra efficiency, we decode the base32 encoded
15071519
checksum value. This should produce exactly SHA1_DIGEST_SIZE
@@ -1515,12 +1527,13 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
15151527
if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
15161528
{
15171529
/* This is a valid line with a valid checksum. */
1518-
store_warc_record(original_url, NULL, record_id, checksum_v);
1530+
store_warc_record(original_url, date, record_id, checksum_v);
15191531
xfree (checksum_v);
15201532
}
15211533
else
15221534
{
15231535
xfree (original_url);
1536+
xfree (date);
15241537
xfree (checksum_v);
15251538
xfree (record_id);
15261539
}
@@ -1529,6 +1542,7 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
15291542
{
15301543
xfree(checksum);
15311544
xfree(original_url);
1545+
xfree(date);
15321546
xfree(record_id);
15331547
}
15341548
}
@@ -1543,6 +1557,7 @@ warc_load_cdx_dedup_file (void)
15431557
size_t n = 0;
15441558
ssize_t line_length;
15451559
int field_num_original_url = -1;
1560+
int field_num_date = -1;
15461561
int field_num_checksum = -1;
15471562
int field_num_record_id = -1;
15481563

@@ -1558,7 +1573,8 @@ warc_load_cdx_dedup_file (void)
15581573
line_length = getline (&lineptr, &n, f);
15591574
if (line_length != -1)
15601575
warc_parse_cdx_header (lineptr, &field_num_original_url,
1561-
&field_num_checksum, &field_num_record_id);
1576+
&field_num_date, &field_num_checksum,
1577+
&field_num_record_id);
15621578

15631579
/* If the file contains all three fields, read the complete file. */
15641580
if (field_num_original_url == -1
@@ -1568,6 +1584,9 @@ warc_load_cdx_dedup_file (void)
15681584
if (field_num_original_url == -1)
15691585
logprintf (LOG_NOTQUIET,
15701586
_("CDX file does not list original urls. (Missing column 'a'.)\n"));
1587+
if (field_num_date == -1)
1588+
logprintf (LOG_NOTQUIET,
1589+
_("CDX file does not list dates. (Missing column 'b'.)\n"));
15711590
if (field_num_checksum == -1)
15721591
logprintf (LOG_NOTQUIET,
15731592
_("CDX file does not list checksums. (Missing column 'k'.)\n"));
@@ -1587,7 +1606,8 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n"));
15871606
if (line_length != -1)
15881607
{
15891608
warc_process_cdx_line (lineptr, field_num_original_url,
1590-
field_num_checksum, field_num_record_id);
1609+
field_num_date, field_num_checksum,
1610+
field_num_record_id);
15911611
}
15921612

15931613
}

0 commit comments

Comments
 (0)