Skip to content

Commit cb35e76

Browse files
committed
Fix null pointer read while parsing CDX file
1 parent 8f34226 commit cb35e76

File tree

1 file changed

+90
-12
lines changed

1 file changed

+90
-12
lines changed

src/warc.c

Lines changed: 90 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -850,6 +850,42 @@ warc_timestamp (char *timestamp, size_t timestamp_size)
850850
return timestamp;
851851
}
852852

853+
/* Checks if the timestamp passed is a valid CDX-style timestamp. */
854+
static bool
855+
is_valid_cdx_timestamp (char *timestamp)
856+
{
857+
for (size_t i = 0; i < 14; i++)
858+
{
859+
if (!c_isdigit (timestamp[i]))
860+
{
861+
return false;
862+
}
863+
}
864+
865+
return timestamp[14] == 0;
866+
}
867+
868+
static char *
869+
cdx_to_warc_timestamp (char *cdx_timestamp)
870+
{
871+
char *warc_timestamp = xmalloc (21);
872+
memcpy (warc_timestamp , cdx_timestamp , 4); /* YYYY */
873+
warc_timestamp[4] = '-'; /* - */
874+
memcpy (warc_timestamp + 5 , cdx_timestamp + 4 , 2); /* MM */
875+
warc_timestamp[7] = '-'; /* - */
876+
memcpy (warc_timestamp + 8 , cdx_timestamp + 6 , 2); /* DD */
877+
warc_timestamp[10] = 'T'; /* T */
878+
memcpy (warc_timestamp + 11, cdx_timestamp + 8 , 2); /* hh */
879+
warc_timestamp[13] = ':'; /* : */
880+
memcpy (warc_timestamp + 14, cdx_timestamp + 10, 2); /* mm */
881+
warc_timestamp[16] = ':'; /* : */
882+
memcpy (warc_timestamp + 17, cdx_timestamp + 12, 2); /* ss */
883+
warc_timestamp[19] = 'Z'; /* Z */
884+
warc_timestamp[20] = 0;
885+
886+
return warc_timestamp;
887+
}
888+
853889
/* Fills urn_str with a UUID in the format required
854890
for the WARC-Record-Id header.
855891
The string will be 47 characters long. */
@@ -1427,7 +1463,8 @@ store_warc_record (const char *uri, const char *date, const char *uuid,
14271463
checksum and record ID fields. */
14281464
static bool
14291465
warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
1430-
int *field_num_checksum, int *field_num_record_id)
1466+
int *field_num_date, int *field_num_checksum,
1467+
int *field_num_record_id)
14311468
{
14321469
char *token;
14331470
char *save_ptr;
@@ -1451,6 +1488,9 @@ warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
14511488
case 'a':
14521489
*field_num_original_url = field_num;
14531490
break;
1491+
case 'b':
1492+
*field_num_date = field_num;
1493+
break;
14541494
case 'k':
14551495
*field_num_checksum = field_num;
14561496
break;
@@ -1464,16 +1504,19 @@ warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
14641504
}
14651505

14661506
return *field_num_original_url != -1
1507+
&& *field_num_date != -1
14671508
&& *field_num_checksum != -1
14681509
&& *field_num_record_id != -1;
14691510
}
14701511

14711512
/* Parse the CDX record and add it to the warc_dedup_table hash table. */
1472-
static void
1513+
static bool
14731514
warc_process_cdx_line (char *lineptr, int field_num_original_url,
1474-
int field_num_checksum, int field_num_record_id)
1515+
int field_num_date, int field_num_checksum,
1516+
int field_num_record_id)
14751517
{
14761518
char *original_url = NULL;
1519+
char *date = NULL;
14771520
char *checksum = NULL;
14781521
char *record_id = NULL;
14791522
char *token;
@@ -1487,6 +1530,8 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
14871530
char **val;
14881531
if (field_num == field_num_original_url)
14891532
val = &original_url;
1533+
else if (field_num == field_num_date)
1534+
val = &date;
14901535
else if (field_num == field_num_checksum)
14911536
val = &checksum;
14921537
else if (field_num == field_num_record_id)
@@ -1501,26 +1546,44 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
15011546
field_num++;
15021547
}
15031548

1504-
if (original_url != NULL && checksum != NULL && record_id != NULL)
1549+
if (original_url != NULL
1550+
&& date != NULL
1551+
&& checksum != NULL
1552+
&& record_id != NULL)
15051553
{
1554+
if (!is_valid_cdx_timestamp (date))
1555+
{
1556+
logprintf (LOG_NOTQUIET,
1557+
_("CDX line contains invalid timestamp (%s).\n"),
1558+
quote (date));
1559+
xfree (original_url);
1560+
xfree (date);
1561+
xfree (checksum);
1562+
xfree (record_id);
1563+
return false;
1564+
}
1565+
15061566
/* For some extra efficiency, we decode the base32 encoded
15071567
checksum value. This should produce exactly SHA1_DIGEST_SIZE
15081568
bytes. */
15091569
idx_t checksum_l;
1510-
char * checksum_v;
1570+
char *checksum_v, *warc_date;
15111571
base32_decode_alloc (checksum, strlen (checksum), &checksum_v,
15121572
&checksum_l);
15131573
xfree (checksum);
15141574

15151575
if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
15161576
{
15171577
/* This is a valid line with a valid checksum. */
1518-
store_warc_record(original_url, NULL, record_id, checksum_v);
1578+
warc_date = cdx_to_warc_timestamp(date);
1579+
store_warc_record(original_url, warc_date, record_id, checksum_v);
1580+
xfree (date);
15191581
xfree (checksum_v);
15201582
}
15211583
else
15221584
{
15231585
xfree (original_url);
1586+
xfree (date);
15241587
xfree (checksum_v);
15251588
xfree (record_id);
15261589
}
@@ -1529,8 +1592,11 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
15291592
{
15301593
xfree(checksum);
15311594
xfree(original_url);
1595+
xfree(date);
15321596
xfree(record_id);
15331597
}
1598+
1599+
return true;
15341600
}
15351601

15361602
/* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
@@ -1543,6 +1609,7 @@ warc_load_cdx_dedup_file (void)
15431609
size_t n = 0;
15441610
ssize_t line_length;
15451611
int field_num_original_url = -1;
1612+
int field_num_date = -1;
15461613
int field_num_checksum = -1;
15471614
int field_num_record_id = -1;
15481615

@@ -1553,21 +1620,26 @@ warc_load_cdx_dedup_file (void)
15531620
/* The first line should contain the CDX header.
15541621
Format: " CDX x x x x x"
15551622
where x are field type indicators. For our purposes, we only
1556-
need 'a' (the original url), 'k' (the SHA1 checksum) and
1557-
'u' (the WARC record id). */
1623+
need 'a' (the original url), 'b' (the date),
1624+
'k' (the SHA1 checksum) and 'u' (the WARC record id). */
15581625
line_length = getline (&lineptr, &n, f);
15591626
if (line_length != -1)
15601627
warc_parse_cdx_header (lineptr, &field_num_original_url,
1561-
&field_num_checksum, &field_num_record_id);
1628+
&field_num_date, &field_num_checksum,
1629+
&field_num_record_id);
15621630

1563-
/* If the file contains all three fields, read the complete file. */
1631+
/* If the file contains all four fields, read the complete file. */
15641632
if (field_num_original_url == -1
1633+
|| field_num_date == -1
15651634
|| field_num_checksum == -1
15661635
|| field_num_record_id == -1)
15671636
{
15681637
if (field_num_original_url == -1)
15691638
logprintf (LOG_NOTQUIET,
15701639
_("CDX file does not list original urls. (Missing column 'a'.)\n"));
1640+
if (field_num_date == -1)
1641+
logprintf (LOG_NOTQUIET,
1642+
_("CDX file does not list dates. (Missing column 'b'.)\n"));
15711643
if (field_num_checksum == -1)
15721644
logprintf (LOG_NOTQUIET,
15731645
_("CDX file does not list checksums. (Missing column 'k'.)\n"));
@@ -1586,8 +1658,14 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n"));
15861658
line_length = getline (&lineptr, &n, f);
15871659
if (line_length != -1)
15881660
{
1589-
warc_process_cdx_line (lineptr, field_num_original_url,
1590-
field_num_checksum, field_num_record_id);
1661+
if (!warc_process_cdx_line (lineptr, field_num_original_url,
1662+
field_num_date, field_num_checksum,
1663+
field_num_record_id))
1664+
{
1665+
xfree (lineptr);
1666+
fclose (f);
1667+
return false;
1668+
}
15911669
}
15921670

15931671
}

0 commit comments

Comments
 (0)