@@ -850,6 +850,42 @@ warc_timestamp (char *timestamp, size_t timestamp_size)
850850 return timestamp ;
851851}
852852
853+ /* Checks if the timestamp passed is a valid CDX-style timestamp. */
854+ static bool
855+ is_valid_cdx_timestamp (char * timestamp )
856+ {
857+ for (size_t i = 0 ; i < 14 ; i ++ )
858+ {
859+ if (!c_isdigit (timestamp [i ]))
860+ {
861+ return false;
862+ }
863+ }
864+
865+ return timestamp [14 ] == 0 ;
866+ }
867+
868+ static char *
869+ cdx_to_warc_timestamp (char * cdx_timestamp )
870+ {
871+ char * warc_timestamp = xmalloc (21 );
872+ memcpy (warc_timestamp , cdx_timestamp , 4 ); /* YYYY */
873+ warc_timestamp [4 ] = '-' ; /* - */
874+ memcpy (warc_timestamp + 5 , cdx_timestamp + 4 , 2 ); /* MM */
875+ warc_timestamp [7 ] = '-' ; /* - */
876+ memcpy (warc_timestamp + 8 , cdx_timestamp + 6 , 2 ); /* DD */
877+ warc_timestamp [10 ] = 'T' ; /* T */
878+ memcpy (warc_timestamp + 11 , cdx_timestamp + 8 , 2 ); /* hh */
879+ warc_timestamp [13 ] = ':' ; /* : */
880+ memcpy (warc_timestamp + 14 , cdx_timestamp + 10 , 2 ); /* mm */
881+ warc_timestamp [16 ] = ':' ; /* : */
882+ memcpy (warc_timestamp + 17 , cdx_timestamp + 12 , 2 ); /* ss */
883+ warc_timestamp [19 ] = 'Z' ; /* Z */
884+ warc_timestamp [20 ] = 0 ;
885+
886+ return warc_timestamp ;
887+ }
888+
853889/* Fills urn_str with a UUID in the format required
854890 for the WARC-Record-Id header.
855891 The string will be 47 characters long. */
@@ -1427,7 +1463,8 @@ store_warc_record (const char *uri, const char *date, const char *uuid,
14271463 checksum and record ID fields. */
14281464static bool
14291465warc_parse_cdx_header (char * lineptr , int * field_num_original_url ,
1430- int * field_num_checksum , int * field_num_record_id )
1466+ int * field_num_date , int * field_num_checksum ,
1467+ int * field_num_record_id )
14311468{
14321469 char * token ;
14331470 char * save_ptr ;
@@ -1451,6 +1488,9 @@ warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
14511488 case 'a' :
14521489 * field_num_original_url = field_num ;
14531490 break ;
1491+ case 'b' :
1492+ * field_num_date = field_num ;
1493+ break ;
14541494 case 'k' :
14551495 * field_num_checksum = field_num ;
14561496 break ;
@@ -1464,16 +1504,19 @@ warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
14641504 }
14651505
14661506 return * field_num_original_url != -1
1507+ && * field_num_date != -1
14671508 && * field_num_checksum != -1
14681509 && * field_num_record_id != -1 ;
14691510}
14701511
14711512/* Parse the CDX record and add it to the warc_dedup_table hash table. */
1472- static void
1513+ static bool
14731514warc_process_cdx_line (char * lineptr , int field_num_original_url ,
1474- int field_num_checksum , int field_num_record_id )
1515+ int field_num_date , int field_num_checksum ,
1516+ int field_num_record_id )
14751517{
14761518 char * original_url = NULL ;
1519+ char * date = NULL ;
14771520 char * checksum = NULL ;
14781521 char * record_id = NULL ;
14791522 char * token ;
@@ -1487,6 +1530,8 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
14871530 char * * val ;
14881531 if (field_num == field_num_original_url )
14891532 val = & original_url ;
1533+ else if (field_num == field_num_date )
1534+ val = & date ;
14901535 else if (field_num == field_num_checksum )
14911536 val = & checksum ;
14921537 else if (field_num == field_num_record_id )
@@ -1501,26 +1546,44 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
15011546 field_num ++ ;
15021547 }
15031548
1504- if (original_url != NULL && checksum != NULL && record_id != NULL )
1549+ if (original_url != NULL
1550+ && date != NULL
1551+ && checksum != NULL
1552+ && record_id != NULL )
15051553 {
1554+ if (!is_valid_cdx_timestamp (date ))
1555+ {
1556+ logprintf (LOG_NOTQUIET ,
1557+ _ ("CDX line contains invalid timestamp (%s).\n" ),
1558+ quote (date ));
1559+ xfree (original_url );
1560+ xfree (date );
1561+ xfree (checksum );
1562+ xfree (record_id );
1563+ return false;
1564+ }
1565+
15061566 /* For some extra efficiency, we decode the base32 encoded
15071567 checksum value. This should produce exactly SHA1_DIGEST_SIZE
15081568 bytes. */
15091569 idx_t checksum_l ;
1510- char * checksum_v ;
1570+ char * checksum_v , * warc_date ;
15111571 base32_decode_alloc (checksum , strlen (checksum ), & checksum_v ,
15121572 & checksum_l );
15131573 xfree (checksum );
15141574
15151575 if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE )
15161576 {
15171577 /* This is a valid line with a valid checksum. */
1518- store_warc_record (original_url , NULL , record_id , checksum_v );
1578+ warc_date = cdx_to_warc_timestamp (date );
1579+ store_warc_record (original_url , warc_date , record_id , checksum_v );
1580+ xfree (date );
15191581 xfree (checksum_v );
15201582 }
15211583 else
15221584 {
15231585 xfree (original_url );
1586+ xfree (date );
15241587 xfree (checksum_v );
15251588 xfree (record_id );
15261589 }
@@ -1529,8 +1592,11 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
15291592 {
15301593 xfree (checksum );
15311594 xfree (original_url );
1595+ xfree (date );
15321596 xfree (record_id );
15331597 }
1598+
1599+ return true;
15341600}
15351601
15361602/* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
@@ -1543,6 +1609,7 @@ warc_load_cdx_dedup_file (void)
15431609 size_t n = 0 ;
15441610 ssize_t line_length ;
15451611 int field_num_original_url = -1 ;
1612+ int field_num_date = -1 ;
15461613 int field_num_checksum = -1 ;
15471614 int field_num_record_id = -1 ;
15481615
@@ -1553,21 +1620,26 @@ warc_load_cdx_dedup_file (void)
15531620 /* The first line should contain the CDX header.
15541621 Format: " CDX x x x x x"
15551622 where x are field type indicators. For our purposes, we only
1556- need 'a' (the original url), 'k ' (the SHA1 checksum) and
1557- 'u' (the WARC record id). */
1623+ need 'a' (the original url), 'b ' (the date),
1624+ 'k' (the SHA1 checksum) and ' u' (the WARC record id). */
15581625 line_length = getline (& lineptr , & n , f );
15591626 if (line_length != -1 )
15601627 warc_parse_cdx_header (lineptr , & field_num_original_url ,
1561- & field_num_checksum , & field_num_record_id );
1628+ & field_num_date , & field_num_checksum ,
1629+ & field_num_record_id );
15621630
1563- /* If the file contains all three fields, read the complete file. */
1631+ /* If the file contains all four fields, read the complete file. */
15641632 if (field_num_original_url == -1
1633+ || field_num_date == -1
15651634 || field_num_checksum == -1
15661635 || field_num_record_id == -1 )
15671636 {
15681637 if (field_num_original_url == -1 )
15691638 logprintf (LOG_NOTQUIET ,
15701639_ ("CDX file does not list original urls. (Missing column 'a'.)\n" ));
1640+ if (field_num_date == -1 )
1641+ logprintf (LOG_NOTQUIET ,
1642+ _ ("CDX file does not list dates. (Missing column 'b'.)\n" ));
15711643 if (field_num_checksum == -1 )
15721644 logprintf (LOG_NOTQUIET ,
15731645_ ("CDX file does not list checksums. (Missing column 'k'.)\n" ));
@@ -1586,8 +1658,14 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n"));
15861658 line_length = getline (& lineptr , & n , f );
15871659 if (line_length != -1 )
15881660 {
1589- warc_process_cdx_line (lineptr , field_num_original_url ,
1590- field_num_checksum , field_num_record_id );
1661+ if (!warc_process_cdx_line (lineptr , field_num_original_url ,
1662+ field_num_date , field_num_checksum ,
1663+ field_num_record_id ))
1664+ {
1665+ xfree (lineptr );
1666+ fclose (f );
1667+ return false;
1668+ }
15911669 }
15921670
15931671 }
0 commit comments