@@ -1427,7 +1427,8 @@ store_warc_record (const char *uri, const char *date, const char *uuid,
14271427 checksum and record ID fields. */
14281428static bool
14291429warc_parse_cdx_header (char * lineptr , int * field_num_original_url ,
1430- int * field_num_checksum , int * field_num_record_id )
1430+ int * field_num_date , int * field_num_checksum ,
1431+ int * field_num_record_id )
14311432{
14321433 char * token ;
14331434 char * save_ptr ;
@@ -1451,6 +1452,9 @@ warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
14511452 case 'a' :
14521453 * field_num_original_url = field_num ;
14531454 break ;
1455+ case 'b' :
1456+ * field_num_date = field_num ;
1457+ break ;
14541458 case 'k' :
14551459 * field_num_checksum = field_num ;
14561460 break ;
@@ -1464,16 +1468,19 @@ warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
14641468 }
14651469
14661470 return * field_num_original_url != -1
1471+ && * field_num_date != -1
14671472 && * field_num_checksum != -1
14681473 && * field_num_record_id != -1 ;
14691474}
14701475
14711476/* Parse the CDX record and add it to the warc_dedup_table hash table. */
14721477static void
14731478warc_process_cdx_line (char * lineptr , int field_num_original_url ,
1474- int field_num_checksum , int field_num_record_id )
1479+ int field_num_date , int field_num_checksum ,
1480+ int field_num_record_id )
14751481{
14761482 char * original_url = NULL ;
1483+ char * date = NULL ;
14771484 char * checksum = NULL ;
14781485 char * record_id = NULL ;
14791486 char * token ;
@@ -1487,6 +1494,8 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
14871494 char * * val ;
14881495 if (field_num == field_num_original_url )
14891496 val = & original_url ;
1497+ else if (field_num == field_num_date )
1498+ val = & date ;
14901499 else if (field_num == field_num_checksum )
14911500 val = & checksum ;
14921501 else if (field_num == field_num_record_id )
@@ -1501,7 +1510,10 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
15011510 field_num ++ ;
15021511 }
15031512
1504- if (original_url != NULL && checksum != NULL && record_id != NULL )
1513+ if (original_url != NULL
1514+ && date != NULL
1515+ && checksum != NULL
1516+ && record_id != NULL )
15051517 {
15061518 /* For some extra efficiency, we decode the base32 encoded
15071519 checksum value. This should produce exactly SHA1_DIGEST_SIZE
@@ -1515,12 +1527,13 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
15151527 if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE )
15161528 {
15171529 /* This is a valid line with a valid checksum. */
1518- store_warc_record (original_url , NULL , record_id , checksum_v );
1530+ store_warc_record (original_url , date , record_id , checksum_v );
15191531 xfree (checksum_v );
15201532 }
15211533 else
15221534 {
15231535 xfree (original_url );
1536+ xfree (date );
15241537 xfree (checksum_v );
15251538 xfree (record_id );
15261539 }
@@ -1529,6 +1542,7 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
15291542 {
15301543 xfree (checksum );
15311544 xfree (original_url );
1545+ xfree (date );
15321546 xfree (record_id );
15331547 }
15341548}
@@ -1543,6 +1557,7 @@ warc_load_cdx_dedup_file (void)
15431557 size_t n = 0 ;
15441558 ssize_t line_length ;
15451559 int field_num_original_url = -1 ;
1560+ int field_num_date = -1 ;
15461561 int field_num_checksum = -1 ;
15471562 int field_num_record_id = -1 ;
15481563
@@ -1558,7 +1573,8 @@ warc_load_cdx_dedup_file (void)
15581573 line_length = getline (& lineptr , & n , f );
15591574 if (line_length != -1 )
15601575 warc_parse_cdx_header (lineptr , & field_num_original_url ,
1561- & field_num_checksum , & field_num_record_id );
1576+ & field_num_date , & field_num_checksum ,
1577+ & field_num_record_id );
15621578
15631579 /* If the file contains all three fields, read the complete file. */
15641580 if (field_num_original_url == -1
@@ -1568,6 +1584,9 @@ warc_load_cdx_dedup_file (void)
15681584 if (field_num_original_url == -1 )
15691585 logprintf (LOG_NOTQUIET ,
15701586_ ("CDX file does not list original urls. (Missing column 'a'.)\n" ));
1587+ if (field_num_date == -1 )
1588+ logprintf (LOG_NOTQUIET ,
1589+ _ ("CDX file does not list dates. (Missing column 'b'.)\n" ));
15711590 if (field_num_checksum == -1 )
15721591 logprintf (LOG_NOTQUIET ,
15731592_ ("CDX file does not list checksums. (Missing column 'k'.)\n" ));
@@ -1587,7 +1606,8 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n"));
15871606 if (line_length != -1 )
15881607 {
15891608 warc_process_cdx_line (lineptr , field_num_original_url ,
1590- field_num_checksum , field_num_record_id );
1609+ field_num_date , field_num_checksum ,
1610+ field_num_record_id );
15911611 }
15921612
15931613 }
0 commit comments