@@ -17,7 +17,7 @@ def do_harvest(self, start_date, end_date):
1717 start_date = start_date .date ()
1818
1919 # There is no apparent way to filter by date range, just sort by date.
20- url = furl (self .config .base_url )
20+ url = furl (self .config .base_url + '/catalog' )
2121 url .args ['per_page' ] = 10 # If it gets more active, consider upping to 50 or 100
2222 url .args ['sort' ] = 'system_modified_dtsi+desc'
2323
@@ -30,7 +30,7 @@ def fetch_records(self, url, start_date, end_date):
3030 resp = self .requests .get (furl (url ).set (query_params = {'page' : page }))
3131 soup = BeautifulSoup (resp .content , 'lxml' )
3232 try :
33- total = int (soup .select ('#sortAndPerPage .page_entries strong' )[- 1 ].text )
33+ total = int (soup .select ('#sortAndPerPage .page_entries strong' )[- 1 ].text . replace ( ',' , '' ) )
3434 except IndexError :
3535 total = 0
3636
@@ -43,9 +43,8 @@ def fetch_records(self, url, start_date, end_date):
4343 break
4444
4545 logger .info ('On document %d of %d (%d%%)' , count , total , (count / total ) * 100 )
46-
4746 for link in links :
48- item_response = self .requests .get (self .config .home_page + link )
47+ item_response = self .requests .get (self .config .base_url + link )
4948 if item_response .status_code // 100 != 2 :
5049 logger .warning ('Got non-200 status %s from %s' , item_response , link )
5150 continue
0 commit comments