2222import list_urls
2323import extract_item_info
2424import extract_mp3
25+ import download_mp3s
2526
2627
2728DATA_DIR = pathlib .Path (__file__ ).parent / "data"
2829
2930
3031# target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
31- target_url = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp=2"
32- item_urls = list_urls .get_national_jukebox_song_detail_urls (target_url )
32+ target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp={}"
3333
3434
3535def download_and_extract_item (base_url ):
3636 print (f"Fetching content from: { base_url } " )
3737 # https://guides.loc.gov/digital-scholarship/faq
3838 # Stay within 20 requests per minute rate limit.
3939 time .sleep (3 )
40- response = requests .get (base_url )
4140
4241 try :
42+ response = requests .get (base_url , timeout = 10 )
4343 response .raise_for_status () # Raise an exception for HTTP errors (4xx or 5xx)
4444 except requests .exceptions .RequestException as e :
4545 print (f"Error fetching URL: { e } " )
@@ -52,23 +52,44 @@ def download_and_extract_item(base_url):
5252 return item
5353
5454
55- visited_urls = {}
56- jukebox_path = DATA_DIR / "jukebox.jsonl"
5755
58- if jukebox_path . exists ( ):
59- jukebox = pandas . read_json ( jukebox_path , lines = True , orient = "records" )
60- visited_urls = frozenset ( jukebox [ "URL" ]. to_list ()) if "URL" in jukebox . columns else {}
56+ def download_page ( page_number ):
57+ target_url = target_url_template . format ( page_number )
58+ item_urls = list_urls . get_national_jukebox_song_detail_urls ( target_url )
6159
60+ visited_urls = set ()
61+ jukebox_path = DATA_DIR / "jukebox.jsonl"
6262
63- with open (DATA_DIR / "jukebox.jsonl" , "a" ) as data_file :
64- for item_url in item_urls :
65- if item_url in visited_urls :
66- continue
63+ if jukebox_path .exists ():
64+ jukebox = pandas .read_json (jukebox_path , lines = True , orient = "records" )
65+ visited_urls = frozenset (jukebox ["URL" ].to_list ()) if "URL" in jukebox .columns else {}
6766
68- item = download_and_extract_item (item_url )
69- if item is None :
70- continue
67+ with open (DATA_DIR / "jukebox.jsonl" , "a" ) as data_file :
68+ while item_urls :
69+ item_url = item_urls .pop (0 )
70+ if item_url in visited_urls :
71+ continue
72+
73+ item = download_and_extract_item (item_url )
74+ if item is None :
75+ item_urls .append (item_url )
76+ continue
77+
78+ json .dump (item , data_file , indent = None )
79+ data_file .write ("\n " )
80+ data_file .flush ()
81+
82+
83+ if __name__ == "__main__" :
84+ page_number = 4
85+ while True :
86+ print (f"Page { page_number } " )
87+ try :
88+ download_page (page_number )
89+ download_mp3s .download_all ()
90+ except requests .exceptions .HTTPError as exc :
91+ if exc .response .status_code == 404 :
92+ print ("Reached last page?" )
93+ break
94+ page_number += 1
7195
72- json .dump (item , data_file , indent = None )
73- data_file .write ("\n " )
74- data_file .flush ()
0 commit comments