2727
2828DATA_DIR = pathlib .Path (__file__ ).parent / "data"
2929
30+ MAX_PAGES = 169
31+
32+ # https://guides.loc.gov/digital-scholarship/faq
33+ # Stay within 20 requests per minute rate limit.
34+ SLEEP_SECONDS = 60.0 / 20.0
3035
3136# target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
32- target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp={}"
37+ # target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp={}"
38+ target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date&sp={}"
39+ # target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&fa=original-format:sound+recording&sb=date&sp={}"
3340
3441
3542def download_and_extract_item (base_url ):
3643 print (f"Fetching content from: { base_url } " )
37- # https://guides.loc.gov/digital-scholarship/faq
38- # Stay within 20 requests per minute rate limit.
39- time .sleep (3 )
44+ time .sleep (SLEEP_SECONDS )
4045
4146 try :
4247 response = requests .get (base_url , timeout = 10 )
@@ -55,7 +60,7 @@ def download_and_extract_item(base_url):
5560
5661def download_page (page_number ):
5762 target_url = target_url_template .format (page_number )
58- item_urls = list_urls .get_national_jukebox_song_detail_urls (target_url )
63+ item_urls = list_urls .get_national_jukebox_song_detail_urls (target_url , sleep_seconds = SLEEP_SECONDS )
5964
6065 visited_urls = set ()
6166 jukebox_path = DATA_DIR / "jukebox.jsonl"
@@ -81,15 +86,19 @@ def download_page(page_number):
8186
8287
8388if __name__ == "__main__" :
84- page_number = 30 # 4
89+ page_number = 1
8590 while True :
8691 print (f"Page { page_number } " )
8792 try :
8893 download_page (page_number )
89- download_mp3s .download_all ()
94+ # Server is currently down for audio.
95+ # download_mp3s.download_all(sleep_seconds=SLEEP_SECONDS)
9096 except requests .exceptions .HTTPError as exc :
9197 if exc .response .status_code == 404 :
9298 print ("Reached last page?" )
9399 break
94100 page_number += 1
95101
102+ if page_number > MAX_PAGES :
103+ break
104+
0 commit comments