1+ # Copyright 2025 Google LLC
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # https://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
115import json
216import pathlib
3- import requests
417import time
518
19+ import pandas
20+ import requests
21+
622import list_urls
723import extract_item_info
824import extract_mp3
@@ -21,10 +37,6 @@ def download_and_extract_item(base_url):
2137 # Stay within 20 requests per minute rate limit.
2238 time .sleep (3 )
2339 response = requests .get (base_url )
24- while response .status_code == 429 :
25- print ("Too many requests, sleeping" )
26- time .sleep (10 )
27- response = requests .get (base_url )
2840
2941 try :
3042 response .raise_for_status () # Raise an exception for HTTP errors (4xx or 5xx)
@@ -35,12 +47,27 @@ def download_and_extract_item(base_url):
3547 item = extract_item_info .extract_subheadings_to_dict (response .text )
3648 mp3_url = extract_mp3 .extract_mp3_url (response .text )
3749 item ["MP3 URL" ] = mp3_url
50+ item ["URL" ] = base_url
3851 return item
3952
4053
41- with open (DATA_DIR / "jukebox.jsonl" , "w" ) as data_file :
54+ visited_urls = {}
55+ jukebox_path = DATA_DIR / "jukebox.jsonl"
56+
57+ if jukebox_path .exists ():
58+ jukebox = pandas .read_json (jukebox_path , lines = True , orient = "records" )
59+ visited_urls = frozenset (jukebox ["URL" ].to_list ()) if "URL" in jukebox .columns else {}
60+
61+
62+ with open (DATA_DIR / "jukebox.jsonl" , "a" ) as data_file :
4263 for item_url in item_urls :
64+ if item_url in visited_urls :
65+ continue
66+
4367 item = download_and_extract_item (item_url )
68+ if item is None :
69+ continue
70+
4471 json .dump (item , data_file , indent = None )
4572 data_file .write ("\n " )
4673 data_file .flush ()
0 commit comments