Skip to content

Commit 89f0296

Browse files
committed
download mp3s
1 parent 46d4c8d commit 89f0296

File tree

5 files changed

+132
-7
lines changed

5 files changed

+132
-7
lines changed

2025/national-jukebox/download_first_page.py

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,24 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
115
import json
216
import pathlib
3-
import requests
417
import time
518

19+
import pandas
20+
import requests
21+
622
import list_urls
723
import extract_item_info
824
import extract_mp3
@@ -21,10 +37,6 @@ def download_and_extract_item(base_url):
2137
# Stay within 20 requests per minute rate limit.
2238
time.sleep(3)
2339
response = requests.get(base_url)
24-
while response.status_code == 429:
25-
print("Too many requests, sleeping")
26-
time.sleep(10)
27-
response = requests.get(base_url)
2840

2941
try:
3042
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
@@ -35,12 +47,27 @@ def download_and_extract_item(base_url):
3547
item = extract_item_info.extract_subheadings_to_dict(response.text)
3648
mp3_url = extract_mp3.extract_mp3_url(response.text)
3749
item["MP3 URL"] = mp3_url
50+
item["URL"] = base_url
3851
return item
3952

4053

41-
with open(DATA_DIR / "jukebox.jsonl", "w") as data_file:
54+
visited_urls = {}
55+
jukebox_path = DATA_DIR / "jukebox.jsonl"
56+
57+
if jukebox_path.exists():
58+
jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
59+
visited_urls = frozenset(jukebox["URL"].to_list()) if "URL" in jukebox.columns else {}
60+
61+
62+
with open(DATA_DIR / "jukebox.jsonl", "a") as data_file:
4263
for item_url in item_urls:
64+
if item_url in visited_urls:
65+
continue
66+
4367
item = download_and_extract_item(item_url)
68+
if item is None:
69+
continue
70+
4471
json.dump(item, data_file, indent=None)
4572
data_file.write("\n")
4673
data_file.flush()
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pathlib
16+
import time
17+
18+
import pandas
19+
import requests
20+
21+
22+
DATA_DIR = pathlib.Path(__file__).parent / "data"
23+
24+
25+
26+
def download_mp3(base_url):
27+
print(f"Fetching content from: {base_url}")
28+
# https://guides.loc.gov/digital-scholarship/faq
29+
# Stay within 20 requests per minute rate limit.
30+
time.sleep(3)
31+
response = requests.get(base_url)
32+
33+
try:
34+
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
35+
except requests.exceptions.RequestException as e:
36+
print(f"Error fetching URL: {e}")
37+
return None
38+
39+
return response.content
40+
41+
42+
jukebox_path = DATA_DIR / "jukebox.jsonl"
43+
jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
44+
45+
for _, row in jukebox.iterrows():
46+
jukebox_id = row["URL"].split("/")[-2]
47+
mp3_path = (DATA_DIR / jukebox_id).with_suffix(".mp3")
48+
if mp3_path.exists():
49+
continue
50+
51+
mp3_bytes = download_mp3(row["MP3 URL"])
52+
with open(mp3_path, "wb") as mp3_file:
53+
mp3_file.write(mp3_bytes)
54+
print(f"Wrote {mp3_path}")

2025/national-jukebox/extract_item_info.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
115
from bs4 import BeautifulSoup
216
import requests
317
import json
@@ -81,7 +95,9 @@ def download_and_extract(base_url):
8195
print(f"Error fetching URL: {e}")
8296
return None
8397

84-
return extract_subheadings_to_dict(response.text)
98+
item = extract_subheadings_to_dict(response.text)
99+
item["URL"] = base_url
100+
return item
85101

86102
# Provided HTML content
87103
if __name__ == "__main__":

2025/national-jukebox/extract_mp3.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
115
from bs4 import BeautifulSoup
216
import requests
317
import json

2025/national-jukebox/list_urls.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
115
import requests
216
from bs4 import BeautifulSoup
317
from urllib.parse import urljoin

0 commit comments

Comments
 (0)