download mp3s

tswast · tswast · commit 89f02960336c · 2025-07-08T21:48:21.000-07:00
diff --git a/2025/national-jukebox/download_first_page.py b/2025/national-jukebox/download_first_page.py
@@ -1,8 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import pathlib
-import requests
 import time
 
+import pandas
+import requests
+
 import list_urls
 import extract_item_info
 import extract_mp3
@@ -21,10 +37,6 @@ def download_and_extract_item(base_url):
     # Stay within 20 requests per minute rate limit.
     time.sleep(3)
     response = requests.get(base_url)
-    while response.status_code == 429:
-        print("Too many requests, sleeping")
-        time.sleep(10)
-        response = requests.get(base_url)
 
     try:
         response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
@@ -35,12 +47,27 @@ def download_and_extract_item(base_url):
     item = extract_item_info.extract_subheadings_to_dict(response.text)
     mp3_url = extract_mp3.extract_mp3_url(response.text)
     item["MP3 URL"] = mp3_url
+    item["URL"] = base_url
     return item
 
 
-with open(DATA_DIR / "jukebox.jsonl", "w") as data_file:
+visited_urls = {}
+jukebox_path = DATA_DIR / "jukebox.jsonl"
+
+if jukebox_path.exists():
+    jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
+    visited_urls = frozenset(jukebox["URL"].to_list()) if "URL" in jukebox.columns else {}
+
+
+with open(DATA_DIR / "jukebox.jsonl", "a") as data_file:
     for item_url in item_urls:
+        if item_url in visited_urls:
+            continue
+
         item = download_and_extract_item(item_url)
+        if item is None:
+            continue
+
         json.dump(item, data_file, indent=None)
         data_file.write("\n")
         data_file.flush()
diff --git a/2025/national-jukebox/download_mp3s.py b/2025/national-jukebox/download_mp3s.py
@@ -0,0 +1,54 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pathlib
+import time
+
+import pandas
+import requests
+
+
+DATA_DIR = pathlib.Path(__file__).parent / "data"
+
+
+
+def download_mp3(base_url):
+    print(f"Fetching content from: {base_url}")
+    # https://guides.loc.gov/digital-scholarship/faq
+    # Stay within 20 requests per minute rate limit.
+    time.sleep(3)
+    response = requests.get(base_url)
+
+    try:
+        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching URL: {e}")
+        return None
+    
+    return response.content
+
+
+jukebox_path = DATA_DIR / "jukebox.jsonl"
+jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
+
+for _, row in jukebox.iterrows():
+    jukebox_id = row["URL"].split("/")[-2]
+    mp3_path = (DATA_DIR / jukebox_id).with_suffix(".mp3")
+    if mp3_path.exists():
+        continue
+
+    mp3_bytes = download_mp3(row["MP3 URL"])
+    with open(mp3_path, "wb") as mp3_file:
+        mp3_file.write(mp3_bytes)
+    print(f"Wrote {mp3_path}")
diff --git a/2025/national-jukebox/extract_item_info.py b/2025/national-jukebox/extract_item_info.py
@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from bs4 import BeautifulSoup
 import requests
 import json
@@ -81,7 +95,9 @@ def download_and_extract(base_url):
         print(f"Error fetching URL: {e}")
         return None
     
-    return extract_subheadings_to_dict(response.text)
+    item = extract_subheadings_to_dict(response.text)
+    item["URL"] = base_url
+    return item
 
 # Provided HTML content
 if __name__ == "__main__":
diff --git a/2025/national-jukebox/extract_mp3.py b/2025/national-jukebox/extract_mp3.py
@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from bs4 import BeautifulSoup
 import requests
 import json
diff --git a/2025/national-jukebox/list_urls.py b/2025/national-jukebox/list_urls.py
@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin