Merge remote-tracking branch 'origin/main' into airflow-demo

tswast · tswast · commit e095dc1a2099 · 2025-07-10T15:20:08.000-07:00
diff --git a/2025/national-jukebox/.gitignore b/2025/national-jukebox/.gitignore
@@ -0,0 +1 @@
+data/*
diff --git a/2025/national-jukebox/download_all.py b/2025/national-jukebox/download_all.py
@@ -0,0 +1,95 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import pathlib
+import time
+
+import pandas
+import requests
+
+import list_urls
+import extract_item_info
+import extract_mp3
+import download_mp3s
+
+
+DATA_DIR = pathlib.Path(__file__).parent / "data"
+
+
+# target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
+target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp={}"
+
+
+def download_and_extract_item(base_url):
+    print(f"Fetching content from: {base_url}")
+    # https://guides.loc.gov/digital-scholarship/faq
+    # Stay within 20 requests per minute rate limit.
+    time.sleep(3)
+
+    try:
+        response = requests.get(base_url, timeout=10)
+        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching URL: {e}")
+        return None
+    
+    item = extract_item_info.extract_subheadings_to_dict(response.text)
+    mp3_url = extract_mp3.extract_mp3_url(response.text)
+    item["MP3 URL"] = mp3_url
+    item["URL"] = base_url
+    return item
+
+
+
+def download_page(page_number):
+    target_url = target_url_template.format(page_number)
+    item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url)
+
+    visited_urls = set()
+    jukebox_path = DATA_DIR / "jukebox.jsonl"
+
+    if jukebox_path.exists():
+        jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
+        visited_urls = frozenset(jukebox["URL"].to_list()) if "URL" in jukebox.columns else {}
+
+    with open(DATA_DIR / "jukebox.jsonl", "a") as data_file:
+        while item_urls:
+            item_url = item_urls.pop(0)
+            if item_url in visited_urls:
+                continue
+
+            item = download_and_extract_item(item_url)
+            if item is None:
+                item_urls.append(item_url)
+                continue
+
+            json.dump(item, data_file, indent=None)
+            data_file.write("\n")
+            data_file.flush()
+
+
+if __name__ == "__main__":
+    page_number = 4
+    while True:
+        print(f"Page {page_number}")
+        try:
+            download_page(page_number)
+            download_mp3s.download_all()
+        except requests.exceptions.HTTPError as exc:
+            if exc.response.status_code == 404:
+                print("Reached last page?")
+                break
+        page_number += 1
+
diff --git a/2025/national-jukebox/download_mp3s.py b/2025/national-jukebox/download_mp3s.py
@@ -0,0 +1,63 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pathlib
+import time
+
+import pandas
+import requests
+
+
+DATA_DIR = pathlib.Path(__file__).parent / "data"
+
+
+
+def download_mp3(base_url):
+    print(f"Fetching content from: {base_url}")
+    # https://guides.loc.gov/digital-scholarship/faq
+    # Stay within 20 requests per minute rate limit.
+    time.sleep(3)
+
+    try:
+        response = requests.get(base_url)
+        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching URL: {e}")
+        return None
+    
+    return response.content
+
+
+def download_all():
+    jukebox_path = DATA_DIR / "jukebox.jsonl"
+    jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
+
+    # for _, row in jukebox.iterrows():
+    for _, row in jukebox.iloc[100:].iterrows():
+        jukebox_id = row["URL"].split("/")[-2]
+        mp3_path = (DATA_DIR / jukebox_id).with_suffix(".mp3")
+        if mp3_path.exists():
+            continue
+
+        mp3_bytes = download_mp3(row["MP3 URL"])
+        if mp3_bytes is None:
+            continue
+
+        with open(mp3_path, "wb") as mp3_file:
+            mp3_file.write(mp3_bytes)
+        print(f"Wrote {mp3_path}")
+
+
+if __name__ == "__main__":
+    download_all()
diff --git a/2025/national-jukebox/extract_item_info.py b/2025/national-jukebox/extract_item_info.py
@@ -0,0 +1,110 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from bs4 import BeautifulSoup
+import requests
+import json
+
+def extract_subheadings_to_dict(html_content):
+    """
+    Extracts subheadings from the "About this item" section of HTML
+    and returns them as a JSON object.
+
+    Args:
+        html_content (str): The HTML content as a string.
+
+    Returns:
+        str: A JSON string where each subheading is a key, and its corresponding
+             value is a list of items under that subheading.
+             Returns an empty JSON object string if the section is not found.
+    """
+    soup = BeautifulSoup(html_content, 'html.parser')
+    about_this_item_section = soup.find('div', id='about-this-item')
+
+    if not about_this_item_section:
+        return json.dumps({})
+
+    subheadings_data = {}
+    
+    # Find the div that contains the actual cataloged data
+    item_cataloged_data = about_this_item_section.find('div', class_='item-cataloged-data')
+
+    if item_cataloged_data:
+        # Iterate through each subheading (h3) within this div
+        for h3_tag in item_cataloged_data.find_all('h3'):
+            subheading_text = h3_tag.get_text(strip=True)
+            items = []
+            # The items for each subheading are in the immediately following <ul>
+            ul_tag = h3_tag.find_next_sibling('ul')
+            if ul_tag:
+                for li_tag in ul_tag.find_all('li'):
+                    # Get text from list items, handling potential nested structures or links
+                    item_text = li_tag.get_text(strip=True)
+                    items.append(item_text)
+            subheadings_data[subheading_text] = items
+            
+    # Extract "Part of" section as it's outside item-cataloged-data but still a subheading
+    part_of_section = about_this_item_section.find('div', id='part-of')
+    if part_of_section:
+        h3_tag = part_of_section.find('h3')
+        if h3_tag:
+            subheading_text = h3_tag.get_text(strip=True)
+            items = []
+            ul_tag = h3_tag.find_next_sibling('ul')
+            if ul_tag:
+                for li_tag in ul_tag.find_all('li'):
+                    item_text = li_tag.get_text(strip=True)
+                    # Remove the count in parentheses if present, e.g., "(10,009)"
+                    if '(' in item_text and item_text.endswith(')'):
+                        item_text = item_text.rsplit('(', 1)[0].strip()
+                    items.append(item_text)
+            subheadings_data[subheading_text] = items
+            
+    # Extract IIIF Presentation Manifest
+    iiif_manifest_section = about_this_item_section.find('h3', id='item-iiif-presentation-manifest')
+    if iiif_manifest_section:
+        subheading_text = iiif_manifest_section.get_text(strip=True)
+        items = []
+        ul_tag = iiif_manifest_section.find_next_sibling('ul')
+        if ul_tag:
+            for li_tag in ul_tag.find_all('li'):
+                item_text = li_tag.get_text(strip=True)
+                items.append(item_text)
+        subheadings_data[subheading_text] = items
+
+    return subheadings_data
+
+
+def download_and_extract(base_url):
+    print(f"Fetching content from: {base_url}")
+    try:
+        response = requests.get(base_url)
+        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching URL: {e}")
+        return None
+    
+    item = extract_subheadings_to_dict(response.text)
+    item["URL"] = base_url
+    return item
+
+# Provided HTML content
+if __name__ == "__main__":
+    target_url = "https://www.loc.gov/item/jukebox-679643/"
+    item = download_and_extract(target_url)
+    if item:
+        print("\nFound song detail page URLs:")
+        print(json.dumps(item, indent=4))
+    else:
+        print("No song detail URLs found or an error occurred.")
diff --git a/2025/national-jukebox/extract_mp3.py b/2025/national-jukebox/extract_mp3.py
@@ -0,0 +1,60 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from bs4 import BeautifulSoup
+import requests
+import json
+
+def extract_mp3_url(html_content):
+    """
+    Extracts the MP3 download URL from the given HTML content.
+
+    Args:
+        html_content (str): The HTML content of the webpage.
+
+    Returns:
+        str or None: The MP3 download URL if found, otherwise None.
+    """
+    soup = BeautifulSoup(html_content, 'html.parser')
+
+    # Find the select element that contains download options
+    # Based on the HTML, it has an ID of 'select-resource0'
+    download_select = soup.find('select', id='select-resource0')
+
+    if download_select:
+        # Find the option tag specifically for AUDIO download (MP3)
+        # It has a data-file-download attribute set to "AUDIO"
+        mp3_option = download_select.find('option', attrs={'data-file-download': 'AUDIO'})
+        if mp3_option:
+            return mp3_option['value'] # Return the value attribute which is the URL
+    return None # Return None if the select or option is not found
+
+# Example Usage (assuming you've fetched the HTML using requests)
+if __name__ == "__main__":
+    url = "https://www.loc.gov/item/jukebox-679643/"
+    try:
+        response = requests.get(url)
+        response.raise_for_status() # Raise an exception for HTTP errors
+        html_doc = response.text
+
+        mp3_url = extract_mp3_url(html_doc)
+
+        if mp3_url:
+            print(f"Extracted MP3 URL: {mp3_url}")
+        else:
+            print("MP3 URL not found in the HTML.")
+
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching the URL: {e}")
+
diff --git a/2025/national-jukebox/list_urls.py b/2025/national-jukebox/list_urls.py
diff --git a/2025/national-jukebox/transcribe_songs.ipynb b/2025/national-jukebox/transcribe_songs.ipynb