Merge pull request #5 from tswast/national-jukebox

tswast · web-flow · commit f3a5590e0365 · 2025-07-09T10:09:09.000-05:00
National jukebox demo for SciPy 2025
diff --git a/2025/national-jukebox/.gitignore b/2025/national-jukebox/.gitignore
@@ -0,0 +1 @@
+data/*
diff --git a/2025/national-jukebox/download_first_page.py b/2025/national-jukebox/download_first_page.py
@@ -0,0 +1,73 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import pathlib
+import time
+
+import pandas
+import requests
+
+import list_urls
+import extract_item_info
+import extract_mp3
+
+
+DATA_DIR = pathlib.Path(__file__).parent / "data"
+
+
+target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
+item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url)
+
+
+def download_and_extract_item(base_url):
+    print(f"Fetching content from: {base_url}")
+    # https://guides.loc.gov/digital-scholarship/faq
+    # Stay within 20 requests per minute rate limit.
+    time.sleep(3)
+    response = requests.get(base_url)
+
+    try:
+        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching URL: {e}")
+        return None
+    
+    item = extract_item_info.extract_subheadings_to_dict(response.text)
+    mp3_url = extract_mp3.extract_mp3_url(response.text)
+    item["MP3 URL"] = mp3_url
+    item["URL"] = base_url
+    return item
+
+
+visited_urls = {}
+jukebox_path = DATA_DIR / "jukebox.jsonl"
+
+if jukebox_path.exists():
+    jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
+    visited_urls = frozenset(jukebox["URL"].to_list()) if "URL" in jukebox.columns else {}
+
+
+with open(DATA_DIR / "jukebox.jsonl", "a") as data_file:
+    for item_url in item_urls:
+        if item_url in visited_urls:
+            continue
+
+        item = download_and_extract_item(item_url)
+        if item is None:
+            continue
+
+        json.dump(item, data_file, indent=None)
+        data_file.write("\n")
+        data_file.flush()
diff --git a/2025/national-jukebox/download_mp3s.py b/2025/national-jukebox/download_mp3s.py
@@ -0,0 +1,54 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pathlib
+import time
+
+import pandas
+import requests
+
+
+DATA_DIR = pathlib.Path(__file__).parent / "data"
+
+
+
+def download_mp3(base_url):
+    print(f"Fetching content from: {base_url}")
+    # https://guides.loc.gov/digital-scholarship/faq
+    # Stay within 20 requests per minute rate limit.
+    time.sleep(3)
+    response = requests.get(base_url)
+
+    try:
+        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching URL: {e}")
+        return None
+    
+    return response.content
+
+
+jukebox_path = DATA_DIR / "jukebox.jsonl"
+jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
+
+for _, row in jukebox.iterrows():
+    jukebox_id = row["URL"].split("/")[-2]
+    mp3_path = (DATA_DIR / jukebox_id).with_suffix(".mp3")
+    if mp3_path.exists():
+        continue
+
+    mp3_bytes = download_mp3(row["MP3 URL"])
+    with open(mp3_path, "wb") as mp3_file:
+        mp3_file.write(mp3_bytes)
+    print(f"Wrote {mp3_path}")
diff --git a/2025/national-jukebox/extract_item_info.py b/2025/national-jukebox/extract_item_info.py
@@ -0,0 +1,110 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from bs4 import BeautifulSoup
+import requests
+import json
+
+def extract_subheadings_to_dict(html_content):
+    """
+    Extracts subheadings from the "About this item" section of HTML
+    and returns them as a JSON object.
+
+    Args:
+        html_content (str): The HTML content as a string.
+
+    Returns:
+        str: A JSON string where each subheading is a key, and its corresponding
+             value is a list of items under that subheading.
+             Returns an empty JSON object string if the section is not found.
+    """
+    soup = BeautifulSoup(html_content, 'html.parser')
+    about_this_item_section = soup.find('div', id='about-this-item')
+
+    if not about_this_item_section:
+        return json.dumps({})
+
+    subheadings_data = {}
+    
+    # Find the div that contains the actual cataloged data
+    item_cataloged_data = about_this_item_section.find('div', class_='item-cataloged-data')
+
+    if item_cataloged_data:
+        # Iterate through each subheading (h3) within this div
+        for h3_tag in item_cataloged_data.find_all('h3'):
+            subheading_text = h3_tag.get_text(strip=True)
+            items = []
+            # The items for each subheading are in the immediately following <ul>
+            ul_tag = h3_tag.find_next_sibling('ul')
+            if ul_tag:
+                for li_tag in ul_tag.find_all('li'):
+                    # Get text from list items, handling potential nested structures or links
+                    item_text = li_tag.get_text(strip=True)
+                    items.append(item_text)
+            subheadings_data[subheading_text] = items
+            
+    # Extract "Part of" section as it's outside item-cataloged-data but still a subheading
+    part_of_section = about_this_item_section.find('div', id='part-of')
+    if part_of_section:
+        h3_tag = part_of_section.find('h3')
+        if h3_tag:
+            subheading_text = h3_tag.get_text(strip=True)
+            items = []
+            ul_tag = h3_tag.find_next_sibling('ul')
+            if ul_tag:
+                for li_tag in ul_tag.find_all('li'):
+                    item_text = li_tag.get_text(strip=True)
+                    # Remove the count in parentheses if present, e.g., "(10,009)"
+                    if '(' in item_text and item_text.endswith(')'):
+                        item_text = item_text.rsplit('(', 1)[0].strip()
+                    items.append(item_text)
+            subheadings_data[subheading_text] = items
+            
+    # Extract IIIF Presentation Manifest
+    iiif_manifest_section = about_this_item_section.find('h3', id='item-iiif-presentation-manifest')
+    if iiif_manifest_section:
+        subheading_text = iiif_manifest_section.get_text(strip=True)
+        items = []
+        ul_tag = iiif_manifest_section.find_next_sibling('ul')
+        if ul_tag:
+            for li_tag in ul_tag.find_all('li'):
+                item_text = li_tag.get_text(strip=True)
+                items.append(item_text)
+        subheadings_data[subheading_text] = items
+
+    return subheadings_data
+
+
+def download_and_extract(base_url):
+    print(f"Fetching content from: {base_url}")
+    try:
+        response = requests.get(base_url)
+        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching URL: {e}")
+        return None
+    
+    item = extract_subheadings_to_dict(response.text)
+    item["URL"] = base_url
+    return item
+
+# Provided HTML content
+if __name__ == "__main__":
+    target_url = "https://www.loc.gov/item/jukebox-679643/"
+    item = download_and_extract(target_url)
+    if item:
+        print("\nFound song detail page URLs:")
+        print(json.dumps(item, indent=4))
+    else:
+        print("No song detail URLs found or an error occurred.")
diff --git a/2025/national-jukebox/extract_mp3.py b/2025/national-jukebox/extract_mp3.py
@@ -0,0 +1,60 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from bs4 import BeautifulSoup
+import requests
+import json
+
+def extract_mp3_url(html_content):
+    """
+    Extracts the MP3 download URL from the given HTML content.
+
+    Args:
+        html_content (str): The HTML content of the webpage.
+
+    Returns:
+        str or None: The MP3 download URL if found, otherwise None.
+    """
+    soup = BeautifulSoup(html_content, 'html.parser')
+
+    # Find the select element that contains download options
+    # Based on the HTML, it has an ID of 'select-resource0'
+    download_select = soup.find('select', id='select-resource0')
+
+    if download_select:
+        # Find the option tag specifically for AUDIO download (MP3)
+        # It has a data-file-download attribute set to "AUDIO"
+        mp3_option = download_select.find('option', attrs={'data-file-download': 'AUDIO'})
+        if mp3_option:
+            return mp3_option['value'] # Return the value attribute which is the URL
+    return None # Return None if the select or option is not found
+
+# Example Usage (assuming you've fetched the HTML using requests)
+if __name__ == "__main__":
+    url = "https://www.loc.gov/item/jukebox-679643/"
+    try:
+        response = requests.get(url)
+        response.raise_for_status() # Raise an exception for HTTP errors
+        html_doc = response.text
+
+        mp3_url = extract_mp3_url(html_doc)
+
+        if mp3_url:
+            print(f"Extracted MP3 URL: {mp3_url}")
+        else:
+            print("MP3 URL not found in the HTML.")
+
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching the URL: {e}")
+
diff --git a/2025/national-jukebox/list_urls.py b/2025/national-jukebox/list_urls.py
@@ -0,0 +1,77 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+
+def get_national_jukebox_song_detail_urls(base_url: str) -> list[str]:
+    """
+    Scrapes the National Jukebox collection page to extract URLs for individual song detail pages.
+
+    Args:
+        base_url: The URL of the main collection page (e.g., "https://www.loc.gov/collections/national-jukebox/?sb=date_desc").
+
+    Returns:
+        A list of URLs for the song detail pages.
+    """
+    print(f"Fetching content from: {base_url}")
+    try:
+        response = requests.get(base_url)
+        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching URL: {e}")
+        return []
+
+    soup = BeautifulSoup(response.text, 'html.parser')
+    detail_urls = []
+
+    # The structure of the page suggests that song links are typically within
+    # elements that represent individual items. Looking for common patterns like 'div'
+    # with specific classes or 'a' tags directly.
+    # From a quick inspection, it seems 'a' tags with hrefs pointing to individual
+    # records are nested within list items or similar structures.
+    # Let's try to find all links within a common container for search results.
+
+    # Assuming the main container for search results items has a class like 'item-results'
+    # or similar, and individual items have links within them.
+    # We'll look for <a> tags whose href attributes match a pattern for detail pages.
+    # A common pattern for Library of Congress detail pages is /item/{id}/ or /record/{id}/
+    # Let's target links that contain '/item/' in their href and are likely part of the main results.
+
+    # Find all 'a' tags that have an 'href' attribute
+    for link in soup.find_all('a', href=True):
+        href = link['href']
+        # Check if the href points to a detail page.
+        # Examples: /item/jukebox-12345/ or similar.
+        # We need to construct absolute URLs.
+        if '/item/jukebox' in href and not href.startswith('#'):
+            full_url = urljoin(base_url, href)
+            # Avoid adding duplicates if the same item link appears multiple times
+            if full_url not in detail_urls:
+                detail_urls.append(full_url)
+
+    return detail_urls
+
+if __name__ == "__main__":
+    target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
+    song_urls = get_national_jukebox_song_detail_urls(target_url)
+
+    if song_urls:
+        print("\nFound song detail page URLs:")
+        for url in song_urls:
+            print(url)
+        print(f"\nTotal URLs found: {len(song_urls)}")
+    else:
+        print("No song detail URLs found or an error occurred.")
diff --git a/2025/national-jukebox/transcribe_songs.ipynb b/2025/national-jukebox/transcribe_songs.ipynb