Skip to content

Commit 46d4c8d

Browse files
committed
demo to transcribe the first 100 songs in the national jukebox
1 parent bf46bc3 commit 46d4c8d

File tree

5 files changed

+250
-0
lines changed

5 files changed

+250
-0
lines changed

2025/national-jukebox/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
data/*
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import json
2+
import pathlib
3+
import requests
4+
import time
5+
6+
import list_urls
7+
import extract_item_info
8+
import extract_mp3
9+
10+
11+
DATA_DIR = pathlib.Path(__file__).parent / "data"
12+
13+
14+
target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
15+
item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url)
16+
17+
18+
def download_and_extract_item(base_url):
19+
print(f"Fetching content from: {base_url}")
20+
# https://guides.loc.gov/digital-scholarship/faq
21+
# Stay within 20 requests per minute rate limit.
22+
time.sleep(3)
23+
response = requests.get(base_url)
24+
while response.status_code == 429:
25+
print("Too many requests, sleeping")
26+
time.sleep(10)
27+
response = requests.get(base_url)
28+
29+
try:
30+
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
31+
except requests.exceptions.RequestException as e:
32+
print(f"Error fetching URL: {e}")
33+
return None
34+
35+
item = extract_item_info.extract_subheadings_to_dict(response.text)
36+
mp3_url = extract_mp3.extract_mp3_url(response.text)
37+
item["MP3 URL"] = mp3_url
38+
return item
39+
40+
41+
with open(DATA_DIR / "jukebox.jsonl", "w") as data_file:
42+
for item_url in item_urls:
43+
item = download_and_extract_item(item_url)
44+
json.dump(item, data_file, indent=None)
45+
data_file.write("\n")
46+
data_file.flush()
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
from bs4 import BeautifulSoup
2+
import requests
3+
import json
4+
5+
def extract_subheadings_to_dict(html_content):
6+
"""
7+
Extracts subheadings from the "About this item" section of HTML
8+
and returns them as a JSON object.
9+
10+
Args:
11+
html_content (str): The HTML content as a string.
12+
13+
Returns:
14+
str: A JSON string where each subheading is a key, and its corresponding
15+
value is a list of items under that subheading.
16+
Returns an empty JSON object string if the section is not found.
17+
"""
18+
soup = BeautifulSoup(html_content, 'html.parser')
19+
about_this_item_section = soup.find('div', id='about-this-item')
20+
21+
if not about_this_item_section:
22+
return json.dumps({})
23+
24+
subheadings_data = {}
25+
26+
# Find the div that contains the actual cataloged data
27+
item_cataloged_data = about_this_item_section.find('div', class_='item-cataloged-data')
28+
29+
if item_cataloged_data:
30+
# Iterate through each subheading (h3) within this div
31+
for h3_tag in item_cataloged_data.find_all('h3'):
32+
subheading_text = h3_tag.get_text(strip=True)
33+
items = []
34+
# The items for each subheading are in the immediately following <ul>
35+
ul_tag = h3_tag.find_next_sibling('ul')
36+
if ul_tag:
37+
for li_tag in ul_tag.find_all('li'):
38+
# Get text from list items, handling potential nested structures or links
39+
item_text = li_tag.get_text(strip=True)
40+
items.append(item_text)
41+
subheadings_data[subheading_text] = items
42+
43+
# Extract "Part of" section as it's outside item-cataloged-data but still a subheading
44+
part_of_section = about_this_item_section.find('div', id='part-of')
45+
if part_of_section:
46+
h3_tag = part_of_section.find('h3')
47+
if h3_tag:
48+
subheading_text = h3_tag.get_text(strip=True)
49+
items = []
50+
ul_tag = h3_tag.find_next_sibling('ul')
51+
if ul_tag:
52+
for li_tag in ul_tag.find_all('li'):
53+
item_text = li_tag.get_text(strip=True)
54+
# Remove the count in parentheses if present, e.g., "(10,009)"
55+
if '(' in item_text and item_text.endswith(')'):
56+
item_text = item_text.rsplit('(', 1)[0].strip()
57+
items.append(item_text)
58+
subheadings_data[subheading_text] = items
59+
60+
# Extract IIIF Presentation Manifest
61+
iiif_manifest_section = about_this_item_section.find('h3', id='item-iiif-presentation-manifest')
62+
if iiif_manifest_section:
63+
subheading_text = iiif_manifest_section.get_text(strip=True)
64+
items = []
65+
ul_tag = iiif_manifest_section.find_next_sibling('ul')
66+
if ul_tag:
67+
for li_tag in ul_tag.find_all('li'):
68+
item_text = li_tag.get_text(strip=True)
69+
items.append(item_text)
70+
subheadings_data[subheading_text] = items
71+
72+
return subheadings_data
73+
74+
75+
def download_and_extract(base_url):
76+
print(f"Fetching content from: {base_url}")
77+
try:
78+
response = requests.get(base_url)
79+
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
80+
except requests.exceptions.RequestException as e:
81+
print(f"Error fetching URL: {e}")
82+
return None
83+
84+
return extract_subheadings_to_dict(response.text)
85+
86+
# Provided HTML content
87+
if __name__ == "__main__":
88+
target_url = "https://www.loc.gov/item/jukebox-679643/"
89+
item = download_and_extract(target_url)
90+
if item:
91+
print("\nFound song detail page URLs:")
92+
print(json.dumps(item, indent=4))
93+
else:
94+
print("No song detail URLs found or an error occurred.")
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from bs4 import BeautifulSoup
2+
import requests
3+
import json
4+
5+
def extract_mp3_url(html_content):
6+
"""
7+
Extracts the MP3 download URL from the given HTML content.
8+
9+
Args:
10+
html_content (str): The HTML content of the webpage.
11+
12+
Returns:
13+
str or None: The MP3 download URL if found, otherwise None.
14+
"""
15+
soup = BeautifulSoup(html_content, 'html.parser')
16+
17+
# Find the select element that contains download options
18+
# Based on the HTML, it has an ID of 'select-resource0'
19+
download_select = soup.find('select', id='select-resource0')
20+
21+
if download_select:
22+
# Find the option tag specifically for AUDIO download (MP3)
23+
# It has a data-file-download attribute set to "AUDIO"
24+
mp3_option = download_select.find('option', attrs={'data-file-download': 'AUDIO'})
25+
if mp3_option:
26+
return mp3_option['value'] # Return the value attribute which is the URL
27+
return None # Return None if the select or option is not found
28+
29+
# Example Usage (assuming you've fetched the HTML using requests)
30+
if __name__ == "__main__":
31+
url = "https://www.loc.gov/item/jukebox-679643/"
32+
try:
33+
response = requests.get(url)
34+
response.raise_for_status() # Raise an exception for HTTP errors
35+
html_doc = response.text
36+
37+
mp3_url = extract_mp3_url(html_doc)
38+
39+
if mp3_url:
40+
print(f"Extracted MP3 URL: {mp3_url}")
41+
else:
42+
print("MP3 URL not found in the HTML.")
43+
44+
except requests.exceptions.RequestException as e:
45+
print(f"Error fetching the URL: {e}")
46+

2025/national-jukebox/list_urls.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
from urllib.parse import urljoin
4+
5+
def get_national_jukebox_song_detail_urls(base_url: str) -> list[str]:
6+
"""
7+
Scrapes the National Jukebox collection page to extract URLs for individual song detail pages.
8+
9+
Args:
10+
base_url: The URL of the main collection page (e.g., "https://www.loc.gov/collections/national-jukebox/?sb=date_desc").
11+
12+
Returns:
13+
A list of URLs for the song detail pages.
14+
"""
15+
print(f"Fetching content from: {base_url}")
16+
try:
17+
response = requests.get(base_url)
18+
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
19+
except requests.exceptions.RequestException as e:
20+
print(f"Error fetching URL: {e}")
21+
return []
22+
23+
soup = BeautifulSoup(response.text, 'html.parser')
24+
detail_urls = []
25+
26+
# The structure of the page suggests that song links are typically within
27+
# elements that represent individual items. Looking for common patterns like 'div'
28+
# with specific classes or 'a' tags directly.
29+
# From a quick inspection, it seems 'a' tags with hrefs pointing to individual
30+
# records are nested within list items or similar structures.
31+
# Let's try to find all links within a common container for search results.
32+
33+
# Assuming the main container for search results items has a class like 'item-results'
34+
# or similar, and individual items have links within them.
35+
# We'll look for <a> tags whose href attributes match a pattern for detail pages.
36+
# A common pattern for Library of Congress detail pages is /item/{id}/ or /record/{id}/
37+
# Let's target links that contain '/item/' in their href and are likely part of the main results.
38+
39+
# Find all 'a' tags that have an 'href' attribute
40+
for link in soup.find_all('a', href=True):
41+
href = link['href']
42+
# Check if the href points to a detail page.
43+
# Examples: /item/jukebox-12345/ or similar.
44+
# We need to construct absolute URLs.
45+
if '/item/jukebox' in href and not href.startswith('#'):
46+
full_url = urljoin(base_url, href)
47+
# Avoid adding duplicates if the same item link appears multiple times
48+
if full_url not in detail_urls:
49+
detail_urls.append(full_url)
50+
51+
return detail_urls
52+
53+
if __name__ == "__main__":
54+
target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
55+
song_urls = get_national_jukebox_song_detail_urls(target_url)
56+
57+
if song_urls:
58+
print("\nFound song detail page URLs:")
59+
for url in song_urls:
60+
print(url)
61+
print(f"\nTotal URLs found: {len(song_urls)}")
62+
else:
63+
print("No song detail URLs found or an error occurred.")

0 commit comments

Comments
 (0)