Skip to content

Commit f3a5590

Browse files
authored
Merge pull request #5 from tswast/national-jukebox
National jukebox demo for SciPy 2025
2 parents bf46bc3 + a331464 commit f3a5590

File tree

7 files changed

+1325
-0
lines changed

7 files changed

+1325
-0
lines changed

2025/national-jukebox/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
data/*
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import json
16+
import pathlib
17+
import time
18+
19+
import pandas
20+
import requests
21+
22+
import list_urls
23+
import extract_item_info
24+
import extract_mp3
25+
26+
27+
DATA_DIR = pathlib.Path(__file__).parent / "data"
28+
29+
30+
target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
31+
item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url)
32+
33+
34+
def download_and_extract_item(base_url):
35+
print(f"Fetching content from: {base_url}")
36+
# https://guides.loc.gov/digital-scholarship/faq
37+
# Stay within 20 requests per minute rate limit.
38+
time.sleep(3)
39+
response = requests.get(base_url)
40+
41+
try:
42+
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
43+
except requests.exceptions.RequestException as e:
44+
print(f"Error fetching URL: {e}")
45+
return None
46+
47+
item = extract_item_info.extract_subheadings_to_dict(response.text)
48+
mp3_url = extract_mp3.extract_mp3_url(response.text)
49+
item["MP3 URL"] = mp3_url
50+
item["URL"] = base_url
51+
return item
52+
53+
54+
visited_urls = {}
55+
jukebox_path = DATA_DIR / "jukebox.jsonl"
56+
57+
if jukebox_path.exists():
58+
jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
59+
visited_urls = frozenset(jukebox["URL"].to_list()) if "URL" in jukebox.columns else {}
60+
61+
62+
with open(DATA_DIR / "jukebox.jsonl", "a") as data_file:
63+
for item_url in item_urls:
64+
if item_url in visited_urls:
65+
continue
66+
67+
item = download_and_extract_item(item_url)
68+
if item is None:
69+
continue
70+
71+
json.dump(item, data_file, indent=None)
72+
data_file.write("\n")
73+
data_file.flush()
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pathlib
16+
import time
17+
18+
import pandas
19+
import requests
20+
21+
22+
DATA_DIR = pathlib.Path(__file__).parent / "data"
23+
24+
25+
26+
def download_mp3(base_url):
27+
print(f"Fetching content from: {base_url}")
28+
# https://guides.loc.gov/digital-scholarship/faq
29+
# Stay within 20 requests per minute rate limit.
30+
time.sleep(3)
31+
response = requests.get(base_url)
32+
33+
try:
34+
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
35+
except requests.exceptions.RequestException as e:
36+
print(f"Error fetching URL: {e}")
37+
return None
38+
39+
return response.content
40+
41+
42+
jukebox_path = DATA_DIR / "jukebox.jsonl"
43+
jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
44+
45+
for _, row in jukebox.iterrows():
46+
jukebox_id = row["URL"].split("/")[-2]
47+
mp3_path = (DATA_DIR / jukebox_id).with_suffix(".mp3")
48+
if mp3_path.exists():
49+
continue
50+
51+
mp3_bytes = download_mp3(row["MP3 URL"])
52+
with open(mp3_path, "wb") as mp3_file:
53+
mp3_file.write(mp3_bytes)
54+
print(f"Wrote {mp3_path}")
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from bs4 import BeautifulSoup
16+
import requests
17+
import json
18+
19+
def extract_subheadings_to_dict(html_content):
20+
"""
21+
Extracts subheadings from the "About this item" section of HTML
22+
and returns them as a JSON object.
23+
24+
Args:
25+
html_content (str): The HTML content as a string.
26+
27+
Returns:
28+
str: A JSON string where each subheading is a key, and its corresponding
29+
value is a list of items under that subheading.
30+
Returns an empty JSON object string if the section is not found.
31+
"""
32+
soup = BeautifulSoup(html_content, 'html.parser')
33+
about_this_item_section = soup.find('div', id='about-this-item')
34+
35+
if not about_this_item_section:
36+
return json.dumps({})
37+
38+
subheadings_data = {}
39+
40+
# Find the div that contains the actual cataloged data
41+
item_cataloged_data = about_this_item_section.find('div', class_='item-cataloged-data')
42+
43+
if item_cataloged_data:
44+
# Iterate through each subheading (h3) within this div
45+
for h3_tag in item_cataloged_data.find_all('h3'):
46+
subheading_text = h3_tag.get_text(strip=True)
47+
items = []
48+
# The items for each subheading are in the immediately following <ul>
49+
ul_tag = h3_tag.find_next_sibling('ul')
50+
if ul_tag:
51+
for li_tag in ul_tag.find_all('li'):
52+
# Get text from list items, handling potential nested structures or links
53+
item_text = li_tag.get_text(strip=True)
54+
items.append(item_text)
55+
subheadings_data[subheading_text] = items
56+
57+
# Extract "Part of" section as it's outside item-cataloged-data but still a subheading
58+
part_of_section = about_this_item_section.find('div', id='part-of')
59+
if part_of_section:
60+
h3_tag = part_of_section.find('h3')
61+
if h3_tag:
62+
subheading_text = h3_tag.get_text(strip=True)
63+
items = []
64+
ul_tag = h3_tag.find_next_sibling('ul')
65+
if ul_tag:
66+
for li_tag in ul_tag.find_all('li'):
67+
item_text = li_tag.get_text(strip=True)
68+
# Remove the count in parentheses if present, e.g., "(10,009)"
69+
if '(' in item_text and item_text.endswith(')'):
70+
item_text = item_text.rsplit('(', 1)[0].strip()
71+
items.append(item_text)
72+
subheadings_data[subheading_text] = items
73+
74+
# Extract IIIF Presentation Manifest
75+
iiif_manifest_section = about_this_item_section.find('h3', id='item-iiif-presentation-manifest')
76+
if iiif_manifest_section:
77+
subheading_text = iiif_manifest_section.get_text(strip=True)
78+
items = []
79+
ul_tag = iiif_manifest_section.find_next_sibling('ul')
80+
if ul_tag:
81+
for li_tag in ul_tag.find_all('li'):
82+
item_text = li_tag.get_text(strip=True)
83+
items.append(item_text)
84+
subheadings_data[subheading_text] = items
85+
86+
return subheadings_data
87+
88+
89+
def download_and_extract(base_url):
90+
print(f"Fetching content from: {base_url}")
91+
try:
92+
response = requests.get(base_url)
93+
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
94+
except requests.exceptions.RequestException as e:
95+
print(f"Error fetching URL: {e}")
96+
return None
97+
98+
item = extract_subheadings_to_dict(response.text)
99+
item["URL"] = base_url
100+
return item
101+
102+
# Provided HTML content
103+
if __name__ == "__main__":
104+
target_url = "https://www.loc.gov/item/jukebox-679643/"
105+
item = download_and_extract(target_url)
106+
if item:
107+
print("\nFound song detail page URLs:")
108+
print(json.dumps(item, indent=4))
109+
else:
110+
print("No song detail URLs found or an error occurred.")
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from bs4 import BeautifulSoup
16+
import requests
17+
import json
18+
19+
def extract_mp3_url(html_content):
20+
"""
21+
Extracts the MP3 download URL from the given HTML content.
22+
23+
Args:
24+
html_content (str): The HTML content of the webpage.
25+
26+
Returns:
27+
str or None: The MP3 download URL if found, otherwise None.
28+
"""
29+
soup = BeautifulSoup(html_content, 'html.parser')
30+
31+
# Find the select element that contains download options
32+
# Based on the HTML, it has an ID of 'select-resource0'
33+
download_select = soup.find('select', id='select-resource0')
34+
35+
if download_select:
36+
# Find the option tag specifically for AUDIO download (MP3)
37+
# It has a data-file-download attribute set to "AUDIO"
38+
mp3_option = download_select.find('option', attrs={'data-file-download': 'AUDIO'})
39+
if mp3_option:
40+
return mp3_option['value'] # Return the value attribute which is the URL
41+
return None # Return None if the select or option is not found
42+
43+
# Example Usage (assuming you've fetched the HTML using requests)
44+
if __name__ == "__main__":
45+
url = "https://www.loc.gov/item/jukebox-679643/"
46+
try:
47+
response = requests.get(url)
48+
response.raise_for_status() # Raise an exception for HTTP errors
49+
html_doc = response.text
50+
51+
mp3_url = extract_mp3_url(html_doc)
52+
53+
if mp3_url:
54+
print(f"Extracted MP3 URL: {mp3_url}")
55+
else:
56+
print("MP3 URL not found in the HTML.")
57+
58+
except requests.exceptions.RequestException as e:
59+
print(f"Error fetching the URL: {e}")
60+

2025/national-jukebox/list_urls.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import requests
16+
from bs4 import BeautifulSoup
17+
from urllib.parse import urljoin
18+
19+
def get_national_jukebox_song_detail_urls(base_url: str) -> list[str]:
20+
"""
21+
Scrapes the National Jukebox collection page to extract URLs for individual song detail pages.
22+
23+
Args:
24+
base_url: The URL of the main collection page (e.g., "https://www.loc.gov/collections/national-jukebox/?sb=date_desc").
25+
26+
Returns:
27+
A list of URLs for the song detail pages.
28+
"""
29+
print(f"Fetching content from: {base_url}")
30+
try:
31+
response = requests.get(base_url)
32+
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
33+
except requests.exceptions.RequestException as e:
34+
print(f"Error fetching URL: {e}")
35+
return []
36+
37+
soup = BeautifulSoup(response.text, 'html.parser')
38+
detail_urls = []
39+
40+
# The structure of the page suggests that song links are typically within
41+
# elements that represent individual items. Looking for common patterns like 'div'
42+
# with specific classes or 'a' tags directly.
43+
# From a quick inspection, it seems 'a' tags with hrefs pointing to individual
44+
# records are nested within list items or similar structures.
45+
# Let's try to find all links within a common container for search results.
46+
47+
# Assuming the main container for search results items has a class like 'item-results'
48+
# or similar, and individual items have links within them.
49+
# We'll look for <a> tags whose href attributes match a pattern for detail pages.
50+
# A common pattern for Library of Congress detail pages is /item/{id}/ or /record/{id}/
51+
# Let's target links that contain '/item/' in their href and are likely part of the main results.
52+
53+
# Find all 'a' tags that have an 'href' attribute
54+
for link in soup.find_all('a', href=True):
55+
href = link['href']
56+
# Check if the href points to a detail page.
57+
# Examples: /item/jukebox-12345/ or similar.
58+
# We need to construct absolute URLs.
59+
if '/item/jukebox' in href and not href.startswith('#'):
60+
full_url = urljoin(base_url, href)
61+
# Avoid adding duplicates if the same item link appears multiple times
62+
if full_url not in detail_urls:
63+
detail_urls.append(full_url)
64+
65+
return detail_urls
66+
67+
if __name__ == "__main__":
68+
target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
69+
song_urls = get_national_jukebox_song_detail_urls(target_url)
70+
71+
if song_urls:
72+
print("\nFound song detail page URLs:")
73+
for url in song_urls:
74+
print(url)
75+
print(f"\nTotal URLs found: {len(song_urls)}")
76+
else:
77+
print("No song detail URLs found or an error occurred.")

0 commit comments

Comments
 (0)