Skip to content

Commit e095dc1

Browse files
committed
Merge remote-tracking branch 'origin/main' into airflow-demo
2 parents 83a834c + f81a082 commit e095dc1

File tree

7 files changed

+2956
-0
lines changed

7 files changed

+2956
-0
lines changed

2025/national-jukebox/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
data/*
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import json
16+
import pathlib
17+
import time
18+
19+
import pandas
20+
import requests
21+
22+
import list_urls
23+
import extract_item_info
24+
import extract_mp3
25+
import download_mp3s
26+
27+
28+
DATA_DIR = pathlib.Path(__file__).parent / "data"
29+
30+
31+
# target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
32+
target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp={}"
33+
34+
35+
def download_and_extract_item(base_url):
36+
print(f"Fetching content from: {base_url}")
37+
# https://guides.loc.gov/digital-scholarship/faq
38+
# Stay within 20 requests per minute rate limit.
39+
time.sleep(3)
40+
41+
try:
42+
response = requests.get(base_url, timeout=10)
43+
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
44+
except requests.exceptions.RequestException as e:
45+
print(f"Error fetching URL: {e}")
46+
return None
47+
48+
item = extract_item_info.extract_subheadings_to_dict(response.text)
49+
mp3_url = extract_mp3.extract_mp3_url(response.text)
50+
item["MP3 URL"] = mp3_url
51+
item["URL"] = base_url
52+
return item
53+
54+
55+
56+
def download_page(page_number):
57+
target_url = target_url_template.format(page_number)
58+
item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url)
59+
60+
visited_urls = set()
61+
jukebox_path = DATA_DIR / "jukebox.jsonl"
62+
63+
if jukebox_path.exists():
64+
jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
65+
visited_urls = frozenset(jukebox["URL"].to_list()) if "URL" in jukebox.columns else {}
66+
67+
with open(DATA_DIR / "jukebox.jsonl", "a") as data_file:
68+
while item_urls:
69+
item_url = item_urls.pop(0)
70+
if item_url in visited_urls:
71+
continue
72+
73+
item = download_and_extract_item(item_url)
74+
if item is None:
75+
item_urls.append(item_url)
76+
continue
77+
78+
json.dump(item, data_file, indent=None)
79+
data_file.write("\n")
80+
data_file.flush()
81+
82+
83+
if __name__ == "__main__":
84+
page_number = 4
85+
while True:
86+
print(f"Page {page_number}")
87+
try:
88+
download_page(page_number)
89+
download_mp3s.download_all()
90+
except requests.exceptions.HTTPError as exc:
91+
if exc.response.status_code == 404:
92+
print("Reached last page?")
93+
break
94+
page_number += 1
95+
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pathlib
16+
import time
17+
18+
import pandas
19+
import requests
20+
21+
22+
DATA_DIR = pathlib.Path(__file__).parent / "data"
23+
24+
25+
26+
def download_mp3(base_url):
27+
print(f"Fetching content from: {base_url}")
28+
# https://guides.loc.gov/digital-scholarship/faq
29+
# Stay within 20 requests per minute rate limit.
30+
time.sleep(3)
31+
32+
try:
33+
response = requests.get(base_url)
34+
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
35+
except requests.exceptions.RequestException as e:
36+
print(f"Error fetching URL: {e}")
37+
return None
38+
39+
return response.content
40+
41+
42+
def download_all():
43+
jukebox_path = DATA_DIR / "jukebox.jsonl"
44+
jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
45+
46+
# for _, row in jukebox.iterrows():
47+
for _, row in jukebox.iloc[100:].iterrows():
48+
jukebox_id = row["URL"].split("/")[-2]
49+
mp3_path = (DATA_DIR / jukebox_id).with_suffix(".mp3")
50+
if mp3_path.exists():
51+
continue
52+
53+
mp3_bytes = download_mp3(row["MP3 URL"])
54+
if mp3_bytes is None:
55+
continue
56+
57+
with open(mp3_path, "wb") as mp3_file:
58+
mp3_file.write(mp3_bytes)
59+
print(f"Wrote {mp3_path}")
60+
61+
62+
if __name__ == "__main__":
63+
download_all()
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from bs4 import BeautifulSoup
16+
import requests
17+
import json
18+
19+
def extract_subheadings_to_dict(html_content):
20+
"""
21+
Extracts subheadings from the "About this item" section of HTML
22+
and returns them as a JSON object.
23+
24+
Args:
25+
html_content (str): The HTML content as a string.
26+
27+
Returns:
28+
str: A JSON string where each subheading is a key, and its corresponding
29+
value is a list of items under that subheading.
30+
Returns an empty JSON object string if the section is not found.
31+
"""
32+
soup = BeautifulSoup(html_content, 'html.parser')
33+
about_this_item_section = soup.find('div', id='about-this-item')
34+
35+
if not about_this_item_section:
36+
return json.dumps({})
37+
38+
subheadings_data = {}
39+
40+
# Find the div that contains the actual cataloged data
41+
item_cataloged_data = about_this_item_section.find('div', class_='item-cataloged-data')
42+
43+
if item_cataloged_data:
44+
# Iterate through each subheading (h3) within this div
45+
for h3_tag in item_cataloged_data.find_all('h3'):
46+
subheading_text = h3_tag.get_text(strip=True)
47+
items = []
48+
# The items for each subheading are in the immediately following <ul>
49+
ul_tag = h3_tag.find_next_sibling('ul')
50+
if ul_tag:
51+
for li_tag in ul_tag.find_all('li'):
52+
# Get text from list items, handling potential nested structures or links
53+
item_text = li_tag.get_text(strip=True)
54+
items.append(item_text)
55+
subheadings_data[subheading_text] = items
56+
57+
# Extract "Part of" section as it's outside item-cataloged-data but still a subheading
58+
part_of_section = about_this_item_section.find('div', id='part-of')
59+
if part_of_section:
60+
h3_tag = part_of_section.find('h3')
61+
if h3_tag:
62+
subheading_text = h3_tag.get_text(strip=True)
63+
items = []
64+
ul_tag = h3_tag.find_next_sibling('ul')
65+
if ul_tag:
66+
for li_tag in ul_tag.find_all('li'):
67+
item_text = li_tag.get_text(strip=True)
68+
# Remove the count in parentheses if present, e.g., "(10,009)"
69+
if '(' in item_text and item_text.endswith(')'):
70+
item_text = item_text.rsplit('(', 1)[0].strip()
71+
items.append(item_text)
72+
subheadings_data[subheading_text] = items
73+
74+
# Extract IIIF Presentation Manifest
75+
iiif_manifest_section = about_this_item_section.find('h3', id='item-iiif-presentation-manifest')
76+
if iiif_manifest_section:
77+
subheading_text = iiif_manifest_section.get_text(strip=True)
78+
items = []
79+
ul_tag = iiif_manifest_section.find_next_sibling('ul')
80+
if ul_tag:
81+
for li_tag in ul_tag.find_all('li'):
82+
item_text = li_tag.get_text(strip=True)
83+
items.append(item_text)
84+
subheadings_data[subheading_text] = items
85+
86+
return subheadings_data
87+
88+
89+
def download_and_extract(base_url):
90+
print(f"Fetching content from: {base_url}")
91+
try:
92+
response = requests.get(base_url)
93+
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
94+
except requests.exceptions.RequestException as e:
95+
print(f"Error fetching URL: {e}")
96+
return None
97+
98+
item = extract_subheadings_to_dict(response.text)
99+
item["URL"] = base_url
100+
return item
101+
102+
# Provided HTML content
103+
if __name__ == "__main__":
104+
target_url = "https://www.loc.gov/item/jukebox-679643/"
105+
item = download_and_extract(target_url)
106+
if item:
107+
print("\nFound song detail page URLs:")
108+
print(json.dumps(item, indent=4))
109+
else:
110+
print("No song detail URLs found or an error occurred.")
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from bs4 import BeautifulSoup
16+
import requests
17+
import json
18+
19+
def extract_mp3_url(html_content):
20+
"""
21+
Extracts the MP3 download URL from the given HTML content.
22+
23+
Args:
24+
html_content (str): The HTML content of the webpage.
25+
26+
Returns:
27+
str or None: The MP3 download URL if found, otherwise None.
28+
"""
29+
soup = BeautifulSoup(html_content, 'html.parser')
30+
31+
# Find the select element that contains download options
32+
# Based on the HTML, it has an ID of 'select-resource0'
33+
download_select = soup.find('select', id='select-resource0')
34+
35+
if download_select:
36+
# Find the option tag specifically for AUDIO download (MP3)
37+
# It has a data-file-download attribute set to "AUDIO"
38+
mp3_option = download_select.find('option', attrs={'data-file-download': 'AUDIO'})
39+
if mp3_option:
40+
return mp3_option['value'] # Return the value attribute which is the URL
41+
return None # Return None if the select or option is not found
42+
43+
# Example Usage (assuming you've fetched the HTML using requests)
44+
if __name__ == "__main__":
45+
url = "https://www.loc.gov/item/jukebox-679643/"
46+
try:
47+
response = requests.get(url)
48+
response.raise_for_status() # Raise an exception for HTTP errors
49+
html_doc = response.text
50+
51+
mp3_url = extract_mp3_url(html_doc)
52+
53+
if mp3_url:
54+
print(f"Extracted MP3 URL: {mp3_url}")
55+
else:
56+
print("MP3 URL not found in the HTML.")
57+
58+
except requests.exceptions.RequestException as e:
59+
print(f"Error fetching the URL: {e}")
60+

0 commit comments

Comments
 (0)