|
| 1 | +# Copyright 2025 Google LLC |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# https://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +from bs4 import BeautifulSoup |
| 16 | +import requests |
| 17 | +import json |
| 18 | + |
| 19 | +def extract_subheadings_to_dict(html_content): |
| 20 | + """ |
| 21 | + Extracts subheadings from the "About this item" section of HTML |
| 22 | + and returns them as a JSON object. |
| 23 | +
|
| 24 | + Args: |
| 25 | + html_content (str): The HTML content as a string. |
| 26 | +
|
| 27 | + Returns: |
| 28 | + str: A JSON string where each subheading is a key, and its corresponding |
| 29 | + value is a list of items under that subheading. |
| 30 | + Returns an empty JSON object string if the section is not found. |
| 31 | + """ |
| 32 | + soup = BeautifulSoup(html_content, 'html.parser') |
| 33 | + about_this_item_section = soup.find('div', id='about-this-item') |
| 34 | + |
| 35 | + if not about_this_item_section: |
| 36 | + return json.dumps({}) |
| 37 | + |
| 38 | + subheadings_data = {} |
| 39 | + |
| 40 | + # Find the div that contains the actual cataloged data |
| 41 | + item_cataloged_data = about_this_item_section.find('div', class_='item-cataloged-data') |
| 42 | + |
| 43 | + if item_cataloged_data: |
| 44 | + # Iterate through each subheading (h3) within this div |
| 45 | + for h3_tag in item_cataloged_data.find_all('h3'): |
| 46 | + subheading_text = h3_tag.get_text(strip=True) |
| 47 | + items = [] |
| 48 | + # The items for each subheading are in the immediately following <ul> |
| 49 | + ul_tag = h3_tag.find_next_sibling('ul') |
| 50 | + if ul_tag: |
| 51 | + for li_tag in ul_tag.find_all('li'): |
| 52 | + # Get text from list items, handling potential nested structures or links |
| 53 | + item_text = li_tag.get_text(strip=True) |
| 54 | + items.append(item_text) |
| 55 | + subheadings_data[subheading_text] = items |
| 56 | + |
| 57 | + # Extract "Part of" section as it's outside item-cataloged-data but still a subheading |
| 58 | + part_of_section = about_this_item_section.find('div', id='part-of') |
| 59 | + if part_of_section: |
| 60 | + h3_tag = part_of_section.find('h3') |
| 61 | + if h3_tag: |
| 62 | + subheading_text = h3_tag.get_text(strip=True) |
| 63 | + items = [] |
| 64 | + ul_tag = h3_tag.find_next_sibling('ul') |
| 65 | + if ul_tag: |
| 66 | + for li_tag in ul_tag.find_all('li'): |
| 67 | + item_text = li_tag.get_text(strip=True) |
| 68 | + # Remove the count in parentheses if present, e.g., "(10,009)" |
| 69 | + if '(' in item_text and item_text.endswith(')'): |
| 70 | + item_text = item_text.rsplit('(', 1)[0].strip() |
| 71 | + items.append(item_text) |
| 72 | + subheadings_data[subheading_text] = items |
| 73 | + |
| 74 | + # Extract IIIF Presentation Manifest |
| 75 | + iiif_manifest_section = about_this_item_section.find('h3', id='item-iiif-presentation-manifest') |
| 76 | + if iiif_manifest_section: |
| 77 | + subheading_text = iiif_manifest_section.get_text(strip=True) |
| 78 | + items = [] |
| 79 | + ul_tag = iiif_manifest_section.find_next_sibling('ul') |
| 80 | + if ul_tag: |
| 81 | + for li_tag in ul_tag.find_all('li'): |
| 82 | + item_text = li_tag.get_text(strip=True) |
| 83 | + items.append(item_text) |
| 84 | + subheadings_data[subheading_text] = items |
| 85 | + |
| 86 | + return subheadings_data |
| 87 | + |
| 88 | + |
| 89 | +def download_and_extract(base_url): |
| 90 | + print(f"Fetching content from: {base_url}") |
| 91 | + try: |
| 92 | + response = requests.get(base_url) |
| 93 | + response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx) |
| 94 | + except requests.exceptions.RequestException as e: |
| 95 | + print(f"Error fetching URL: {e}") |
| 96 | + return None |
| 97 | + |
| 98 | + item = extract_subheadings_to_dict(response.text) |
| 99 | + item["URL"] = base_url |
| 100 | + return item |
| 101 | + |
| 102 | +# Provided HTML content |
| 103 | +if __name__ == "__main__": |
| 104 | + target_url = "https://www.loc.gov/item/jukebox-679643/" |
| 105 | + item = download_and_extract(target_url) |
| 106 | + if item: |
| 107 | + print("\nFound song detail page URLs:") |
| 108 | + print(json.dumps(item, indent=4)) |
| 109 | + else: |
| 110 | + print("No song detail URLs found or an error occurred.") |
0 commit comments