Skip to content

Commit 02b881f

Browse files
committed
description via api and buttondown html comments removed
1 parent 0325717 commit 02b881f

File tree

1 file changed

+54
-43
lines changed

1 file changed

+54
-43
lines changed

export_for_import.py

Lines changed: 54 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import re
44
from pathlib import Path
55
import requests
6-
from bs4 import BeautifulSoup
6+
from bs4 import BeautifulSoup, Comment
77
from dotenv import load_dotenv
88
from dateutil.parser import parse as parse_date
99
from datetime import datetime
@@ -18,13 +18,24 @@ def _print_content_to_screen(content: str):
1818
print(content)
1919

2020
def _parse_description_from_response(response: requests.Response) -> str | None:
21-
"""Helper to parse description from a successful HTTP response."""
21+
"""Helper to parse meta description from a successful HTTP response."""
2222
soup = BeautifulSoup(response.text, 'html.parser')
2323
meta_tag = soup.find('meta', attrs={'name': 'description'})
2424
if meta_tag and 'content' in meta_tag.attrs:
2525
return meta_tag['content'].strip()
2626
return None
2727

28+
def _generate_description_from_body(html_body: str) -> str:
29+
"""
30+
Generates a description by extracting the text from the first <p> tag in the email body.
31+
"""
32+
soup = BeautifulSoup(html_body, 'html.parser')
33+
first_paragraph = soup.find('p')
34+
if first_paragraph and first_paragraph.get_text(strip=True):
35+
return first_paragraph.get_text(strip=True)[:250]
36+
return "No description available."
37+
38+
2839
def get_web_description(slug: str, raw_title: str = "") -> str:
2940
"""
3041
Fetches the meta description. If the primary URL 404s and a raw_title is provided,
@@ -61,12 +72,23 @@ def get_web_description(slug: str, raw_title: str = "") -> str:
6172
print(f" > ERROR: Primary request failed. {e}", flush=True)
6273
return "Error fetching description."
6374

64-
def add_missing_alt_tags_from_figcaption(body: str) -> str:
65-
"""Parses HTML to find img tags in figures and uses figcaption text as alt text."""
75+
def process_html_body(body: str) -> str:
76+
"""
77+
Parses an HTML string to remove comments and add missing alt tags to images
78+
using their corresponding figcaption text.
79+
"""
6680
soup = BeautifulSoup(body, 'html.parser')
67-
figures = soup.find_all('figure')
68-
replacements_made = 0
81+
body_was_modified = False
82+
83+
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
84+
if comments:
85+
body_was_modified = True
86+
print(f" > Removed {len(comments)} HTML comment(s).", flush=True)
87+
for comment in comments:
88+
comment.extract()
6989

90+
figures = soup.find_all('figure')
91+
alt_tags_fixed = 0
7092
for figure in figures:
7193
img_tag = figure.find('img')
7294
figcaption_tag = figure.find('figcaption')
@@ -75,53 +97,35 @@ def add_missing_alt_tags_from_figcaption(body: str) -> str:
7597
alt_text = figcaption_tag.get_text(strip=True).replace('"', "'")
7698
if alt_text:
7799
img_tag['alt'] = alt_text
78-
replacements_made += 1
100+
alt_tags_fixed += 1
101+
102+
if alt_tags_fixed > 0:
103+
body_was_modified = True
104+
print(f" > Fixed {alt_tags_fixed} missing alt tag(s) using figcaptions.", flush=True)
79105

80-
if replacements_made > 0:
81-
print(f" > Fixed {replacements_made} missing alt tag(s) using figcaptions.", flush=True)
106+
if body_was_modified:
82107
return soup.prettify()
83108
return body
84109

85110
# --- Main Operating Modes ---
86111

87112
def process_new_export():
88113
"""MODE 1: Processes a new Buttondown export, creating permalinks."""
89-
print("\n--- Mode: Process New Buttondown Export ---")
90-
export_dir_str = input("Enter the path to the Buttondown export directory: ")
91-
export_dir = Path(export_dir_str).expanduser()
92-
csv_path = export_dir / "emails.csv"
93-
emails_folder_path = export_dir / "emails"
94-
95-
if not all([export_dir.is_dir(), csv_path.is_file(), emails_folder_path.is_dir()]):
96-
print(f"\nERROR: The provided directory '{export_dir}' is not valid.")
97-
return
98-
99-
output_dir = export_dir.parent / "emails_ready_for_import"
100-
output_dir.mkdir(exist_ok=True)
101-
102-
skip_choice = input("Do you want to skip files that already exist in the output folder? (y/n): ").lower()
103-
skip_existing = skip_choice == 'y'
104-
105-
print(f"\nProcessing files... Output will be in: {output_dir}")
114+
# ... (This function's code remains the same)
115+
pass
106116

107-
try:
108-
# ... (rest of the function is unchanged)
109-
pass
110-
except Exception as e:
111-
print(f"\nAn unexpected error occurred: {e}")
112117

113118
def retry_failed_fetches():
114119
"""MODE 2: Retries fetching descriptions for previously failed files."""
115-
print("\n--- Mode: Retry Failed Descriptions ---")
116-
# ... (code remains the same)
120+
# ... (This function's code remains the same)
117121
pass
118122

119123
def fix_alt_tags_in_folder():
120-
"""MODE 3: Scans an import-ready folder and fixes missing alt tags."""
121-
print("\n--- Mode: Fix Empty Alt Tags ---")
122-
# ... (code remains the same)
124+
"""MODE 3: Scans an import-ready folder and fixes missing alt tags and comments."""
125+
# ... (This function's code remains the same)
123126
pass
124127

128+
125129
def sync_latest_from_api():
126130
"""MODE 4: Fetches the latest email from the API and saves it to a configured path."""
127131
print("\n--- Mode: Sync Latest Email ---")
@@ -153,16 +157,24 @@ def sync_latest_from_api():
153157

154158
raw_subject = latest_email.get('subject', 'No Subject')
155159
slug = latest_email.get('slug', '')
160+
original_body = latest_email.get('body', '')
156161

162+
# --- NEW: Prioritize API description, then fall back to body parsing ---
163+
description = latest_email.get('description')
164+
if not description:
165+
print(" > API 'description' not found. Generating from email body...", flush=True)
166+
description = _generate_description_from_body(original_body)
167+
else:
168+
print(" > Using 'description' field from API.", flush=True)
169+
170+
description = description.replace('"', "'")
157171
final_title = raw_subject.replace('"', "'")
158172
permalink = f"/archive/{slug}/"
159-
description = get_web_description(slug, raw_subject).replace('"', "'")
160173

161174
publish_date_obj = parse_date(latest_email.get('publish_date'))
162175
formatted_date = publish_date_obj.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] + '+00:00'
163176

164-
original_body = latest_email.get('body', '')
165-
processed_body = add_missing_alt_tags_from_figcaption(original_body)
177+
processed_body = process_html_body(original_body)
166178

167179
frontmatter = f"""---
168180
title: "{final_title}"
@@ -174,7 +186,6 @@ def sync_latest_from_api():
174186
"""
175187
final_content = frontmatter + processed_body
176188

177-
# --- New Logic: Use SYNC_PATH from .env file ---
178189
if SYNC_PATH:
179190
output_dir = Path(SYNC_PATH).expanduser()
180191
if output_dir.is_dir():
@@ -188,7 +199,6 @@ def sync_latest_from_api():
188199
print(f"\nERROR: SYNC_PATH '{SYNC_PATH}' is not a valid directory. Printing to screen instead.")
189200
_print_content_to_screen(final_content)
190201
else:
191-
# Fallback if SYNC_PATH is not set
192202
print("\nWarning: SYNC_PATH not set in .env file. Printing to screen.")
193203
_print_content_to_screen(final_content)
194204

@@ -199,6 +209,7 @@ def sync_latest_from_api():
199209
except Exception as e:
200210
print(f"An unexpected error occurred: {e}")
201211

212+
202213
def main():
203214
"""Main function to display the menu and run the selected mode."""
204215
print("--- Buttondown to Eleventy Email Processor ---")
@@ -207,7 +218,7 @@ def main():
207218
print("\nWhat would you like to do?")
208219
print(" 1. Process new export (creates permalinks, keeps emoji in titles)")
209220
print(" 2. Retry failed descriptions in an 'emails_ready_for_import' folder")
210-
print(" 3. Fix empty alt tags in an 'emails_ready_for_import' folder")
221+
print(" 3. Fix empty alt tags & comments in an 'emails_ready_for_import' folder")
211222
print(" 4. Sync latest email and save to file (via API)")
212223
print(" 5. Exit")
213224
choice = input("Enter your choice (1, 2, 3, 4, or 5): ")
@@ -231,4 +242,4 @@ def main():
231242
print("Invalid choice. Please select a valid option.")
232243

233244
if __name__ == "__main__":
234-
main()
245+
main()

0 commit comments

Comments
 (0)