|
3 | 3 | from bs4 import BeautifulSoup, Comment, Tag |
4 | 4 | from django.template.loader import render_to_string |
5 | 5 | from django.templatetags.static import static |
| 6 | +from lxml import html |
6 | 7 |
|
7 | 8 | from core.boostrenderer import get_body_from_html |
8 | 9 | from core.constants import SourceDocType |
@@ -241,26 +242,30 @@ def modernize_legacy_page( |
241 | 242 |
|
242 | 243 | def slightly_modernize_legacy_library_doc_page(content): |
243 | 244 | """Modernize a legacy Boost library documentation page, but only minimally.""" |
244 | | - result = BeautifulSoup(content, "html.parser") |
245 | | - if result.html is None: |
246 | | - # Not an HTML file we care about |
247 | | - return content |
248 | | - # Remove the first occurrence of legacy header(s) and other stuff |
249 | | - for tag_name, tag_attrs in REMOVE_TAGS: |
250 | | - tag = result.find(tag_name, tag_attrs) |
251 | | - if tag: |
252 | | - tag.decompose() |
| 245 | + try: |
| 246 | + root = html.fromstring(content) |
| 247 | + except Exception: |
| 248 | + return content # Not valid HTML |
253 | 249 |
|
254 | | - for tag_name, tag_attrs in REMOVE_ALL: |
255 | | - for tag in result.find_all(tag_name, tag_attrs): |
256 | | - tag.decompose() |
| 250 | + for tag_name, attrs in REMOVE_TAGS: |
| 251 | + xpath = build_xpath(tag_name, attrs) |
| 252 | + elements = root.xpath(xpath) |
| 253 | + if elements: |
| 254 | + elements[0].getparent().remove(elements[0]) # Remove only first |
257 | 255 |
|
258 | | - content = str(result) |
| 256 | + for tag_name, attrs in REMOVE_ALL: |
| 257 | + xpath = build_xpath(tag_name, attrs) |
| 258 | + for el in root.xpath(xpath): |
| 259 | + el.getparent().remove(el) |
259 | 260 |
|
260 | | - # Replace all links to boost.org with a local link |
261 | | - content = content.replace("https://www.boost.org/doc/libs/", "/doc/libs/") |
| 261 | + content = html.tostring(root, encoding="unicode", method="html") |
| 262 | + return content.replace("https://www.boost.org/doc/libs/", "/doc/libs/") |
262 | 263 |
|
263 | | - return content |
| 264 | + |
| 265 | +def build_xpath(tag, attrs): |
| 266 | + parts = [f"@{key}='{val}'" for key, val in attrs.items()] |
| 267 | + condition = " and ".join(parts) |
| 268 | + return f".//{tag}[{condition}]" if condition else f".//{tag}" |
264 | 269 |
|
265 | 270 |
|
266 | 271 | def get_library_documentation_urls(content, name="Alphabetically", parent="h2"): |
|
0 commit comments