Improve performance of processing large docs files (boostorg#1799)

gregjkal · web-flow · commit 72112febff0d · 2025-05-21T18:43:03.000-04:00
diff --git a/core/htmlhelper.py b/core/htmlhelper.py
@@ -3,6 +3,7 @@
 from bs4 import BeautifulSoup, Comment, Tag
 from django.template.loader import render_to_string
 from django.templatetags.static import static
+from lxml import html
 
 from core.boostrenderer import get_body_from_html
 from core.constants import SourceDocType
@@ -241,26 +242,30 @@ def modernize_legacy_page(
 
 def slightly_modernize_legacy_library_doc_page(content):
     """Modernize a legacy Boost library documentation page, but only minimally."""
-    result = BeautifulSoup(content, "html.parser")
-    if result.html is None:
-        # Not an HTML file we care about
-        return content
-    # Remove the first occurrence of legacy header(s) and other stuff
-    for tag_name, tag_attrs in REMOVE_TAGS:
-        tag = result.find(tag_name, tag_attrs)
-        if tag:
-            tag.decompose()
+    try:
+        root = html.fromstring(content)
+    except Exception:
+        return content  # Not valid HTML
 
-    for tag_name, tag_attrs in REMOVE_ALL:
-        for tag in result.find_all(tag_name, tag_attrs):
-            tag.decompose()
+    for tag_name, attrs in REMOVE_TAGS:
+        xpath = build_xpath(tag_name, attrs)
+        elements = root.xpath(xpath)
+        if elements:
+            elements[0].getparent().remove(elements[0])  # Remove only first
 
-    content = str(result)
+    for tag_name, attrs in REMOVE_ALL:
+        xpath = build_xpath(tag_name, attrs)
+        for el in root.xpath(xpath):
+            el.getparent().remove(el)
 
-    # Replace all links to boost.org with a local link
-    content = content.replace("https://www.boost.org/doc/libs/", "/doc/libs/")
+    content = html.tostring(root, encoding="unicode", method="html")
+    return content.replace("https://www.boost.org/doc/libs/", "/doc/libs/")
 
-    return content
+
+def build_xpath(tag, attrs):
+    parts = [f"@{key}='{val}'" for key, val in attrs.items()]
+    condition = " and ".join(parts)
+    return f".//{tag}[{condition}]" if condition else f".//{tag}"
 
 
 def get_library_documentation_urls(content, name="Alphabetically", parent="h2"):
diff --git a/docs/dependencies.md b/docs/dependencies.md
@@ -6,4 +6,4 @@
 1. Add the package to `requirements.in`
 1. Run `just pip-compile`, which will add the dependency to `requirements.txt`
 1. Run `just rebuild` to rebuild your Docker image to include the new dependencies
-2. Run `docker compose up` and continue with development
+2. Run `just up` and continue with development
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -10,7 +10,7 @@ django==4.2.16
     #   django-debug-toolbar
 django-debug-toolbar==4.4.6
     # via -r ./requirements-dev.in
-pydevd-pycharm==243.22562.180
+pydevd-pycharm==243.26053.29
     # via -r ./requirements-dev.in
 sqlparse==0.5.1
     # via
diff --git a/requirements.in b/requirements.in
@@ -33,6 +33,7 @@ boto3
 jsoncomment
 unidecode
 wordcloud
+lxml
 
 # Logging
 django-tracer
diff --git a/requirements.txt b/requirements.txt
@@ -206,6 +206,8 @@ kiwisolver==1.4.7
     # via matplotlib
 kombu==5.4.2
     # via celery
+lxml==5.4.0
+    # via -r ./requirements.in
 marshmallow==3.22.0
     # via environs
 matplotlib==3.9.2