Skip to content

Commit 72112fe

Browse files
authored
Improve performance of processing large docs files (boostorg#1799)
1 parent be14363 commit 72112fe

File tree

5 files changed

+26
-18
lines changed

5 files changed

+26
-18
lines changed

core/htmlhelper.py

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from bs4 import BeautifulSoup, Comment, Tag
44
from django.template.loader import render_to_string
55
from django.templatetags.static import static
6+
from lxml import html
67

78
from core.boostrenderer import get_body_from_html
89
from core.constants import SourceDocType
@@ -241,26 +242,30 @@ def modernize_legacy_page(
241242

242243
def slightly_modernize_legacy_library_doc_page(content):
243244
"""Modernize a legacy Boost library documentation page, but only minimally."""
244-
result = BeautifulSoup(content, "html.parser")
245-
if result.html is None:
246-
# Not an HTML file we care about
247-
return content
248-
# Remove the first occurrence of legacy header(s) and other stuff
249-
for tag_name, tag_attrs in REMOVE_TAGS:
250-
tag = result.find(tag_name, tag_attrs)
251-
if tag:
252-
tag.decompose()
245+
try:
246+
root = html.fromstring(content)
247+
except Exception:
248+
return content # Not valid HTML
253249

254-
for tag_name, tag_attrs in REMOVE_ALL:
255-
for tag in result.find_all(tag_name, tag_attrs):
256-
tag.decompose()
250+
for tag_name, attrs in REMOVE_TAGS:
251+
xpath = build_xpath(tag_name, attrs)
252+
elements = root.xpath(xpath)
253+
if elements:
254+
elements[0].getparent().remove(elements[0]) # Remove only first
257255

258-
content = str(result)
256+
for tag_name, attrs in REMOVE_ALL:
257+
xpath = build_xpath(tag_name, attrs)
258+
for el in root.xpath(xpath):
259+
el.getparent().remove(el)
259260

260-
# Replace all links to boost.org with a local link
261-
content = content.replace("https://www.boost.org/doc/libs/", "/doc/libs/")
261+
content = html.tostring(root, encoding="unicode", method="html")
262+
return content.replace("https://www.boost.org/doc/libs/", "/doc/libs/")
262263

263-
return content
264+
265+
def build_xpath(tag, attrs):
266+
parts = [f"@{key}='{val}'" for key, val in attrs.items()]
267+
condition = " and ".join(parts)
268+
return f".//{tag}[{condition}]" if condition else f".//{tag}"
264269

265270

266271
def get_library_documentation_urls(content, name="Alphabetically", parent="h2"):

docs/dependencies.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@
66
1. Add the package to `requirements.in`
77
1. Run `just pip-compile`, which will add the dependency to `requirements.txt`
88
1. Run `just rebuild` to rebuild your Docker image to include the new dependencies
9-
2. Run `docker compose up` and continue with development
9+
2. Run `just up` and continue with development

requirements-dev.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ django==4.2.16
1010
# django-debug-toolbar
1111
django-debug-toolbar==4.4.6
1212
# via -r ./requirements-dev.in
13-
pydevd-pycharm==243.22562.180
13+
pydevd-pycharm==243.26053.29
1414
# via -r ./requirements-dev.in
1515
sqlparse==0.5.1
1616
# via

requirements.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ boto3
3333
jsoncomment
3434
unidecode
3535
wordcloud
36+
lxml
3637

3738
# Logging
3839
django-tracer

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,8 @@ kiwisolver==1.4.7
206206
# via matplotlib
207207
kombu==5.4.2
208208
# via celery
209+
lxml==5.4.0
210+
# via -r ./requirements.in
209211
marshmallow==3.22.0
210212
# via environs
211213
matplotlib==3.9.2

0 commit comments

Comments
 (0)