Skip to content

Commit 8088234

Browse files
authored
Refactor docs retrieval, and canonical uri addition (boostorg#1857) (boostorg#1924)
1 parent ba2c99f commit 8088234

File tree

12 files changed

+340
-230
lines changed

12 files changed

+340
-230
lines changed

core/boostrenderer.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import structlog
88
from botocore.exceptions import ClientError
99
from bs4 import BeautifulSoup, Tag
10+
import chardet
1011
from django.conf import settings
1112
from mistletoe import HtmlRenderer
1213
from mistletoe.span_token import SpanToken
@@ -23,6 +24,10 @@ def extract_file_data(response, s3_key):
2324
"""Extracts the file content, content type, and last modified date from an S3
2425
response object."""
2526
file_content = response["Body"].read()
27+
detected_encoding = chardet.detect(file_content)["encoding"] or "utf-8"
28+
# decoding here stops django debug toolbar erroring on non-utf-8, e.g. preprocessor
29+
if detected_encoding != "utf-8":
30+
file_content = file_content.decode(detected_encoding).encode("utf-8")
2631
content_type = get_content_type(s3_key, response["ContentType"])
2732
last_modified = response["LastModified"]
2833
return {

core/constants.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import re
12
from enum import Enum
23

34

@@ -7,3 +8,63 @@ class SourceDocType(Enum):
78

89

910
SLACK_URL = "https://cpplang.slack.com"
11+
# possible library versions are: boost_1_53_0_beta1, 1_82_0, 1_55_0b1
12+
BOOST_LIB_PATH_RE = re.compile(r"^(boost_){0,1}([0-9_]*[0-9]+[^/]*)/(.*)")
13+
NO_PROCESS_LIBS = [
14+
# Do nothing with these - just render contents directly
15+
"libs/filesystem",
16+
"libs/gil",
17+
"libs/hana",
18+
"libs/locale",
19+
"libs/iostreams",
20+
"libs/preprocessor",
21+
"libs/serialization",
22+
"libs/wave",
23+
]
24+
NO_WRAPPER_LIBS = [
25+
# Add a header to these, but no wrapper.
26+
"libs/array",
27+
"libs/assert",
28+
"libs/bloom",
29+
"libs/charconv",
30+
"libs/cobalt",
31+
"libs/compat",
32+
"libs/container_hash",
33+
"libs/describe",
34+
"libs/endian",
35+
"libs/exception",
36+
"libs/hash2",
37+
"libs/io",
38+
"libs/lambda2",
39+
"libs/leaf",
40+
"libs/mp11",
41+
"libs/predef",
42+
"libs/process",
43+
"doc/html/process",
44+
"libs/property_map_parallel",
45+
"libs/qvm",
46+
"libs/redis",
47+
"libs/smart_ptr",
48+
"libs/system",
49+
"libs/throw_exception",
50+
"libs/unordered",
51+
"libs/uuid",
52+
"libs/variant2",
53+
]
54+
FULLY_MODERNIZED_LIB_VERSIONS = [
55+
# FIXME: we should have a way to opt-in via a flag on the library/lib-version.
56+
# Hard-coding these here as a quick fix for now.
57+
# TODO: create a ticket for this
58+
"tools/", # Not a library version, but tools are somewhat analogous
59+
"1_87_0/libs/charconv",
60+
"1_88_0/libs/charconv",
61+
"1_89_0/libs/charconv",
62+
"latest/libs/charconv",
63+
"develop/libs/charconv",
64+
"master/libs/charconv",
65+
"1_89_0/libs/redis",
66+
"latest/libs/redis",
67+
"develop/libs/redis",
68+
"master/libs/redis",
69+
"doc/antora/url",
70+
]

core/htmlhelper.py

Lines changed: 87 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,20 @@
11
import re
22

33
from bs4 import BeautifulSoup, Comment, Tag
4+
from django.http import HttpHeaders
45
from django.template.loader import render_to_string
56
from django.templatetags.static import static
6-
from lxml import html
7+
from structlog import get_logger
78

89
from core.boostrenderer import get_body_from_html
9-
from core.constants import SourceDocType
10+
from core.constants import (
11+
SourceDocType,
12+
NO_PROCESS_LIBS,
13+
NO_WRAPPER_LIBS,
14+
FULLY_MODERNIZED_LIB_VERSIONS,
15+
)
16+
17+
logger = get_logger()
1018

1119
# List HTML elements (with relevant attributes) to remove the FIRST occurrence
1220
REMOVE_TAGS = [
@@ -41,6 +49,8 @@
4149
("table", {"cellpadding": "2", "width": "100%"}),
4250
# Remove the first hr from the page
4351
("hr", {}),
52+
# remove canonical tags
53+
("link", {"rel": "canonical"}),
4454
]
4555

4656
# these tags are only removed on the release page, update REMOVE_TAGS for all pages
@@ -154,44 +164,33 @@ def is_end_comment(html_element):
154164

155165

156166
def modernize_legacy_page(
157-
content,
158-
base_html,
159-
head_selector="head",
160-
insert_body=True,
167+
soup: BeautifulSoup,
168+
base_html: str,
169+
head_selector: str | dict[str, str] = "head",
170+
insert_body: bool = True,
161171
original_docs_type: SourceDocType | None = None,
162-
skip_replace_boostlook=False,
163-
show_footer=True,
164-
show_navbar=True,
165-
):
172+
skip_replace_boostlook: bool = False,
173+
show_footer: bool = True,
174+
show_navbar: bool = True,
175+
) -> str:
166176
"""Modernize a legacy Boost documentation page."""
167177
HIDE_TAGS_BASE = []
168178
if not show_navbar:
169179
HIDE_TAGS_BASE.append(("div", {"class": "header-menu-bar topnavbar"})),
170180

171-
result = BeautifulSoup(content, "html.parser")
172-
if result.html is None:
181+
if soup.html is None:
173182
# Not an HTML file we care about
174-
return content
175-
# Remove the first occurrence of legacy header(s) and other stuff
176-
for tag_name, tag_attrs in REMOVE_TAGS:
177-
tag = result.find(tag_name, tag_attrs)
178-
if tag:
179-
tag.decompose()
180-
181-
# Remove all navbar-like divs, if any
182-
for tag_name, tag_attrs in REMOVE_ALL:
183-
for tag in result.find_all(tag_name, tag_attrs):
184-
tag.decompose()
183+
return soup.prettify(formatter="html")
185184

186185
# Remove CSS classes that produce visual harm
187186
for tag_name, tag_attrs in REMOVE_CSS_CLASSES:
188-
for tag in result.find_all(tag_name, tag_attrs):
187+
for tag in soup.find_all(tag_name, tag_attrs):
189188
tag.attrs.pop("class")
190189

191-
result = convert_name_to_id(result)
190+
soup = convert_name_to_id(soup)
192191
if not skip_replace_boostlook:
193-
result = remove_library_boostlook(result)
194-
result = remove_embedded_boostlook(result)
192+
soup = remove_library_boostlook(soup)
193+
soup = remove_embedded_boostlook(soup)
195194

196195
# Use the base HTML to later extract the <head> and (part of) the <body>
197196
placeholder = BeautifulSoup(base_html, "html.parser")
@@ -204,62 +203,56 @@ def modernize_legacy_page(
204203

205204
if target_head:
206205
# Append the <head> taken from the base HTML to the existing (legacy) head
207-
_insert_head(result, target_head)
206+
_insert_head(soup, target_head)
208207

209-
original_body = result.body
208+
original_body = soup.body
210209
if original_body is None:
211210
pass
212211
elif placeholder.body is not None:
213212
if insert_body:
214213
# Beautify the legacy body with structure and classes from the
215214
# modern one, and embed the original body into a:
216215
# <div id="boost-legacy-docs-body"></div> block
217-
_replace_body(result, original_body, base_body=placeholder.body)
216+
_replace_body(soup, original_body, base_body=placeholder.body)
218217
else:
219218
_insert_in_doc(
220-
result.body,
219+
soup.body,
221220
placeholder.find("div", {"id": "boost-legacy-docs-header"}),
222221
append=False,
223222
)
224-
wrap_main_body_elements(result, original_docs_type)
223+
wrap_main_body_elements(soup, original_docs_type)
225224
if show_footer:
226225
rendered_template = render_to_string("includes/_footer.html", {})
227226
rendered_template_as_dom = BeautifulSoup(
228227
rendered_template, "html.parser"
229228
)
230-
result.append(rendered_template_as_dom)
229+
soup.append(rendered_template_as_dom)
231230

232231
# Remove tags from the base template
233-
result = hide_tags(result, HIDE_TAGS_BASE)
232+
soup = hide_tags(soup, HIDE_TAGS_BASE)
233+
234+
return soup.prettify(formatter="html")
234235

235-
content = str(result)
236236

237+
def minimize_uris(content: str) -> str:
237238
# Replace all links to boost.org with a local link
238239
content = content.replace("https://www.boost.org/doc/libs/", "/doc/libs/")
239-
240240
return content
241241

242242

243-
def slightly_modernize_legacy_library_doc_page(content):
244-
"""Modernize a legacy Boost library documentation page, but only minimally."""
245-
try:
246-
root = html.fromstring(content)
247-
except Exception:
248-
return content # Not valid HTML
249-
250-
for tag_name, attrs in REMOVE_TAGS:
251-
xpath = build_xpath(tag_name, attrs)
252-
elements = root.xpath(xpath)
253-
if elements:
254-
elements[0].getparent().remove(elements[0]) # Remove only first
243+
def remove_unwanted(content: BeautifulSoup) -> BeautifulSoup:
244+
# Remove the first occurrence of legacy header(s) and other stuff
245+
for tag_name, tag_attrs in REMOVE_TAGS:
246+
tag = content.find(tag_name, tag_attrs)
247+
if tag:
248+
tag.decompose()
255249

256-
for tag_name, attrs in REMOVE_ALL:
257-
xpath = build_xpath(tag_name, attrs)
258-
for el in root.xpath(xpath):
259-
el.getparent().remove(el)
250+
# Remove all navbar-like divs, if any
251+
for tag_name, tag_attrs in REMOVE_ALL:
252+
for tag in content.find_all(tag_name, tag_attrs):
253+
tag.decompose()
260254

261-
content = html.tostring(root, encoding="unicode", method="html")
262-
return content.replace("https://www.boost.org/doc/libs/", "/doc/libs/")
255+
return content
263256

264257

265258
def build_xpath(tag, attrs):
@@ -345,6 +338,14 @@ def remove_library_boostlook(soup):
345338
return soup
346339

347340

341+
def add_canonical_link(soup, canonical_uri):
342+
"""Add a canonical link to the head of the document."""
343+
if canonical_uri and soup.head:
344+
canonical_link = soup.new_tag("link", rel="canonical", href=canonical_uri)
345+
soup.head.append(canonical_link)
346+
return soup
347+
348+
348349
def modernize_preprocessor_docs(soup: BeautifulSoup) -> tuple[BeautifulSoup, bool]:
349350
"""Special case handling for Boost.Preprocessor docs.
350351
@@ -746,3 +747,35 @@ def modernize_release_notes(html_content):
746747
# Replace all links to boost.org with a local link
747748
content = result.replace("https://www.boost.org/doc/libs/", "/docs/libs/")
748749
return get_body_from_html(content)
750+
751+
752+
def is_in_no_process_libs(path: str) -> bool:
753+
return any(lib_slug in path for lib_slug in NO_PROCESS_LIBS)
754+
755+
756+
def is_in_fully_modernized_libs(path: str) -> bool:
757+
return any(lib_slug in path for lib_slug in FULLY_MODERNIZED_LIB_VERSIONS)
758+
759+
760+
def is_in_no_wrapper_libs(path: str) -> bool:
761+
return any(lib_slug in path for lib_slug in NO_WRAPPER_LIBS)
762+
763+
764+
def is_managed_content_type(content_type: str) -> bool:
765+
passthrough_types = [
766+
"text/html",
767+
"text/html; charset=utf-8",
768+
]
769+
for t in passthrough_types:
770+
if t in content_type:
771+
return True
772+
return False
773+
774+
775+
def is_valid_modernize_value(modernize: str) -> bool:
776+
return modernize in ("max", "med", "min")
777+
778+
779+
def get_is_iframe_destination(headers: HttpHeaders) -> bool:
780+
# Is the request coming from an iframe? If so, let's disable the modernization.
781+
return headers.get("Sec-Fetch-Dest", "") in ["iframe", "frame"]

0 commit comments

Comments
 (0)