cppalliance
diff --git a/‎core/boostrenderer.py‎
Lines changed: 5 additions & 0 deletions b/‎core/boostrenderer.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎core/constants.py‎
Lines changed: 61 additions & 0 deletions b/‎core/constants.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎core/htmlhelper.py‎
Lines changed: 87 additions & 54 deletions b/‎core/htmlhelper.py‎
Lines changed: 87 additions & 54 deletions
@@ -7,6 +7,7 @@
 import structlog
 from botocore.exceptions import ClientError
 from bs4 import BeautifulSoup, Tag
+import chardet
 from django.conf import settings
 from mistletoe import HtmlRenderer
 from mistletoe.span_token import SpanToken
@@ -23,6 +24,10 @@ def extract_file_data(response, s3_key):
     """Extracts the file content, content type, and last modified date from an S3
     response object."""
     file_content = response["Body"].read()
+    detected_encoding = chardet.detect(file_content)["encoding"] or "utf-8"
+    # decoding here stops django debug toolbar erroring on non-utf-8, e.g. preprocessor
+    if detected_encoding != "utf-8":
+        file_content = file_content.decode(detected_encoding).encode("utf-8")
     content_type = get_content_type(s3_key, response["ContentType"])
     last_modified = response["LastModified"]
     return {
 
@@ -1,3 +1,4 @@
+import re
 from enum import Enum
 
 
@@ -7,3 +8,63 @@ class SourceDocType(Enum):
 
 
 SLACK_URL = "https://cpplang.slack.com"
+# possible library versions are: boost_1_53_0_beta1, 1_82_0, 1_55_0b1
+BOOST_LIB_PATH_RE = re.compile(r"^(boost_){0,1}([0-9_]*[0-9]+[^/]*)/(.*)")
+NO_PROCESS_LIBS = [
+    # Do nothing with these - just render contents directly
+    "libs/filesystem",
+    "libs/gil",
+    "libs/hana",
+    "libs/locale",
+    "libs/iostreams",
+    "libs/preprocessor",
+    "libs/serialization",
+    "libs/wave",
+]
+NO_WRAPPER_LIBS = [
+    # Add a header to these, but no wrapper.
+    "libs/array",
+    "libs/assert",
+    "libs/bloom",
+    "libs/charconv",
+    "libs/cobalt",
+    "libs/compat",
+    "libs/container_hash",
+    "libs/describe",
+    "libs/endian",
+    "libs/exception",
+    "libs/hash2",
+    "libs/io",
+    "libs/lambda2",
+    "libs/leaf",
+    "libs/mp11",
+    "libs/predef",
+    "libs/process",
+    "doc/html/process",
+    "libs/property_map_parallel",
+    "libs/qvm",
+    "libs/redis",
+    "libs/smart_ptr",
+    "libs/system",
+    "libs/throw_exception",
+    "libs/unordered",
+    "libs/uuid",
+    "libs/variant2",
+]
+FULLY_MODERNIZED_LIB_VERSIONS = [
+    # FIXME: we should have a way to opt-in via a flag on the library/lib-version.
+    #  Hard-coding these here as a quick fix for now.
+    # TODO: create a ticket for this
+    "tools/",  # Not a library version, but tools are somewhat analogous
+    "1_87_0/libs/charconv",
+    "1_88_0/libs/charconv",
+    "1_89_0/libs/charconv",
+    "latest/libs/charconv",
+    "develop/libs/charconv",
+    "master/libs/charconv",
+    "1_89_0/libs/redis",
+    "latest/libs/redis",
+    "develop/libs/redis",
+    "master/libs/redis",
+    "doc/antora/url",
+]
@@ -1,12 +1,20 @@
 import re
 
 from bs4 import BeautifulSoup, Comment, Tag
+from django.http import HttpHeaders
 from django.template.loader import render_to_string
 from django.templatetags.static import static
-from lxml import html
+from structlog import get_logger
 
 from core.boostrenderer import get_body_from_html
-from core.constants import SourceDocType
+from core.constants import (
+    SourceDocType,
+    NO_PROCESS_LIBS,
+    NO_WRAPPER_LIBS,
+    FULLY_MODERNIZED_LIB_VERSIONS,
+)
+
+logger = get_logger()
 
 # List HTML elements (with relevant attributes) to remove the FIRST occurrence
 REMOVE_TAGS = [
@@ -41,6 +49,8 @@
     ("table", {"cellpadding": "2", "width": "100%"}),
     # Remove the first hr from the page
     ("hr", {}),
+    # remove canonical tags
+    ("link", {"rel": "canonical"}),
 ]
 
 # these tags are only removed on the release page, update REMOVE_TAGS for all pages
@@ -154,44 +164,33 @@ def is_end_comment(html_element):
 
 
 def modernize_legacy_page(
-    content,
-    base_html,
-    head_selector="head",
-    insert_body=True,
+    soup: BeautifulSoup,
+    base_html: str,
+    head_selector: str | dict[str, str] = "head",
+    insert_body: bool = True,
     original_docs_type: SourceDocType | None = None,
-    skip_replace_boostlook=False,
-    show_footer=True,
-    show_navbar=True,
-):
+    skip_replace_boostlook: bool = False,
+    show_footer: bool = True,
+    show_navbar: bool = True,
+) -> str:
     """Modernize a legacy Boost documentation page."""
     HIDE_TAGS_BASE = []
     if not show_navbar:
         HIDE_TAGS_BASE.append(("div", {"class": "header-menu-bar topnavbar"})),
 
-    result = BeautifulSoup(content, "html.parser")
-    if result.html is None:
+    if soup.html is None:
         # Not an HTML file we care about
-        return content
-    # Remove the first occurrence of legacy header(s) and other stuff
-    for tag_name, tag_attrs in REMOVE_TAGS:
-        tag = result.find(tag_name, tag_attrs)
-        if tag:
-            tag.decompose()
-
-    # Remove all navbar-like divs, if any
-    for tag_name, tag_attrs in REMOVE_ALL:
-        for tag in result.find_all(tag_name, tag_attrs):
-            tag.decompose()
+        return soup.prettify(formatter="html")
 
     # Remove CSS classes that produce visual harm
     for tag_name, tag_attrs in REMOVE_CSS_CLASSES:
-        for tag in result.find_all(tag_name, tag_attrs):
+        for tag in soup.find_all(tag_name, tag_attrs):
             tag.attrs.pop("class")
 
-    result = convert_name_to_id(result)
+    soup = convert_name_to_id(soup)
     if not skip_replace_boostlook:
-        result = remove_library_boostlook(result)
-    result = remove_embedded_boostlook(result)
+        soup = remove_library_boostlook(soup)
+    soup = remove_embedded_boostlook(soup)
 
     # Use the base HTML to later extract the <head> and (part of) the <body>
     placeholder = BeautifulSoup(base_html, "html.parser")
@@ -204,62 +203,56 @@ def modernize_legacy_page(
 
     if target_head:
         # Append the <head> taken from the base HTML to the existing (legacy) head
-        _insert_head(result, target_head)
+        _insert_head(soup, target_head)
 
-    original_body = result.body
+    original_body = soup.body
     if original_body is None:
         pass
     elif placeholder.body is not None:
         if insert_body:
             # Beautify the legacy body with structure and classes from the
             # modern one, and embed the original body into a:
             # <div id="boost-legacy-docs-body"></div> block
-            _replace_body(result, original_body, base_body=placeholder.body)
+            _replace_body(soup, original_body, base_body=placeholder.body)
         else:
             _insert_in_doc(
-                result.body,
+                soup.body,
                 placeholder.find("div", {"id": "boost-legacy-docs-header"}),
                 append=False,
             )
-            wrap_main_body_elements(result, original_docs_type)
+            wrap_main_body_elements(soup, original_docs_type)
             if show_footer:
                 rendered_template = render_to_string("includes/_footer.html", {})
                 rendered_template_as_dom = BeautifulSoup(
                     rendered_template, "html.parser"
                 )
-                result.append(rendered_template_as_dom)
+                soup.append(rendered_template_as_dom)
 
     # Remove tags from the base template
-    result = hide_tags(result, HIDE_TAGS_BASE)
+    soup = hide_tags(soup, HIDE_TAGS_BASE)
+
+    return soup.prettify(formatter="html")
 
-    content = str(result)
 
+def minimize_uris(content: str) -> str:
     # Replace all links to boost.org with a local link
     content = content.replace("https://www.boost.org/doc/libs/", "/doc/libs/")
-
     return content
 
 
-def slightly_modernize_legacy_library_doc_page(content):
-    """Modernize a legacy Boost library documentation page, but only minimally."""
-    try:
-        root = html.fromstring(content)
-    except Exception:
-        return content  # Not valid HTML
-
-    for tag_name, attrs in REMOVE_TAGS:
-        xpath = build_xpath(tag_name, attrs)
-        elements = root.xpath(xpath)
-        if elements:
-            elements[0].getparent().remove(elements[0])  # Remove only first
+def remove_unwanted(content: BeautifulSoup) -> BeautifulSoup:
+    # Remove the first occurrence of legacy header(s) and other stuff
+    for tag_name, tag_attrs in REMOVE_TAGS:
+        tag = content.find(tag_name, tag_attrs)
+        if tag:
+            tag.decompose()
 
-    for tag_name, attrs in REMOVE_ALL:
-        xpath = build_xpath(tag_name, attrs)
-        for el in root.xpath(xpath):
-            el.getparent().remove(el)
+    # Remove all navbar-like divs, if any
+    for tag_name, tag_attrs in REMOVE_ALL:
+        for tag in content.find_all(tag_name, tag_attrs):
+            tag.decompose()
 
-    content = html.tostring(root, encoding="unicode", method="html")
-    return content.replace("https://www.boost.org/doc/libs/", "/doc/libs/")
+    return content
 
 
 def build_xpath(tag, attrs):
@@ -345,6 +338,14 @@ def remove_library_boostlook(soup):
     return soup
 
 
+def add_canonical_link(soup, canonical_uri):
+    """Add a canonical link to the head of the document."""
+    if canonical_uri and soup.head:
+        canonical_link = soup.new_tag("link", rel="canonical", href=canonical_uri)
+        soup.head.append(canonical_link)
+    return soup
+
+
 def modernize_preprocessor_docs(soup: BeautifulSoup) -> tuple[BeautifulSoup, bool]:
     """Special case handling for Boost.Preprocessor docs.
 
@@ -746,3 +747,35 @@ def modernize_release_notes(html_content):
     # Replace all links to boost.org with a local link
     content = result.replace("https://www.boost.org/doc/libs/", "/docs/libs/")
     return get_body_from_html(content)
+
+
+def is_in_no_process_libs(path: str) -> bool:
+    return any(lib_slug in path for lib_slug in NO_PROCESS_LIBS)
+
+
+def is_in_fully_modernized_libs(path: str) -> bool:
+    return any(lib_slug in path for lib_slug in FULLY_MODERNIZED_LIB_VERSIONS)
+
+
+def is_in_no_wrapper_libs(path: str) -> bool:
+    return any(lib_slug in path for lib_slug in NO_WRAPPER_LIBS)
+
+
+def is_managed_content_type(content_type: str) -> bool:
+    passthrough_types = [
+        "text/html",
+        "text/html; charset=utf-8",
+    ]
+    for t in passthrough_types:
+        if t in content_type:
+            return True
+    return False
+
+
+def is_valid_modernize_value(modernize: str) -> bool:
+    return modernize in ("max", "med", "min")
+
+
+def get_is_iframe_destination(headers: HttpHeaders) -> bool:
+    # Is the request coming from an iframe? If so, let's disable the modernization.
+    return headers.get("Sec-Fetch-Dest", "") in ["iframe", "frame"]