11import re
22
33from bs4 import BeautifulSoup , Comment , Tag
4+ from django .http import HttpHeaders
45from django .template .loader import render_to_string
56from django .templatetags .static import static
6- from lxml import html
7+ from structlog import get_logger
78
89from core .boostrenderer import get_body_from_html
9- from core .constants import SourceDocType
10+ from core .constants import (
11+ SourceDocType ,
12+ NO_PROCESS_LIBS ,
13+ NO_WRAPPER_LIBS ,
14+ FULLY_MODERNIZED_LIB_VERSIONS ,
15+ )
16+
17+ logger = get_logger ()
1018
1119# List HTML elements (with relevant attributes) to remove the FIRST occurrence
1220REMOVE_TAGS = [
4149 ("table" , {"cellpadding" : "2" , "width" : "100%" }),
4250 # Remove the first hr from the page
4351 ("hr" , {}),
52+ # remove canonical tags
53+ ("link" , {"rel" : "canonical" }),
4454]
4555
4656# these tags are only removed on the release page, update REMOVE_TAGS for all pages
@@ -154,44 +164,33 @@ def is_end_comment(html_element):
154164
155165
156166def modernize_legacy_page (
157- content ,
158- base_html ,
159- head_selector = "head" ,
160- insert_body = True ,
167+ soup : BeautifulSoup ,
168+ base_html : str ,
169+ head_selector : str | dict [ str , str ] = "head" ,
170+ insert_body : bool = True ,
161171 original_docs_type : SourceDocType | None = None ,
162- skip_replace_boostlook = False ,
163- show_footer = True ,
164- show_navbar = True ,
165- ):
172+ skip_replace_boostlook : bool = False ,
173+ show_footer : bool = True ,
174+ show_navbar : bool = True ,
175+ ) -> str :
166176 """Modernize a legacy Boost documentation page."""
167177 HIDE_TAGS_BASE = []
168178 if not show_navbar :
169179 HIDE_TAGS_BASE .append (("div" , {"class" : "header-menu-bar topnavbar" })),
170180
171- result = BeautifulSoup (content , "html.parser" )
172- if result .html is None :
181+ if soup .html is None :
173182 # Not an HTML file we care about
174- return content
175- # Remove the first occurrence of legacy header(s) and other stuff
176- for tag_name , tag_attrs in REMOVE_TAGS :
177- tag = result .find (tag_name , tag_attrs )
178- if tag :
179- tag .decompose ()
180-
181- # Remove all navbar-like divs, if any
182- for tag_name , tag_attrs in REMOVE_ALL :
183- for tag in result .find_all (tag_name , tag_attrs ):
184- tag .decompose ()
183+ return soup .prettify (formatter = "html" )
185184
186185 # Remove CSS classes that produce visual harm
187186 for tag_name , tag_attrs in REMOVE_CSS_CLASSES :
188- for tag in result .find_all (tag_name , tag_attrs ):
187+ for tag in soup .find_all (tag_name , tag_attrs ):
189188 tag .attrs .pop ("class" )
190189
191- result = convert_name_to_id (result )
190+ soup = convert_name_to_id (soup )
192191 if not skip_replace_boostlook :
193- result = remove_library_boostlook (result )
194- result = remove_embedded_boostlook (result )
192+ soup = remove_library_boostlook (soup )
193+ soup = remove_embedded_boostlook (soup )
195194
196195 # Use the base HTML to later extract the <head> and (part of) the <body>
197196 placeholder = BeautifulSoup (base_html , "html.parser" )
@@ -204,62 +203,56 @@ def modernize_legacy_page(
204203
205204 if target_head :
206205 # Append the <head> taken from the base HTML to the existing (legacy) head
207- _insert_head (result , target_head )
206+ _insert_head (soup , target_head )
208207
209- original_body = result .body
208+ original_body = soup .body
210209 if original_body is None :
211210 pass
212211 elif placeholder .body is not None :
213212 if insert_body :
214213 # Beautify the legacy body with structure and classes from the
215214 # modern one, and embed the original body into a:
216215 # <div id="boost-legacy-docs-body"></div> block
217- _replace_body (result , original_body , base_body = placeholder .body )
216+ _replace_body (soup , original_body , base_body = placeholder .body )
218217 else :
219218 _insert_in_doc (
220- result .body ,
219+ soup .body ,
221220 placeholder .find ("div" , {"id" : "boost-legacy-docs-header" }),
222221 append = False ,
223222 )
224- wrap_main_body_elements (result , original_docs_type )
223+ wrap_main_body_elements (soup , original_docs_type )
225224 if show_footer :
226225 rendered_template = render_to_string ("includes/_footer.html" , {})
227226 rendered_template_as_dom = BeautifulSoup (
228227 rendered_template , "html.parser"
229228 )
230- result .append (rendered_template_as_dom )
229+ soup .append (rendered_template_as_dom )
231230
232231 # Remove tags from the base template
233- result = hide_tags (result , HIDE_TAGS_BASE )
232+ soup = hide_tags (soup , HIDE_TAGS_BASE )
233+
234+ return soup .prettify (formatter = "html" )
234235
235- content = str (result )
236236
237+ def minimize_uris (content : str ) -> str :
237238 # Replace all links to boost.org with a local link
238239 content = content .replace ("https://www.boost.org/doc/libs/" , "/doc/libs/" )
239-
240240 return content
241241
242242
243- def slightly_modernize_legacy_library_doc_page (content ):
244- """Modernize a legacy Boost library documentation page, but only minimally."""
245- try :
246- root = html .fromstring (content )
247- except Exception :
248- return content # Not valid HTML
249-
250- for tag_name , attrs in REMOVE_TAGS :
251- xpath = build_xpath (tag_name , attrs )
252- elements = root .xpath (xpath )
253- if elements :
254- elements [0 ].getparent ().remove (elements [0 ]) # Remove only first
243+ def remove_unwanted (content : BeautifulSoup ) -> BeautifulSoup :
244+ # Remove the first occurrence of legacy header(s) and other stuff
245+ for tag_name , tag_attrs in REMOVE_TAGS :
246+ tag = content .find (tag_name , tag_attrs )
247+ if tag :
248+ tag .decompose ()
255249
256- for tag_name , attrs in REMOVE_ALL :
257- xpath = build_xpath ( tag_name , attrs )
258- for el in root . xpath ( xpath ):
259- el . getparent (). remove ( el )
250+ # Remove all navbar-like divs, if any
251+ for tag_name , tag_attrs in REMOVE_ALL :
252+ for tag in content . find_all ( tag_name , tag_attrs ):
253+ tag . decompose ( )
260254
261- content = html .tostring (root , encoding = "unicode" , method = "html" )
262- return content .replace ("https://www.boost.org/doc/libs/" , "/doc/libs/" )
255+ return content
263256
264257
265258def build_xpath (tag , attrs ):
@@ -345,6 +338,14 @@ def remove_library_boostlook(soup):
345338 return soup
346339
347340
341+ def add_canonical_link (soup , canonical_uri ):
342+ """Add a canonical link to the head of the document."""
343+ if canonical_uri and soup .head :
344+ canonical_link = soup .new_tag ("link" , rel = "canonical" , href = canonical_uri )
345+ soup .head .append (canonical_link )
346+ return soup
347+
348+
348349def modernize_preprocessor_docs (soup : BeautifulSoup ) -> tuple [BeautifulSoup , bool ]:
349350 """Special case handling for Boost.Preprocessor docs.
350351
@@ -746,3 +747,35 @@ def modernize_release_notes(html_content):
746747 # Replace all links to boost.org with a local link
747748 content = result .replace ("https://www.boost.org/doc/libs/" , "/docs/libs/" )
748749 return get_body_from_html (content )
750+
751+
752+ def is_in_no_process_libs (path : str ) -> bool :
753+ return any (lib_slug in path for lib_slug in NO_PROCESS_LIBS )
754+
755+
756+ def is_in_fully_modernized_libs (path : str ) -> bool :
757+ return any (lib_slug in path for lib_slug in FULLY_MODERNIZED_LIB_VERSIONS )
758+
759+
760+ def is_in_no_wrapper_libs (path : str ) -> bool :
761+ return any (lib_slug in path for lib_slug in NO_WRAPPER_LIBS )
762+
763+
764+ def is_managed_content_type (content_type : str ) -> bool :
765+ passthrough_types = [
766+ "text/html" ,
767+ "text/html; charset=utf-8" ,
768+ ]
769+ for t in passthrough_types :
770+ if t in content_type :
771+ return True
772+ return False
773+
774+
775+ def is_valid_modernize_value (modernize : str ) -> bool :
776+ return modernize in ("max" , "med" , "min" )
777+
778+
779+ def get_is_iframe_destination (headers : HttpHeaders ) -> bool :
780+ # Is the request coming from an iframe? If so, let's disable the modernization.
781+ return headers .get ("Sec-Fetch-Dest" , "" ) in ["iframe" , "frame" ]
0 commit comments