From ebed3e136924072c4124201ff2709f14d2f2dd4e Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Wed, 10 Sep 2025 14:01:08 +0900 Subject: [PATCH] Implement anchor permanence checking This adds a postprocess step, written in Rust. It looks for `

+"#.as_bytes()).await.unwrap(); + let mut processor = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| processor.visit(h)); + processor.apply().unwrap(); + let serialized = serialize_for_test(&[document]); + assert!(!serialized.contains("text/required-ids")); + } + + #[tokio::test] + async fn no_script_present_noop() { + let document = parse_document_async( + r#" + +"# + .as_bytes(), + ) + .await + .unwrap(); + let before = serialize_for_test(&[document.clone()]); + let mut processor = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| processor.visit(h)); + processor.apply().unwrap(); + assert_eq!(before, serialize_for_test(&[document])); + } + + #[tokio::test] + async fn whitespace_splitting() { + // Includes indentation, multiple spaces, and newlines in the script content. + let document = parse_document_async(r#"
+"#.as_bytes()).await.unwrap(); + let mut processor = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| processor.visit(h)); + processor.apply().unwrap(); + let serialized = serialize_for_test(&[document]); + assert!(!serialized.contains("text/required-ids")); + } + + #[tokio::test] + async fn errors_on_missing_ids() { + let document = parse_document_async(r#" +
+"#.as_bytes()).await.unwrap(); + let mut processor = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| processor.visit(h)); + let err = processor.apply().expect_err("expected missing IDs error"); + assert!( + err.to_string() + .contains("Missing required IDs for anchor permanence: bar, baz") + ); + } + + #[tokio::test] + #[should_panic(expected = "multiple required-ids scripts encountered")] + async fn panics_on_multiple_required_ids_scripts() { + let document = parse_document_async(r#" + + +
"#.as_bytes()).await.unwrap(); + let mut processor = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| processor.visit(h)); + } +} diff --git a/src/dom_utils.rs b/src/dom_utils.rs index 6a788e8d..dffb9fcb 100644 --- a/src/dom_utils.rs +++ b/src/dom_utils.rs @@ -76,6 +76,11 @@ pub trait NodeHandleExt { where Self: Sized; + /// Removes the node from its parent. + fn remove(&self) + where + Self: Sized; + /// Clones the node and its entire subtree (including template contents). fn deep_clone(&self) -> Self; @@ -326,6 +331,10 @@ impl NodeHandleExt for Handle { self.parent.take(); } + fn remove(&self) { + self.replace_with(Vec::new()); + } + fn deep_clone(&self) -> Handle { use NodeData::*; let new_node_data = match &self.data { diff --git a/src/main.rs b/src/main.rs index 853403ed..a3ad9085 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,6 +8,7 @@ use std::path::{Path, PathBuf}; use markup5ever_rcdom::SerializableHandle; +mod anchor_permanence; mod annotate_attributes; mod boilerplate; mod dom_utils; @@ -21,15 +22,25 @@ mod tag_omission; #[tokio::main] async fn main() -> io::Result<()> { + let is_post = env::args().any(|a| a == "--singlepage-post"); + let result = if is_post { + // --singlepage-post runs the postprocess phase, which is currently only meant to be used on the + // singlepage output from Wattsi. + run_postprocess().await + } else { + // By default we run the preprocess phase, which creates a new input for Wattsi. + run_preprocess().await + }; + // This gives slightly prettier error-printing. - if let Err(e) = run().await { + if let Err(e) = result { eprintln!("{e}"); std::process::exit(1); } Ok(()) } -async fn run() -> io::Result<()> { +async fn run_preprocess() -> io::Result<()> { // Since we're using Rc in the DOM implementation, we must ensure that tasks // which act on it are confined to this thread. @@ -79,6 +90,27 @@ async fn run() -> io::Result<()> { Ok(()) } +// The steps and considerations here are similar to run_preprocess. +async fn run_postprocess() -> io::Result<()> { + let document = parser::parse_document_async(tokio::io::stdin()).await?; + + let mut anchor_permanence = anchor_permanence::Processor::new(); + + dom_utils::scan_dom(&document, &mut |h| { + anchor_permanence.visit(h); + }); + + anchor_permanence.apply()?; + + let serializable: SerializableHandle = document.into(); + serialize( + &mut BufWriter::with_capacity(128 * 1024, io::stdout()), + &serializable, + SerializeOpts::default(), + )?; + Ok(()) +} + fn path_from_env<'a, V, D>(var: &V, default: &'a D) -> Cow<'a, Path> where V: AsRef + ?Sized,