Skip to content

Commit 7d15668

Browse files
authored
Merge pull request #2924 from ehuss/html-end-tags
Add better handling for unbalanced HTML tags
2 parents cc7f8be + f0117ec commit 7d15668

File tree

2 files changed

+132
-44
lines changed

2 files changed

+132
-44
lines changed

crates/mdbook-html/src/html/tree.rs

Lines changed: 88 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use pulldown_cmark::{Alignment, CodeBlockKind, CowStr, Event, LinkType, Tag, Tag
1919
use std::borrow::Cow;
2020
use std::collections::{HashMap, HashSet};
2121
use std::ops::Deref;
22-
use tracing::{error, trace, warn};
22+
use tracing::{trace, warn};
2323

2424
/// Helper to create a [`QualName`].
2525
macro_rules! attr_qual_name {
@@ -307,6 +307,8 @@ where
307307
match event {
308308
Event::Start(tag) => self.start_tag(tag),
309309
Event::End(tag) => {
310+
// TODO: This should validate that the event stack is
311+
// properly synchronized with the tag stack.
310312
self.pop();
311313
match tag {
312314
TagEnd::TableHead => {
@@ -378,6 +380,7 @@ where
378380
}
379381
}
380382
}
383+
self.finish_stack();
381384
self.collect_footnote_defs();
382385
}
383386

@@ -606,40 +609,10 @@ where
606609
trace!("html token={token:?}");
607610
match token {
608611
Token::DoctypeToken(_) => {}
609-
Token::TagToken(tag) => {
610-
match tag.kind {
611-
TagKind::StartTag => {
612-
let is_closed = is_void_element(&tag.name) || tag.self_closing;
613-
is_raw = matches!(&*tag.name, "script" | "style");
614-
let name = QualName::new(None, html5ever::ns!(html), tag.name);
615-
let attrs = tag
616-
.attrs
617-
.into_iter()
618-
.map(|attr| (attr.name, attr.value))
619-
.collect();
620-
let mut el = Element {
621-
name,
622-
attrs,
623-
self_closing: tag.self_closing,
624-
was_raw: true,
625-
};
626-
fix_html_link(&mut el);
627-
self.push(Node::Element(el));
628-
if is_closed {
629-
// No end element.
630-
self.pop();
631-
}
632-
}
633-
TagKind::EndTag => {
634-
is_raw = false;
635-
if self.is_html_tag_matching(&tag.name) {
636-
self.pop();
637-
}
638-
// else the stack is corrupt. I'm not really sure
639-
// what to do here...
640-
}
641-
}
642-
}
612+
Token::TagToken(tag) => match tag.kind {
613+
TagKind::StartTag => self.start_html_tag(tag, &mut is_raw),
614+
TagKind::EndTag => self.end_html_tag(tag, &mut is_raw),
615+
},
643616
Token::CommentToken(comment) => {
644617
self.append(Node::Comment(comment));
645618
}
@@ -664,22 +637,59 @@ where
664637
}
665638
}
666639

640+
/// Adds an open HTML tag.
641+
fn start_html_tag(&mut self, tag: html5ever::tokenizer::Tag, is_raw: &mut bool) {
642+
let is_closed = is_void_element(&tag.name) || tag.self_closing;
643+
*is_raw = matches!(&*tag.name, "script" | "style");
644+
let name = QualName::new(None, html5ever::ns!(html), tag.name);
645+
let attrs = tag
646+
.attrs
647+
.into_iter()
648+
.map(|attr| (attr.name, attr.value))
649+
.collect();
650+
let mut el = Element {
651+
name,
652+
attrs,
653+
self_closing: tag.self_closing,
654+
was_raw: true,
655+
};
656+
fix_html_link(&mut el);
657+
self.push(Node::Element(el));
658+
if is_closed {
659+
// No end element.
660+
self.pop();
661+
}
662+
}
663+
664+
/// Closes the given HTML tag.
665+
fn end_html_tag(&mut self, tag: html5ever::tokenizer::Tag, is_raw: &mut bool) {
666+
*is_raw = false;
667+
if self.is_html_tag_matching(&tag.name) {
668+
self.pop();
669+
} else {
670+
// The proper thing to do here is to recover. However, the HTML
671+
// parsing algorithm for that is quite complex. See
672+
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
673+
// and the adoption agency algorithm.
674+
warn!(
675+
"unexpected HTML end tag `</{}>` found in `{}`\n\
676+
Check that the HTML tags are properly balanced.",
677+
tag.name,
678+
self.options.path.display()
679+
);
680+
}
681+
}
682+
667683
/// This is used to verify HTML parsing keeps the stack of tags in sync.
668684
fn is_html_tag_matching(&self, name: &str) -> bool {
669685
let current = self.tree.get(self.current_node).unwrap().value();
670686
if let Node::Element(el) = current
671687
&& el.name() == name
672688
{
673-
return true;
689+
true
690+
} else {
691+
false
674692
}
675-
error!(
676-
"internal error: HTML tag stack out of sync.\n
677-
path: `{}`\n\
678-
current={current:?}\n\
679-
pop name: {name}",
680-
self.options.path.display()
681-
);
682-
false
683693
}
684694

685695
/// Eats all pulldown-cmark events until the next `End` matching the
@@ -736,6 +746,40 @@ where
736746
output
737747
}
738748

749+
/// Deals with any unclosed elements on the stack.
750+
fn finish_stack(&mut self) {
751+
while let Some(node_id) = self.tag_stack.pop() {
752+
let node = self.tree.get(node_id).unwrap().value();
753+
match node {
754+
Node::Fragment => {}
755+
Node::Element(el) => {
756+
if el.was_raw {
757+
warn!(
758+
"unclosed HTML tag `<{}>` found in `{}`",
759+
el.name.local,
760+
self.options.path.display()
761+
);
762+
} else {
763+
panic!(
764+
"internal error: expected empty tag stack.\n
765+
path: `{}`\n\
766+
element={el:?}",
767+
self.options.path.display()
768+
);
769+
}
770+
}
771+
node => {
772+
panic!(
773+
"internal error: expected empty tag stack.\n
774+
path: `{}`\n\
775+
node={node:?}",
776+
self.options.path.display()
777+
);
778+
}
779+
}
780+
}
781+
}
782+
739783
/// Appends a new footnote reference.
740784
fn footnote_reference(&mut self, name: CowStr<'event>) {
741785
let len = self.footnote_numbers.len() + 1;

tests/testsuite/rendering.rs

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,3 +239,47 @@ fn html_blocks() {
239239
fn code_block_fenced_with_indent() {
240240
BookTest::from_dir("rendering/code_blocks_fenced_with_indent").check_all_main_files();
241241
}
242+
243+
// Unclosed HTML tags.
244+
//
245+
// Note that the HTML parsing algorithm is much more complicated than what
246+
// this is checking.
247+
#[test]
248+
fn unclosed_html_tags() {
249+
BookTest::init(|_| {})
250+
.change_file("src/chapter_1.md", "<div>x<span>foo<i>xyz")
251+
.run("build", |cmd| {
252+
cmd.expect_stderr(str![[r#"
253+
INFO Book building has started
254+
INFO Running the html backend
255+
WARN unclosed HTML tag `<i>` found in `chapter_1.md`
256+
WARN unclosed HTML tag `<span>` found in `chapter_1.md`
257+
WARN unclosed HTML tag `<div>` found in `chapter_1.md`
258+
INFO HTML book written to `[ROOT]/book`
259+
260+
"#]]);
261+
})
262+
.check_main_file(
263+
"book/chapter_1.html",
264+
str!["<div>x<span>foo<i>xyz</i></span></div>"],
265+
);
266+
}
267+
268+
// Test for HTML tags out of sync.
269+
#[test]
270+
fn unbalanced_html_tags() {
271+
BookTest::init(|_| {})
272+
.change_file("src/chapter_1.md", "<div>x<span>foo</div></span>")
273+
.run("build", |cmd| {
274+
cmd.expect_stderr(str![[r#"
275+
INFO Book building has started
276+
INFO Running the html backend
277+
WARN unexpected HTML end tag `</div>` found in `chapter_1.md`
278+
Check that the HTML tags are properly balanced.
279+
WARN unclosed HTML tag `<div>` found in `chapter_1.md`
280+
INFO HTML book written to `[ROOT]/book`
281+
282+
"#]]);
283+
})
284+
.check_main_file("book/chapter_1.html", str!["<div>x<span>foo</span></div>"]);
285+
}

0 commit comments

Comments
 (0)