Skip to content

Commit e83c940

Browse files
author
bors-servo
authored
Auto merge of #272 - servo:utf8, r=nox
Remove broken BytesParser It cannot be the future API for handling `<meta charset>` since it does not support interrupting for scripts. Update Tendril to a version that doesn’t depend on rust-encoding (unless a Cargo feature is enabled). <!-- Reviewable:start --> --- This change is [<img src="https://reviewable.io/review_button.svg" height="34" align="absmiddle" alt="Reviewable"/>](https://reviewable.io/reviews/servo/html5ever/272) <!-- Reviewable:end -->
2 parents f0decc5 + 2814e6a commit e83c940

File tree

19 files changed

+47
-514
lines changed

19 files changed

+47
-514
lines changed

html5ever/Cargo.toml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22

33
name = "html5ever"
4-
version = "0.16.0"
4+
version = "0.17.0"
55
authors = [ "The html5ever Project Developers" ]
66
license = "MIT / Apache-2.0"
77
repository = "https://github.com/servo/html5ever"
@@ -30,13 +30,12 @@ name = "tokenizer"
3030
harness = false
3131

3232
[features]
33-
unstable = ["tendril/unstable"]
33+
unstable = ["markup5ever/unstable"]
3434
heap_size = ["markup5ever/heap_size"]
3535

3636
[dependencies]
37-
log = "0"
38-
mac = "0"
39-
tendril = "0.2.2"
37+
log = "0.3"
38+
mac = "0.1"
4039
markup5ever = { version = "0.1", path = "../markup5ever" }
4140

4241
[dev-dependencies]

html5ever/examples/html2html.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,17 @@
1515
//!
1616
//! where htmlparser-1.4.jar comes from http://about.validator.nu/htmlparser/
1717
18-
extern crate tendril;
1918
extern crate html5ever;
2019

2120
use std::io::{self, Write};
2221
use std::default::Default;
2322

24-
use tendril::TendrilSink;
2523

26-
use html5ever::driver::ParseOpts;
27-
use html5ever::tree_builder::TreeBuilderOpts;
2824
use html5ever::{parse_document, serialize};
25+
use html5ever::driver::ParseOpts;
2926
use html5ever::rcdom::RcDom;
27+
use html5ever::tendril::TendrilSink;
28+
use html5ever::tree_builder::TreeBuilderOpts;
3029

3130
fn main() {
3231
let opts = ParseOpts {

html5ever/examples/print-rcdom.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,15 @@
88
// except according to those terms.
99

1010
#[macro_use] extern crate html5ever;
11-
extern crate tendril;
1211

1312
use std::io;
1413
use std::iter::repeat;
1514
use std::default::Default;
1615
use std::string::String;
1716

18-
use tendril::TendrilSink;
1917
use html5ever::parse_document;
2018
use html5ever::rcdom::{NodeData, RcDom, Handle};
19+
use html5ever::tendril::TendrilSink;
2120

2221
// This is not proper HTML serialization, of course.
2322

html5ever/src/driver.rs

Lines changed: 7 additions & 209 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,10 @@ use tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult};
1515
use tree_builder::{TreeBuilderOpts, TreeBuilder, TreeSink, create_element};
1616

1717
use std::borrow::Cow;
18-
use std::mem;
1918

20-
use encoding::{self, EncodingRef};
2119
use tendril;
22-
use tendril::{StrTendril, ByteTendril};
23-
use tendril::stream::{TendrilSink, Utf8LossyDecoder, LossyDecoder};
20+
use tendril::StrTendril;
21+
use tendril::stream::{TendrilSink, Utf8LossyDecoder};
2422

2523
/// All-encompassing options struct for the parser.
2624
#[derive(Clone, Default)]
@@ -92,7 +90,7 @@ impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> {
9290

9391
// FIXME: Is it too noisy to report every character decoding error?
9492
fn error(&mut self, desc: Cow<'static, str>) {
95-
self.tokenizer.sink_mut().sink_mut().parse_error(desc)
93+
self.tokenizer.sink.sink.parse_error(desc)
9694
}
9795

9896
type Output = Sink::Output;
@@ -102,7 +100,7 @@ impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> {
102100
while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {}
103101
assert!(self.input_buffer.is_empty());
104102
self.tokenizer.end();
105-
self.tokenizer.unwrap().unwrap().finish()
103+
self.tokenizer.sink.sink.finish()
106104
}
107105
}
108106

@@ -114,220 +112,20 @@ impl<Sink: TreeSink> Parser<Sink> {
114112
pub fn from_utf8(self) -> Utf8LossyDecoder<Self> {
115113
Utf8LossyDecoder::new(self)
116114
}
117-
118-
/// Wrap this parser into a `TendrilSink` that accepts bytes
119-
/// and tries to detect the correct character encoding.
120-
///
121-
/// Currently this looks for a Byte Order Mark,
122-
/// then uses `BytesOpts::transport_layer_encoding`,
123-
/// then falls back to UTF-8.
124-
///
125-
/// FIXME(https://github.com/servo/html5ever/issues/18): this should look for `<meta>` elements
126-
/// and other data per
127-
/// https://html.spec.whatwg.org/multipage/syntax.html#determining-the-character-encoding
128-
pub fn from_bytes(self, opts: BytesOpts) -> BytesParser<Sink> {
129-
BytesParser {
130-
state: BytesParserState::Initial { parser: self },
131-
opts: opts,
132-
}
133-
}
134-
}
135-
136-
/// Options for choosing a character encoding
137-
#[derive(Clone, Default)]
138-
pub struct BytesOpts {
139-
/// The character encoding specified by the transport layer, if any.
140-
/// In HTTP for example, this is the `charset` parameter of the `Content-Type` response header.
141-
pub transport_layer_encoding: Option<EncodingRef>,
142-
}
143-
144-
/// An HTML parser,
145-
/// ready to receive bytes input through the `tendril::TendrilSink` trait’s methods.
146-
///
147-
/// See `Parser::from_bytes`.
148-
pub struct BytesParser<Sink> where Sink: TreeSink {
149-
state: BytesParserState<Sink>,
150-
opts: BytesOpts,
151-
}
152-
153-
enum BytesParserState<Sink> where Sink: TreeSink {
154-
Initial {
155-
parser: Parser<Sink>,
156-
},
157-
Buffering {
158-
parser: Parser<Sink>,
159-
buffer: ByteTendril
160-
},
161-
Parsing {
162-
decoder: LossyDecoder<Parser<Sink>>,
163-
},
164-
Transient
165-
}
166-
167-
impl<Sink: TreeSink> BytesParser<Sink> {
168-
/// Access the underlying Parser
169-
pub fn str_parser(&self) -> &Parser<Sink> {
170-
match self.state {
171-
BytesParserState::Initial { ref parser } => parser,
172-
BytesParserState::Buffering { ref parser, .. } => parser,
173-
BytesParserState::Parsing { ref decoder } => decoder.inner_sink(),
174-
BytesParserState::Transient => unreachable!(),
175-
}
176-
}
177-
178-
/// Access the underlying Parser
179-
pub fn str_parser_mut(&mut self) -> &mut Parser<Sink> {
180-
match self.state {
181-
BytesParserState::Initial { ref mut parser } => parser,
182-
BytesParserState::Buffering { ref mut parser, .. } => parser,
183-
BytesParserState::Parsing { ref mut decoder } => decoder.inner_sink_mut(),
184-
BytesParserState::Transient => unreachable!(),
185-
}
186-
}
187-
188-
/// Insert a Unicode chunk in the middle of the byte stream.
189-
///
190-
/// This is e.g. for supporting `document.write`.
191-
pub fn process_unicode(&mut self, t: StrTendril) {
192-
if t.is_empty() {
193-
return // Don’t prevent buffering/encoding detection
194-
}
195-
if let BytesParserState::Parsing { ref mut decoder } = self.state {
196-
decoder.inner_sink_mut().process(t)
197-
} else {
198-
match mem::replace(&mut self.state, BytesParserState::Transient) {
199-
BytesParserState::Initial { mut parser } => {
200-
parser.process(t);
201-
self.start_parsing(parser, ByteTendril::new())
202-
}
203-
BytesParserState::Buffering { parser, buffer } => {
204-
self.start_parsing(parser, buffer);
205-
if let BytesParserState::Parsing { ref mut decoder } = self.state {
206-
decoder.inner_sink_mut().process(t)
207-
} else {
208-
unreachable!()
209-
}
210-
}
211-
BytesParserState::Parsing { .. } | BytesParserState::Transient => unreachable!(),
212-
}
213-
}
214-
}
215-
216-
fn start_parsing(&mut self, parser: Parser<Sink>, buffer: ByteTendril) {
217-
let encoding = detect_encoding(&buffer, &self.opts);
218-
let mut decoder = LossyDecoder::new(encoding, parser);
219-
decoder.process(buffer);
220-
self.state = BytesParserState::Parsing { decoder: decoder }
221-
}
222-
}
223-
224-
impl<Sink: TreeSink> TendrilSink<tendril::fmt::Bytes> for BytesParser<Sink> {
225-
fn process(&mut self, t: ByteTendril) {
226-
if let &mut BytesParserState::Parsing { ref mut decoder } = &mut self.state {
227-
return decoder.process(t)
228-
}
229-
let (parser, buffer) = match mem::replace(&mut self.state, BytesParserState::Transient) {
230-
BytesParserState::Initial{ parser } => (parser, t),
231-
BytesParserState::Buffering { parser, mut buffer } => {
232-
buffer.push_tendril(&t);
233-
(parser, buffer)
234-
}
235-
BytesParserState::Parsing { .. } | BytesParserState::Transient => unreachable!(),
236-
};
237-
if buffer.len32() >= PRESCAN_BYTES {
238-
self.start_parsing(parser, buffer)
239-
} else {
240-
self.state = BytesParserState::Buffering {
241-
parser: parser,
242-
buffer: buffer,
243-
}
244-
}
245-
}
246-
247-
fn error(&mut self, desc: Cow<'static, str>) {
248-
match self.state {
249-
BytesParserState::Initial { ref mut parser } => parser.error(desc),
250-
BytesParserState::Buffering { ref mut parser, .. } => parser.error(desc),
251-
BytesParserState::Parsing { ref mut decoder } => decoder.error(desc),
252-
BytesParserState::Transient => unreachable!(),
253-
}
254-
}
255-
256-
type Output = Sink::Output;
257-
258-
fn finish(self) -> Self::Output {
259-
match self.state {
260-
BytesParserState::Initial { parser } => parser.finish(),
261-
BytesParserState::Buffering { parser, buffer } => {
262-
let encoding = detect_encoding(&buffer, &self.opts);
263-
let mut decoder = LossyDecoder::new(encoding, parser);
264-
decoder.process(buffer);
265-
decoder.finish()
266-
},
267-
BytesParserState::Parsing { decoder } => decoder.finish(),
268-
BytesParserState::Transient => unreachable!(),
269-
}
270-
}
271-
}
272-
273-
/// How many bytes does detect_encoding() need
274-
// FIXME(#18): should be 1024 for <meta> elements.
275-
const PRESCAN_BYTES: u32 = 3;
276-
277-
/// https://html.spec.whatwg.org/multipage/syntax.html#determining-the-character-encoding
278-
fn detect_encoding(bytes: &ByteTendril, opts: &BytesOpts) -> EncodingRef {
279-
if bytes.starts_with(b"\xEF\xBB\xBF") {
280-
return encoding::all::UTF_8
281-
}
282-
if bytes.starts_with(b"\xFE\xFF") {
283-
return encoding::all::UTF_16BE
284-
}
285-
if bytes.starts_with(b"\xFF\xFE") {
286-
return encoding::all::UTF_16LE
287-
}
288-
if let Some(encoding) = opts.transport_layer_encoding {
289-
return encoding
290-
}
291-
// FIXME(#18): <meta> etc.
292-
return encoding::all::UTF_8
293115
}
294116

295117
#[cfg(test)]
296118
mod tests {
297119
use rcdom::RcDom;
298120
use serialize::serialize;
299-
use std::iter::repeat;
300121
use tendril::TendrilSink;
301122
use super::*;
302123

303124
#[test]
304125
fn from_utf8() {
305-
assert_serialization(
306-
parse_document(RcDom::default(), ParseOpts::default())
307-
.from_utf8()
308-
.one("<title>Test".as_bytes()));
309-
}
310-
311-
#[test]
312-
fn from_bytes_one() {
313-
assert_serialization(
314-
parse_document(RcDom::default(), ParseOpts::default())
315-
.from_bytes(BytesOpts::default())
316-
.one("<title>Test".as_bytes()));
317-
}
318-
319-
#[test]
320-
fn from_bytes_iter() {
321-
assert_serialization(
322-
parse_document(RcDom::default(), ParseOpts::default())
323-
.from_bytes(BytesOpts::default())
324-
.from_iter([
325-
"<title>Test".as_bytes(),
326-
repeat(' ').take(1200).collect::<String>().as_bytes(),
327-
].iter().cloned()));
328-
}
329-
330-
fn assert_serialization(dom: RcDom) {
126+
let dom = parse_document(RcDom::default(), ParseOpts::default())
127+
.from_utf8()
128+
.one("<title>Test".as_bytes());
331129
let mut serialized = Vec::new();
332130
serialize(&mut serialized, &dom.document, Default::default()).unwrap();
333131
assert_eq!(String::from_utf8(serialized).unwrap().replace(" ", ""),

html5ever/src/lib.rs

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,3 @@ pub mod serialize;
3333
pub mod tokenizer;
3434
pub mod tree_builder;
3535
pub mod driver;
36-
37-
/// Re-export the tendril crate.
38-
pub mod tendril {
39-
extern crate tendril;
40-
pub use self::tendril::*;
41-
}
42-
43-
/// Re-export the encoding crate.
44-
pub use tendril::encoding;

0 commit comments

Comments
 (0)