Skip to content

Commit 79ed3e3

Browse files
committed
Remove broken BytesParser
It cannot be the future API for handling `<meta charset>` since it does not support interupting for scripts.
1 parent b6fe2e2 commit 79ed3e3

File tree

6 files changed

+11
-407
lines changed

6 files changed

+11
-407
lines changed

html5ever/src/driver.rs

Lines changed: 5 additions & 207 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,10 @@ use tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult};
1515
use tree_builder::{TreeBuilderOpts, TreeBuilder, TreeSink, create_element};
1616

1717
use std::borrow::Cow;
18-
use std::mem;
1918

20-
use encoding::{self, EncodingRef};
2119
use tendril;
22-
use tendril::{StrTendril, ByteTendril};
23-
use tendril::stream::{TendrilSink, Utf8LossyDecoder, LossyDecoder};
20+
use tendril::StrTendril;
21+
use tendril::stream::{TendrilSink, Utf8LossyDecoder};
2422

2523
/// All-encompassing options struct for the parser.
2624
#[derive(Clone, Default)]
@@ -114,220 +112,20 @@ impl<Sink: TreeSink> Parser<Sink> {
114112
pub fn from_utf8(self) -> Utf8LossyDecoder<Self> {
115113
Utf8LossyDecoder::new(self)
116114
}
117-
118-
/// Wrap this parser into a `TendrilSink` that accepts bytes
119-
/// and tries to detect the correct character encoding.
120-
///
121-
/// Currently this looks for a Byte Order Mark,
122-
/// then uses `BytesOpts::transport_layer_encoding`,
123-
/// then falls back to UTF-8.
124-
///
125-
/// FIXME(https://github.com/servo/html5ever/issues/18): this should look for `<meta>` elements
126-
/// and other data per
127-
/// https://html.spec.whatwg.org/multipage/syntax.html#determining-the-character-encoding
128-
pub fn from_bytes(self, opts: BytesOpts) -> BytesParser<Sink> {
129-
BytesParser {
130-
state: BytesParserState::Initial { parser: self },
131-
opts: opts,
132-
}
133-
}
134-
}
135-
136-
/// Options for choosing a character encoding
137-
#[derive(Clone, Default)]
138-
pub struct BytesOpts {
139-
/// The character encoding specified by the transport layer, if any.
140-
/// In HTTP for example, this is the `charset` parameter of the `Content-Type` response header.
141-
pub transport_layer_encoding: Option<EncodingRef>,
142-
}
143-
144-
/// An HTML parser,
145-
/// ready to receive bytes input through the `tendril::TendrilSink` trait’s methods.
146-
///
147-
/// See `Parser::from_bytes`.
148-
pub struct BytesParser<Sink> where Sink: TreeSink {
149-
state: BytesParserState<Sink>,
150-
opts: BytesOpts,
151-
}
152-
153-
enum BytesParserState<Sink> where Sink: TreeSink {
154-
Initial {
155-
parser: Parser<Sink>,
156-
},
157-
Buffering {
158-
parser: Parser<Sink>,
159-
buffer: ByteTendril
160-
},
161-
Parsing {
162-
decoder: LossyDecoder<Parser<Sink>>,
163-
},
164-
Transient
165-
}
166-
167-
impl<Sink: TreeSink> BytesParser<Sink> {
168-
/// Access the underlying Parser
169-
pub fn str_parser(&self) -> &Parser<Sink> {
170-
match self.state {
171-
BytesParserState::Initial { ref parser } => parser,
172-
BytesParserState::Buffering { ref parser, .. } => parser,
173-
BytesParserState::Parsing { ref decoder } => decoder.inner_sink(),
174-
BytesParserState::Transient => unreachable!(),
175-
}
176-
}
177-
178-
/// Access the underlying Parser
179-
pub fn str_parser_mut(&mut self) -> &mut Parser<Sink> {
180-
match self.state {
181-
BytesParserState::Initial { ref mut parser } => parser,
182-
BytesParserState::Buffering { ref mut parser, .. } => parser,
183-
BytesParserState::Parsing { ref mut decoder } => decoder.inner_sink_mut(),
184-
BytesParserState::Transient => unreachable!(),
185-
}
186-
}
187-
188-
/// Insert a Unicode chunk in the middle of the byte stream.
189-
///
190-
/// This is e.g. for supporting `document.write`.
191-
pub fn process_unicode(&mut self, t: StrTendril) {
192-
if t.is_empty() {
193-
return // Don’t prevent buffering/encoding detection
194-
}
195-
if let BytesParserState::Parsing { ref mut decoder } = self.state {
196-
decoder.inner_sink_mut().process(t)
197-
} else {
198-
match mem::replace(&mut self.state, BytesParserState::Transient) {
199-
BytesParserState::Initial { mut parser } => {
200-
parser.process(t);
201-
self.start_parsing(parser, ByteTendril::new())
202-
}
203-
BytesParserState::Buffering { parser, buffer } => {
204-
self.start_parsing(parser, buffer);
205-
if let BytesParserState::Parsing { ref mut decoder } = self.state {
206-
decoder.inner_sink_mut().process(t)
207-
} else {
208-
unreachable!()
209-
}
210-
}
211-
BytesParserState::Parsing { .. } | BytesParserState::Transient => unreachable!(),
212-
}
213-
}
214-
}
215-
216-
fn start_parsing(&mut self, parser: Parser<Sink>, buffer: ByteTendril) {
217-
let encoding = detect_encoding(&buffer, &self.opts);
218-
let mut decoder = LossyDecoder::new(encoding, parser);
219-
decoder.process(buffer);
220-
self.state = BytesParserState::Parsing { decoder: decoder }
221-
}
222-
}
223-
224-
impl<Sink: TreeSink> TendrilSink<tendril::fmt::Bytes> for BytesParser<Sink> {
225-
fn process(&mut self, t: ByteTendril) {
226-
if let &mut BytesParserState::Parsing { ref mut decoder } = &mut self.state {
227-
return decoder.process(t)
228-
}
229-
let (parser, buffer) = match mem::replace(&mut self.state, BytesParserState::Transient) {
230-
BytesParserState::Initial{ parser } => (parser, t),
231-
BytesParserState::Buffering { parser, mut buffer } => {
232-
buffer.push_tendril(&t);
233-
(parser, buffer)
234-
}
235-
BytesParserState::Parsing { .. } | BytesParserState::Transient => unreachable!(),
236-
};
237-
if buffer.len32() >= PRESCAN_BYTES {
238-
self.start_parsing(parser, buffer)
239-
} else {
240-
self.state = BytesParserState::Buffering {
241-
parser: parser,
242-
buffer: buffer,
243-
}
244-
}
245-
}
246-
247-
fn error(&mut self, desc: Cow<'static, str>) {
248-
match self.state {
249-
BytesParserState::Initial { ref mut parser } => parser.error(desc),
250-
BytesParserState::Buffering { ref mut parser, .. } => parser.error(desc),
251-
BytesParserState::Parsing { ref mut decoder } => decoder.error(desc),
252-
BytesParserState::Transient => unreachable!(),
253-
}
254-
}
255-
256-
type Output = Sink::Output;
257-
258-
fn finish(self) -> Self::Output {
259-
match self.state {
260-
BytesParserState::Initial { parser } => parser.finish(),
261-
BytesParserState::Buffering { parser, buffer } => {
262-
let encoding = detect_encoding(&buffer, &self.opts);
263-
let mut decoder = LossyDecoder::new(encoding, parser);
264-
decoder.process(buffer);
265-
decoder.finish()
266-
},
267-
BytesParserState::Parsing { decoder } => decoder.finish(),
268-
BytesParserState::Transient => unreachable!(),
269-
}
270-
}
271-
}
272-
273-
/// How many bytes does detect_encoding() need
274-
// FIXME(#18): should be 1024 for <meta> elements.
275-
const PRESCAN_BYTES: u32 = 3;
276-
277-
/// https://html.spec.whatwg.org/multipage/syntax.html#determining-the-character-encoding
278-
fn detect_encoding(bytes: &ByteTendril, opts: &BytesOpts) -> EncodingRef {
279-
if bytes.starts_with(b"\xEF\xBB\xBF") {
280-
return encoding::all::UTF_8
281-
}
282-
if bytes.starts_with(b"\xFE\xFF") {
283-
return encoding::all::UTF_16BE
284-
}
285-
if bytes.starts_with(b"\xFF\xFE") {
286-
return encoding::all::UTF_16LE
287-
}
288-
if let Some(encoding) = opts.transport_layer_encoding {
289-
return encoding
290-
}
291-
// FIXME(#18): <meta> etc.
292-
return encoding::all::UTF_8
293115
}
294116

295117
#[cfg(test)]
296118
mod tests {
297119
use rcdom::RcDom;
298120
use serialize::serialize;
299-
use std::iter::repeat;
300121
use tendril::TendrilSink;
301122
use super::*;
302123

303124
#[test]
304125
fn from_utf8() {
305-
assert_serialization(
306-
parse_document(RcDom::default(), ParseOpts::default())
307-
.from_utf8()
308-
.one("<title>Test".as_bytes()));
309-
}
310-
311-
#[test]
312-
fn from_bytes_one() {
313-
assert_serialization(
314-
parse_document(RcDom::default(), ParseOpts::default())
315-
.from_bytes(BytesOpts::default())
316-
.one("<title>Test".as_bytes()));
317-
}
318-
319-
#[test]
320-
fn from_bytes_iter() {
321-
assert_serialization(
322-
parse_document(RcDom::default(), ParseOpts::default())
323-
.from_bytes(BytesOpts::default())
324-
.from_iter([
325-
"<title>Test".as_bytes(),
326-
repeat(' ').take(1200).collect::<String>().as_bytes(),
327-
].iter().cloned()));
328-
}
329-
330-
fn assert_serialization(dom: RcDom) {
126+
let dom = parse_document(RcDom::default(), ParseOpts::default())
127+
.from_utf8()
128+
.one("<title>Test".as_bytes());
331129
let mut serialized = Vec::new();
332130
serialize(&mut serialized, &dom.document, Default::default()).unwrap();
333131
assert_eq!(String::from_utf8(serialized).unwrap().replace(" ", ""),

html5ever/src/lib.rs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,3 @@ pub mod serialize;
3333
pub mod tokenizer;
3434
pub mod tree_builder;
3535
pub mod driver;
36-
37-
/// Re-export the encoding crate.
38-
pub use tendril::encoding;

html5ever/src/tree_builder/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -500,7 +500,7 @@ mod test {
500500
use ExpandedName;
501501
use QualName;
502502
use tendril::StrTendril;
503-
use tendril::stream::{TendrilSink, Utf8LossyDecoder, LossyDecoder};
503+
use tendril::stream::{TendrilSink, Utf8LossyDecoder};
504504

505505
use tokenizer;
506506
use tokenizer::{Tokenizer, TokenizerOpts};

xml5ever/examples/hello_xml.rs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,15 @@ extern crate xml5ever;
1313
use std::default::Default;
1414

1515
use xml5ever::tendril::TendrilSink;
16-
use xml5ever::driver::{parse_document, BytesOpts};
16+
use xml5ever::driver::parse_document;
1717
use xml5ever::tree_builder::{TreeSink};
1818
use xml5ever::rcdom::{RcDom, NodeData};
1919

2020
fn main() {
2121
// To parse a string into a tree of nodes, we need to invoke
2222
// `parse_document` and supply it with a TreeSink implementation (RcDom).
23-
//
24-
// Since this is a string, it's best to use `from_bytes` to create a
25-
// BytesParser for given string.
2623
let dom: RcDom = parse_document(RcDom::default(), Default::default())
27-
.from_bytes(BytesOpts::default())
28-
.one("<hello>XML</hello>".as_bytes());
24+
.one("<hello>XML</hello>");
2925

3026
// Do some processing
3127
let doc = &dom.document;

0 commit comments

Comments
 (0)