@@ -15,12 +15,10 @@ use tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult};
1515use tree_builder:: { TreeBuilderOpts , TreeBuilder , TreeSink , create_element} ;
1616
1717use std:: borrow:: Cow ;
18- use std:: mem;
1918
20- use encoding:: { self , EncodingRef } ;
2119use tendril;
22- use tendril:: { StrTendril , ByteTendril } ;
23- use tendril:: stream:: { TendrilSink , Utf8LossyDecoder , LossyDecoder } ;
20+ use tendril:: StrTendril ;
21+ use tendril:: stream:: { TendrilSink , Utf8LossyDecoder } ;
2422
2523/// All-encompassing options struct for the parser.
2624#[ derive( Clone , Default ) ]
@@ -114,220 +112,20 @@ impl<Sink: TreeSink> Parser<Sink> {
114112 pub fn from_utf8 ( self ) -> Utf8LossyDecoder < Self > {
115113 Utf8LossyDecoder :: new ( self )
116114 }
117-
118- /// Wrap this parser into a `TendrilSink` that accepts bytes
119- /// and tries to detect the correct character encoding.
120- ///
121- /// Currently this looks for a Byte Order Mark,
122- /// then uses `BytesOpts::transport_layer_encoding`,
123- /// then falls back to UTF-8.
124- ///
125- /// FIXME(https://github.com/servo/html5ever/issues/18): this should look for `<meta>` elements
126- /// and other data per
127- /// https://html.spec.whatwg.org/multipage/syntax.html#determining-the-character-encoding
128- pub fn from_bytes ( self , opts : BytesOpts ) -> BytesParser < Sink > {
129- BytesParser {
130- state : BytesParserState :: Initial { parser : self } ,
131- opts : opts,
132- }
133- }
134- }
135-
136- /// Options for choosing a character encoding
137- #[ derive( Clone , Default ) ]
138- pub struct BytesOpts {
139- /// The character encoding specified by the transport layer, if any.
140- /// In HTTP for example, this is the `charset` parameter of the `Content-Type` response header.
141- pub transport_layer_encoding : Option < EncodingRef > ,
142- }
143-
144- /// An HTML parser,
145- /// ready to receive bytes input through the `tendril::TendrilSink` trait’s methods.
146- ///
147- /// See `Parser::from_bytes`.
148- pub struct BytesParser < Sink > where Sink : TreeSink {
149- state : BytesParserState < Sink > ,
150- opts : BytesOpts ,
151- }
152-
153- enum BytesParserState < Sink > where Sink : TreeSink {
154- Initial {
155- parser : Parser < Sink > ,
156- } ,
157- Buffering {
158- parser : Parser < Sink > ,
159- buffer : ByteTendril
160- } ,
161- Parsing {
162- decoder : LossyDecoder < Parser < Sink > > ,
163- } ,
164- Transient
165- }
166-
167- impl < Sink : TreeSink > BytesParser < Sink > {
168- /// Access the underlying Parser
169- pub fn str_parser ( & self ) -> & Parser < Sink > {
170- match self . state {
171- BytesParserState :: Initial { ref parser } => parser,
172- BytesParserState :: Buffering { ref parser, .. } => parser,
173- BytesParserState :: Parsing { ref decoder } => decoder. inner_sink ( ) ,
174- BytesParserState :: Transient => unreachable ! ( ) ,
175- }
176- }
177-
178- /// Access the underlying Parser
179- pub fn str_parser_mut ( & mut self ) -> & mut Parser < Sink > {
180- match self . state {
181- BytesParserState :: Initial { ref mut parser } => parser,
182- BytesParserState :: Buffering { ref mut parser, .. } => parser,
183- BytesParserState :: Parsing { ref mut decoder } => decoder. inner_sink_mut ( ) ,
184- BytesParserState :: Transient => unreachable ! ( ) ,
185- }
186- }
187-
188- /// Insert a Unicode chunk in the middle of the byte stream.
189- ///
190- /// This is e.g. for supporting `document.write`.
191- pub fn process_unicode ( & mut self , t : StrTendril ) {
192- if t. is_empty ( ) {
193- return // Don’t prevent buffering/encoding detection
194- }
195- if let BytesParserState :: Parsing { ref mut decoder } = self . state {
196- decoder. inner_sink_mut ( ) . process ( t)
197- } else {
198- match mem:: replace ( & mut self . state , BytesParserState :: Transient ) {
199- BytesParserState :: Initial { mut parser } => {
200- parser. process ( t) ;
201- self . start_parsing ( parser, ByteTendril :: new ( ) )
202- }
203- BytesParserState :: Buffering { parser, buffer } => {
204- self . start_parsing ( parser, buffer) ;
205- if let BytesParserState :: Parsing { ref mut decoder } = self . state {
206- decoder. inner_sink_mut ( ) . process ( t)
207- } else {
208- unreachable ! ( )
209- }
210- }
211- BytesParserState :: Parsing { .. } | BytesParserState :: Transient => unreachable ! ( ) ,
212- }
213- }
214- }
215-
216- fn start_parsing ( & mut self , parser : Parser < Sink > , buffer : ByteTendril ) {
217- let encoding = detect_encoding ( & buffer, & self . opts ) ;
218- let mut decoder = LossyDecoder :: new ( encoding, parser) ;
219- decoder. process ( buffer) ;
220- self . state = BytesParserState :: Parsing { decoder : decoder }
221- }
222- }
223-
224- impl < Sink : TreeSink > TendrilSink < tendril:: fmt:: Bytes > for BytesParser < Sink > {
225- fn process ( & mut self , t : ByteTendril ) {
226- if let & mut BytesParserState :: Parsing { ref mut decoder } = & mut self . state {
227- return decoder. process ( t)
228- }
229- let ( parser, buffer) = match mem:: replace ( & mut self . state , BytesParserState :: Transient ) {
230- BytesParserState :: Initial { parser } => ( parser, t) ,
231- BytesParserState :: Buffering { parser, mut buffer } => {
232- buffer. push_tendril ( & t) ;
233- ( parser, buffer)
234- }
235- BytesParserState :: Parsing { .. } | BytesParserState :: Transient => unreachable ! ( ) ,
236- } ;
237- if buffer. len32 ( ) >= PRESCAN_BYTES {
238- self . start_parsing ( parser, buffer)
239- } else {
240- self . state = BytesParserState :: Buffering {
241- parser : parser,
242- buffer : buffer,
243- }
244- }
245- }
246-
247- fn error ( & mut self , desc : Cow < ' static , str > ) {
248- match self . state {
249- BytesParserState :: Initial { ref mut parser } => parser. error ( desc) ,
250- BytesParserState :: Buffering { ref mut parser, .. } => parser. error ( desc) ,
251- BytesParserState :: Parsing { ref mut decoder } => decoder. error ( desc) ,
252- BytesParserState :: Transient => unreachable ! ( ) ,
253- }
254- }
255-
256- type Output = Sink :: Output ;
257-
258- fn finish ( self ) -> Self :: Output {
259- match self . state {
260- BytesParserState :: Initial { parser } => parser. finish ( ) ,
261- BytesParserState :: Buffering { parser, buffer } => {
262- let encoding = detect_encoding ( & buffer, & self . opts ) ;
263- let mut decoder = LossyDecoder :: new ( encoding, parser) ;
264- decoder. process ( buffer) ;
265- decoder. finish ( )
266- } ,
267- BytesParserState :: Parsing { decoder } => decoder. finish ( ) ,
268- BytesParserState :: Transient => unreachable ! ( ) ,
269- }
270- }
271- }
272-
273- /// How many bytes does detect_encoding() need
274- // FIXME(#18): should be 1024 for <meta> elements.
275- const PRESCAN_BYTES : u32 = 3 ;
276-
277- /// https://html.spec.whatwg.org/multipage/syntax.html#determining-the-character-encoding
278- fn detect_encoding ( bytes : & ByteTendril , opts : & BytesOpts ) -> EncodingRef {
279- if bytes. starts_with ( b"\xEF \xBB \xBF " ) {
280- return encoding:: all:: UTF_8
281- }
282- if bytes. starts_with ( b"\xFE \xFF " ) {
283- return encoding:: all:: UTF_16BE
284- }
285- if bytes. starts_with ( b"\xFF \xFE " ) {
286- return encoding:: all:: UTF_16LE
287- }
288- if let Some ( encoding) = opts. transport_layer_encoding {
289- return encoding
290- }
291- // FIXME(#18): <meta> etc.
292- return encoding:: all:: UTF_8
293115}
294116
295117#[ cfg( test) ]
296118mod tests {
297119 use rcdom:: RcDom ;
298120 use serialize:: serialize;
299- use std:: iter:: repeat;
300121 use tendril:: TendrilSink ;
301122 use super :: * ;
302123
303124 #[ test]
304125 fn from_utf8 ( ) {
305- assert_serialization (
306- parse_document ( RcDom :: default ( ) , ParseOpts :: default ( ) )
307- . from_utf8 ( )
308- . one ( "<title>Test" . as_bytes ( ) ) ) ;
309- }
310-
311- #[ test]
312- fn from_bytes_one ( ) {
313- assert_serialization (
314- parse_document ( RcDom :: default ( ) , ParseOpts :: default ( ) )
315- . from_bytes ( BytesOpts :: default ( ) )
316- . one ( "<title>Test" . as_bytes ( ) ) ) ;
317- }
318-
319- #[ test]
320- fn from_bytes_iter ( ) {
321- assert_serialization (
322- parse_document ( RcDom :: default ( ) , ParseOpts :: default ( ) )
323- . from_bytes ( BytesOpts :: default ( ) )
324- . from_iter ( [
325- "<title>Test" . as_bytes ( ) ,
326- repeat ( ' ' ) . take ( 1200 ) . collect :: < String > ( ) . as_bytes ( ) ,
327- ] . iter ( ) . cloned ( ) ) ) ;
328- }
329-
330- fn assert_serialization ( dom : RcDom ) {
126+ let dom = parse_document ( RcDom :: default ( ) , ParseOpts :: default ( ) )
127+ . from_utf8 ( )
128+ . one ( "<title>Test" . as_bytes ( ) ) ;
331129 let mut serialized = Vec :: new ( ) ;
332130 serialize ( & mut serialized, & dom. document , Default :: default ( ) ) . unwrap ( ) ;
333131 assert_eq ! ( String :: from_utf8( serialized) . unwrap( ) . replace( " " , "" ) ,
0 commit comments