Skip to content

Commit d67f426

Browse files
committed
#154: Implement read_text which read all content between tags as a text, including other markup
1 parent 7fafac4 commit d67f426

File tree

3 files changed

+145
-0
lines changed

3 files changed

+145
-0
lines changed

Changelog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
under the `quick-xml::encoding` namespace.
4242
- [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers
4343
- [#455]: Change return type of all `read_to_end*` methods to return a span between tags
44+
- [#455]: Added `Reader::read_text` method to return a raw content (including markup) between tags
4445

4546

4647
### Bug Fixes

src/reader/ns_reader.rs

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
//! [qualified names]: https://www.w3.org/TR/xml-names11/#dt-qualname
55
//! [expanded names]: https://www.w3.org/TR/xml-names11/#dt-expname
66
7+
use std::borrow::Cow;
78
use std::fs::File;
89
use std::io::{BufRead, BufReader};
910
use std::ops::Deref;
@@ -750,6 +751,75 @@ impl<'i> NsReader<&'i [u8]> {
750751
// match literally the start name. See `Self::check_end_names` documentation
751752
self.reader.read_to_end(end)
752753
}
754+
755+
/// Reads content between start and end tags, including any markup. This
756+
/// function is supposed to be called after you already read a [`Start`] event.
757+
///
758+
/// Manages nested cases where parent and child elements have the same name.
759+
///
760+
/// This method does not unescape read data, instead it returns content
761+
/// "as is" of the XML document. This is because it has no idea what text
762+
/// it reads, and if, for example, it contains CDATA section, attempt to
763+
/// unescape it content will spoil data.
764+
///
765+
/// Any text will be decoded using the XML current [`decoder()`].
766+
///
767+
/// Actually, this method perform the following code:
768+
///
769+
/// ```ignore
770+
/// let span = reader.read_to_end(end)?;
771+
/// let text = reader.decoder().decode(&reader.inner_slice[span]);
772+
/// ```
773+
///
774+
/// # Examples
775+
///
776+
/// This example shows, how you can read a HTML content from your XML document.
777+
///
778+
/// ```
779+
/// # use pretty_assertions::assert_eq;
780+
/// # use std::borrow::Cow;
781+
/// use quick_xml::events::{BytesStart, Event};
782+
/// use quick_xml::NsReader;
783+
///
784+
/// let mut reader = NsReader::from_str(r#"
785+
/// <html>
786+
/// <title>This is a HTML text</title>
787+
/// <p>Usual XML rules does not apply inside it
788+
/// <p>For example, elements not needed to be &quot;closed&quot;
789+
/// </html>
790+
/// "#);
791+
/// reader.trim_text(true);
792+
///
793+
/// let start = BytesStart::new("html");
794+
/// let end = start.to_end().into_owned();
795+
///
796+
/// // First, we read a start event...
797+
/// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
798+
/// // ...and disable checking of end names because we expect HTML further...
799+
/// reader.check_end_names(false);
800+
///
801+
/// // ...then, we could read text content until close tag.
802+
/// // This call will correctly handle nested <html> elements.
803+
/// let text = reader.read_text(end.name()).unwrap();
804+
/// assert_eq!(text, Cow::Borrowed(r#"
805+
/// <title>This is a HTML text</title>
806+
/// <p>Usual XML rules does not apply inside it
807+
/// <p>For example, elements not needed to be &quot;closed&quot;
808+
/// "#));
809+
///
810+
/// // Now we can enable checks again
811+
/// reader.check_end_names(true);
812+
///
813+
/// // At the end we should get an Eof event, because we ate the whole XML
814+
/// assert_eq!(reader.read_event().unwrap(), Event::Eof);
815+
/// ```
816+
///
817+
/// [`Start`]: Event::Start
818+
/// [`decoder()`]: Reader::decoder()
819+
#[inline]
820+
pub fn read_text(&mut self, end: QName) -> Result<Cow<'i, str>> {
821+
self.reader.read_text(end)
822+
}
753823
}
754824

755825
impl<R> Deref for NsReader<R> {

src/reader/slice_reader.rs

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
//! underlying byte stream. This implementation supports not using an
33
//! intermediate buffer as the byte slice itself can be used to borrow from.
44
5+
use std::borrow::Cow;
6+
57
#[cfg(feature = "encoding")]
68
use crate::reader::EncodingRef;
79
#[cfg(feature = "encoding")]
@@ -153,6 +155,78 @@ impl<'a> Reader<&'a [u8]> {
153155
pub fn read_to_end(&mut self, end: QName) -> Result<Span> {
154156
Ok(read_to_end!(self, end, (), read_event_impl, {}))
155157
}
158+
159+
/// Reads content between start and end tags, including any markup. This
160+
/// function is supposed to be called after you already read a [`Start`] event.
161+
///
162+
/// Manages nested cases where parent and child elements have the same name.
163+
///
164+
/// This method does not unescape read data, instead it returns content
165+
/// "as is" of the XML document. This is because it has no idea what text
166+
/// it reads, and if, for example, it contains CDATA section, attempt to
167+
/// unescape it content will spoil data.
168+
///
169+
/// Any text will be decoded using the XML current [`decoder()`].
170+
///
171+
/// Actually, this method perform the following code:
172+
///
173+
/// ```ignore
174+
/// let span = reader.read_to_end(end)?;
175+
/// let text = reader.decoder().decode(&reader.inner_slice[span]);
176+
/// ```
177+
///
178+
/// # Examples
179+
///
180+
/// This example shows, how you can read a HTML content from your XML document.
181+
///
182+
/// ```
183+
/// # use pretty_assertions::assert_eq;
184+
/// # use std::borrow::Cow;
185+
/// use quick_xml::events::{BytesStart, Event};
186+
/// use quick_xml::Reader;
187+
///
188+
/// let mut reader = Reader::from_str("
189+
/// <html>
190+
/// <title>This is a HTML text</title>
191+
/// <p>Usual XML rules does not apply inside it
192+
/// <p>For example, elements not needed to be &quot;closed&quot;
193+
/// </html>
194+
/// ");
195+
/// reader.trim_text(true);
196+
///
197+
/// let start = BytesStart::new("html");
198+
/// let end = start.to_end().into_owned();
199+
///
200+
/// // First, we read a start event...
201+
/// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
202+
/// // ...and disable checking of end names because we expect HTML further...
203+
/// reader.check_end_names(false);
204+
///
205+
/// // ...then, we could read text content until close tag.
206+
/// // This call will correctly handle nested <html> elements.
207+
/// let text = reader.read_text(end.name()).unwrap();
208+
/// assert_eq!(text, Cow::Borrowed(r#"
209+
/// <title>This is a HTML text</title>
210+
/// <p>Usual XML rules does not apply inside it
211+
/// <p>For example, elements not needed to be &quot;closed&quot;
212+
/// "#));
213+
///
214+
/// // Now we can enable checks again
215+
/// reader.check_end_names(true);
216+
///
217+
/// // At the end we should get an Eof event, because we ate the whole XML
218+
/// assert_eq!(reader.read_event().unwrap(), Event::Eof);
219+
/// ```
220+
///
221+
/// [`Start`]: Event::Start
222+
/// [`decoder()`]: Self::decoder()
223+
pub fn read_text(&mut self, end: QName) -> Result<Cow<'a, str>> {
224+
// self.reader will be changed, so store original reference
225+
let buffer = self.reader;
226+
let span = self.read_to_end(end)?;
227+
228+
self.decoder().decode(&buffer[0..span.len()])
229+
}
156230
}
157231

158232
////////////////////////////////////////////////////////////////////////////////////////////////////

0 commit comments

Comments
 (0)