@@ -27,7 +27,7 @@ import org.readium.r2.shared.util.Language
2727import org.readium.r2.shared.util.mediatype.MediaType
2828import org.readium.r2.shared.util.use
2929
30- // FIXME: Support custom skipped elements
30+ // FIXME: Support custom skipped elements?
3131
3232/* *
3333 * Iterates an HTML [resource], starting from the given [locator].
@@ -36,11 +36,14 @@ import org.readium.r2.shared.util.use
3636 * [Locator.Locations] object.
3737 *
3838 * If you want to start from the end of the resource, the [locator] must have a `progression` of 1.0.
39+ *
40+ * Locators will contain a `before` context of up to `beforeMaxLength` characters.
3941 */
4042@ExperimentalReadiumApi
4143class HtmlResourceContentIterator (
4244 private val resource : Resource ,
43- private val locator : Locator
45+ private val locator : Locator ,
46+ private val beforeMaxLength : Int = 50
4447) : Content.Iterator {
4548
4649 companion object {
@@ -57,47 +60,50 @@ class HtmlResourceContentIterator(
5760 /* *
5861 * [Content.Element] loaded with [hasPrevious] or [hasNext], associated with the move delta.
5962 */
60- private data class ContentWithDelta (
63+ private data class ElementWithDelta (
6164 val element : Content .Element ,
6265 val delta : Int
6366 )
6467
65- private var currentElement : ContentWithDelta ? = null
68+ private var requestedElement : ElementWithDelta ? = null
6669
6770 override suspend fun hasPrevious (): Boolean {
68- currentElement = nextBy(- 1 )
69- return currentElement != null
71+ if (requestedElement?.delta == - 1 ) return true
72+
73+ val index = currentIndex() - 1
74+ val element = elements().elements.getOrNull(index) ? : return false
75+ currentIndex = index
76+ requestedElement = ElementWithDelta (element, - 1 )
77+ return true
7078 }
7179
7280 override fun previous (): Content .Element =
73- currentElement
81+ requestedElement
7482 ?.takeIf { it.delta == - 1 }?.element
83+ ?.also { requestedElement = null }
7584 ? : throw IllegalStateException (" Called previous() without a successful call to hasPrevious() first" )
7685
7786 override suspend fun hasNext (): Boolean {
78- currentElement = nextBy(+ 1 )
79- return currentElement != null
87+ if (requestedElement?.delta == 1 ) return true
88+
89+ val index = currentIndex()
90+ val element = elements().elements.getOrNull(index) ? : return false
91+ currentIndex = index + 1
92+ requestedElement = ElementWithDelta (element, + 1 )
93+ return true
8094 }
8195
8296 override fun next (): Content .Element =
83- currentElement
84- ?.takeIf { it.delta == + 1 }?.element
97+ requestedElement
98+ ?.takeIf { it.delta == 1 }?.element
99+ ?.also { requestedElement = null }
85100 ? : throw IllegalStateException (" Called next() without a successful call to hasNext() first" )
86101
87- private suspend fun nextBy (delta : Int ): ContentWithDelta ? {
88- val elements = elements()
89- val index = currentIndex?.let { it + delta }
90- ? : elements.startIndex
91-
92- val content = elements.elements.getOrNull(index)
93- ? : return null
94-
95- currentIndex = index
96- return ContentWithDelta (content, delta)
97- }
98-
99102 private var currentIndex: Int? = null
100103
104+ private suspend fun currentIndex (): Int =
105+ currentIndex ? : elements().startIndex
106+
101107 private suspend fun elements (): ParsedElements =
102108 parsedElements
103109 ? : parseElements().also { parsedElements = it }
@@ -116,14 +122,15 @@ class HtmlResourceContentIterator(
116122 // The JS third-party library used to generate the CSS Selector sometimes adds
117123 // :root >, which doesn't work with JSoup.
118124 tryOrNull { body.selectFirst(it.removePrefix(" :root > " )) }
119- }
125+ },
126+ beforeMaxLength = beforeMaxLength
120127 )
121128 NodeTraversor .traverse(contentParser, body)
122129 return contentParser.result()
123130 }
124131
125132 /* *
126- * Holds the result of parsing the HTML resource into a list of [ ContentElement] .
133+ * Holds the result of parsing the HTML resource into a list of ` ContentElement` .
127134 *
128135 * The [startIndex] will be calculated from the element matched by the base [locator], if
129136 * possible. Defaults to 0.
@@ -136,6 +143,7 @@ class HtmlResourceContentIterator(
136143 private class ContentParser (
137144 private val baseLocator : Locator ,
138145 private val startElement : Element ? ,
146+ private val beforeMaxLength : Int
139147 ) : NodeVisitor {
140148
141149 fun result () = ParsedElements (
@@ -146,51 +154,48 @@ class HtmlResourceContentIterator(
146154
147155 private val elements = mutableListOf<Content .Element >()
148156 private var startIndex = 0
149- private var currentElement: Element ? = null
150157
151158 private val segmentsAcc = mutableListOf<TextElement .Segment >()
152159 private var textAcc = StringBuilder ()
153- private var wholeRawTextAcc: String = " "
160+ private var wholeRawTextAcc: String? = null
154161 private var elementRawTextAcc: String = " "
155162 private var rawTextAcc: String = " "
156163 private var currentLanguage: String? = null
157164 private var currentCssSelector: String? = null
158- private var ignoredNode: Node ? = null
159165
160- override fun head (node : Node , depth : Int ) {
161- if (ignoredNode != null ) return
162-
163- if (node.isHidden) {
164- ignoredNode = node
165- return
166- }
166+ /* * LIFO stack of the current element's block ancestors. */
167+ private val breadcrumbs = mutableListOf<Element >()
167168
169+ override fun head (node : Node , depth : Int ) {
168170 if (node is Element ) {
169- currentElement = node
171+ if (node.isBlock) {
172+ breadcrumbs.add(node)
173+ }
170174
171175 val tag = node.normalName()
172176
177+ val elementLocator: Locator by lazy {
178+ baseLocator.copy(
179+ locations = Locator .Locations (
180+ otherLocations = buildMap {
181+ put(" cssSelector" , node.cssSelector() as Any )
182+ }
183+ )
184+ )
185+ }
186+
173187 when {
174188 tag == " br" -> {
175189 flushText()
176190 }
191+
177192 tag == " img" -> {
178193 flushText()
179194
180- val href = node.attr(" src" )
181- .takeIf { it.isNotBlank() }
182- ?.let { Href (it, baseLocator.href).string }
183-
184- if (href != null ) {
195+ node.srcRelativeToHref(baseLocator.href)?.let { href ->
185196 elements.add(
186- Content .ImageElement (
187- locator = baseLocator.copy(
188- locations = Locator .Locations (
189- otherLocations = buildMap {
190- put(" cssSelector" , node.cssSelector() as Any )
191- }
192- )
193- ),
197+ ImageElement (
198+ locator = elementLocator,
194199 embeddedLink = Link (href = href),
195200 caption = null , // FIXME: Get the caption from figcaption
196201 attributes = buildList {
@@ -203,6 +208,34 @@ class HtmlResourceContentIterator(
203208 )
204209 }
205210 }
211+
212+ tag == " audio" || tag == " video" -> {
213+ flushText()
214+
215+ val href = node.srcRelativeToHref(baseLocator.href)
216+ val link: Link ? =
217+ if (href != null ) {
218+ Link (href = href)
219+ } else {
220+ val sources = node.select(" source" )
221+ .mapNotNull { source ->
222+ source.srcRelativeToHref(baseLocator.href)?.let { href ->
223+ Link (href = href, type = source.attr(" type" ).takeUnless { it.isBlank() })
224+ }
225+ }
226+
227+ sources.firstOrNull()?.copy(alternates = sources.drop(1 ))
228+ }
229+
230+ if (link != null ) {
231+ when (tag) {
232+ " audio" -> elements.add(AudioElement (locator = elementLocator, embeddedLink = link, attributes = emptyList()))
233+ " video" -> elements.add(VideoElement (locator = elementLocator, embeddedLink = link, attributes = emptyList()))
234+ else -> {}
235+ }
236+ }
237+ }
238+
206239 node.isBlock -> {
207240 segmentsAcc.clear()
208241 textAcc.clear()
@@ -214,10 +247,6 @@ class HtmlResourceContentIterator(
214247 }
215248
216249 override fun tail (node : Node , depth : Int ) {
217- if (ignoredNode == node) {
218- ignoredNode = null
219- }
220-
221250 if (node is TextNode ) {
222251 val language = node.language
223252 if (currentLanguage != language) {
@@ -229,7 +258,9 @@ class HtmlResourceContentIterator(
229258 appendNormalisedText(node)
230259 } else if (node is Element ) {
231260 if (node.isBlock) {
261+ assert (breadcrumbs.last() == node)
232262 flushText()
263+ breadcrumbs.removeLast()
233264 }
234265 }
235266 }
@@ -246,7 +277,7 @@ class HtmlResourceContentIterator(
246277 flushSegment()
247278 if (segmentsAcc.isEmpty()) return
248279
249- if (startElement != null && currentElement == startElement) {
280+ if (startElement != null && breadcrumbs.lastOrNull() == startElement) {
250281 startIndex = elements.size
251282 }
252283 elements.add(
@@ -259,7 +290,10 @@ class HtmlResourceContentIterator(
259290 }
260291 }
261292 ),
262- text = Locator .Text (highlight = elementRawTextAcc)
293+ text = Locator .Text (
294+ before = segmentsAcc.firstOrNull()?.locator?.text?.before,
295+ highlight = elementRawTextAcc,
296+ )
263297 ),
264298 role = TextElement .Role .Body ,
265299 segments = segmentsAcc.toList()
@@ -296,7 +330,7 @@ class HtmlResourceContentIterator(
296330 ),
297331 text = Locator .Text (
298332 highlight = rawTextAcc,
299- before = wholeRawTextAcc.takeLast(50 ) // FIXME: custom length
333+ before = wholeRawTextAcc? .takeLast(beforeMaxLength)
300334 )
301335 ),
302336 text = text,
@@ -309,18 +343,22 @@ class HtmlResourceContentIterator(
309343 )
310344 }
311345
312- wholeRawTextAcc + = rawTextAcc
313- elementRawTextAcc + = rawTextAcc
346+ if (rawTextAcc != " " ) {
347+ wholeRawTextAcc = (wholeRawTextAcc ? : " " ) + rawTextAcc
348+ elementRawTextAcc + = rawTextAcc
349+ }
314350 rawTextAcc = " "
315351 textAcc.clear()
316352 }
317353 }
318354}
319355
320- // FIXME: Setup ignore conditions
321- private val Node .isHidden: Boolean get() = false
322-
323356private val Node .language: String? get() =
324357 attr(" xml:lang" ).takeUnless { it.isBlank() }
325358 ? : attr(" lang" ).takeUnless { it.isBlank() }
326359 ? : parent()?.language
360+
361+ private fun Node.srcRelativeToHref (baseHref : String ): String? =
362+ attr(" src" )
363+ .takeIf { it.isNotBlank() }
364+ ?.let { Href (it, baseHref).string }
0 commit comments