Skip to content

Commit 4334390

Browse files
authored
Fix HTML resource content iterator (#338)
1 parent 2fa3d11 commit 4334390

File tree

5 files changed

+524
-73
lines changed

5 files changed

+524
-73
lines changed

CHANGELOG.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,23 @@ All notable changes to this project will be documented in this file. Take a look
1717
* Scroll mode: jumping between two EPUB resources with a horizontal swipe triggers the `Navigator.Listener.onJumpToLocator()` callback.
1818
* This can be used to allow the user to go back to their previous location if they swiped across chapters by mistake.
1919

20-
#### Changed
20+
#### Streamer
21+
22+
* The EPUB content iterator now returns `audio` and `video` elements.
23+
24+
### Changed
25+
26+
#### Navigator
2127

2228
* `EpubNavigatorFragment.firstVisibleElementLocator()` now returns the first *block* element that is visible on the screen, even if it starts on previous pages.
2329
* This is used to make sure the user will not miss any context when restoring a TTS session in the middle of a resource.
2430

31+
### Fixed
32+
33+
#### Streamer
34+
35+
* Fix issue with the TTS starting from the beginning of the chapter instead of the current position.
36+
2537
## [2.3.0]
2638

2739
### Added
Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,13 @@
1-
SCRIPTS_PATH := src/main/assets/_scripts
1+
SCRIPTS_PATH := readium/navigator/src/main/assets/_scripts
22

33
help:
44
@echo "Usage: make <target>\n\n\
5-
install\tDownload NPM dependencies\n\
6-
scripts\tBundle EPUB scripts with Webpack\n\
7-
lint-scripts\tCheck quality of EPUB scripts\n\
5+
scripts\tBundle the Navigator EPUB scripts\n\
86
"
97

10-
install:
11-
yarn --cwd "$(SCRIPTS_PATH)" install --frozen-lockfile
12-
8+
.PHONY: scripts
139
scripts:
10+
yarn --cwd "$(SCRIPTS_PATH)" install --frozen-lockfile
1411
yarn --cwd "$(SCRIPTS_PATH)" run format
15-
yarn --cwd "$(SCRIPTS_PATH)" run bundle
16-
17-
lint-scripts:
1812
yarn --cwd "$(SCRIPTS_PATH)" run lint
13+
yarn --cwd "$(SCRIPTS_PATH)" run bundle

readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/Content.kt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,6 @@ interface Content {
242242

243243
/**
244244
* Extracts the full raw text, or returns null if no text content can be found.
245-
*
246245
* @param separator Separator to use between individual elements. Defaults to newline.
247246
*/
248247
suspend fun text(separator: String = "\n"): String? =

readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt

Lines changed: 99 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ import org.readium.r2.shared.util.Language
2727
import org.readium.r2.shared.util.mediatype.MediaType
2828
import org.readium.r2.shared.util.use
2929

30-
// FIXME: Support custom skipped elements
30+
// FIXME: Support custom skipped elements?
3131

3232
/**
3333
* Iterates an HTML [resource], starting from the given [locator].
@@ -36,11 +36,14 @@ import org.readium.r2.shared.util.use
3636
* [Locator.Locations] object.
3737
*
3838
* If you want to start from the end of the resource, the [locator] must have a `progression` of 1.0.
39+
*
40+
* Locators will contain a `before` context of up to `beforeMaxLength` characters.
3941
*/
4042
@ExperimentalReadiumApi
4143
class HtmlResourceContentIterator(
4244
private val resource: Resource,
43-
private val locator: Locator
45+
private val locator: Locator,
46+
private val beforeMaxLength: Int = 50
4447
) : Content.Iterator {
4548

4649
companion object {
@@ -57,47 +60,50 @@ class HtmlResourceContentIterator(
5760
/**
5861
* [Content.Element] loaded with [hasPrevious] or [hasNext], associated with the move delta.
5962
*/
60-
private data class ContentWithDelta(
63+
private data class ElementWithDelta(
6164
val element: Content.Element,
6265
val delta: Int
6366
)
6467

65-
private var currentElement: ContentWithDelta? = null
68+
private var requestedElement: ElementWithDelta? = null
6669

6770
override suspend fun hasPrevious(): Boolean {
68-
currentElement = nextBy(-1)
69-
return currentElement != null
71+
if (requestedElement?.delta == -1) return true
72+
73+
val index = currentIndex() - 1
74+
val element = elements().elements.getOrNull(index) ?: return false
75+
currentIndex = index
76+
requestedElement = ElementWithDelta(element, -1)
77+
return true
7078
}
7179

7280
override fun previous(): Content.Element =
73-
currentElement
81+
requestedElement
7482
?.takeIf { it.delta == -1 }?.element
83+
?.also { requestedElement = null }
7584
?: throw IllegalStateException("Called previous() without a successful call to hasPrevious() first")
7685

7786
override suspend fun hasNext(): Boolean {
78-
currentElement = nextBy(+1)
79-
return currentElement != null
87+
if (requestedElement?.delta == 1) return true
88+
89+
val index = currentIndex()
90+
val element = elements().elements.getOrNull(index) ?: return false
91+
currentIndex = index + 1
92+
requestedElement = ElementWithDelta(element, +1)
93+
return true
8094
}
8195

8296
override fun next(): Content.Element =
83-
currentElement
84-
?.takeIf { it.delta == +1 }?.element
97+
requestedElement
98+
?.takeIf { it.delta == 1 }?.element
99+
?.also { requestedElement = null }
85100
?: throw IllegalStateException("Called next() without a successful call to hasNext() first")
86101

87-
private suspend fun nextBy(delta: Int): ContentWithDelta? {
88-
val elements = elements()
89-
val index = currentIndex?.let { it + delta }
90-
?: elements.startIndex
91-
92-
val content = elements.elements.getOrNull(index)
93-
?: return null
94-
95-
currentIndex = index
96-
return ContentWithDelta(content, delta)
97-
}
98-
99102
private var currentIndex: Int? = null
100103

104+
private suspend fun currentIndex(): Int =
105+
currentIndex ?: elements().startIndex
106+
101107
private suspend fun elements(): ParsedElements =
102108
parsedElements
103109
?: parseElements().also { parsedElements = it }
@@ -116,14 +122,15 @@ class HtmlResourceContentIterator(
116122
// The JS third-party library used to generate the CSS Selector sometimes adds
117123
// :root >, which doesn't work with JSoup.
118124
tryOrNull { body.selectFirst(it.removePrefix(":root > ")) }
119-
}
125+
},
126+
beforeMaxLength = beforeMaxLength
120127
)
121128
NodeTraversor.traverse(contentParser, body)
122129
return contentParser.result()
123130
}
124131

125132
/**
126-
* Holds the result of parsing the HTML resource into a list of [ContentElement].
133+
* Holds the result of parsing the HTML resource into a list of `ContentElement`.
127134
*
128135
* The [startIndex] will be calculated from the element matched by the base [locator], if
129136
* possible. Defaults to 0.
@@ -136,6 +143,7 @@ class HtmlResourceContentIterator(
136143
private class ContentParser(
137144
private val baseLocator: Locator,
138145
private val startElement: Element?,
146+
private val beforeMaxLength: Int
139147
) : NodeVisitor {
140148

141149
fun result() = ParsedElements(
@@ -146,51 +154,48 @@ class HtmlResourceContentIterator(
146154

147155
private val elements = mutableListOf<Content.Element>()
148156
private var startIndex = 0
149-
private var currentElement: Element? = null
150157

151158
private val segmentsAcc = mutableListOf<TextElement.Segment>()
152159
private var textAcc = StringBuilder()
153-
private var wholeRawTextAcc: String = ""
160+
private var wholeRawTextAcc: String? = null
154161
private var elementRawTextAcc: String = ""
155162
private var rawTextAcc: String = ""
156163
private var currentLanguage: String? = null
157164
private var currentCssSelector: String? = null
158-
private var ignoredNode: Node? = null
159165

160-
override fun head(node: Node, depth: Int) {
161-
if (ignoredNode != null) return
162-
163-
if (node.isHidden) {
164-
ignoredNode = node
165-
return
166-
}
166+
/** LIFO stack of the current element's block ancestors. */
167+
private val breadcrumbs = mutableListOf<Element>()
167168

169+
override fun head(node: Node, depth: Int) {
168170
if (node is Element) {
169-
currentElement = node
171+
if (node.isBlock) {
172+
breadcrumbs.add(node)
173+
}
170174

171175
val tag = node.normalName()
172176

177+
val elementLocator: Locator by lazy {
178+
baseLocator.copy(
179+
locations = Locator.Locations(
180+
otherLocations = buildMap {
181+
put("cssSelector", node.cssSelector() as Any)
182+
}
183+
)
184+
)
185+
}
186+
173187
when {
174188
tag == "br" -> {
175189
flushText()
176190
}
191+
177192
tag == "img" -> {
178193
flushText()
179194

180-
val href = node.attr("src")
181-
.takeIf { it.isNotBlank() }
182-
?.let { Href(it, baseLocator.href).string }
183-
184-
if (href != null) {
195+
node.srcRelativeToHref(baseLocator.href)?.let { href ->
185196
elements.add(
186-
Content.ImageElement(
187-
locator = baseLocator.copy(
188-
locations = Locator.Locations(
189-
otherLocations = buildMap {
190-
put("cssSelector", node.cssSelector() as Any)
191-
}
192-
)
193-
),
197+
ImageElement(
198+
locator = elementLocator,
194199
embeddedLink = Link(href = href),
195200
caption = null, // FIXME: Get the caption from figcaption
196201
attributes = buildList {
@@ -203,6 +208,34 @@ class HtmlResourceContentIterator(
203208
)
204209
}
205210
}
211+
212+
tag == "audio" || tag == "video" -> {
213+
flushText()
214+
215+
val href = node.srcRelativeToHref(baseLocator.href)
216+
val link: Link? =
217+
if (href != null) {
218+
Link(href = href)
219+
} else {
220+
val sources = node.select("source")
221+
.mapNotNull { source ->
222+
source.srcRelativeToHref(baseLocator.href)?.let { href ->
223+
Link(href = href, type = source.attr("type").takeUnless { it.isBlank() })
224+
}
225+
}
226+
227+
sources.firstOrNull()?.copy(alternates = sources.drop(1))
228+
}
229+
230+
if (link != null) {
231+
when (tag) {
232+
"audio" -> elements.add(AudioElement(locator = elementLocator, embeddedLink = link, attributes = emptyList()))
233+
"video" -> elements.add(VideoElement(locator = elementLocator, embeddedLink = link, attributes = emptyList()))
234+
else -> {}
235+
}
236+
}
237+
}
238+
206239
node.isBlock -> {
207240
segmentsAcc.clear()
208241
textAcc.clear()
@@ -214,10 +247,6 @@ class HtmlResourceContentIterator(
214247
}
215248

216249
override fun tail(node: Node, depth: Int) {
217-
if (ignoredNode == node) {
218-
ignoredNode = null
219-
}
220-
221250
if (node is TextNode) {
222251
val language = node.language
223252
if (currentLanguage != language) {
@@ -229,7 +258,9 @@ class HtmlResourceContentIterator(
229258
appendNormalisedText(node)
230259
} else if (node is Element) {
231260
if (node.isBlock) {
261+
assert(breadcrumbs.last() == node)
232262
flushText()
263+
breadcrumbs.removeLast()
233264
}
234265
}
235266
}
@@ -246,7 +277,7 @@ class HtmlResourceContentIterator(
246277
flushSegment()
247278
if (segmentsAcc.isEmpty()) return
248279

249-
if (startElement != null && currentElement == startElement) {
280+
if (startElement != null && breadcrumbs.lastOrNull() == startElement) {
250281
startIndex = elements.size
251282
}
252283
elements.add(
@@ -259,7 +290,10 @@ class HtmlResourceContentIterator(
259290
}
260291
}
261292
),
262-
text = Locator.Text(highlight = elementRawTextAcc)
293+
text = Locator.Text(
294+
before = segmentsAcc.firstOrNull()?.locator?.text?.before,
295+
highlight = elementRawTextAcc,
296+
)
263297
),
264298
role = TextElement.Role.Body,
265299
segments = segmentsAcc.toList()
@@ -296,7 +330,7 @@ class HtmlResourceContentIterator(
296330
),
297331
text = Locator.Text(
298332
highlight = rawTextAcc,
299-
before = wholeRawTextAcc.takeLast(50) // FIXME: custom length
333+
before = wholeRawTextAcc?.takeLast(beforeMaxLength)
300334
)
301335
),
302336
text = text,
@@ -309,18 +343,22 @@ class HtmlResourceContentIterator(
309343
)
310344
}
311345

312-
wholeRawTextAcc += rawTextAcc
313-
elementRawTextAcc += rawTextAcc
346+
if (rawTextAcc != "") {
347+
wholeRawTextAcc = (wholeRawTextAcc ?: "") + rawTextAcc
348+
elementRawTextAcc += rawTextAcc
349+
}
314350
rawTextAcc = ""
315351
textAcc.clear()
316352
}
317353
}
318354
}
319355

320-
// FIXME: Setup ignore conditions
321-
private val Node.isHidden: Boolean get() = false
322-
323356
private val Node.language: String? get() =
324357
attr("xml:lang").takeUnless { it.isBlank() }
325358
?: attr("lang").takeUnless { it.isBlank() }
326359
?: parent()?.language
360+
361+
private fun Node.srcRelativeToHref(baseHref: String): String? =
362+
attr("src")
363+
.takeIf { it.isNotBlank() }
364+
?.let { Href(it, baseHref).string }

0 commit comments

Comments
 (0)