Skip to content

Commit 550beab

Browse files
authored
Fix HTML resource content iterator (#342)
1 parent 4e2f66d commit 550beab

File tree

3 files changed

+123
-11
lines changed

3 files changed

+123
-11
lines changed

Makefile

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,19 @@ SCRIPTS_PATH := readium/navigator/src/main/assets/_scripts
22

33
help:
44
@echo "Usage: make <target>\n\n\
5+
lint\t\tLint the Kotlin sources with ktlint\n\
6+
format\tFormat the Kotlin sources with ktlint\n\
57
scripts\tBundle the Navigator EPUB scripts\n\
68
"
79

10+
.PHONY: lint
11+
lint:
12+
./gradlew ktlintCheck
13+
14+
.PHONY: format
15+
format:
16+
./gradlew ktlintFormat
17+
818
.PHONY: scripts
919
scripts:
1020
yarn --cwd "$(SCRIPTS_PATH)" install --frozen-lockfile

readium/shared/src/main/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIterator.kt

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -158,12 +158,19 @@ class HtmlResourceContentIterator(
158158
private val elements = mutableListOf<Content.Element>()
159159
private var startIndex = 0
160160

161+
/** Segments accumulated for the current element. */
161162
private val segmentsAcc = mutableListOf<TextElement.Segment>()
163+
/** Text since the beginning of the current segment, after coalescing whitespaces. */
162164
private var textAcc = StringBuilder()
165+
/** Text content since the beginning of the resource, including whitespaces. */
163166
private var wholeRawTextAcc: String? = null
167+
/** Text content since the beginning of the current element, including whitespaces. */
164168
private var elementRawTextAcc: String = ""
169+
/** Text content since the beginning of the current segment, including whitespaces. */
165170
private var rawTextAcc: String = ""
171+
/** Language of the current segment. */
166172
private var currentLanguage: String? = null
173+
/** CSS selector of the current element. */
167174
private var currentCssSelector: String? = null
168175

169176
/** LIFO stack of the current element's block ancestors. */
@@ -240,17 +247,15 @@ class HtmlResourceContentIterator(
240247
}
241248

242249
node.isBlock -> {
243-
segmentsAcc.clear()
244-
textAcc.clear()
245-
rawTextAcc = ""
250+
flushText()
246251
currentCssSelector = node.cssSelector()
247252
}
248253
}
249254
}
250255
}
251256

252257
override fun tail(node: Node, depth: Int) {
253-
if (node is TextNode) {
258+
if (node is TextNode && node.wholeText.isNotBlank()) {
254259
val language = node.language
255260
if (currentLanguage != language) {
256261
flushSegment()
@@ -278,11 +283,17 @@ class HtmlResourceContentIterator(
278283

279284
private fun flushText() {
280285
flushSegment()
281-
if (segmentsAcc.isEmpty()) return
282286

283-
if (startElement != null && breadcrumbs.lastOrNull() == startElement) {
287+
if (startIndex == 0 && startElement != null && breadcrumbs.lastOrNull() == startElement) {
284288
startIndex = elements.size
285289
}
290+
291+
if (segmentsAcc.isEmpty()) return
292+
293+
// Trim the end of the last segment's text to get a cleaner output for the TextElement.
294+
// Only whitespaces between the segments are meaningful.
295+
segmentsAcc[segmentsAcc.size - 1] = segmentsAcc.last().run { copy(text = text.trimEnd()) }
296+
286297
elements.add(
287298
Content.TextElement(
288299
locator = baseLocator.copy(
@@ -293,9 +304,9 @@ class HtmlResourceContentIterator(
293304
}
294305
}
295306
),
296-
text = Locator.Text(
297-
before = segmentsAcc.firstOrNull()?.locator?.text?.before,
298-
highlight = elementRawTextAcc,
307+
text = Locator.Text.trimmingText(
308+
elementRawTextAcc,
309+
before = segmentsAcc.firstOrNull()?.locator?.text?.before
299310
)
300311
),
301312
role = TextElement.Role.Body,
@@ -331,8 +342,8 @@ class HtmlResourceContentIterator(
331342
}
332343
}
333344
),
334-
text = Locator.Text(
335-
highlight = rawTextAcc,
345+
text = Locator.Text.trimmingText(
346+
rawTextAcc,
336347
before = wholeRawTextAcc?.takeLast(beforeMaxLength)
337348
)
338349
),
@@ -356,6 +367,13 @@ class HtmlResourceContentIterator(
356367
}
357368
}
358369

370+
private fun Locator.Text.Companion.trimmingText(text: String, before: String?): Locator.Text =
371+
Locator.Text(
372+
before = ((before ?: "") + text.takeWhile { it.isWhitespace() }).takeUnless { it.isBlank() },
373+
highlight = text.trim(),
374+
after = text.takeLastWhile { it.isWhitespace() }.takeUnless { it.isBlank() }
375+
)
376+
359377
private val Node.language: String? get() =
360378
attr("xml:lang").takeUnless { it.isBlank() }
361379
?: attr("lang").takeUnless { it.isBlank() }

readium/shared/src/test/java/org/readium/r2/shared/publication/services/content/iterators/HtmlResourceContentIteratorTest.kt

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,4 +455,88 @@ class HtmlResourceContentIteratorTest {
455455
iterator(html).elements()
456456
)
457457
}
458+
459+
@Test
460+
fun `iterating over an element containing both a text node and child elements`() = runTest {
461+
val html = """
462+
<?xml version="1.0" encoding="UTF-8"?>
463+
<html xmlns="http://www.w3.org/1999/xhtml">
464+
<body>
465+
<ol class="decimal" id="c06-list-0001">
466+
<li id="c06-li-0001">Let&#39;s start at the top&#8212;the <i>source of ideas</i>.
467+
<aside><div class="top hr"><hr/></div>
468+
<section class="feature1">
469+
<p id="c06-para-0019"><i>While almost everyone today claims to be Agile, what I&#39;ve just described is very much a <i>waterfall</i> process.</i></p>
470+
</section>
471+
Trailing text
472+
</li>
473+
</ol>
474+
</body>
475+
</html>
476+
"""
477+
478+
assertEquals(
479+
listOf(
480+
TextElement(
481+
locator = locator(
482+
selector = "#c06-li-0001",
483+
highlight = "Let's start at the top—the source of ideas."
484+
),
485+
role = TextElement.Role.Body,
486+
segments = listOf(
487+
Segment(
488+
locator = locator(
489+
selector = "#c06-li-0001",
490+
highlight = "Let's start at the top—the source of ideas."
491+
),
492+
text = "Let's start at the top—the source of ideas.",
493+
attributes = emptyList()
494+
),
495+
),
496+
attributes = emptyList()
497+
),
498+
TextElement(
499+
locator = locator(
500+
selector = "#c06-para-0019",
501+
before = " top—the source of ideas.\n ",
502+
highlight = "While almost everyone today claims to be Agile, what I've just described is very much a waterfall process."
503+
),
504+
role = TextElement.Role.Body,
505+
segments = listOf(
506+
Segment(
507+
locator = locator(
508+
selector = "#c06-para-0019",
509+
before = " top—the source of ideas.\n ",
510+
highlight = "While almost everyone today claims to be Agile, what I've just described is very much a waterfall process."
511+
),
512+
text = "While almost everyone today claims to be Agile, what I've just described is very much a waterfall process.",
513+
attributes = emptyList()
514+
)
515+
),
516+
attributes = emptyList()
517+
),
518+
TextElement(
519+
locator = locator(
520+
selector = "#c06-para-0019",
521+
before = "e just described is very much a waterfall process.\n \n ",
522+
highlight = "Trailing text"
523+
),
524+
role = TextElement.Role.Body,
525+
segments = listOf(
526+
Segment(
527+
locator = locator(
528+
selector = "#c06-para-0019",
529+
before = "e just described is very much a waterfall process.\n ",
530+
highlight = "Trailing text"
531+
),
532+
text = "Trailing text",
533+
attributes = emptyList()
534+
)
535+
),
536+
attributes = emptyList()
537+
)
538+
),
539+
iterator(html).elements()
540+
)
541+
}
458542
}

0 commit comments

Comments
 (0)