Skip to content

Commit e35e4d8

Browse files
bukzorclaude
andcommitted
Fix #160: Add proper text escaping in FormatHtml
HTML text nodes containing &, <, > were output without escaping, causing xq's output to be unparseable when piped back through xq -j. This commit adds: - New escapeTextContent() function for minimal entity escaping - Modified FormatHtml to escape text nodes with &amp;, &lt;, &gt; - Tests verifying the output is valid XML Example issue: echo '<html>1 &amp; 2</html>' | xq | xq -j # Before: Error - bare & in output # After: Success - properly escaped as &amp; This is a critical fix preventing data corruption when round-tripping HTML through xq. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent ec5a59a commit e35e4d8

File tree

2 files changed

+63
-0
lines changed

2 files changed

+63
-0
lines changed

cmd/root_test.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,57 @@ func TestCDATASupport(t *testing.T) {
114114
assert.Equal(t, expected, result)
115115
}
116116

117+
func TestEscapedTextNodes(t *testing.T) {
118+
// Test case 1: ampersand entity - reproduce issue #160
119+
// xq outputs bare & which fails when parsed as XML (used by -j flag)
120+
t.Run("ampersand entity output is valid XML", func(t *testing.T) {
121+
input := "<html>1 &amp; 2</html>"
122+
123+
// First pass: format the HTML
124+
reader1 := strings.NewReader(input)
125+
var output1 bytes.Buffer
126+
err := utils.FormatHtml(reader1, &output1, "", utils.ColorsDisabled)
127+
assert.Nil(t, err)
128+
129+
result1 := strings.TrimSpace(output1.String())
130+
t.Logf("First pass output: %q", result1)
131+
132+
// Second pass: try to parse as XML (this is what `xq -j` does)
133+
reader2 := strings.NewReader(result1)
134+
_, err = xmlquery.Parse(reader2)
135+
assert.Nil(t, err, "xq output should be parseable as XML (for -j flag)")
136+
})
137+
138+
// Test case 2: less-than and greater-than entities - reproduce issue #160
139+
// xq outputs bare < and > which are parsed as tags
140+
t.Run("less-than and greater-than entities output is valid XML", func(t *testing.T) {
141+
input := "<html>is &lt;bold&gt; a valid tag?</html>"
142+
143+
// First pass: format the HTML
144+
reader1 := strings.NewReader(input)
145+
var output1 bytes.Buffer
146+
err := utils.FormatHtml(reader1, &output1, "", utils.ColorsDisabled)
147+
assert.Nil(t, err)
148+
149+
result1 := strings.TrimSpace(output1.String())
150+
t.Logf("First pass output: %q", result1)
151+
152+
// Second pass: try to parse as XML (this is what `xq -j` does)
153+
reader2 := strings.NewReader(result1)
154+
doc, err := xmlquery.Parse(reader2)
155+
assert.Nil(t, err, "xq output should be parseable as XML (for -j flag)")
156+
157+
// Verify the text content is preserved correctly
158+
if doc != nil {
159+
textNode := xmlquery.FindOne(doc, "//html")
160+
if textNode != nil {
161+
assert.Equal(t, "is <bold> a valid tag?", textNode.InnerText(),
162+
"Text content should preserve the literal < and > characters")
163+
}
164+
}
165+
})
166+
}
167+
117168
func TestProcessAsJSON(t *testing.T) {
118169
tests := []struct {
119170
name string

internal/utils/utils.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,9 @@ func FormatHtml(reader io.Reader, writer io.Writer, indent string, colors int) e
338338
case html.TextToken:
339339
str := normalizeSpaces(string(tokenizer.Text()), indent, level)
340340
hasContent = str != ""
341+
if hasContent {
342+
str = escapeTextContent(str)
343+
}
341344
_, _ = fmt.Fprint(writer, str)
342345
case html.StartTagToken, html.SelfClosingTagToken:
343346
if level > 0 {
@@ -585,6 +588,15 @@ func escapeText(input string) (string, error) {
585588
return result, nil
586589
}
587590

591+
func escapeTextContent(input string) string {
592+
// Only escape the minimal set of characters needed for text content
593+
// to avoid XML parsing errors: & < >
594+
result := strings.ReplaceAll(input, "&", "&amp;")
595+
result = strings.ReplaceAll(result, "<", "&lt;")
596+
result = strings.ReplaceAll(result, ">", "&gt;")
597+
return result
598+
}
599+
588600
func normalizeSpaces(input string, indent string, level int) string {
589601
if strings.TrimSpace(input) == "" {
590602
input = ""

0 commit comments

Comments
 (0)