Skip to content

Commit 10e5c2a

Browse files
committed
fix: handle mixed-case HTML tags correctly
HTML is case-insensitive by spec, but the library was failing to process tags with mixed case (e.g., <Br>, <DIV>, <Strong>). This caused translation to stop prematurely, resulting in data loss. Root cause: The HTML parser with lowerCaseTagName: false would preserve the original case, but wouldn't recognize mixed-case void elements like <Br> as self-closing tags. This caused content after the tag to be incorrectly parsed as children of that tag. Solution: 1. Set lowerCaseTagName: true in nodeHtmlParserConfig to normalize all tags 2. Updated visitor.ts to handle tags case-insensitively using toUpperCase() 3. Added comprehensive tests for various mixed-case tag scenarios All translator lookups and element matching now work regardless of the original HTML tag casing, preventing data loss when processing HTML with inconsistent capitalization. Resolves #63
1 parent eae5d0d commit 10e5c2a

File tree

3 files changed

+74
-7
lines changed

3 files changed

+74
-7
lines changed

src/utilities.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ export const truthyStr = (v: any, value?: string): string => v ? ((value !== und
128128
* Note: Do not change - values are tuned for performance
129129
*/
130130
export const nodeHtmlParserConfig: NodeHtmlParserOptions = {
131-
lowerCaseTagName: false,
131+
lowerCaseTagName: true,
132132
comment: false,
133133
fixNestedATags: true,
134134
blockTextElements: {

src/visitor.ts

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -111,14 +111,15 @@ export class Visitor {
111111
const { translators } = this.instance;
112112
(function visit(node: HtmlNode): boolean {
113113
let res = false
114-
if (isTextNode(node) || (isElementNode(node) && contentlessElements.includes(node.tagName))) {
114+
if (isTextNode(node) || (isElementNode(node) && contentlessElements.includes((node.tagName || '').toUpperCase()))) {
115115
res = true;
116116
}
117117
else {
118118
const childNodes = getChildNodes(node);
119119
if (!childNodes.length) {
120-
const translator = translators[(node as ElementNode).tagName];
121-
if (translator?.preserveIfEmpty || typeof translator === 'function') res = true;
120+
const elementNode = node as ElementNode;
121+
const translator = elementNode.tagName ? translators.get(elementNode.tagName) : undefined;
122+
if (typeof translator === 'function' || translator?.preserveIfEmpty) res = true;
122123
}
123124
else
124125
for (const child of childNodes) {
@@ -171,11 +172,14 @@ export class Visitor {
171172
if (textOnly || !isElementNode(node)) return;
172173

173174
/* Handle element node */
174-
const translatorCfgOrFactory: TranslatorConfig | TranslatorConfigFactory | undefined =
175-
metadata?.translators ? metadata.translators[node.tagName] : this.instance.translators[node.tagName];
175+
const tagNameUpper = (node.tagName || '').toUpperCase();
176+
177+
const translatorCfgOrFactory: TranslatorConfig | TranslatorConfigFactory | undefined = tagNameUpper
178+
? (metadata?.translators ? metadata.translators[tagNameUpper] : this.instance.translators.get(node.tagName))
179+
: undefined;
176180

177181
/* Update metadata with list detail */
178-
switch (node.tagName) {
182+
switch (tagNameUpper) {
179183
case 'UL':
180184
case 'OL':
181185
metadata = {

test/special-cases.test.ts

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,4 +101,67 @@ describe(`Special Cases`, () => {
101101
const res = translate(html);
102102
expect(res).toBe(expected);
103103
});
104+
105+
// See: https://github.com/crosstype/node-html-markdown/issues/63
106+
describe(`Handles mixed-case HTML tags correctly`, () => {
107+
test(`handles mixed-case <Br> tag`, () => {
108+
const res = translate('Foo<Br>Bar');
109+
expect(res).toBe('Foo \nBar');
110+
});
111+
112+
test(`handles uppercase <BR> tag`, () => {
113+
const res = translate('Hello<BR>World');
114+
expect(res).toBe('Hello \nWorld');
115+
});
116+
117+
test(`handles mixed-case <DIV> tag`, () => {
118+
const res = translate('<DIV>content</DIV>');
119+
expect(res).toBe('content');
120+
});
121+
122+
test(`handles mixed-case <Div> tag`, () => {
123+
const res = translate('<Div>test content</Div>');
124+
expect(res).toBe('test content');
125+
});
126+
127+
test(`handles mixed-case <P> tag`, () => {
128+
const res = translate('<P>Hello</P>');
129+
expect(res).toBe('Hello');
130+
});
131+
132+
test(`handles mixed-case <pArAgRaPh> tag`, () => {
133+
const res = translate('<pArAgRaPh>Strange case</pArAgRaPh>');
134+
expect(res).toBe('Strange case');
135+
});
136+
137+
test(`handles mixed-case formatting tags`, () => {
138+
const res = translate('<Strong>Bold</Strong> and <Em>Italic</Em>');
139+
expect(res).toBe('**Bold** and _Italic_');
140+
});
141+
142+
test(`handles mixed-case <Hr> tag`, () => {
143+
const res = translate('Before<Hr>After');
144+
expect(res).toBe('Before\n\n---\n\nAfter');
145+
});
146+
147+
test(`handles mixed-case list tags`, () => {
148+
const res = translate('<Ul><Li>Item 1</Li><Li>Item 2</Li></Ul>');
149+
expect(res).toBe('* Item 1\n* Item 2');
150+
});
151+
152+
test(`handles mixed-case heading tags`, () => {
153+
const res = translate('<H1>Title</H1><H2>Subtitle</H2>');
154+
expect(res).toBe('# Title\n\n## Subtitle');
155+
});
156+
157+
test(`handles completely lowercase tags`, () => {
158+
const res = translate('<br><div>content</div><strong>bold</strong>');
159+
expect(res).toBe(' \n\ncontent\n\n**bold**');
160+
});
161+
162+
test(`handles mixed-case nested tags without data loss`, () => {
163+
const res = translate('<Div><P>Paragraph 1</P><Br><P>Paragraph 2</P></Div>');
164+
expect(res).toBe('Paragraph 1\n\n \nParagraph 2');
165+
});
166+
});
104167
});

0 commit comments

Comments
 (0)