Skip to content

Commit 14f2c52

Browse files
Page collection support shadow dom and frames (#2002)
* Support shadow DOM * Add depth filter and refactor to settings object * Add typing * Add setting to control iframe inclusion * Lint fix * Check with feature setting enabled
1 parent 67f6598 commit 14f2c52

File tree

1 file changed

+86
-14
lines changed

1 file changed

+86
-14
lines changed

injected/src/features/page-context.js

Lines changed: 86 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,36 +30,84 @@ function isHtmlElement(node) {
3030
return node.nodeType === Node.ELEMENT_NODE;
3131
}
3232

33+
/**
34+
* Check if an iframe is same-origin and return its content document
35+
* @param {HTMLIFrameElement} iframe
36+
* @returns {Document | null}
37+
*/
38+
function getSameOriginIframeDocument(iframe) {
39+
try {
40+
// Try to access the contentDocument - this will throw if cross-origin
41+
const doc = iframe.contentDocument;
42+
if (doc && doc.documentElement) {
43+
return doc;
44+
}
45+
} catch (e) {
46+
// Cross-origin iframe - cannot access content
47+
return null;
48+
}
49+
return null;
50+
}
51+
52+
/**
53+
* Stringify the children of a node to markdown
54+
* @param {NodeListOf<ChildNode>} childNodes
55+
* @param {DomToMarkdownSettings} settings
56+
* @param {number} depth
57+
* @returns {string}
58+
*/
59+
function domToMarkdownChildren(childNodes, settings, depth = 0) {
60+
if (depth > settings.maxDepth) {
61+
return '';
62+
}
63+
let children = '';
64+
for (const childNode of childNodes) {
65+
const childContent = domToMarkdown(childNode, settings, depth + 1);
66+
children += childContent;
67+
if (children.length > settings.maxLength) {
68+
children = children.substring(0, settings.maxLength) + '...';
69+
break;
70+
}
71+
}
72+
return children;
73+
}
74+
75+
/**
76+
* @typedef {Object} DomToMarkdownSettings
77+
* @property {number} maxLength - Maximum length of content
78+
* @property {number} maxDepth - Maximum depth to traverse
79+
* @property {string} excludeSelectors - CSS selectors to exclude from processing
80+
* @property {boolean} includeIframes - Whether to include iframe content
81+
*/
82+
3383
/**
3484
* Convert a DOM node to markdown
3585
* @param {Node} node
36-
* @param {number} maxLength
37-
* @param {string} excludeSelectors
86+
* @param {DomToMarkdownSettings} settings
87+
* @param {number} depth
3888
* @returns {string}
3989
*/
40-
function domToMarkdown(node, maxLength = Infinity, excludeSelectors) {
90+
function domToMarkdown(node, settings, depth = 0) {
91+
if (depth > settings.maxDepth) {
92+
return '';
93+
}
4194
if (node.nodeType === Node.TEXT_NODE) {
4295
return collapseWhitespace(node.textContent);
4396
}
4497
if (!isHtmlElement(node)) {
4598
return '';
4699
}
47-
if (!checkNodeIsVisible(node) || node.matches(excludeSelectors)) {
100+
if (!checkNodeIsVisible(node) || node.matches(settings.excludeSelectors)) {
48101
return '';
49102
}
50103

51104
const tag = node.tagName.toLowerCase();
52105

53106
// Build children string incrementally to exit early when maxLength is exceeded
54-
let children = '';
55-
for (const childNode of node.childNodes) {
56-
const childContent = domToMarkdown(childNode, maxLength - children.length, excludeSelectors);
57-
children += childContent;
107+
let children = domToMarkdownChildren(node.childNodes, settings, depth + 1);
58108

59-
if (children.length > maxLength) {
60-
children = children.substring(0, maxLength) + '...';
61-
break;
62-
}
109+
if (node.shadowRoot) {
110+
children += domToMarkdownChildren(node.shadowRoot.childNodes, settings, depth + 1);
63111
}
64112

65113
switch (tag) {
@@ -85,6 +133,19 @@ function domToMarkdown(node, maxLength = Infinity, excludeSelectors) {
85133
return `\n- ${children.trim()}\n`;
86134
case 'a':
87135
return getLinkText(node);
136+
case 'iframe': {
137+
if (!settings.includeIframes) {
138+
return children;
139+
}
140+
// Try to access same-origin iframe content
141+
const iframeDoc = getSameOriginIframeDocument(/** @type {HTMLIFrameElement} */ (node));
142+
if (iframeDoc && iframeDoc.body) {
143+
const iframeContent = domToMarkdown(iframeDoc.body, settings, depth + 1);
144+
return iframeContent ? `\n\n--- Iframe Content ---\n${iframeContent}\n--- End Iframe ---\n\n` : children;
145+
}
146+
// If we can't access the iframe content (cross-origin), return the children or empty string
147+
return children;
148+
}
88149
default:
89150
return children;
90151
}
@@ -355,6 +416,8 @@ export default class PageContext extends ContentFeature {
355416
const maxLength = this.getFeatureSetting('maxContentLength') || 9500;
356417
// Used to avoid large content serialization
357418
const upperLimit = this.getFeatureSetting('upperLimit') || 500000;
419+
// We should refactor to use iteration but for now this just caps overflow.
420+
const maxDepth = this.getFeatureSetting('maxDepth') || 5000;
358421
let excludeSelectors = this.getFeatureSetting('excludeSelectors') || ['.ad', '.sidebar', '.footer', '.nav', '.header'];
359422
const excludedInertElements = this.getFeatureSetting('excludedInertElements') || [
360423
'script',
@@ -380,7 +443,12 @@ export default class PageContext extends ContentFeature {
380443

381444
if (contentRoot) {
382445
this.log.info('Getting main content', contentRoot);
383-
content += domToMarkdown(contentRoot, upperLimit, excludeSelectorsString);
446+
content += domToMarkdown(contentRoot, {
447+
maxLength: upperLimit,
448+
maxDepth,
449+
includeIframes: this.getFeatureSettingEnabled('includeIframes', 'enabled'),
450+
excludeSelectors: excludeSelectorsString,
451+
});
384452
this.log.info('Content markdown', content, contentRoot);
385453
}
386454
content = content.trim();
@@ -390,7 +458,11 @@ export default class PageContext extends ContentFeature {
390458

391459
// Limit content length
392460
if (content.length > maxLength) {
393-
this.log.info('Truncating content', content);
461+
this.log.info('Truncating content', {
462+
content,
463+
contentLength: content.length,
464+
maxLength,
465+
});
394466
content = content.substring(0, maxLength) + '...';
395467
}
396468

0 commit comments

Comments
 (0)