@@ -30,36 +30,84 @@ function isHtmlElement(node) {
3030 return node . nodeType === Node . ELEMENT_NODE ;
3131}
3232
33+ /**
34+ * Check if an iframe is same-origin and return its content document
35+ * @param {HTMLIFrameElement } iframe
36+ * @returns {Document | null }
37+ */
38+ function getSameOriginIframeDocument ( iframe ) {
39+ try {
40+ // Try to access the contentDocument - this will throw if cross-origin
41+ const doc = iframe . contentDocument ;
42+ if ( doc && doc . documentElement ) {
43+ return doc ;
44+ }
45+ } catch ( e ) {
46+ // Cross-origin iframe - cannot access content
47+ return null ;
48+ }
49+ return null ;
50+ }
51+
52+ /**
53+ * Stringify the children of a node to markdown
54+ * @param {NodeListOf<ChildNode> } childNodes
55+ * @param {DomToMarkdownSettings } settings
56+ * @param {number } depth
57+ * @returns {string }
58+ */
59+ function domToMarkdownChildren ( childNodes , settings , depth = 0 ) {
60+ if ( depth > settings . maxDepth ) {
61+ return '' ;
62+ }
63+ let children = '' ;
64+ for ( const childNode of childNodes ) {
65+ const childContent = domToMarkdown ( childNode , settings , depth + 1 ) ;
66+ children += childContent ;
67+ if ( children . length > settings . maxLength ) {
68+ children = children . substring ( 0 , settings . maxLength ) + '...' ;
69+ break ;
70+ }
71+ }
72+ return children ;
73+ }
74+
75+ /**
76+ * @typedef {Object } DomToMarkdownSettings
77+ * @property {number } maxLength - Maximum length of content
78+ * @property {number } maxDepth - Maximum depth to traverse
79+ * @property {string } excludeSelectors - CSS selectors to exclude from processing
80+ * @property {boolean } includeIframes - Whether to include iframe content
81+ */
82+
3383/**
3484 * Convert a DOM node to markdown
3585 * @param {Node } node
36- * @param {number } maxLength
37- * @param {string } excludeSelectors
86+ * @param {DomToMarkdownSettings } settings
87+ * @param {number } depth
3888 * @returns {string }
3989 */
40- function domToMarkdown ( node , maxLength = Infinity , excludeSelectors ) {
90+ function domToMarkdown ( node , settings , depth = 0 ) {
91+ if ( depth > settings . maxDepth ) {
92+ return '' ;
93+ }
4194 if ( node . nodeType === Node . TEXT_NODE ) {
4295 return collapseWhitespace ( node . textContent ) ;
4396 }
4497 if ( ! isHtmlElement ( node ) ) {
4598 return '' ;
4699 }
47- if ( ! checkNodeIsVisible ( node ) || node . matches ( excludeSelectors ) ) {
100+ if ( ! checkNodeIsVisible ( node ) || node . matches ( settings . excludeSelectors ) ) {
48101 return '' ;
49102 }
50103
51104 const tag = node . tagName . toLowerCase ( ) ;
52105
53106 // Build children string incrementally to exit early when maxLength is exceeded
54- let children = '' ;
55- for ( const childNode of node . childNodes ) {
56- const childContent = domToMarkdown ( childNode , maxLength - children . length , excludeSelectors ) ;
57- children += childContent ;
107+ let children = domToMarkdownChildren ( node . childNodes , settings , depth + 1 ) ;
58108
59- if ( children . length > maxLength ) {
60- children = children . substring ( 0 , maxLength ) + '...' ;
61- break ;
62- }
109+ if ( node . shadowRoot ) {
110+ children += domToMarkdownChildren ( node . shadowRoot . childNodes , settings , depth + 1 ) ;
63111 }
64112
65113 switch ( tag ) {
@@ -85,6 +133,19 @@ function domToMarkdown(node, maxLength = Infinity, excludeSelectors) {
85133 return `\n- ${ children . trim ( ) } \n` ;
86134 case 'a' :
87135 return getLinkText ( node ) ;
136+ case 'iframe' : {
137+ if ( ! settings . includeIframes ) {
138+ return children ;
139+ }
140+ // Try to access same-origin iframe content
141+ const iframeDoc = getSameOriginIframeDocument ( /** @type {HTMLIFrameElement } */ ( node ) ) ;
142+ if ( iframeDoc && iframeDoc . body ) {
143+ const iframeContent = domToMarkdown ( iframeDoc . body , settings , depth + 1 ) ;
144+ return iframeContent ? `\n\n--- Iframe Content ---\n${ iframeContent } \n--- End Iframe ---\n\n` : children ;
145+ }
146+ // If we can't access the iframe content (cross-origin), return the children or empty string
147+ return children ;
148+ }
88149 default :
89150 return children ;
90151 }
@@ -355,6 +416,8 @@ export default class PageContext extends ContentFeature {
355416 const maxLength = this . getFeatureSetting ( 'maxContentLength' ) || 9500 ;
356417 // Used to avoid large content serialization
357418 const upperLimit = this . getFeatureSetting ( 'upperLimit' ) || 500000 ;
419+ // We should refactor to use iteration but for now this just caps overflow.
420+ const maxDepth = this . getFeatureSetting ( 'maxDepth' ) || 5000 ;
358421 let excludeSelectors = this . getFeatureSetting ( 'excludeSelectors' ) || [ '.ad' , '.sidebar' , '.footer' , '.nav' , '.header' ] ;
359422 const excludedInertElements = this . getFeatureSetting ( 'excludedInertElements' ) || [
360423 'script' ,
@@ -380,7 +443,12 @@ export default class PageContext extends ContentFeature {
380443
381444 if ( contentRoot ) {
382445 this . log . info ( 'Getting main content' , contentRoot ) ;
383- content += domToMarkdown ( contentRoot , upperLimit , excludeSelectorsString ) ;
446+ content += domToMarkdown ( contentRoot , {
447+ maxLength : upperLimit ,
448+ maxDepth,
449+ includeIframes : this . getFeatureSettingEnabled ( 'includeIframes' , 'enabled' ) ,
450+ excludeSelectors : excludeSelectorsString ,
451+ } ) ;
384452 this . log . info ( 'Content markdown' , content , contentRoot ) ;
385453 }
386454 content = content . trim ( ) ;
@@ -390,7 +458,11 @@ export default class PageContext extends ContentFeature {
390458
391459 // Limit content length
392460 if ( content . length > maxLength ) {
393- this . log . info ( 'Truncating content' , content ) ;
461+ this . log . info ( 'Truncating content' , {
462+ content,
463+ contentLength : content . length ,
464+ maxLength,
465+ } ) ;
394466 content = content . substring ( 0 , maxLength ) + '...' ;
395467 }
396468
0 commit comments