Skip to content

Commit 67f6598

Browse files
Hidden page context removal (#2000)
* Remove hidden content from collection * Pass a query string for exclusions * Fix type guard
1 parent e62b752 commit 67f6598

File tree

1 file changed

+41
-17
lines changed

1 file changed

+41
-17
lines changed

injected/src/features/page-context.js

Lines changed: 41 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,48 @@ import { getFaviconList } from './favicon.js';
33
import { isDuckAi, isBeingFramed, getTabUrl } from '../utils.js';
44
const MSG_PAGE_CONTEXT_RESPONSE = 'collectionResult';
55

6+
function checkNodeIsVisible(node) {
7+
try {
8+
const style = window.getComputedStyle(node);
9+
10+
// Check primary visibility properties
11+
if (style.display === 'none' || style.visibility === 'hidden' || parseFloat(style.opacity) === 0) {
12+
return false;
13+
}
14+
return true;
15+
} catch (e) {
16+
return false;
17+
}
18+
}
19+
620
function collapseWhitespace(str) {
721
return typeof str === 'string' ? str.replace(/\s+/g, ' ') : '';
822
}
923

10-
function domToMarkdown(node, maxLength = Infinity) {
24+
/**
25+
* Check if a node is an HTML element
26+
* @param {Node} node
27+
* @returns {node is HTMLElement}
28+
**/
29+
function isHtmlElement(node) {
30+
return node.nodeType === Node.ELEMENT_NODE;
31+
}
32+
33+
/**
34+
* Convert a DOM node to markdown
35+
* @param {Node} node
36+
* @param {number} maxLength
37+
* @param {string} excludeSelectors
38+
* @returns {string}
39+
*/
40+
function domToMarkdown(node, maxLength = Infinity, excludeSelectors) {
1141
if (node.nodeType === Node.TEXT_NODE) {
1242
return collapseWhitespace(node.textContent);
1343
}
14-
if (node.nodeType !== Node.ELEMENT_NODE) {
44+
if (!isHtmlElement(node)) {
45+
return '';
46+
}
47+
if (!checkNodeIsVisible(node) || node.matches(excludeSelectors)) {
1548
return '';
1649
}
1750

@@ -20,7 +53,7 @@ function domToMarkdown(node, maxLength = Infinity) {
2053
// Build children string incrementally to exit early when maxLength is exceeded
2154
let children = '';
2255
for (const childNode of node.childNodes) {
23-
const childContent = domToMarkdown(childNode, maxLength - children.length);
56+
const childContent = domToMarkdown(childNode, maxLength - children.length, excludeSelectors);
2457
children += childContent;
2558

2659
if (children.length > maxLength) {
@@ -333,6 +366,7 @@ export default class PageContext extends ContentFeature {
333366
'canvas',
334367
];
335368
excludeSelectors = excludeSelectors.concat(excludedInertElements);
369+
const excludeSelectorsString = excludeSelectors.join(',');
336370

337371
let content = '';
338372
// Get content from main content areas
@@ -346,18 +380,8 @@ export default class PageContext extends ContentFeature {
346380

347381
if (contentRoot) {
348382
this.log.info('Getting main content', contentRoot);
349-
// Create a clone to work with
350-
const clone = /** @type {Element} */ (contentRoot.cloneNode(true));
351-
352-
// Remove excluded elements
353-
excludeSelectors.forEach((selector) => {
354-
const elements = clone.querySelectorAll(selector);
355-
elements.forEach((el) => el.remove());
356-
});
357-
358-
this.log.info('Calling domToMarkdown', clone.innerHTML);
359-
content += domToMarkdown(clone, upperLimit);
360-
this.log.info('Content markdown', content, clone, contentRoot);
383+
content += domToMarkdown(contentRoot, upperLimit, excludeSelectorsString);
384+
this.log.info('Content markdown', content, contentRoot);
361385
}
362386
content = content.trim();
363387

@@ -375,8 +399,8 @@ export default class PageContext extends ContentFeature {
375399

376400
getHeadings() {
377401
const headings = [];
378-
const headdingSelector = this.getFeatureSetting('headingSelector') || 'h1, h2, h3, h4, h5, h6';
379-
const headingElements = document.querySelectorAll(headdingSelector);
402+
const headingSelector = this.getFeatureSetting('headingSelector') || 'h1, h2, h3, h4, h5, h6';
403+
const headingElements = document.querySelectorAll(headingSelector);
380404

381405
headingElements.forEach((heading) => {
382406
const level = parseInt(heading.tagName.charAt(1));

0 commit comments

Comments
 (0)