1- import { connectToRemoteBrowser } from "../browser-management/browserConnection " ;
1+ import { Page } from "playwright-core " ;
22import { parseMarkdown } from "./markdown" ;
33import logger from "../logger" ;
44
@@ -21,115 +21,105 @@ async function gotoWithFallback(page: any, url: string) {
2121 * Fetches a webpage, strips scripts/styles/images/etc,
2222 * returns clean Markdown using parser.
2323 * @param url - The URL to convert
24- * @param existingPage - Optional existing Playwright page instance to reuse
24+ * @param page - Existing Playwright page instance to use
2525 */
26- export async function convertPageToMarkdown ( url : string ) : Promise < string > {
27- const browser = await connectToRemoteBrowser ( ) ;
28- const page = await browser . newPage ( ) ;
29-
30- await page . goto ( url , { waitUntil : "networkidle" , timeout : 100000 } ) ;
31-
32- const cleanedHtml = await page . evaluate ( ( ) => {
33- const selectors = [
34- "script" ,
35- "style" ,
36- "link[rel='stylesheet']" ,
37- "noscript" ,
38- "meta" ,
39- "svg" ,
40- "img" ,
41- "picture" ,
42- "source" ,
43- "video" ,
44- "audio" ,
45- "iframe" ,
46- "object" ,
47- "embed"
48- ] ;
49-
50- selectors . forEach ( sel => {
51- document . querySelectorAll ( sel ) . forEach ( e => e . remove ( ) ) ;
52- } ) ;
26+ export async function convertPageToMarkdown ( url : string , page : Page ) : Promise < string > {
27+ try {
28+ logger . log ( 'info' , `[Scrape] Using existing page instance for markdown conversion of ${ url } ` ) ;
29+
30+ await gotoWithFallback ( page , url ) ;
31+
32+ const cleanedHtml = await page . evaluate ( ( ) => {
33+ const selectors = [
34+ "script" ,
35+ "style" ,
36+ "link[rel='stylesheet']" ,
37+ "noscript" ,
38+ "meta" ,
39+ "svg" ,
40+ "img" ,
41+ "picture" ,
42+ "source" ,
43+ "video" ,
44+ "audio" ,
45+ "iframe" ,
46+ "object" ,
47+ "embed"
48+ ] ;
49+
50+ selectors . forEach ( sel => {
51+ document . querySelectorAll ( sel ) . forEach ( e => e . remove ( ) ) ;
52+ } ) ;
5353
54- // Remove inline event handlers (onclick, onload…)
55- const all = document . querySelectorAll ( "*" ) ;
56- all . forEach ( el => {
57- [ ... el . attributes ] . forEach ( attr => {
58- if ( attr . name . startsWith ( "on" ) ) {
59- el . removeAttribute ( attr . name ) ;
60- }
54+ const all = document . querySelectorAll ( "*" ) ;
55+ all . forEach ( el => {
56+ [ ... el . attributes ] . forEach ( attr => {
57+ if ( attr . name . startsWith ( "on" ) ) {
58+ el . removeAttribute ( attr . name ) ;
59+ }
60+ } ) ;
6161 } ) ;
62- } ) ;
6362
64- return document . documentElement . outerHTML ;
65- } ) ;
63+ return document . documentElement . outerHTML ;
64+ } ) ;
6665
67- if ( shouldCloseBrowser && browser ) {
68- logger . log ( 'info' , `[Scrape] Closing browser instance created for markdown conversion` ) ;
69- await browser . close ( ) ;
70- } else {
71- logger . log ( 'info' , `[Scrape] Keeping existing browser instance open after markdown conversion` ) ;
66+ const markdown = await parseMarkdown ( cleanedHtml , url ) ;
67+ return markdown ;
68+ } catch ( error : any ) {
69+ logger . error ( `[Scrape] Error during markdown conversion: ${ error . message } ` ) ;
70+ throw error ;
7271 }
73-
74- // Convert cleaned HTML → Markdown
75- const markdown = await parseMarkdown ( cleanedHtml , url ) ;
76- return markdown ;
7772}
7873
7974/**
8075 * Fetches a webpage, strips scripts/styles/images/etc,
8176 * returns clean HTML.
8277 * @param url - The URL to convert
83- * @param existingPage - Optional existing Playwright page instance to reuse
78+ * @param page - Existing Playwright page instance to use
8479 */
85- export async function convertPageToHTML ( url : string ) : Promise < string > {
86- const browser = await connectToRemoteBrowser ( ) ;
87- const page = await browser . newPage ( ) ;
88-
89- await page . goto ( url , { waitUntil : "networkidle" , timeout : 100000 } ) ;
90-
91- const cleanedHtml = await page . evaluate ( ( ) => {
92- const selectors = [
93- "script" ,
94- "style" ,
95- "link[rel='stylesheet']" ,
96- "noscript" ,
97- "meta" ,
98- "svg" ,
99- "img" ,
100- "picture" ,
101- "source" ,
102- "video" ,
103- "audio" ,
104- "iframe" ,
105- "object" ,
106- "embed"
107- ] ;
108-
109- selectors . forEach ( sel => {
110- document . querySelectorAll ( sel ) . forEach ( e => e . remove ( ) ) ;
111- } ) ;
80+ export async function convertPageToHTML ( url : string , page : Page ) : Promise < string > {
81+ try {
82+ logger . log ( 'info' , `[Scrape] Using existing page instance for HTML conversion of ${ url } ` ) ;
83+
84+ await gotoWithFallback ( page , url ) ;
85+
86+ const cleanedHtml = await page . evaluate ( ( ) => {
87+ const selectors = [
88+ "script" ,
89+ "style" ,
90+ "link[rel='stylesheet']" ,
91+ "noscript" ,
92+ "meta" ,
93+ "svg" ,
94+ "img" ,
95+ "picture" ,
96+ "source" ,
97+ "video" ,
98+ "audio" ,
99+ "iframe" ,
100+ "object" ,
101+ "embed"
102+ ] ;
103+
104+ selectors . forEach ( sel => {
105+ document . querySelectorAll ( sel ) . forEach ( e => e . remove ( ) ) ;
106+ } ) ;
112107
113- // Remove inline event handlers (onclick, onload…)
114- const all = document . querySelectorAll ( "*" ) ;
115- all . forEach ( el => {
116- [ ... el . attributes ] . forEach ( attr => {
117- if ( attr . name . startsWith ( "on" ) ) {
118- el . removeAttribute ( attr . name ) ;
119- }
108+ const all = document . querySelectorAll ( "*" ) ;
109+ all . forEach ( el => {
110+ [ ... el . attributes ] . forEach ( attr => {
111+ if ( attr . name . startsWith ( "on" ) ) {
112+ el . removeAttribute ( attr . name ) ;
113+ }
114+ } ) ;
120115 } ) ;
121- } ) ;
122116
123- return document . documentElement . outerHTML ;
124- } ) ;
117+ return document . documentElement . outerHTML ;
118+ } ) ;
125119
126- if ( shouldCloseBrowser && browser ) {
127- logger . log ( 'info' , `[Scrape] Closing browser instance created for HTML conversion` ) ;
128- await browser . close ( ) ;
129- } else {
130- logger . log ( 'info' , `[Scrape] Keeping existing browser instance open after HTML conversion` ) ;
120+ return cleanedHtml ;
121+ } catch ( error : any ) {
122+ logger . error ( `[Scrape] Error during HTML conversion: ${ error . message } ` ) ;
123+ throw error ;
131124 }
132-
133- // Return cleaned HTML directly
134- return cleanedHtml ;
135125}
0 commit comments