Skip to content
4 changes: 2 additions & 2 deletions server/src/browser-management/classes/RemoteBrowser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -550,9 +550,9 @@ export class RemoteBrowser {

try {
const blocker = await PlaywrightBlocker.fromLists(fetch, ['https://easylist.to/easylist/easylist.txt']);
await blocker.enableBlockingInPage(this.currentPage);
await blocker.enableBlockingInPage(this.currentPage as any);
this.client = await this.currentPage.context().newCDPSession(this.currentPage);
await blocker.disableBlockingInPage(this.currentPage);
await blocker.disableBlockingInPage(this.currentPage as any);
console.log('Adblocker initialized');
} catch (error: any) {
console.warn('Failed to initialize adblocker, continuing without it:', error.message);
Expand Down
178 changes: 84 additions & 94 deletions server/src/markdownify/scrape.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { connectToRemoteBrowser } from "../browser-management/browserConnection";
import { Page } from "playwright-core";
import { parseMarkdown } from "./markdown";
import logger from "../logger";

Expand All @@ -21,115 +21,105 @@ async function gotoWithFallback(page: any, url: string) {
* Fetches a webpage, strips scripts/styles/images/etc,
* returns clean Markdown using parser.
* @param url - The URL to convert
* @param existingPage - Optional existing Playwright page instance to reuse
* @param page - Existing Playwright page instance to use
*/
export async function convertPageToMarkdown(url: string): Promise<string> {
const browser = await connectToRemoteBrowser();
const page = await browser.newPage();

await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });

const cleanedHtml = await page.evaluate(() => {
const selectors = [
"script",
"style",
"link[rel='stylesheet']",
"noscript",
"meta",
"svg",
"img",
"picture",
"source",
"video",
"audio",
"iframe",
"object",
"embed"
];

selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(e => e.remove());
});
export async function convertPageToMarkdown(url: string, page: Page): Promise<string> {
try {
logger.log('info', `[Scrape] Using existing page instance for markdown conversion of ${url}`);

await gotoWithFallback(page, url);

const cleanedHtml = await page.evaluate(() => {
const selectors = [
"script",
"style",
"link[rel='stylesheet']",
"noscript",
"meta",
"svg",
"img",
"picture",
"source",
"video",
"audio",
"iframe",
"object",
"embed"
];

selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(e => e.remove());
});

// Remove inline event handlers (onclick, onload…)
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
});
});
});

return document.documentElement.outerHTML;
});
return document.documentElement.outerHTML;
});

if (shouldCloseBrowser && browser) {
logger.log('info', `[Scrape] Closing browser instance created for markdown conversion`);
await browser.close();
} else {
logger.log('info', `[Scrape] Keeping existing browser instance open after markdown conversion`);
const markdown = await parseMarkdown(cleanedHtml, url);
return markdown;
} catch (error: any) {
logger.error(`[Scrape] Error during markdown conversion: ${error.message}`);
throw error;
}

// Convert cleaned HTML → Markdown
const markdown = await parseMarkdown(cleanedHtml, url);
return markdown;
}

/**
* Fetches a webpage, strips scripts/styles/images/etc,
* returns clean HTML.
* @param url - The URL to convert
* @param existingPage - Optional existing Playwright page instance to reuse
* @param page - Existing Playwright page instance to use
*/
export async function convertPageToHTML(url: string): Promise<string> {
const browser = await connectToRemoteBrowser();
const page = await browser.newPage();

await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });

const cleanedHtml = await page.evaluate(() => {
const selectors = [
"script",
"style",
"link[rel='stylesheet']",
"noscript",
"meta",
"svg",
"img",
"picture",
"source",
"video",
"audio",
"iframe",
"object",
"embed"
];

selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(e => e.remove());
});
export async function convertPageToHTML(url: string, page: Page): Promise<string> {
try {
logger.log('info', `[Scrape] Using existing page instance for HTML conversion of ${url}`);

await gotoWithFallback(page, url);

const cleanedHtml = await page.evaluate(() => {
const selectors = [
"script",
"style",
"link[rel='stylesheet']",
"noscript",
"meta",
"svg",
"img",
"picture",
"source",
"video",
"audio",
"iframe",
"object",
"embed"
];

selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(e => e.remove());
});

// Remove inline event handlers (onclick, onload…)
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
});
});
});

return document.documentElement.outerHTML;
});
return document.documentElement.outerHTML;
});

if (shouldCloseBrowser && browser) {
logger.log('info', `[Scrape] Closing browser instance created for HTML conversion`);
await browser.close();
} else {
logger.log('info', `[Scrape] Keeping existing browser instance open after HTML conversion`);
return cleanedHtml;
} catch (error: any) {
logger.error(`[Scrape] Error during HTML conversion: ${error.message}`);
throw error;
}

// Return cleaned HTML directly
return cleanedHtml;
}
2 changes: 0 additions & 2 deletions server/src/routes/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ import { router as record } from './record';
import { router as workflow } from './workflow';
import { router as storage } from './storage';
import { router as auth } from './auth';
import { router as integration } from './integration';
import { router as proxy } from './proxy';
import { router as webhook } from './webhook';

Expand All @@ -11,7 +10,6 @@ export {
workflow,
storage,
auth,
integration,
proxy,
webhook
};
1 change: 0 additions & 1 deletion server/src/routes/storage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ import { encrypt, decrypt } from '../utils/auth';
import { WorkflowFile } from 'maxun-core';
import { cancelScheduledWorkflow, scheduleWorkflow } from '../storage/schedule';
import { pgBossClient } from '../storage/pgboss';
chromium.use(stealthPlugin());

export const router = Router();

Expand Down
3 changes: 1 addition & 2 deletions server/src/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import { Server } from "socket.io";
import cors from 'cors';
import dotenv from 'dotenv';
dotenv.config();
import { record, workflow, storage, auth, integration, proxy, webhook } from './routes';
import { record, workflow, storage, auth, proxy, webhook } from './routes';
import { BrowserPool } from "./browser-management/classes/BrowserPool";
import logger from './logger';
import sequelize, { connectDB, syncDB } from './storage/db'
Expand Down Expand Up @@ -107,7 +107,6 @@ app.use('/record', record);
app.use('/workflow', workflow);
app.use('/storage', storage);
app.use('/auth', auth);
app.use('/integration', integration);
app.use('/proxy', proxy);
app.use('/api-docs', swaggerUi.serve, swaggerUi.setup(swaggerSpec));

Expand Down