diff --git a/Dockerfile.backend b/Dockerfile.backend index 85ee4b83c..d0143a3df 100644 --- a/Dockerfile.backend +++ b/Dockerfile.backend @@ -1,4 +1,4 @@ -FROM --platform=$BUILDPLATFORM mcr.microsoft.com/playwright:v1.46.0-noble +FROM --platform=$BUILDPLATFORM node:20-slim # Set working directory WORKDIR /app @@ -18,31 +18,6 @@ COPY server/tsconfig.json ./server/ # Install dependencies RUN npm install --legacy-peer-deps -# Create the Chromium data directory with necessary permissions -RUN mkdir -p /tmp/chromium-data-dir && \ - chmod -R 777 /tmp/chromium-data-dir - -# Install dependencies -RUN apt-get update && apt-get install -y \ - libgbm1 \ - libnss3 \ - libatk1.0-0 \ - libatk-bridge2.0-0 \ - libdrm2 \ - libxkbcommon0 \ - libglib2.0-0 \ - libdbus-1-3 \ - libx11-xcb1 \ - libxcb1 \ - libxcomposite1 \ - libxcursor1 \ - libxdamage1 \ - libxext6 \ - libxi6 \ - libxtst6 \ - && rm -rf /var/lib/apt/lists/* \ - && mkdir -p /tmp/.X11-unix && chmod 1777 /tmp/.X11-unix - # Expose backend port EXPOSE ${BACKEND_PORT:-8080} diff --git a/browser/Dockerfile b/browser/Dockerfile index 9f2ea8385..7dd270338 100644 --- a/browser/Dockerfile +++ b/browser/Dockerfile @@ -6,7 +6,7 @@ WORKDIR /app COPY browser/package*.json ./ # Install dependencies -RUN npm ci +RUN npm install # Copy TypeScript source and config COPY browser/server.ts ./ diff --git a/browser/server.ts b/browser/server.ts index 2a70beef2..e12cd79ad 100644 --- a/browser/server.ts +++ b/browser/server.ts @@ -11,6 +11,7 @@ let browserServer: BrowserServer | null = null; // Configurable ports with defaults const BROWSER_WS_PORT = parseInt(process.env.BROWSER_WS_PORT || '3001', 10); const BROWSER_HEALTH_PORT = parseInt(process.env.BROWSER_HEALTH_PORT || '3002', 10); +const BROWSER_WS_HOST = process.env.BROWSER_WS_HOST || 'localhost'; async function start(): Promise { console.log('Starting Maxun Browser Service...'); @@ -44,17 +45,19 @@ async function start(): Promise { // Health check HTTP server const healthServer = http.createServer((req, res) => { if (req.url === '/health') { + const wsEndpoint = browserServer?.wsEndpoint().replace('localhost', BROWSER_WS_HOST) || ''; res.writeHead(200, { 'Content-Type': 'application/json' }); res.end(JSON.stringify({ status: 'healthy', - wsEndpoint: browserServer?.wsEndpoint(), + wsEndpoint, wsPort: BROWSER_WS_PORT, healthPort: BROWSER_HEALTH_PORT, timestamp: new Date().toISOString() })); } else if (req.url === '/') { res.writeHead(200, { 'Content-Type': 'text/plain' }); - res.end(`Maxun Browser Service\nWebSocket: ${browserServer?.wsEndpoint()}\nHealth: http://localhost:${BROWSER_HEALTH_PORT}/health`); + const wsEndpoint = browserServer?.wsEndpoint().replace('localhost', BROWSER_WS_HOST) || ''; + res.end(`Maxun Browser Service\nWebSocket: ${wsEndpoint}\nHealth: http://localhost:${BROWSER_HEALTH_PORT}/health`); } else { res.writeHead(404); res.end('Not Found'); diff --git a/docker-compose.yml b/docker-compose.yml index dbb147b79..700303fd6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -30,9 +30,9 @@ services: - minio_data:/data backend: - #build: - #context: . - #dockerfile: server/Dockerfile + # build: + # context: . + # dockerfile: Dockerfile.backend image: getmaxun/maxun-backend:latest restart: unless-stopped ports: @@ -60,9 +60,9 @@ services: - /var/run/dbus:/var/run/dbus frontend: - #build: - #context: . - #dockerfile: Dockerfile + # build: + # context: . + # dockerfile: Dockerfile.frontend image: getmaxun/maxun-frontend:latest restart: unless-stopped ports: @@ -89,6 +89,8 @@ services: - DEBUG=pw:browser* - BROWSER_WS_PORT=${BROWSER_WS_PORT:-3001} - BROWSER_HEALTH_PORT=${BROWSER_HEALTH_PORT:-3002} + - BROWSER_WS_HOST=${BROWSER_WS_HOST:-browser} + - PLAYWRIGHT_BROWSERS_PATH=/ms-playwright restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:${BROWSER_HEALTH_PORT:-3002}/health"] diff --git a/maxun-core/package.json b/maxun-core/package.json index 21b51e37e..9a8c003de 100644 --- a/maxun-core/package.json +++ b/maxun-core/package.json @@ -1,6 +1,6 @@ { "name": "maxun-core", - "version": "0.0.27", + "version": "0.0.28", "description": "Core package for Maxun, responsible for data extraction", "main": "build/index.js", "typings": "build/index.d.ts", diff --git a/package.json b/package.json index 79f6f9669..41b1255cd 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "maxun", - "version": "0.0.27", + "version": "0.0.28", "author": "Maxun", "license": "AGPL-3.0-or-later", "dependencies": { @@ -52,7 +52,7 @@ "lodash": "^4.17.21", "loglevel": "^1.8.0", "loglevel-plugin-remote": "^0.6.8", - "maxun-core": "^0.0.27", + "maxun-core": "^0.0.28", "minio": "^8.0.1", "moment-timezone": "^0.5.45", "node-cron": "^3.0.3", @@ -131,4 +131,4 @@ "vite": "^5.4.10", "zod": "^3.25.62" } -} \ No newline at end of file +} diff --git a/server/src/api/record.ts b/server/src/api/record.ts index 04f4ab158..b2c2422a3 100644 --- a/server/src/api/record.ts +++ b/server/src/api/record.ts @@ -11,7 +11,7 @@ import { io, Socket } from "socket.io-client"; import { BinaryOutputService } from "../storage/mino"; import { AuthenticatedRequest } from "../routes/record" import {capture} from "../utils/analytics"; -import { Page } from "playwright"; +import { Page } from "playwright-core"; import { WorkflowFile } from "maxun-core"; import { addGoogleSheetUpdateTask, googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet"; import { addAirtableUpdateTask, airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable"; diff --git a/server/src/browser-management/classes/RemoteBrowser.ts b/server/src/browser-management/classes/RemoteBrowser.ts index 266a0978c..41a59176e 100644 --- a/server/src/browser-management/classes/RemoteBrowser.ts +++ b/server/src/browser-management/classes/RemoteBrowser.ts @@ -550,9 +550,9 @@ export class RemoteBrowser { try { const blocker = await PlaywrightBlocker.fromLists(fetch, ['https://easylist.to/easylist/easylist.txt']); - await blocker.enableBlockingInPage(this.currentPage); + await blocker.enableBlockingInPage(this.currentPage as any); this.client = await this.currentPage.context().newCDPSession(this.currentPage); - await blocker.disableBlockingInPage(this.currentPage); + await blocker.disableBlockingInPage(this.currentPage as any); console.log('Adblocker initialized'); } catch (error: any) { console.warn('Failed to initialize adblocker, continuing without it:', error.message); diff --git a/server/src/markdownify/scrape.ts b/server/src/markdownify/scrape.ts index 52ae19bf5..09df42767 100644 --- a/server/src/markdownify/scrape.ts +++ b/server/src/markdownify/scrape.ts @@ -1,4 +1,4 @@ -import { connectToRemoteBrowser } from "../browser-management/browserConnection"; +import { Page } from "playwright-core"; import { parseMarkdown } from "./markdown"; import logger from "../logger"; @@ -21,115 +21,105 @@ async function gotoWithFallback(page: any, url: string) { * Fetches a webpage, strips scripts/styles/images/etc, * returns clean Markdown using parser. * @param url - The URL to convert - * @param existingPage - Optional existing Playwright page instance to reuse + * @param page - Existing Playwright page instance to use */ -export async function convertPageToMarkdown(url: string): Promise { - const browser = await connectToRemoteBrowser(); - const page = await browser.newPage(); - - await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); - - const cleanedHtml = await page.evaluate(() => { - const selectors = [ - "script", - "style", - "link[rel='stylesheet']", - "noscript", - "meta", - "svg", - "img", - "picture", - "source", - "video", - "audio", - "iframe", - "object", - "embed" - ]; - - selectors.forEach(sel => { - document.querySelectorAll(sel).forEach(e => e.remove()); - }); +export async function convertPageToMarkdown(url: string, page: Page): Promise { + try { + logger.log('info', `[Scrape] Using existing page instance for markdown conversion of ${url}`); + + await gotoWithFallback(page, url); + + const cleanedHtml = await page.evaluate(() => { + const selectors = [ + "script", + "style", + "link[rel='stylesheet']", + "noscript", + "meta", + "svg", + "img", + "picture", + "source", + "video", + "audio", + "iframe", + "object", + "embed" + ]; + + selectors.forEach(sel => { + document.querySelectorAll(sel).forEach(e => e.remove()); + }); - // Remove inline event handlers (onclick, onload…) - const all = document.querySelectorAll("*"); - all.forEach(el => { - [...el.attributes].forEach(attr => { - if (attr.name.startsWith("on")) { - el.removeAttribute(attr.name); - } + const all = document.querySelectorAll("*"); + all.forEach(el => { + [...el.attributes].forEach(attr => { + if (attr.name.startsWith("on")) { + el.removeAttribute(attr.name); + } + }); }); - }); - return document.documentElement.outerHTML; - }); + return document.documentElement.outerHTML; + }); - if (shouldCloseBrowser && browser) { - logger.log('info', `[Scrape] Closing browser instance created for markdown conversion`); - await browser.close(); - } else { - logger.log('info', `[Scrape] Keeping existing browser instance open after markdown conversion`); + const markdown = await parseMarkdown(cleanedHtml, url); + return markdown; + } catch (error: any) { + logger.error(`[Scrape] Error during markdown conversion: ${error.message}`); + throw error; } - - // Convert cleaned HTML → Markdown - const markdown = await parseMarkdown(cleanedHtml, url); - return markdown; } /** * Fetches a webpage, strips scripts/styles/images/etc, * returns clean HTML. * @param url - The URL to convert - * @param existingPage - Optional existing Playwright page instance to reuse + * @param page - Existing Playwright page instance to use */ -export async function convertPageToHTML(url: string): Promise { - const browser = await connectToRemoteBrowser(); - const page = await browser.newPage(); - - await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); - - const cleanedHtml = await page.evaluate(() => { - const selectors = [ - "script", - "style", - "link[rel='stylesheet']", - "noscript", - "meta", - "svg", - "img", - "picture", - "source", - "video", - "audio", - "iframe", - "object", - "embed" - ]; - - selectors.forEach(sel => { - document.querySelectorAll(sel).forEach(e => e.remove()); - }); +export async function convertPageToHTML(url: string, page: Page): Promise { + try { + logger.log('info', `[Scrape] Using existing page instance for HTML conversion of ${url}`); + + await gotoWithFallback(page, url); + + const cleanedHtml = await page.evaluate(() => { + const selectors = [ + "script", + "style", + "link[rel='stylesheet']", + "noscript", + "meta", + "svg", + "img", + "picture", + "source", + "video", + "audio", + "iframe", + "object", + "embed" + ]; + + selectors.forEach(sel => { + document.querySelectorAll(sel).forEach(e => e.remove()); + }); - // Remove inline event handlers (onclick, onload…) - const all = document.querySelectorAll("*"); - all.forEach(el => { - [...el.attributes].forEach(attr => { - if (attr.name.startsWith("on")) { - el.removeAttribute(attr.name); - } + const all = document.querySelectorAll("*"); + all.forEach(el => { + [...el.attributes].forEach(attr => { + if (attr.name.startsWith("on")) { + el.removeAttribute(attr.name); + } + }); }); - }); - return document.documentElement.outerHTML; - }); + return document.documentElement.outerHTML; + }); - if (shouldCloseBrowser && browser) { - logger.log('info', `[Scrape] Closing browser instance created for HTML conversion`); - await browser.close(); - } else { - logger.log('info', `[Scrape] Keeping existing browser instance open after HTML conversion`); + return cleanedHtml; + } catch (error: any) { + logger.error(`[Scrape] Error during HTML conversion: ${error.message}`); + throw error; } - - // Return cleaned HTML directly - return cleanedHtml; } diff --git a/server/src/routes/index.ts b/server/src/routes/index.ts index 3d8a36449..a6ec206fc 100644 --- a/server/src/routes/index.ts +++ b/server/src/routes/index.ts @@ -2,7 +2,6 @@ import { router as record } from './record'; import { router as workflow } from './workflow'; import { router as storage } from './storage'; import { router as auth } from './auth'; -import { router as integration } from './integration'; import { router as proxy } from './proxy'; import { router as webhook } from './webhook'; @@ -11,7 +10,6 @@ export { workflow, storage, auth, - integration, proxy, webhook }; diff --git a/server/src/routes/storage.ts b/server/src/routes/storage.ts index 72518c7b9..45d4bc532 100644 --- a/server/src/routes/storage.ts +++ b/server/src/routes/storage.ts @@ -15,7 +15,6 @@ import { encrypt, decrypt } from '../utils/auth'; import { WorkflowFile } from 'maxun-core'; import { cancelScheduledWorkflow, scheduleWorkflow } from '../storage/schedule'; import { pgBossClient } from '../storage/pgboss'; -chromium.use(stealthPlugin()); export const router = Router(); diff --git a/server/src/server.ts b/server/src/server.ts index 88dc74c55..61f577560 100644 --- a/server/src/server.ts +++ b/server/src/server.ts @@ -5,7 +5,7 @@ import { Server } from "socket.io"; import cors from 'cors'; import dotenv from 'dotenv'; dotenv.config(); -import { record, workflow, storage, auth, integration, proxy, webhook } from './routes'; +import { record, workflow, storage, auth, proxy, webhook } from './routes'; import { BrowserPool } from "./browser-management/classes/BrowserPool"; import logger from './logger'; import sequelize, { connectDB, syncDB } from './storage/db' @@ -107,7 +107,6 @@ app.use('/record', record); app.use('/workflow', workflow); app.use('/storage', storage); app.use('/auth', auth); -app.use('/integration', integration); app.use('/proxy', proxy); app.use('/api-docs', swaggerUi.serve, swaggerUi.setup(swaggerSpec)); @@ -179,8 +178,6 @@ if (require.main === module) { await startWorkers(); - io = new Server(server); - io.of('/queued-run').on('connection', (socket) => { const userId = socket.handshake.query.userId as string; diff --git a/src/components/browser/BrowserWindow.tsx b/src/components/browser/BrowserWindow.tsx index 7dd8b2cf3..769e9048e 100644 --- a/src/components/browser/BrowserWindow.tsx +++ b/src/components/browser/BrowserWindow.tsx @@ -1686,6 +1686,12 @@ export const BrowserWindow = () => { } }, [paginationMode, resetPaginationSelector]); + useEffect(() => { + if (!paginationMode || !getList) { + setHighlighterData(null); + } + }, [paginationMode, getList]); + useEffect(() => { if (paginationMode && currentListActionId) { const currentListStep = browserSteps.find( @@ -1841,7 +1847,7 @@ export const BrowserWindow = () => { > {/* Individual element highlight (for non-group or hovered element) */} {((getText && !listSelector) || - (getList && paginationMode && paginationType !== "" && + (getList && paginationMode && !paginationSelector && paginationType !== "" && !["none", "scrollDown", "scrollUp"].includes(paginationType))) && (
{ listSelector && !paginationMode && !limitMode && + captureStage === 'initial' && highlighterData.similarElements?.rects?.map((rect, index) => (
= ({ shouldHighlight = false; } else if ( paginationMode && + !paginationSelector && paginationType !== "" && !["none", "scrollDown", "scrollUp"].includes(paginationType) ) { @@ -353,7 +354,7 @@ export const DOMBrowserRenderer: React.FC = ({ const options: boolean | AddEventListenerOptions = ['wheel', 'touchstart', 'touchmove'].includes(event) ? { passive: false } : false; - iframeDoc.removeEventListener(event, handler as EventListener, options); + iframeDoc.removeEventListener(event, handler as EventListener, options); }); } @@ -588,7 +589,7 @@ export const DOMBrowserRenderer: React.FC = ({ const elementRect = element.getBoundingClientRect(); const relativeX = iframeX - elementRect.left; const relativeY = iframeY - elementRect.top; - + socket.emit("dom:click", { selector, url: snapshot.baseUrl, @@ -636,7 +637,7 @@ export const DOMBrowserRenderer: React.FC = ({ if (iframe) { const focusedElement = iframeDoc.activeElement as HTMLElement; let coordinates = { x: 0, y: 0 }; - + if (focusedElement && focusedElement !== iframeDoc.body) { // Get coordinates from the focused element const rect = focusedElement.getBoundingClientRect(); diff --git a/src/components/recorder/RightSidePanel.tsx b/src/components/recorder/RightSidePanel.tsx index 8159e149d..a596e8f9b 100644 --- a/src/components/recorder/RightSidePanel.tsx +++ b/src/components/recorder/RightSidePanel.tsx @@ -415,6 +415,46 @@ export const RightSidePanel: React.FC = ({ onFinishCapture }, [stopGetList, resetListState]); const stopCaptureAndEmitGetListSettings = useCallback(() => { + if (autoDetectedPagination?.selector) { + const iframeElement = document.querySelector('#browser-window iframe') as HTMLIFrameElement; + if (iframeElement?.contentDocument) { + try { + function evaluateSelector(selector: string, doc: Document): Element[] { + if (selector.startsWith('//') || selector.startsWith('(//')) { + try { + const result = doc.evaluate(selector, doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); + const elements: Element[] = []; + for (let i = 0; i < result.snapshotLength; i++) { + const node = result.snapshotItem(i); + if (node && node.nodeType === Node.ELEMENT_NODE) { + elements.push(node as Element); + } + } + return elements; + } catch (err) { + return []; + } + } else { + try { + return Array.from(doc.querySelectorAll(selector)); + } catch (err) { + return []; + } + } + } + + const elements = evaluateSelector(autoDetectedPagination.selector, iframeElement.contentDocument); + elements.forEach((el: Element) => { + (el as HTMLElement).style.outline = ''; + (el as HTMLElement).style.outlineOffset = ''; + (el as HTMLElement).style.zIndex = ''; + }); + } catch (error) { + console.error('Error removing pagination highlight on completion:', error); + } + } + } + const latestListStep = getLatestListStep(browserSteps); if (latestListStep) { extractDataClientSide(latestListStep.listSelector!, latestListStep.fields, latestListStep.id); @@ -423,7 +463,7 @@ export const RightSidePanel: React.FC = ({ onFinishCapture ...currentWorkflowActionsState, hasScrapeListAction: true }); - + emitActionForStep(latestListStep); handleStopGetList(); @@ -441,7 +481,7 @@ export const RightSidePanel: React.FC = ({ onFinishCapture onFinishCapture(); clientSelectorGenerator.cleanup(); } - }, [socket, notify, handleStopGetList, resetInterpretationLog, finishAction, onFinishCapture, t, browserSteps, extractDataClientSide, setCurrentWorkflowActionsState, currentWorkflowActionsState, emitActionForStep]); + }, [socket, notify, handleStopGetList, resetInterpretationLog, finishAction, onFinishCapture, t, browserSteps, extractDataClientSide, setCurrentWorkflowActionsState, currentWorkflowActionsState, emitActionForStep, autoDetectedPagination]); const getLatestListStep = (steps: BrowserStep[]) => { const listSteps = steps.filter(step => step.type === 'list');