Skip to content

Commit 7d0ab5f

Browse files
authored
Merge pull request #907 from getmaxun/pre-release-28
chore: pre-release v0.0.28
2 parents 3d7ae1e + 068a28a commit 7d0ab5f

File tree

15 files changed

+161
-149
lines changed

15 files changed

+161
-149
lines changed

Dockerfile.backend

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM --platform=$BUILDPLATFORM mcr.microsoft.com/playwright:v1.46.0-noble
1+
FROM --platform=$BUILDPLATFORM node:20-slim
22

33
# Set working directory
44
WORKDIR /app
@@ -18,31 +18,6 @@ COPY server/tsconfig.json ./server/
1818
# Install dependencies
1919
RUN npm install --legacy-peer-deps
2020

21-
# Create the Chromium data directory with necessary permissions
22-
RUN mkdir -p /tmp/chromium-data-dir && \
23-
chmod -R 777 /tmp/chromium-data-dir
24-
25-
# Install dependencies
26-
RUN apt-get update && apt-get install -y \
27-
libgbm1 \
28-
libnss3 \
29-
libatk1.0-0 \
30-
libatk-bridge2.0-0 \
31-
libdrm2 \
32-
libxkbcommon0 \
33-
libglib2.0-0 \
34-
libdbus-1-3 \
35-
libx11-xcb1 \
36-
libxcb1 \
37-
libxcomposite1 \
38-
libxcursor1 \
39-
libxdamage1 \
40-
libxext6 \
41-
libxi6 \
42-
libxtst6 \
43-
&& rm -rf /var/lib/apt/lists/* \
44-
&& mkdir -p /tmp/.X11-unix && chmod 1777 /tmp/.X11-unix
45-
4621
# Expose backend port
4722
EXPOSE ${BACKEND_PORT:-8080}
4823

browser/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ WORKDIR /app
66
COPY browser/package*.json ./
77

88
# Install dependencies
9-
RUN npm ci
9+
RUN npm install
1010

1111
# Copy TypeScript source and config
1212
COPY browser/server.ts ./

browser/server.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ let browserServer: BrowserServer | null = null;
1111
// Configurable ports with defaults
1212
const BROWSER_WS_PORT = parseInt(process.env.BROWSER_WS_PORT || '3001', 10);
1313
const BROWSER_HEALTH_PORT = parseInt(process.env.BROWSER_HEALTH_PORT || '3002', 10);
14+
const BROWSER_WS_HOST = process.env.BROWSER_WS_HOST || 'localhost';
1415

1516
async function start(): Promise<void> {
1617
console.log('Starting Maxun Browser Service...');
@@ -44,17 +45,19 @@ async function start(): Promise<void> {
4445
// Health check HTTP server
4546
const healthServer = http.createServer((req, res) => {
4647
if (req.url === '/health') {
48+
const wsEndpoint = browserServer?.wsEndpoint().replace('localhost', BROWSER_WS_HOST) || '';
4749
res.writeHead(200, { 'Content-Type': 'application/json' });
4850
res.end(JSON.stringify({
4951
status: 'healthy',
50-
wsEndpoint: browserServer?.wsEndpoint(),
52+
wsEndpoint,
5153
wsPort: BROWSER_WS_PORT,
5254
healthPort: BROWSER_HEALTH_PORT,
5355
timestamp: new Date().toISOString()
5456
}));
5557
} else if (req.url === '/') {
5658
res.writeHead(200, { 'Content-Type': 'text/plain' });
57-
res.end(`Maxun Browser Service\nWebSocket: ${browserServer?.wsEndpoint()}\nHealth: http://localhost:${BROWSER_HEALTH_PORT}/health`);
59+
const wsEndpoint = browserServer?.wsEndpoint().replace('localhost', BROWSER_WS_HOST) || '';
60+
res.end(`Maxun Browser Service\nWebSocket: ${wsEndpoint}\nHealth: http://localhost:${BROWSER_HEALTH_PORT}/health`);
5861
} else {
5962
res.writeHead(404);
6063
res.end('Not Found');

docker-compose.yml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ services:
3030
- minio_data:/data
3131

3232
backend:
33-
#build:
34-
#context: .
35-
#dockerfile: server/Dockerfile
33+
# build:
34+
# context: .
35+
# dockerfile: Dockerfile.backend
3636
image: getmaxun/maxun-backend:latest
3737
restart: unless-stopped
3838
ports:
@@ -60,9 +60,9 @@ services:
6060
- /var/run/dbus:/var/run/dbus
6161

6262
frontend:
63-
#build:
64-
#context: .
65-
#dockerfile: Dockerfile
63+
# build:
64+
# context: .
65+
# dockerfile: Dockerfile.frontend
6666
image: getmaxun/maxun-frontend:latest
6767
restart: unless-stopped
6868
ports:
@@ -89,6 +89,8 @@ services:
8989
- DEBUG=pw:browser*
9090
- BROWSER_WS_PORT=${BROWSER_WS_PORT:-3001}
9191
- BROWSER_HEALTH_PORT=${BROWSER_HEALTH_PORT:-3002}
92+
- BROWSER_WS_HOST=${BROWSER_WS_HOST:-browser}
93+
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
9294
restart: unless-stopped
9395
healthcheck:
9496
test: ["CMD", "curl", "-f", "http://localhost:${BROWSER_HEALTH_PORT:-3002}/health"]

maxun-core/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "maxun-core",
3-
"version": "0.0.27",
3+
"version": "0.0.28",
44
"description": "Core package for Maxun, responsible for data extraction",
55
"main": "build/index.js",
66
"typings": "build/index.d.ts",

package.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "maxun",
3-
"version": "0.0.27",
3+
"version": "0.0.28",
44
"author": "Maxun",
55
"license": "AGPL-3.0-or-later",
66
"dependencies": {
@@ -52,7 +52,7 @@
5252
"lodash": "^4.17.21",
5353
"loglevel": "^1.8.0",
5454
"loglevel-plugin-remote": "^0.6.8",
55-
"maxun-core": "^0.0.27",
55+
"maxun-core": "^0.0.28",
5656
"minio": "^8.0.1",
5757
"moment-timezone": "^0.5.45",
5858
"node-cron": "^3.0.3",
@@ -131,4 +131,4 @@
131131
"vite": "^5.4.10",
132132
"zod": "^3.25.62"
133133
}
134-
}
134+
}

server/src/api/record.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import { io, Socket } from "socket.io-client";
1111
import { BinaryOutputService } from "../storage/mino";
1212
import { AuthenticatedRequest } from "../routes/record"
1313
import {capture} from "../utils/analytics";
14-
import { Page } from "playwright";
14+
import { Page } from "playwright-core";
1515
import { WorkflowFile } from "maxun-core";
1616
import { addGoogleSheetUpdateTask, googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
1717
import { addAirtableUpdateTask, airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";

server/src/browser-management/classes/RemoteBrowser.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -550,9 +550,9 @@ export class RemoteBrowser {
550550

551551
try {
552552
const blocker = await PlaywrightBlocker.fromLists(fetch, ['https://easylist.to/easylist/easylist.txt']);
553-
await blocker.enableBlockingInPage(this.currentPage);
553+
await blocker.enableBlockingInPage(this.currentPage as any);
554554
this.client = await this.currentPage.context().newCDPSession(this.currentPage);
555-
await blocker.disableBlockingInPage(this.currentPage);
555+
await blocker.disableBlockingInPage(this.currentPage as any);
556556
console.log('Adblocker initialized');
557557
} catch (error: any) {
558558
console.warn('Failed to initialize adblocker, continuing without it:', error.message);

server/src/markdownify/scrape.ts

Lines changed: 84 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { connectToRemoteBrowser } from "../browser-management/browserConnection";
1+
import { Page } from "playwright-core";
22
import { parseMarkdown } from "./markdown";
33
import logger from "../logger";
44

@@ -21,115 +21,105 @@ async function gotoWithFallback(page: any, url: string) {
2121
* Fetches a webpage, strips scripts/styles/images/etc,
2222
* returns clean Markdown using parser.
2323
* @param url - The URL to convert
24-
* @param existingPage - Optional existing Playwright page instance to reuse
24+
* @param page - Existing Playwright page instance to use
2525
*/
26-
export async function convertPageToMarkdown(url: string): Promise<string> {
27-
const browser = await connectToRemoteBrowser();
28-
const page = await browser.newPage();
29-
30-
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
31-
32-
const cleanedHtml = await page.evaluate(() => {
33-
const selectors = [
34-
"script",
35-
"style",
36-
"link[rel='stylesheet']",
37-
"noscript",
38-
"meta",
39-
"svg",
40-
"img",
41-
"picture",
42-
"source",
43-
"video",
44-
"audio",
45-
"iframe",
46-
"object",
47-
"embed"
48-
];
49-
50-
selectors.forEach(sel => {
51-
document.querySelectorAll(sel).forEach(e => e.remove());
52-
});
26+
export async function convertPageToMarkdown(url: string, page: Page): Promise<string> {
27+
try {
28+
logger.log('info', `[Scrape] Using existing page instance for markdown conversion of ${url}`);
29+
30+
await gotoWithFallback(page, url);
31+
32+
const cleanedHtml = await page.evaluate(() => {
33+
const selectors = [
34+
"script",
35+
"style",
36+
"link[rel='stylesheet']",
37+
"noscript",
38+
"meta",
39+
"svg",
40+
"img",
41+
"picture",
42+
"source",
43+
"video",
44+
"audio",
45+
"iframe",
46+
"object",
47+
"embed"
48+
];
49+
50+
selectors.forEach(sel => {
51+
document.querySelectorAll(sel).forEach(e => e.remove());
52+
});
5353

54-
// Remove inline event handlers (onclick, onload…)
55-
const all = document.querySelectorAll("*");
56-
all.forEach(el => {
57-
[...el.attributes].forEach(attr => {
58-
if (attr.name.startsWith("on")) {
59-
el.removeAttribute(attr.name);
60-
}
54+
const all = document.querySelectorAll("*");
55+
all.forEach(el => {
56+
[...el.attributes].forEach(attr => {
57+
if (attr.name.startsWith("on")) {
58+
el.removeAttribute(attr.name);
59+
}
60+
});
6161
});
62-
});
6362

64-
return document.documentElement.outerHTML;
65-
});
63+
return document.documentElement.outerHTML;
64+
});
6665

67-
if (shouldCloseBrowser && browser) {
68-
logger.log('info', `[Scrape] Closing browser instance created for markdown conversion`);
69-
await browser.close();
70-
} else {
71-
logger.log('info', `[Scrape] Keeping existing browser instance open after markdown conversion`);
66+
const markdown = await parseMarkdown(cleanedHtml, url);
67+
return markdown;
68+
} catch (error: any) {
69+
logger.error(`[Scrape] Error during markdown conversion: ${error.message}`);
70+
throw error;
7271
}
73-
74-
// Convert cleaned HTML → Markdown
75-
const markdown = await parseMarkdown(cleanedHtml, url);
76-
return markdown;
7772
}
7873

7974
/**
8075
* Fetches a webpage, strips scripts/styles/images/etc,
8176
* returns clean HTML.
8277
* @param url - The URL to convert
83-
* @param existingPage - Optional existing Playwright page instance to reuse
78+
* @param page - Existing Playwright page instance to use
8479
*/
85-
export async function convertPageToHTML(url: string): Promise<string> {
86-
const browser = await connectToRemoteBrowser();
87-
const page = await browser.newPage();
88-
89-
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
90-
91-
const cleanedHtml = await page.evaluate(() => {
92-
const selectors = [
93-
"script",
94-
"style",
95-
"link[rel='stylesheet']",
96-
"noscript",
97-
"meta",
98-
"svg",
99-
"img",
100-
"picture",
101-
"source",
102-
"video",
103-
"audio",
104-
"iframe",
105-
"object",
106-
"embed"
107-
];
108-
109-
selectors.forEach(sel => {
110-
document.querySelectorAll(sel).forEach(e => e.remove());
111-
});
80+
export async function convertPageToHTML(url: string, page: Page): Promise<string> {
81+
try {
82+
logger.log('info', `[Scrape] Using existing page instance for HTML conversion of ${url}`);
83+
84+
await gotoWithFallback(page, url);
85+
86+
const cleanedHtml = await page.evaluate(() => {
87+
const selectors = [
88+
"script",
89+
"style",
90+
"link[rel='stylesheet']",
91+
"noscript",
92+
"meta",
93+
"svg",
94+
"img",
95+
"picture",
96+
"source",
97+
"video",
98+
"audio",
99+
"iframe",
100+
"object",
101+
"embed"
102+
];
103+
104+
selectors.forEach(sel => {
105+
document.querySelectorAll(sel).forEach(e => e.remove());
106+
});
112107

113-
// Remove inline event handlers (onclick, onload…)
114-
const all = document.querySelectorAll("*");
115-
all.forEach(el => {
116-
[...el.attributes].forEach(attr => {
117-
if (attr.name.startsWith("on")) {
118-
el.removeAttribute(attr.name);
119-
}
108+
const all = document.querySelectorAll("*");
109+
all.forEach(el => {
110+
[...el.attributes].forEach(attr => {
111+
if (attr.name.startsWith("on")) {
112+
el.removeAttribute(attr.name);
113+
}
114+
});
120115
});
121-
});
122116

123-
return document.documentElement.outerHTML;
124-
});
117+
return document.documentElement.outerHTML;
118+
});
125119

126-
if (shouldCloseBrowser && browser) {
127-
logger.log('info', `[Scrape] Closing browser instance created for HTML conversion`);
128-
await browser.close();
129-
} else {
130-
logger.log('info', `[Scrape] Keeping existing browser instance open after HTML conversion`);
120+
return cleanedHtml;
121+
} catch (error: any) {
122+
logger.error(`[Scrape] Error during HTML conversion: ${error.message}`);
123+
throw error;
131124
}
132-
133-
// Return cleaned HTML directly
134-
return cleanedHtml;
135125
}

server/src/routes/index.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ import { router as record } from './record';
22
import { router as workflow } from './workflow';
33
import { router as storage } from './storage';
44
import { router as auth } from './auth';
5-
import { router as integration } from './integration';
65
import { router as proxy } from './proxy';
76
import { router as webhook } from './webhook';
87

@@ -11,7 +10,6 @@ export {
1110
workflow,
1211
storage,
1312
auth,
14-
integration,
1513
proxy,
1614
webhook
1715
};

0 commit comments

Comments
 (0)