Skip to content

Commit e6df945

Browse files
authored
Merge pull request #726 from getmaxun/develop
chore: release v0.0.21
2 parents d57d7a4 + dff2892 commit e6df945

File tree

26 files changed

+716
-219
lines changed

26 files changed

+716
-219
lines changed

docker-compose.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
services:
22
postgres:
33
image: postgres:13
4+
restart: unless-stopped
45
environment:
56
POSTGRES_USER: ${DB_USER}
67
POSTGRES_PASSWORD: ${DB_PASSWORD}
@@ -17,6 +18,7 @@ services:
1718

1819
minio:
1920
image: minio/minio
21+
restart: unless-stopped
2022
environment:
2123
MINIO_ROOT_USER: ${MINIO_ACCESS_KEY}
2224
MINIO_ROOT_PASSWORD: ${MINIO_SECRET_KEY}
@@ -32,6 +34,7 @@ services:
3234
#context: .
3335
#dockerfile: server/Dockerfile
3436
image: getmaxun/maxun-backend:latest
37+
restart: unless-stopped
3538
ports:
3639
- "${BACKEND_PORT:-8080}:${BACKEND_PORT:-8080}"
3740
env_file: .env
@@ -58,6 +61,7 @@ services:
5861
#context: .
5962
#dockerfile: Dockerfile
6063
image: getmaxun/maxun-frontend:latest
64+
restart: unless-stopped
6165
ports:
6266
- "${FRONTEND_PORT:-5173}:${FRONTEND_PORT:-5173}"
6367
env_file: .env

docs/nginx.conf

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# Robust maxun nginx config file
2+
# DO NOT uncomment commented lines unless YOU know what they mean and YOU know what YOU are doing!
3+
### HTTP server block ###
4+
server {
5+
server_name maxun.my.domain;
6+
root /usr/share/nginx/html;
7+
listen 80;
8+
server_tokens off;
9+
return 301 https://$server_name$request_uri;
10+
}
11+
### HTTPS server block ###
12+
server {
13+
### Default config ###
14+
server_name maxun.my.domain;
15+
root /usr/share/nginx/html;
16+
access_log /var/log/nginx/maxun_access.log;
17+
error_log /var/log/nginx/maxun_error.log info;
18+
listen 443 ssl;
19+
http2 on;
20+
server_tokens off;
21+
### SSL config ###
22+
ssl_certificate /etc/letsencrypt/live/my.domain/fullchain.pem;
23+
ssl_certificate_key /etc/letsencrypt/live/my.domain/privkey.pem;
24+
ssl_trusted_certificate /etc/letsencrypt/live/my.domain/chain.pem;
25+
ssl_protocols TLSv1.2 TLSv1.3;
26+
#ssl_ecdh_curve X25519MLKEM768:X25519:prime256v1:secp384r1;
27+
ssl_ecdh_curve X25519:prime256v1:secp384r1;
28+
ssl_prefer_server_ciphers off;
29+
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:DHE-RSA-CHACHA20-POLY1305;
30+
ssl_stapling off;
31+
ssl_stapling_verify off;
32+
ssl_session_cache shared:MozSSL:10m;
33+
ssl_session_tickets off;
34+
ssl_session_timeout 1d;
35+
ssl_dhparam dh.pem;
36+
#ssl_conf_command Options KTLS;
37+
### Performance tuning config ###
38+
client_max_body_size 512M;
39+
client_body_timeout 300s;
40+
client_body_buffer_size 256k;
41+
#pagespeed off;
42+
### Compression ###
43+
## gzip ##
44+
gzip on;
45+
gzip_vary on;
46+
gzip_comp_level 5;
47+
gzip_min_length 256;
48+
gzip_disable msie6;
49+
gzip_proxied expired no-cache no-store private no_last_modified no_etag auth;
50+
gzip_buffers 16 8k;
51+
gzip_types application/atom+xml text/javascript application/javascript application/json application/ld+json application/manifest+json application/rss+xml application/vnd.geo+json application/vnd.ms-fontobject application/wasm application/x-font-ttf application/x-web-app-manifest+json application/xhtml+xml application/xml font/opentype image/bmp image/svg+xml image/x-icon text/cache-manifest text/css text/plain text/vcard text/vnd.rim.location.xloc text/vtt text/x-component text/x-cross-domain-policy;
52+
## brotli: enable only if you have compiled nginx with brotli support!!! ##
53+
#brotli on;
54+
#brotli_static on;
55+
#brotli_comp_level 6;
56+
#brotli_types application/atom+xml application/javascript application/json application/rss+xml
57+
# application/vnd.ms-fontobject application/x-font-opentype application/x-font-truetype
58+
# application/x-font-ttf application/x-javascript application/xhtml+xml application/xml
59+
# font/eot font/opentype font/otf font/truetype image/svg+xml image/vnd.microsoft.icon
60+
# image/x-icon image/x-win-bitmap text/css text/javascript text/plain text/xml;
61+
### Default headers ###
62+
add_header Referrer-Policy "no-referrer" always;
63+
add_header X-Content-Type-Options "nosniff" always;
64+
add_header X-Frame-Options "SAMEORIGIN" always;
65+
add_header X-Permitted-Cross-Domain-Policies "none" always;
66+
add_header X-Robots-Tag "noindex, nofollow" always;
67+
add_header X-XSS-Protection "1; mode=block" always;
68+
add_header Permissions-Policy "geolocation=(self), midi=(self), sync-xhr=(self), microphone=(self), camera=(self), magnetometer=(self), gyroscope=(self), fullscreen=(self), payment=(self), interest-cohort=()";
69+
### Proxy rules ###
70+
# Backend web traffic and websockets
71+
location ~ ^/(auth|storage|record|workflow|robot|proxy|api-docs|api|webhook|socket.io)(/|$) {
72+
proxy_pass http://localhost:8080; #Change the port number to match .env file BACKEND_PORT variable
73+
proxy_http_version 1.1;
74+
proxy_set_header Upgrade $http_upgrade;
75+
proxy_set_header Connection 'upgrade';
76+
proxy_set_header Host $host;
77+
proxy_set_header X-Real-IP $remote_addr;
78+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
79+
proxy_set_header X-Forwarded-Proto $scheme;
80+
}
81+
# Frontend web traffic
82+
location / {
83+
proxy_pass http://localhost:5173; #Change the port number to match .env file FRONTEND_PORT variable
84+
proxy_http_version 1.1;
85+
proxy_set_header Upgrade $http_upgrade;
86+
proxy_set_header Connection 'upgrade';
87+
proxy_set_header Host $host;
88+
proxy_set_header X-Real-IP $remote_addr;
89+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
90+
proxy_set_header X-Forwarded-Proto $scheme;
91+
}
92+
}

docs/self-hosting-docker.md

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# Self hosting docker guide
2+
3+
So you want to create a bot? Let's get you started!
4+
5+
## Requirements (not covered)
6+
- Webserver (Apache2, nginx, etc.)
7+
- SSL Certificates (letsencrypt, zerossl, etc)
8+
- A sub-domain to host maxun i.e. maxun.my.domain
9+
- Docker
10+
- Docker compose
11+
- Probably others...
12+
13+
## Guide
14+
For this guide, we assume that before you start, you have a dedicated docker folder to house config files and everything else we need for persistence between docker container reboots and updates. The path in this guide is `/home/$USER/Docker/maxun`.
15+
1. Change directory into your docker folder `cd /home/$USER/Docker/`
16+
2. Create a new directory for maxun and all the required sub-folders for our docker services `mkdir -p maxun/{db,minio,redis}`
17+
3. Change directory to enter the newly created folder `cd maxun`
18+
4. Create an environment file to save your variables `nano .env` with the following contents:
19+
```
20+
NODE_ENV=production
21+
JWT_SECRET=openssl rand -base64 48
22+
DB_NAME=maxun
23+
DB_USER=postgres
24+
DB_PASSWORD=openssl rand -base64 24
25+
DB_HOST=postgres
26+
DB_PORT=5432
27+
ENCRYPTION_KEY=openssl rand -base64 64
28+
SESSION_SECRET=openssl rand -base64 48
29+
MINIO_ENDPOINT=minio
30+
MINIO_PORT=9000
31+
MINIO_CONSOLE_PORT=9001
32+
MINIO_ACCESS_KEY=minio
33+
MINIO_SECRET_KEY=openssl rand -base64 24
34+
REDIS_HOST=maxun-redis
35+
REDIS_PORT=6379
36+
REDIS_PASSWORD=
37+
BACKEND_PORT=8080
38+
FRONTEND_PORT=5173
39+
BACKEND_URL=https://maxun.my.domain
40+
PUBLIC_URL=https://maxun.my.domain
41+
VITE_BACKEND_URL=https://maxun.my.domain
42+
VITE_PUBLIC_URL=https://maxun.my.domain
43+
GOOGLE_CLIENT_ID=
44+
GOOGLE_CLIENT_SECRET=
45+
GOOGLE_REDIRECT_URI=
46+
AIRTABLE_CLIENT_ID=
47+
AIRTABLE_REDIRECT_URI=
48+
MAXUN_TELEMETRY=true
49+
```
50+
5. Ctrl + x, Y, Enter will save your changes
51+
6. Please be sure to READ this file and change the variables to match your environment!!! i.e. BACKEND_PORT=30000
52+
7. Create a file for docker compose `nano docker-compose.yml` with the following contents:
53+
```yml
54+
services:
55+
postgres:
56+
image: postgres:17
57+
container_name: maxun-postgres
58+
mem_limit: 512M
59+
environment:
60+
POSTGRES_USER: ${DB_USER}
61+
POSTGRES_PASSWORD: ${DB_PASSWORD}
62+
POSTGRES_DB: ${DB_NAME}
63+
volumes:
64+
- /home/$USER/Docker/maxun/db:/var/lib/postgresql/data
65+
healthcheck:
66+
test: ["CMD-SHELL", "pg_isready -U postgres"]
67+
interval: 10s
68+
timeout: 5s
69+
retries: 5
70+
71+
redis:
72+
image: docker.io/library/redis:7
73+
container_name: maxun-redis
74+
restart: always
75+
mem_limit: 128M
76+
volumes:
77+
- /home/$USER/Docker/maxun/redis:/data
78+
79+
minio:
80+
image: minio/minio
81+
container_name: maxun-minio
82+
mem_limit: 512M
83+
environment:
84+
MINIO_ROOT_USER: ${MINIO_ACCESS_KEY}
85+
MINIO_ROOT_PASSWORD: ${MINIO_SECRET_KEY}
86+
command: server /data --console-address :${MINIO_CONSOLE_PORT:-9001}
87+
volumes:
88+
- /home/$USER/Docker/maxun/minio:/data
89+
90+
backend:
91+
image: getmaxun/maxun-backend:latest
92+
container_name: maxun-backend
93+
ports:
94+
- "127.0.0.1:${BACKEND_PORT:-8080}:${BACKEND_PORT:-8080}"
95+
env_file: .env
96+
environment:
97+
BACKEND_URL: ${BACKEND_URL}
98+
PLAYWRIGHT_BROWSERS_PATH: /ms-playwright
99+
PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD: 0
100+
# DEBUG: pw:api
101+
# PWDEBUG: 1 # Enables debugging
102+
CHROMIUM_FLAGS: '--disable-gpu --no-sandbox --headless=new'
103+
security_opt:
104+
- seccomp=unconfined # This might help with browser sandbox issues
105+
shm_size: '2gb'
106+
mem_limit: 4g
107+
depends_on:
108+
- postgres
109+
- minio
110+
volumes:
111+
- /var/run/dbus:/var/run/dbus
112+
113+
frontend:
114+
image: getmaxun/maxun-frontend:latest
115+
container_name: maxun-frontend
116+
mem_limit: 512M
117+
ports:
118+
- "127.0.0.1:${FRONTEND_PORT:-5173}:5173"
119+
env_file: .env
120+
environment:
121+
PUBLIC_URL: ${PUBLIC_URL}
122+
BACKEND_URL: ${BACKEND_URL}
123+
depends_on:
124+
- backend
125+
```
126+
8. Ctrl + x, Y, Enter will save your changes
127+
9. This particular setup is "production ready" meaning that maxun is only accessible from localhost. You must configure a reverse proxy to access it!
128+
10. Start maxun `sudo docker compose up -d` or `sudo docker-compose up -d`
129+
11. Wait 30 seconds for everything to come up
130+
12. Access your maxun instance at http://localhost:5173 if using defaults
131+
132+
## Next steps
133+
You will want to configure a reverse proxy. Click on a link below to check out some examples.
134+
- [Nginx](nginx.conf)

maxun-core/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "maxun-core",
3-
"version": "0.0.20",
3+
"version": "0.0.21",
44
"description": "Core package for Maxun, responsible for data extraction",
55
"main": "build/index.js",
66
"typings": "build/index.d.ts",

maxun-core/src/browserSide/scraper.js

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
537537

538538
const evaluateXPath = (document, xpath, isShadow = false) => {
539539
try {
540+
if (!document || !xpath) {
541+
console.warn('Invalid document or xpath provided to evaluateXPath');
542+
return null;
543+
}
544+
540545
const result = document.evaluate(
541546
xpath,
542547
document,
@@ -632,6 +637,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
632637
return null;
633638
} catch (err) {
634639
console.error("Critical XPath failure:", xpath, err);
640+
// Return null instead of throwing to prevent crashes
635641
return null;
636642
}
637643
};
@@ -694,16 +700,25 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
694700
for (let i = 0; i < parts.length; i++) {
695701
if (!currentElement) return null;
696702

697-
// Handle iframe and frame traversal
703+
// Handle iframe and frame traversal with enhanced safety
698704
if (
699705
currentElement.tagName === "IFRAME" ||
700706
currentElement.tagName === "FRAME"
701707
) {
702708
try {
709+
// Check if frame is accessible
710+
if (!currentElement.contentDocument && !currentElement.contentWindow) {
711+
console.warn('Frame is not accessible (cross-origin or unloaded)');
712+
return null;
713+
}
714+
703715
const frameDoc =
704716
currentElement.contentDocument ||
705-
currentElement.contentWindow.document;
706-
if (!frameDoc) return null;
717+
currentElement.contentWindow?.document;
718+
if (!frameDoc) {
719+
console.warn('Frame document is not available');
720+
return null;
721+
}
707722

708723
if (isXPathSelector(parts[i])) {
709724
currentElement = evaluateXPath(frameDoc, parts[i]);

0 commit comments

Comments
 (0)