Skip to content

Commit 6f33f63

Browse files
committed
Initial eval setup
1 parent ccc914d commit 6f33f63

File tree

164 files changed

+9693
-51
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

164 files changed

+9693
-51
lines changed

Makefile

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Makefile for kernel-browser local development
22
# Using kernel-images native build system
33

4-
.PHONY: help build run stop logs clean dev status shell test
4+
.PHONY: help build rebuild run stop logs clean dev status shell test
55

66
# Default target
77
help: ## Show this help message
@@ -57,11 +57,24 @@ rebuild-devtools-full: ## Force complete rebuild from scratch (slow, rarely need
5757
docker build -f Dockerfile.devtools --no-cache --target devtools-server -t browser-operator-devtools:latest .
5858
@echo "✅ DevTools completely rebuilt"
5959

60-
build: init build-devtools ## Build extended image with DevTools frontend
60+
build: init ## Build extended image with DevTools frontend (smart: only builds DevTools if needed)
6161
@echo "🔨 Building extended kernel-browser with DevTools frontend..."
62+
@if ! docker images | grep -q "browser-operator-devtools.*latest"; then \
63+
echo "📦 DevTools image not found, building it first..."; \
64+
echo " This is a one-time operation and will take ~30 minutes..."; \
65+
$(MAKE) --no-print-directory build-devtools; \
66+
else \
67+
echo "✅ Using existing DevTools image"; \
68+
fi
6269
docker build -f Dockerfile.local -t kernel-browser:extended .
6370
@echo "✅ Extended build complete"
6471

72+
rebuild: init ## Force complete rebuild (including DevTools)
73+
@echo "🔄 Force rebuilding everything from scratch..."
74+
$(MAKE) --no-print-directory build-devtools
75+
docker build -f Dockerfile.local -t kernel-browser:extended .
76+
@echo "✅ Complete rebuild finished"
77+
6578
run: ## Run extended container with DevTools (interactive)
6679
@echo "🚀 Starting extended kernel-browser with DevTools..."
6780
@if [ -n "$(URLS)" ]; then echo "📄 Opening URLs: $(URLS)"; fi

build-local.sh

Lines changed: 0 additions & 43 deletions
This file was deleted.

eval-server/nodejs/src/api-server.js

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -362,12 +362,18 @@ class APIServer {
362362
// Handle nested model configuration directly
363363
const nestedModelConfig = this.processNestedModelConfig(requestBody);
364364

365+
// Extract optional URL and wait timeout
366+
const targetUrl = requestBody.url || 'about:blank';
367+
const waitTimeout = requestBody.wait_timeout || 5000;
368+
365369
const redact = (mk) => ({
366370
...mk,
367371
api_key: mk?.api_key ? `${String(mk.api_key).slice(0, 4)}...` : undefined
368372
});
369373
logger.info('Processing responses request:', {
370374
input: requestBody.input,
375+
url: targetUrl,
376+
wait_timeout: targetUrl !== 'about:blank' ? waitTimeout : 0,
371377
modelConfig: {
372378
main_model: redact(nestedModelConfig.main_model),
373379
mini_model: redact(nestedModelConfig.mini_model),
@@ -378,10 +384,10 @@ class APIServer {
378384
// Find a client with existing tabs (not the dummy client)
379385
const baseClientId = this.findClientWithTabs();
380386

381-
// Open a new tab for this request
382-
logger.info('Opening new tab for responses request', { baseClientId });
387+
// Open a new tab for this request at the specified URL
388+
logger.info('Opening new tab for responses request', { baseClientId, url: targetUrl });
383389
const tabResult = await this.evaluationServer.openTab(baseClientId, {
384-
url: 'about:blank',
390+
url: targetUrl,
385391
background: false
386392
});
387393

@@ -393,6 +399,12 @@ class APIServer {
393399
// Wait for the new tab's DevTools to connect
394400
const tabClient = await this.waitForClientConnection(tabResult.compositeClientId);
395401

402+
// Wait for page to load if a custom URL was provided
403+
if (targetUrl !== 'about:blank') {
404+
logger.info('Waiting for page to load', { waitTimeout });
405+
await new Promise(resolve => setTimeout(resolve, waitTimeout));
406+
}
407+
396408
// Create a dynamic evaluation for this request
397409
const evaluation = this.createDynamicEvaluationNested(requestBody.input, nestedModelConfig);
398410

@@ -484,6 +496,7 @@ class APIServer {
484496
findClientWithTabs() {
485497
const clients = this.evaluationServer.getClientManager().getAllClients();
486498

499+
// First, try to find a client with existing tabs
487500
for (const client of clients) {
488501
const tabs = this.evaluationServer.getClientManager().getClientTabs(client.id);
489502
if (tabs.length > 0) {
@@ -492,7 +505,13 @@ class APIServer {
492505
}
493506
}
494507

495-
throw new Error('No client with existing tabs found. Please ensure at least one DevTools client with a tab is connected.');
508+
// If no client with tabs, use the first available client (even with 0 tabs)
509+
if (clients.length > 0) {
510+
logger.info('No clients with tabs found, using first available client', { clientId: clients[0].id });
511+
return clients[0].id;
512+
}
513+
514+
throw new Error('No clients found. Please ensure at least one DevTools client is registered.');
496515
}
497516

498517
/**
@@ -540,7 +559,7 @@ class APIServer {
540559
description: 'Dynamic evaluation created from API request',
541560
enabled: true,
542561
tool: 'chat',
543-
timeout: 1500000, // 25 minutes
562+
timeout: 7200000, // 2 hours (increased for slow custom API)
544563
input: {
545564
message: input
546565
},

eval-server/nodejs/src/config.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ export const CONFIG = {
4545
},
4646

4747
rpc: {
48-
timeout: parseInt(process.env.RPC_TIMEOUT) || 1500000, // 25 minutes default
48+
timeout: parseInt(process.env.RPC_TIMEOUT) || 7200000, // 2 hours default (increased for slow custom API)
4949
maxConcurrentEvaluations: parseInt(process.env.MAX_CONCURRENT_EVALUATIONS) || 10
5050
},
5151

evals/.env.example

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Evaluation Framework Environment Variables
2+
# Copy this file to .env and fill in your actual API keys
3+
4+
# Required: OpenAI API key for LLM judge and main model
5+
OPENAI_API_KEY=sk-your-openai-api-key-here
6+
7+
# Optional: Groq API key (if using Groq models)
8+
GROQ_API_KEY=gsk-your-groq-api-key-here
9+
10+
# Optional: OpenRouter API key (if using OpenRouter)
11+
OPENROUTER_API_KEY=your-openrouter-api-key-here
12+
13+
# Optional: LiteLLM configuration (if using LiteLLM)
14+
LITELLM_API_KEY=your-litellm-api-key-here
15+
LITELLM_ENDPOINT=http://localhost:8000

evals/.gitignore

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Python
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
*.so
6+
.Python
7+
*.egg-info/
8+
dist/
9+
build/
10+
*.egg
11+
12+
# Virtual environments
13+
.venv/
14+
venv/
15+
ENV/
16+
env/
17+
18+
# uv
19+
.uv/
20+
uv.lock
21+
22+
# IDE
23+
.vscode/
24+
.idea/
25+
*.swp
26+
*.swo
27+
*~
28+
29+
# Reports
30+
reports/*.csv
31+
32+
# Environment variables
33+
.env
34+
.env.local
35+
36+
# OS
37+
.DS_Store
38+
Thumbs.db

evals/.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.9

0 commit comments

Comments
 (0)