Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should these changes be propogated to the other benchmark deepspeed configs?

Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@ machine_rank: 0
main_training_function: main
mixed_precision: 'bf16'
num_machines: 1
num_processes: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
main_process_port: 0
149 changes: 22 additions & 127 deletions jobs/validate_all_static.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#SBATCH --job-name=validate_static_all
#SBATCH --partition=unkillable-cpu
#SBATCH --cpus-per-task=2
#SBATCH --mem=16G
#SBATCH --time=24:00:00
#SBATCH --output=slurm-%j.out
#SBATCH --error=slurm-%j.err
Expand All @@ -10,130 +11,24 @@

source .env

uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/education_qna_direct/*/data.json \
data/70B_15_validation/70B/education_qna_direct/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/education_qna_eli5/*/data.json \
data/70B_15_validation/70B/education_qna_eli5/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/education_qna_expert/*/data.json \
data/70B_15_validation/70B/education_qna_expert/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/education_qna_hinted/*/data.json \
data/70B_15_validation/70B/education_qna_hinted/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/education_summary_eli5/*/data.json \
data/70B_15_validation/70B/education_summary_eli5/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/education_summary_expert/*/data.json \
data/70B_15_validation/70B/education_summary_expert/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_generate_long/*/data.json \
data/70B_15_validation/70B/politics_generate_long/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_generate_short/*/data.json \
data/70B_15_validation/70B/politics_generate_short/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_qna_eli5/*/data.json \
data/70B_15_validation/70B/politics_qna_eli5/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_qna_expert/*/data.json \
data/70B_15_validation/70B/politics_qna_expert/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_summary_eli5/*/data.json \
data/70B_15_validation/70B/politics_summary_eli5/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_summary_expert/*/data.json \
data/70B_15_validation/70B/politics_summary_expert/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_generate_short/*/data.json \
data/70B_15_validation/70B/politics_generate_short/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_qna_eli5/*/data.json \
data/70B_15_validation/70B/politics_qna_eli5/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/politics_summary_eli5/*/data.json \
data/70B_15_validation/70B/politics_summary_eli5/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct \
&& \
uv run aif \
validate \
--max_concurrency 256 \
data/70B_15_generation/tech_healthcare_summary_expert/*/data.json \
data/70B_15_validation/70B/tech_healthcare_summary_expert/validate.json \
--no-validate-diversity \
--model Meta-Llama-3.1-70B-Instruct
MODEL="gpt-4o-mini"
MAX_CONC=256

# list all sub‐tasks
tasks=(
ultra-hh-sampled
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why don't we no longer need the full lest of datasets as it was before? The loop is much better, just trying to understand why the change

)

for t in "${tasks[@]}"; do
echo "Validating $t..."
uv run aif validate \
--max_concurrency "$MAX_CONC" \
"data/$t.json" \
"data/$t-validate-no-diversity.json" \
--no-validate-diversity \
--no-validate-embedding-diversity \
--model "$MODEL" \
|| { echo "Validation failed on $t"; exit 1; }
done

echo "All validations completed successfully."
71 changes: 71 additions & 0 deletions jobs/validate_all_static_diversity.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/bin/bash
#SBATCH --job-name=validate_static_all_diversity_external
#SBATCH --partition=main
#SBATCH --gres=gpu:a100l:1
#SBATCH --mem=36G
#SBATCH --cpus-per-task=6
#SBATCH --time=24:00:00
#SBATCH --output=slurm-%j.out
#SBATCH --error=slurm-%j.err
#SBATCH --mail-type=ALL
#SBATCH --mail-user=

cd
module load python/3.10
module load cuda/12.6.0
source .venv/bin/activate
echo "Starting vLLM server..."
uv run vllm serve BAAI/bge-m3 --dtype float16 --api-key openai --task embed &

# Save server process ID
SERVER_PID=$!

echo "Waiting for server to start..."
while true; do
echo "Checking if server is up..."
RESPONSE=$(curl -s http://localhost:8000/v1/models -H "Authorization: Bearer openai" 2>&1)

if [[ "$RESPONSE" == *"data"* ]]; then
echo "Server is up and running!"
break
fi

# Check if server is still running
if ! kill -0 $SERVER_PID 2>/dev/null; then
echo "Server process died unexpectedly"
exit 1
fi

echo "Server not ready yet. Waiting 5 seconds..."
sleep 5
done

deactivate
cd projects/AIF-Gen
source .env

echo "Starting validation process..."

# list all sub‐tasks
tasks=(
cppo-reward-sampled
cppo-rl-sampled
ultra-hh-sampled
)

for t in "${tasks[@]}"; do
echo "Validating $t..."
uv run aif validate \
"data/$t.json" \
"data/$t-validate-diversity.json" \
--no-validate-diversity \
--no-validate-count \
--no-validate-entropy \
--no-validate-llm-judge \
--embedding-model "BAAI/bge-m3" \
--embedding-batch-size 256 \
--max_concurrency 16 \
|| { echo "Validation failed on $t"; exit 1; }
done

echo "All validations completed successfully."
3 changes: 3 additions & 0 deletions pyproject.toml
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are the dependency changes temporary?

Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ dependencies = [
"pydantic>=2.10.4",
"pytest-asyncio>=0.25.3",
"pytest-mock>=3.14.0",
"setuptools>=75.8.2",
"torch==2.3.0",
"types-pyyaml>=6.0.12.20241230",
]
Expand All @@ -40,6 +41,8 @@ dev = [
"ruff>=0.7.3",
"sphinx>=7.4.7",
"sphinx-rtd-theme>=3.0.2",
"ipykernel>=6.29.5",
"matplotlib>=3.10.1",
]
benchmarks-dpo = [
"datasets>=3.2.0",
Expand Down
Loading