Skip to content

Commit dce56da

Browse files
committed
source and data for ICLR2023 Language Models can teach themselves to program better
1 parent 506099b commit dce56da

19 files changed

+2540
-0
lines changed
27.2 MB
Binary file not shown.
25.6 MB
Binary file not shown.
27 MB
Binary file not shown.
39.5 MB
Binary file not shown.
47.9 MB
Binary file not shown.
46.6 MB
Binary file not shown.

ICLR2023/src/babysit.sh

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#!/bin/bash
2+
# All Experiment Settings - constant through the experiment run - passed to gen.sh and fine_tune.sh as needed
3+
GPU=0 # which GPU to use
4+
MODEL="125M" # MODEL is the size of the model: 125M, 13B, 27B
5+
EXPERIMENT=$MODEL"_PAPER" # Name of Experiment directory under data/* and models/base-model/* to store results
6+
TEST_LOCAL=1 # 0 means run gen/fine_tune on cluster remotely, 1 means run gen/fine_tune locally
7+
TARGET_NUM_FILES=1 # How many files to generate in each iteration before starting fine_tuning. Count of unique examples would have been better.
8+
ITER_START=0 # inclusive index to start processing at - creates iter_# under data&models at each iteration. Can continue prev runs by start at prev ITER_MAX
9+
ITER_MAX=5 # exclusive index to stop processing iterations at
10+
EPOCHS_START=1 # inclusive index of epochs to start processing at - could continue prev run by starting at prev EPOCHS_MAX+1 - 0th epoch is the default model so epoch starts at 1
11+
EPOCHS_MAX=4 # inclusive index of epochs to stop processing at
12+
EPOCHS_PER_STEP=1 # How many EPOCHS through the data to do in each step
13+
TRAIN_INCREMENTAL=0 # Only train on data from the latest iteration, and start finetuning on the last finetuned model - otherwise start from scratch and use all the data generated
14+
TRAIN_BOOST=0 # Initial generation of data from default model is slow - 1 means looks in 125M_RL_ALL to use previous generated initial data to bootstrap.
15+
PASS_AT_K=100 # PASS_AT_K says do K trials to solve to compute Pass@K
16+
LINE_LOG_K=11 # LINE_LOG_K is how many lines of results from solve have results for saving
17+
18+
echo babysit args: $# $0 $1 $2 $3 $4
19+
20+
if (( $# \!= 1 ))
21+
then
22+
echo babysit.sh only takes 1 argument, unless called by another script to initialize configuration variables
23+
return
24+
fi
25+
26+
if (( $# \>= 1 ))
27+
then
28+
GPU=$1
29+
fi
30+
31+
echo babysit GPU $GPU
32+
33+
for (( iteration=$ITER_START; iteration<$ITER_MAX; iteration++ ))
34+
do
35+
FULLNAME="${EXPERIMENT}---${iteration}"
36+
echo FULLNAME $FULLNAME
37+
export FULLNAME # Needed to pass variable off to yaml job
38+
DATAPATH=data/${EXPERIMENT}/iter_$iteration
39+
echo DATAPATH $DATAPATH
40+
41+
if (( $TEST_LOCAL \> 0 ))
42+
then
43+
count=`ls -lt ../${DATAPATH} | grep json | wc -l`
44+
else
45+
count=`amlt sto list ${DATAPATH} | grep json | wc -l`
46+
fi
47+
echo count $count
48+
49+
# Instead of file count we might want to check if the amount of data from preprocess is sufficient
50+
# If not we call to generate more
51+
52+
if (( $count \> 0 ))
53+
then
54+
echo "$FULLNAME has already been started"
55+
echo "You are resuming at iteration $iteration"
56+
echo "You already have $count files of data this iteration"
57+
else
58+
echo "$FULLNAME is starting generation for iteration $iteration"
59+
fi
60+
61+
if (( $count \< $TARGET_NUM_FILES ))
62+
then
63+
if (( $TEST_LOCAL \> 0 ))
64+
then
65+
# ./gen.sh $GPU 2560 100 $FULLNAME -1
66+
# 2.7B 384 100 runs ~10 hours
67+
# 2.7B 160 100 runs ~4.5 hours
68+
./gen.sh $GPU 256000 100 $FULLNAME -1
69+
else
70+
amlt run hyper_gen_octows.yaml $FULLNAME -d "$FULLNAME"
71+
exit
72+
fi
73+
fi
74+
75+
# Running local you are done, but launching on the cloud, you have to wait
76+
for (( poll=0; poll<500; poll++ ))
77+
do
78+
if (( $TEST_LOCAL \> 0 ))
79+
then
80+
count=`ls -lt ../${DATAPATH} | grep json | wc -l`
81+
else
82+
count=`amlt sto list ${DATAPATH} | grep json | wc -l`
83+
fi
84+
85+
echo "gen wait - Iteration: $iteration, Poll: $poll, Count: $count"
86+
87+
if (( $count \>= $TARGET_NUM_FILES ))
88+
then
89+
echo "Finished generation iteration $iteration after $poll polls"
90+
break
91+
fi
92+
sleep 3m
93+
done
94+
95+
# Start a finetune job
96+
if (( $TEST_LOCAL \> 0 ))
97+
then
98+
./fine_tune.sh $GPU $FULLNAME
99+
else
100+
# Pass enviroment variable FULLNAME to amlt.yaml
101+
amlt run amlt_octo.yaml $FULLNAME -d "$FULLNAME"
102+
exit
103+
fi
104+
105+
# On cluster we need to wait for finetune job to finish, run locally it's done
106+
# Check the log files for starting the running of solve have been created for the last epoch of training
107+
108+
MODELPATH=models/gpt-neo-$MODEL/${EXPERIMENT}/iter_$iteration
109+
SOLVE_PATH=$MODELPATH/"epoch_"$EPOCHS_MAX/"solve_"$PASS_AT_K
110+
echo babysit.sh SOLVE_PATH $SOLVE_PATH
111+
112+
for (( poll=0; poll<500; poll++ ))
113+
do
114+
if (( $TEST_LOCAL \> 0 ))
115+
then
116+
count=`ls -lt ../$SOLVE_PATH | grep json | wc -l`
117+
else
118+
count=`amlt sto list $SOLVE_PATH | grep json | wc -l`
119+
fi
120+
121+
echo "fine_tune wait - Iteration: $iteration, Poll: $poll, Count: $count"
122+
123+
if (( $count \>= 1 ))
124+
then
125+
echo "Finished fine_tune iteration $iteration after $poll polls"
126+
break
127+
fi
128+
sleep 3m
129+
done
130+
131+
done
132+
133+
# Pull all the results into 1 log file to look at more easily
134+
135+
if [[ -z "${AMLT_DATA_DIR}" ]];
136+
then
137+
# running locally on torch2020 so we don't have AMLT enviroment variables defined, so need to set them up
138+
AMLT_DATA_DIR=../data
139+
else
140+
# On remote we don't have access to the log files - maybe could do amlt sto download to do this summary below ?
141+
exit
142+
fi
143+
144+
BASE_MODEL_PATH=$AMLT_DATA_DIR/../models/gpt-neo-$MODEL
145+
LOG_FILE=$BASE_MODEL_PATH/$EXPERIMENT/"solve_"$PASS_AT_K".txt"
146+
echo solve LOG_FILE for babysit.sh is $LOG_FILE
147+
rm $LOG_FILE
148+
149+
for (( iteration=$ITER_START; iteration<$ITER_MAX; iteration++ ))
150+
do
151+
for (( epochs=$EPOCHS_START; epochs<=$EPOCHS_MAX; epochs++ ))
152+
do
153+
EPOCH_NAME="epoch_"$epochs
154+
STEP_PATH=$BASE_MODEL_PATH/$EXPERIMENT/iter_$iteration/$EPOCH_NAME
155+
MODEL_PATH=$STEP_PATH/finetuned
156+
echo iteration $iteration epoch $epochs >> $LOG_FILE
157+
head -$LINE_LOG_K $STEP_PATH/"solve_"$PASS_AT_K/results.json >> $LOG_FILE
158+
done
159+
done
160+
161+
cat $LOG_FILE

ICLR2023/src/ds_config_gptneo.json

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
{
2+
"fp16": {
3+
"enabled": "auto",
4+
"loss_scale": 0,
5+
"loss_scale_window": 1000,
6+
"initial_scale_power": 16,
7+
"hysteresis": 2,
8+
"min_loss_scale": 1
9+
},
10+
"optimizer": {
11+
"type": "AdamW",
12+
"params": {
13+
"lr": "auto",
14+
"betas": "auto",
15+
"eps": "auto",
16+
"weight_decay": "auto"
17+
}
18+
},
19+
"scheduler": {
20+
"type": "WarmupLR",
21+
"params": {
22+
"warmup_min_lr": "auto",
23+
"warmup_max_lr": "auto",
24+
"warmup_num_steps": "auto"
25+
}
26+
},
27+
"zero_optimization": {
28+
"stage": 2,
29+
"allgather_partitions": true,
30+
"allgather_bucket_size": 2e8,
31+
"overlap_comm": true,
32+
"reduce_scatter": true,
33+
"reduce_bucket_size": 2e8,
34+
"contiguous_gradients": true,
35+
"cpu_offload": true
36+
},
37+
"gradient_accumulation_steps": "auto",
38+
"gradient_clipping": "auto",
39+
"steps_per_print": 2000,
40+
"train_batch_size": "auto",
41+
"train_micro_batch_size_per_gpu": "auto",
42+
"wall_clock_breakdown": false
43+
}

ICLR2023/src/fine_tune.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
from strictfire import StrictFire as Fire # aborts early on invalid arguments
2+
import os
3+
import csv
4+
import subprocess
5+
import shlex
6+
import random
7+
import numpy as np
8+
import torch
9+
import utils
10+
11+
def fine_tune(
12+
train_txt="../data/generated_sol_100.txt",
13+
output_dir = "../outputs/",
14+
subdir="out",
15+
model_path="EleutherAI/gpt-neo-2.7B",
16+
gpu=0,
17+
num_gpus=1,
18+
epochs=4,
19+
seed=0,
20+
):
21+
"""
22+
Fine tune the model on the puzzles in train_txt file and save the results to OUTPUT_DIR/output_subdir
23+
24+
train_txt: the (possibly gzipped) file containing the text to fine tune on (default: ../data/generated_sol_100.txt)
25+
subdir: the subdirectory to save the results to (default "out")
26+
model_path: the path to the model to fine tune (default "EleutherAI/gpt-neo-2.7B")
27+
gpu: which GPU(s) to use, e.g.: 0,1 (default 0)
28+
epochs: how many epochs to train for (default 4)
29+
seed: the random seed to use, not sure if this affects fine tuning (default 0)
30+
"""
31+
random.seed(seed)
32+
np.random.seed(seed)
33+
torch.manual_seed(seed)
34+
35+
# create output dir if necessary
36+
output_path = os.path.join(output_dir, subdir)
37+
if not os.path.exists(output_path):
38+
os.makedirs(output_path)
39+
40+
41+
text = utils.load_text_file(train_txt) # decompresses if ends in .gz
42+
tokenizer = utils.load_tokenizer(model_path)
43+
num_toks = utils.num_tokens(text, tokenizer, verbose=True)
44+
assert num_toks > 1024, "Not enough tokens in text to fine tune"
45+
46+
# create csv
47+
train_file = os.path.join(output_path, "train.csv")
48+
with open(train_file, mode="w", encoding="utf-8") as csv_file:
49+
fieldnames = ["text"]
50+
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
51+
writer.writeheader()
52+
writer.writerow({"text": text})
53+
54+
output_path_finetuned = os.path.join(output_path, "finetuned")
55+
56+
# keep gradient_accumulation_steps at 1 bc setting it to 2 effectively doubles the batch
57+
# size which gets tricky when batch sizes are small (ft_tokens will no longer be accurate)
58+
gradient_accumulation_steps = 1
59+
per_device_train_batch_size = 4
60+
61+
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "")
62+
if len(cuda_visible_devices):
63+
print("os.environ(CUDA_VISIBLE_DEVICES)", cuda_visible_devices)
64+
del os.environ["CUDA_VISIBLE_DEVICES"]
65+
print("os.environ(CUDA_VISIBLE_DEVICES)", os.environ.get("CUDA_VISIBLE_DEVICES", ""))
66+
67+
master_port = 29600 # During training deepspeed uses a port to syncronize. 2 jobs need to set different ports to run in parallel
68+
if type(gpu) in [list, tuple]:
69+
master_port += gpu[0]
70+
gpu = ",".join([str(g) for g in gpu])
71+
else:
72+
master_port += gpu
73+
74+
gpu_string = f'--include=localhost:{gpu}'
75+
76+
if num_gpus > 1:
77+
gpu_string = f"--num_nodes=1 --num_gpus={num_gpus}",
78+
# If gpu is passed in as negative - it's the count of gpu to use - a bit of a hack
79+
if gpu < 0:
80+
num_gpus = abs(gpu)
81+
gpu_string = f"--num_nodes=1 --num_gpus={num_gpus}"
82+
83+
print("gpu_string", gpu_string)
84+
85+
cmd = " ".join(
86+
[
87+
"deepspeed",
88+
f"--master_port={master_port}",
89+
gpu_string,
90+
# f'--include=localhost:{gpu}',
91+
# "--num_nodes=1",
92+
# f"--num_gpus={num_gpus}",
93+
"neo_train.py",
94+
f"--model_name_or_path={model_path}",
95+
f"--train_file={train_file}",
96+
f"--output_dir={output_path_finetuned}",
97+
"--overwrite_output_dir",
98+
"--ignore_data_skip",
99+
"--deepspeed",
100+
"ds_config_gptneo.json",
101+
f"--save_strategy=no", # ATK remove checkpointing for large datasets
102+
# pretty sure this is just dataset cache
103+
"--overwrite_cache",
104+
# logging frequency
105+
"--logging_steps=5",
106+
"--do_train",
107+
"--report_to none", # turns off report_to WANDB for instance
108+
"--fp16",
109+
f"--num_train_epochs={epochs}",
110+
# overrides num_train_epochs if set to a positive value. This is the number of gradient steps that happen total.
111+
f"--per_device_train_batch_size={per_device_train_batch_size}",
112+
"--use_fast_tokenizer=False",
113+
f"--gradient_accumulation_steps={gradient_accumulation_steps}",
114+
"--learning_rate=5e-06",
115+
# linear increase from this up to learning rate, then LR schedule happens (which itself is linear decreasing until max_steps)
116+
"--warmup_steps=10",
117+
]
118+
)
119+
120+
utils.info(f"running command: {cmd}")
121+
print(f"Command to run:{cmd}") # Why is this different than what utils.info prints out, utils.info truncates it
122+
# exit()
123+
res = subprocess.run(shlex.split(cmd), check=True)
124+
utils.info(str(res))
125+
126+
127+
if __name__ == "__main__":
128+
Fire(fine_tune)

0 commit comments

Comments
 (0)