|
| 1 | +#!/bin/bash |
| 2 | +# All Experiment Settings - constant through the experiment run - passed to gen.sh and fine_tune.sh as needed |
| 3 | +GPU=0 # which GPU to use |
| 4 | +MODEL="125M" # MODEL is the size of the model: 125M, 13B, 27B |
| 5 | +EXPERIMENT=$MODEL"_PAPER" # Name of Experiment directory under data/* and models/base-model/* to store results |
| 6 | +TEST_LOCAL=1 # 0 means run gen/fine_tune on cluster remotely, 1 means run gen/fine_tune locally |
| 7 | +TARGET_NUM_FILES=1 # How many files to generate in each iteration before starting fine_tuning. Count of unique examples would have been better. |
| 8 | +ITER_START=0 # inclusive index to start processing at - creates iter_# under data&models at each iteration. Can continue prev runs by start at prev ITER_MAX |
| 9 | +ITER_MAX=5 # exclusive index to stop processing iterations at |
| 10 | +EPOCHS_START=1 # inclusive index of epochs to start processing at - could continue prev run by starting at prev EPOCHS_MAX+1 - 0th epoch is the default model so epoch starts at 1 |
| 11 | +EPOCHS_MAX=4 # inclusive index of epochs to stop processing at |
| 12 | +EPOCHS_PER_STEP=1 # How many EPOCHS through the data to do in each step |
| 13 | +TRAIN_INCREMENTAL=0 # Only train on data from the latest iteration, and start finetuning on the last finetuned model - otherwise start from scratch and use all the data generated |
| 14 | +TRAIN_BOOST=0 # Initial generation of data from default model is slow - 1 means looks in 125M_RL_ALL to use previous generated initial data to bootstrap. |
| 15 | +PASS_AT_K=100 # PASS_AT_K says do K trials to solve to compute Pass@K |
| 16 | +LINE_LOG_K=11 # LINE_LOG_K is how many lines of results from solve have results for saving |
| 17 | + |
| 18 | +echo babysit args: $# $0 $1 $2 $3 $4 |
| 19 | + |
| 20 | +if (( $# \!= 1 )) |
| 21 | +then |
| 22 | + echo babysit.sh only takes 1 argument, unless called by another script to initialize configuration variables |
| 23 | + return |
| 24 | +fi |
| 25 | + |
| 26 | +if (( $# \>= 1 )) |
| 27 | +then |
| 28 | + GPU=$1 |
| 29 | +fi |
| 30 | + |
| 31 | +echo babysit GPU $GPU |
| 32 | + |
| 33 | +for (( iteration=$ITER_START; iteration<$ITER_MAX; iteration++ )) |
| 34 | +do |
| 35 | + FULLNAME="${EXPERIMENT}---${iteration}" |
| 36 | + echo FULLNAME $FULLNAME |
| 37 | + export FULLNAME # Needed to pass variable off to yaml job |
| 38 | + DATAPATH=data/${EXPERIMENT}/iter_$iteration |
| 39 | + echo DATAPATH $DATAPATH |
| 40 | + |
| 41 | + if (( $TEST_LOCAL \> 0 )) |
| 42 | + then |
| 43 | + count=`ls -lt ../${DATAPATH} | grep json | wc -l` |
| 44 | + else |
| 45 | + count=`amlt sto list ${DATAPATH} | grep json | wc -l` |
| 46 | + fi |
| 47 | + echo count $count |
| 48 | + |
| 49 | + # Instead of file count we might want to check if the amount of data from preprocess is sufficient |
| 50 | + # If not we call to generate more |
| 51 | + |
| 52 | + if (( $count \> 0 )) |
| 53 | + then |
| 54 | + echo "$FULLNAME has already been started" |
| 55 | + echo "You are resuming at iteration $iteration" |
| 56 | + echo "You already have $count files of data this iteration" |
| 57 | + else |
| 58 | + echo "$FULLNAME is starting generation for iteration $iteration" |
| 59 | + fi |
| 60 | + |
| 61 | + if (( $count \< $TARGET_NUM_FILES )) |
| 62 | + then |
| 63 | + if (( $TEST_LOCAL \> 0 )) |
| 64 | + then |
| 65 | + # ./gen.sh $GPU 2560 100 $FULLNAME -1 |
| 66 | + # 2.7B 384 100 runs ~10 hours |
| 67 | + # 2.7B 160 100 runs ~4.5 hours |
| 68 | + ./gen.sh $GPU 256000 100 $FULLNAME -1 |
| 69 | + else |
| 70 | + amlt run hyper_gen_octows.yaml $FULLNAME -d "$FULLNAME" |
| 71 | + exit |
| 72 | + fi |
| 73 | + fi |
| 74 | + |
| 75 | + # Running local you are done, but launching on the cloud, you have to wait |
| 76 | + for (( poll=0; poll<500; poll++ )) |
| 77 | + do |
| 78 | + if (( $TEST_LOCAL \> 0 )) |
| 79 | + then |
| 80 | + count=`ls -lt ../${DATAPATH} | grep json | wc -l` |
| 81 | + else |
| 82 | + count=`amlt sto list ${DATAPATH} | grep json | wc -l` |
| 83 | + fi |
| 84 | + |
| 85 | + echo "gen wait - Iteration: $iteration, Poll: $poll, Count: $count" |
| 86 | + |
| 87 | + if (( $count \>= $TARGET_NUM_FILES )) |
| 88 | + then |
| 89 | + echo "Finished generation iteration $iteration after $poll polls" |
| 90 | + break |
| 91 | + fi |
| 92 | + sleep 3m |
| 93 | + done |
| 94 | + |
| 95 | + # Start a finetune job |
| 96 | + if (( $TEST_LOCAL \> 0 )) |
| 97 | + then |
| 98 | + ./fine_tune.sh $GPU $FULLNAME |
| 99 | + else |
| 100 | + # Pass enviroment variable FULLNAME to amlt.yaml |
| 101 | + amlt run amlt_octo.yaml $FULLNAME -d "$FULLNAME" |
| 102 | + exit |
| 103 | + fi |
| 104 | + |
| 105 | + # On cluster we need to wait for finetune job to finish, run locally it's done |
| 106 | + # Check the log files for starting the running of solve have been created for the last epoch of training |
| 107 | + |
| 108 | + MODELPATH=models/gpt-neo-$MODEL/${EXPERIMENT}/iter_$iteration |
| 109 | + SOLVE_PATH=$MODELPATH/"epoch_"$EPOCHS_MAX/"solve_"$PASS_AT_K |
| 110 | + echo babysit.sh SOLVE_PATH $SOLVE_PATH |
| 111 | + |
| 112 | + for (( poll=0; poll<500; poll++ )) |
| 113 | + do |
| 114 | + if (( $TEST_LOCAL \> 0 )) |
| 115 | + then |
| 116 | + count=`ls -lt ../$SOLVE_PATH | grep json | wc -l` |
| 117 | + else |
| 118 | + count=`amlt sto list $SOLVE_PATH | grep json | wc -l` |
| 119 | + fi |
| 120 | + |
| 121 | + echo "fine_tune wait - Iteration: $iteration, Poll: $poll, Count: $count" |
| 122 | + |
| 123 | + if (( $count \>= 1 )) |
| 124 | + then |
| 125 | + echo "Finished fine_tune iteration $iteration after $poll polls" |
| 126 | + break |
| 127 | + fi |
| 128 | + sleep 3m |
| 129 | + done |
| 130 | + |
| 131 | +done |
| 132 | + |
| 133 | +# Pull all the results into 1 log file to look at more easily |
| 134 | + |
| 135 | +if [[ -z "${AMLT_DATA_DIR}" ]]; |
| 136 | +then |
| 137 | + # running locally on torch2020 so we don't have AMLT enviroment variables defined, so need to set them up |
| 138 | + AMLT_DATA_DIR=../data |
| 139 | +else |
| 140 | + # On remote we don't have access to the log files - maybe could do amlt sto download to do this summary below ? |
| 141 | + exit |
| 142 | +fi |
| 143 | + |
| 144 | +BASE_MODEL_PATH=$AMLT_DATA_DIR/../models/gpt-neo-$MODEL |
| 145 | +LOG_FILE=$BASE_MODEL_PATH/$EXPERIMENT/"solve_"$PASS_AT_K".txt" |
| 146 | +echo solve LOG_FILE for babysit.sh is $LOG_FILE |
| 147 | +rm $LOG_FILE |
| 148 | + |
| 149 | +for (( iteration=$ITER_START; iteration<$ITER_MAX; iteration++ )) |
| 150 | +do |
| 151 | + for (( epochs=$EPOCHS_START; epochs<=$EPOCHS_MAX; epochs++ )) |
| 152 | + do |
| 153 | + EPOCH_NAME="epoch_"$epochs |
| 154 | + STEP_PATH=$BASE_MODEL_PATH/$EXPERIMENT/iter_$iteration/$EPOCH_NAME |
| 155 | + MODEL_PATH=$STEP_PATH/finetuned |
| 156 | + echo iteration $iteration epoch $epochs >> $LOG_FILE |
| 157 | + head -$LINE_LOG_K $STEP_PATH/"solve_"$PASS_AT_K/results.json >> $LOG_FILE |
| 158 | + done |
| 159 | +done |
| 160 | + |
| 161 | +cat $LOG_FILE |
0 commit comments