microsoft
diff --git a/‎ICLR2023/data/125M_PAPER_1M_iter_1.txt.gz‎
27.2 MB b/‎ICLR2023/data/125M_PAPER_1M_iter_1.txt.gz‎
27.2 MB
diff --git a/‎ICLR2023/data/13B_PAPER_1M_iter_1.txt.gz‎
25.6 MB b/‎ICLR2023/data/13B_PAPER_1M_iter_1.txt.gz‎
25.6 MB
diff --git a/‎ICLR2023/data/27B_PAPER_1M_iter_1.txt.gz‎
27 MB b/‎ICLR2023/data/27B_PAPER_1M_iter_1.txt.gz‎
27 MB
diff --git a/‎ICLR2023/data/350M_PAPER_1M_iter_0.txt.gz‎
39.5 MB b/‎ICLR2023/data/350M_PAPER_1M_iter_0.txt.gz‎
39.5 MB
diff --git a/‎ICLR2023/data/Codex_PAPER_1M_iter_0.txt.gz‎
47.9 MB b/‎ICLR2023/data/Codex_PAPER_1M_iter_0.txt.gz‎
47.9 MB
diff --git a/‎ICLR2023/data/Codex_TRIVIAL_1M_iter_0.txt.gz‎
46.6 MB b/‎ICLR2023/data/Codex_TRIVIAL_1M_iter_0.txt.gz‎
46.6 MB
diff --git a/‎ICLR2023/src/babysit.sh‎
Lines changed: 161 additions & 0 deletions b/‎ICLR2023/src/babysit.sh‎
Lines changed: 161 additions & 0 deletions
diff --git a/‎ICLR2023/src/ds_config_gptneo.json‎
Lines changed: 43 additions & 0 deletions b/‎ICLR2023/src/ds_config_gptneo.json‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎ICLR2023/src/fine_tune.py‎
Lines changed: 128 additions & 0 deletions b/‎ICLR2023/src/fine_tune.py‎
Lines changed: 128 additions & 0 deletions
@@ -0,0 +1,161 @@
+#!/bin/bash
+# All Experiment Settings - constant through the experiment run - passed to gen.sh and fine_tune.sh as needed
+GPU=0 # which GPU to use
+MODEL="125M" # MODEL is the size of the model: 125M, 13B, 27B
+EXPERIMENT=$MODEL"_PAPER" # Name of Experiment directory under data/* and models/base-model/* to store results
+TEST_LOCAL=1 # 0 means run gen/fine_tune on cluster remotely, 1 means run gen/fine_tune locally
+TARGET_NUM_FILES=1 # How many files to generate in each iteration before starting fine_tuning.  Count of unique examples would have been better.
+ITER_START=0 # inclusive index to start processing at - creates iter_# under data&models at each iteration.  Can continue prev runs by start at prev ITER_MAX
+ITER_MAX=5 # exclusive index to stop processing iterations at
+EPOCHS_START=1 # inclusive index of epochs to start processing at - could continue prev run by starting at prev EPOCHS_MAX+1 - 0th epoch is the default model so epoch starts at 1
+EPOCHS_MAX=4 # inclusive index of epochs to stop processing at
+EPOCHS_PER_STEP=1 # How many EPOCHS through the data to do in each step
+TRAIN_INCREMENTAL=0 # Only train on data from the latest iteration, and start finetuning on the last finetuned model - otherwise start from scratch and use all the data generated
+TRAIN_BOOST=0 # Initial generation of data from default model is slow - 1 means looks in 125M_RL_ALL to use previous generated initial data to bootstrap.
+PASS_AT_K=100 # PASS_AT_K says do K trials to solve to compute Pass@K
+LINE_LOG_K=11 # LINE_LOG_K is how many lines of results from solve have results for saving
+
+echo babysit args: $# $0 $1 $2 $3 $4
+
+if (( $# \!= 1 ))
+then
+    echo babysit.sh only takes 1 argument, unless called by another script to initialize configuration variables
+    return
+fi
+
+if (( $# \>= 1 ))
+then
+    GPU=$1
+fi
+
+echo babysit GPU $GPU
+
+for (( iteration=$ITER_START; iteration<$ITER_MAX; iteration++ ))
+do 
+    FULLNAME="${EXPERIMENT}---${iteration}" 
+    echo FULLNAME $FULLNAME
+    export FULLNAME  # Needed to pass variable off to yaml job
+    DATAPATH=data/${EXPERIMENT}/iter_$iteration
+    echo DATAPATH $DATAPATH
+
+    if (( $TEST_LOCAL \> 0 )) 
+    then
+        count=`ls -lt ../${DATAPATH} | grep json | wc -l`
+    else
+        count=`amlt sto list ${DATAPATH} | grep json | wc -l`
+    fi
+    echo count $count
+
+    # Instead of file count we might want to check if the amount of data from preprocess is sufficient
+    # If not we call to generate more
+
+    if (( $count \> 0 ))
+    then
+        echo "$FULLNAME has already been started"
+        echo "You are resuming at iteration $iteration"
+        echo "You already have $count files of data this iteration"
+    else
+        echo "$FULLNAME is starting generation for iteration $iteration"
+    fi
+
+    if (( $count \< $TARGET_NUM_FILES ))
+    then
+        if (( $TEST_LOCAL \> 0 ))
+        then
+            # ./gen.sh $GPU 2560 100 $FULLNAME -1
+            # 2.7B 384 100 runs ~10 hours
+            # 2.7B 160 100 runs ~4.5 hours
+            ./gen.sh $GPU 256000 100 $FULLNAME -1
+        else
+            amlt run hyper_gen_octows.yaml $FULLNAME -d "$FULLNAME"
+            exit
+        fi
+    fi
+
+    # Running local you are done, but launching on the cloud, you have to wait
+    for (( poll=0; poll<500; poll++ ))
+    do
+        if (( $TEST_LOCAL \> 0 ))
+        then
+            count=`ls -lt ../${DATAPATH} | grep json | wc -l`
+        else
+            count=`amlt sto list ${DATAPATH} | grep json | wc -l`
+        fi
+
+        echo "gen wait - Iteration: $iteration, Poll: $poll, Count: $count"
+
+        if (( $count \>= $TARGET_NUM_FILES ))
+        then
+            echo "Finished generation iteration $iteration after $poll polls"
+            break
+        fi
+        sleep 3m
+    done
+
+    # Start a finetune job
+    if (( $TEST_LOCAL \> 0 ))
+    then
+        ./fine_tune.sh $GPU $FULLNAME
+    else
+        # Pass enviroment variable FULLNAME to amlt.yaml
+        amlt run amlt_octo.yaml $FULLNAME -d "$FULLNAME"
+        exit
+    fi
+
+    # On cluster we need to wait for finetune job to finish, run locally it's done
+    # Check the log files for starting the running of solve have been created for the last epoch of training
+
+    MODELPATH=models/gpt-neo-$MODEL/${EXPERIMENT}/iter_$iteration
+    SOLVE_PATH=$MODELPATH/"epoch_"$EPOCHS_MAX/"solve_"$PASS_AT_K
+    echo babysit.sh SOLVE_PATH $SOLVE_PATH
+
+    for (( poll=0; poll<500; poll++ ))
+    do
+        if (( $TEST_LOCAL \> 0 ))
+        then
+            count=`ls -lt ../$SOLVE_PATH | grep json | wc -l`
+        else
+            count=`amlt sto list $SOLVE_PATH | grep json | wc -l`
+        fi
+
+        echo "fine_tune wait - Iteration: $iteration, Poll: $poll, Count: $count"
+
+        if (( $count \>= 1 ))
+        then
+            echo "Finished fine_tune iteration $iteration after $poll polls"
+            break
+        fi
+        sleep 3m
+    done
+
+done
+
+# Pull all the results into 1 log file to look at more easily
+
+if [[ -z "${AMLT_DATA_DIR}" ]]; 
+then
+  # running locally on torch2020 so we don't have AMLT enviroment variables defined, so need to set them up
+  AMLT_DATA_DIR=../data
+else
+  # On remote we don't have access to the log files - maybe could do amlt sto download to do this summary below ?
+  exit
+fi
+
+BASE_MODEL_PATH=$AMLT_DATA_DIR/../models/gpt-neo-$MODEL
+LOG_FILE=$BASE_MODEL_PATH/$EXPERIMENT/"solve_"$PASS_AT_K".txt"
+echo solve LOG_FILE for babysit.sh is $LOG_FILE
+rm $LOG_FILE
+
+for (( iteration=$ITER_START; iteration<$ITER_MAX; iteration++ ))
+do
+    for (( epochs=$EPOCHS_START; epochs<=$EPOCHS_MAX; epochs++ ))
+    do
+        EPOCH_NAME="epoch_"$epochs
+        STEP_PATH=$BASE_MODEL_PATH/$EXPERIMENT/iter_$iteration/$EPOCH_NAME
+        MODEL_PATH=$STEP_PATH/finetuned
+        echo iteration $iteration epoch $epochs >> $LOG_FILE
+        head -$LINE_LOG_K $STEP_PATH/"solve_"$PASS_AT_K/results.json >> $LOG_FILE
+    done
+done
+
+cat $LOG_FILE
@@ -0,0 +1,43 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true,
+        "cpu_offload": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
@@ -0,0 +1,128 @@
+from strictfire import StrictFire as Fire  # aborts early on invalid arguments
+import os
+import csv
+import subprocess
+import shlex
+import random
+import numpy as np
+import torch
+import utils
+
+def fine_tune(
+    train_txt="../data/generated_sol_100.txt",
+    output_dir = "../outputs/",
+    subdir="out",
+    model_path="EleutherAI/gpt-neo-2.7B",
+    gpu=0,
+    num_gpus=1,
+    epochs=4,
+    seed=0,
+    ):
+    """
+    Fine tune the model on the puzzles in train_txt file and save the results to OUTPUT_DIR/output_subdir
+
+    train_txt: the (possibly gzipped) file containing the text to fine tune on (default: ../data/generated_sol_100.txt)
+    subdir: the subdirectory to save the results to (default "out")
+    model_path: the path to the model to fine tune (default "EleutherAI/gpt-neo-2.7B")
+    gpu: which GPU(s) to use, e.g.: 0,1 (default 0) 
+    epochs: how many epochs to train for (default 4)
+    seed: the random seed to use, not sure if this affects fine tuning (default 0)    
+    """    
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+    # create output dir if necessary
+    output_path = os.path.join(output_dir, subdir)
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+
+    text = utils.load_text_file(train_txt)  # decompresses if ends in .gz
+    tokenizer = utils.load_tokenizer(model_path) 
+    num_toks = utils.num_tokens(text, tokenizer, verbose=True)
+    assert num_toks > 1024, "Not enough tokens in text to fine tune"
+
+    # create csv    
+    train_file = os.path.join(output_path, "train.csv")
+    with open(train_file, mode="w", encoding="utf-8") as csv_file:
+        fieldnames = ["text"]
+        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerow({"text": text})
+    
+    output_path_finetuned = os.path.join(output_path, "finetuned")    
+
+    # keep gradient_accumulation_steps at 1 bc setting it to 2 effectively doubles the batch
+    # size which gets tricky when batch sizes are small (ft_tokens will no longer be accurate)
+    gradient_accumulation_steps = 1
+    per_device_train_batch_size = 4
+
+    cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "")
+    if len(cuda_visible_devices):
+        print("os.environ(CUDA_VISIBLE_DEVICES)", cuda_visible_devices)
+        del os.environ["CUDA_VISIBLE_DEVICES"]    
+    print("os.environ(CUDA_VISIBLE_DEVICES)", os.environ.get("CUDA_VISIBLE_DEVICES", ""))
+
+    master_port = 29600  # During training deepspeed uses a port to syncronize.  2 jobs need to set different ports to run in parallel
+    if type(gpu) in [list, tuple]:
+        master_port += gpu[0]
+        gpu = ",".join([str(g) for g in gpu])
+    else:
+        master_port += gpu
+
+    gpu_string = f'--include=localhost:{gpu}'
+
+    if num_gpus > 1:
+        gpu_string = f"--num_nodes=1 --num_gpus={num_gpus}",
+    # If gpu is passed in as negative - it's the count of gpu to use - a bit of a hack
+    if gpu < 0:
+        num_gpus = abs(gpu)
+        gpu_string = f"--num_nodes=1 --num_gpus={num_gpus}"
+
+    print("gpu_string", gpu_string)
+
+    cmd = " ".join(
+        [
+            "deepspeed",
+            f"--master_port={master_port}",
+            gpu_string, 
+            # f'--include=localhost:{gpu}',            
+            # "--num_nodes=1",
+            # f"--num_gpus={num_gpus}",
+            "neo_train.py",
+            f"--model_name_or_path={model_path}",
+            f"--train_file={train_file}",
+            f"--output_dir={output_path_finetuned}",
+            "--overwrite_output_dir",
+            "--ignore_data_skip",
+            "--deepspeed",
+            "ds_config_gptneo.json",
+            f"--save_strategy=no", # ATK remove checkpointing for large datasets
+            # pretty sure this is just dataset cache
+            "--overwrite_cache",
+            # logging frequency
+            "--logging_steps=5",
+            "--do_train",
+            "--report_to none", # turns off report_to WANDB for instance
+            "--fp16",
+            f"--num_train_epochs={epochs}",
+            # overrides num_train_epochs if set to a positive value. This is the number of gradient steps that happen total.
+            f"--per_device_train_batch_size={per_device_train_batch_size}",
+            "--use_fast_tokenizer=False",
+            f"--gradient_accumulation_steps={gradient_accumulation_steps}",
+            "--learning_rate=5e-06",
+            # linear increase from this up to learning rate, then LR schedule happens (which itself is linear decreasing until max_steps)
+            "--warmup_steps=10",
+        ]
+    )
+
+    utils.info(f"running command: {cmd}")
+    print(f"Command to run:{cmd}") # Why is this different than what utils.info prints out, utils.info truncates it
+    # exit()
+    res = subprocess.run(shlex.split(cmd), check=True)
+    utils.info(str(res))
+        
+
+if __name__ == "__main__":
+    Fire(fine_tune)