@@ -46,7 +46,9 @@ NOTE: the above script will export PYTHONPATH, so sourcing will make it to take
4646## LLaMA
4747### Get official llama weights from meta-llama
4848
49- Following instructions here: https://github.com/meta-llama/llama#download
49+ Following instructions here:
50+ * Llama-2: https://github.com/meta-llama/llama#download
51+ * Llama-3: https://github.com/meta-llama/llama3/#download
5052
5153After you have downloaded the weights, it will also download a ` tokenizer.model ` file that is
5254the tokenizer that we will use.
@@ -68,7 +70,7 @@ Need to manually modify the `config.json` in the checkpoint folder to make it a
6870export input_ckpt_dir=Original llama weights directory
6971export output_ckpt_dir=The output directory
7072export quantize=True # whether to quantize
71- export model_name=" llama-2 " # or "gemma"
73+ export model_name=" llama-3 " # or "llama-2", "gemma"
7274python -m convert_checkpoints --model_name=$model_name --input_checkpoint_dir=$input_ckpt_dir --output_checkpoint_dir=$output_ckpt_dir --quantize=$quantize
7375```
7476
@@ -80,16 +82,20 @@ Set tokenizer path
8082export tokenizer_path=tokenizer model file path
8183```
8284
83- ## Llama 7b
85+ ## Llama-2 7b
8486``` bash
85- python run_interactive.py --size=7b --batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/$model_name .yaml
87+ python run_interactive.py --size=7b --model_name= $model_name -- batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/llama .yaml
8688```
8789
88- ## Llama 13b
90+ ## Llama-2 13b
8991``` bash
90- python run_interactive.py --size=13b --batch_size=64 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/$model_name .yaml
92+ python run_interactive.py --size=13b --model_name= $model_name -- batch_size=64 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/llama .yaml
9193```
9294
95+ ## Llama-3 8b
96+ ``` bash
97+ python run_interactive.py --size=8b --model_name=$model_name --batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/llama.yaml
98+ ```
9399
94100## Gemma 7b
95101``` bash
@@ -101,7 +107,7 @@ python run_interactive.py --model_name=$model_name --size=7b --batch_size=64 --m
101107NOTE: the ` --platform=tpu=8 ` need to specify number of tpu devices (which is 4 for v4-8 and 8 for v5light-8`)
102108
103109``` bash
104- python run_server.py --param_size=7b --batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --platform=tpu=8 --model=$model_name
110+ python run_server.py --param_size=7b --model_name= $model_name -- batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --platform=tpu=8 --model=$model_name
105111```
106112Now you can fire gRPC to it
107113
0 commit comments