Skip to content

Commit e5bfef2

Browse files
committed
add fmbench llama2-7b config
1 parent 2517c6a commit e5bfef2

File tree

1 file changed

+181
-0
lines changed

1 file changed

+181
-0
lines changed
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
general:
2+
name: "llama2-7b-v1"
3+
model_name: "Llama2-7b"
4+
5+
# AWS and SageMaker settings
6+
aws:
7+
region: {region}
8+
# uncomment and set the Role ARN if not running on sagemaker
9+
sagemaker_execution_role: {role_arn}
10+
## these are the buckets/resources you will create in your account below:
11+
bucket: {write_bucket} ## add the name of your desired bucket
12+
13+
## WRITE BUCKET -- Write the results, data, metrics, endpoint.json and payloads to this bucket directory
14+
dir_paths:
15+
data_prefix: data ## add the prefix for all your data management/storage
16+
prompts_prefix: prompts
17+
all_prompts_file: all_prompts.csv
18+
metrics_dir: metrics
19+
models_dir: models
20+
metadata_dir: metadata ## add a file here to dynamically track the metrics dir
21+
22+
## READ BUCKET -- Represents the section to read from scripts, source data and tokenizer for a separate s3 bucket for read/write segregation
23+
s3_read_data:
24+
read_bucket: {read_bucket}
25+
scripts_prefix: scripts ## add your own scripts in case you are using anything that is not on jumpstart
26+
script_files:
27+
- hf_token.txt ## add your scripts files you have in s3 (including inference files, serving stacks, if any)
28+
source_data_prefix: source_data ## Add a source_data folder to store your raw data in an s3 path configured by you
29+
source_data_files:
30+
# - rajpurkar/squad_v2.jsonl
31+
- 2wikimqa_e.jsonl
32+
- 2wikimqa.jsonl
33+
- hotpotqa_e.jsonl
34+
- hotpotqa.jsonl
35+
- narrativeqa.jsonl
36+
- triviaqa_e.jsonl
37+
- triviaqa.jsonl
38+
tokenizer_prefix: tokenizer ## add the tokenizer.json and config.json from your specific tokenizer type
39+
prompt_template_dir: prompt_template
40+
prompt_template_file: prompt_template_llama2.txt ## add your desired prompt template type
41+
42+
## section that enables container to run notebooks and python scripts automatically
43+
run_steps:
44+
0_setup.ipynb: yes
45+
1_generate_data.ipynb: yes
46+
2_deploy_model.ipynb: yes
47+
3_run_inference.ipynb: yes
48+
4_model_metric_analysis.ipynb: yes
49+
5_cleanup.ipynb: yes
50+
51+
52+
datasets:
53+
prompt_template_keys:
54+
- input
55+
- context
56+
filters:
57+
- language: en
58+
min_length_in_tokens: 1
59+
max_length_in_tokens: 500
60+
payload_file: payload_en_1-500.jsonl
61+
- language: en
62+
min_length_in_tokens: 500
63+
max_length_in_tokens: 1000
64+
payload_file: payload_en_500-1000.jsonl
65+
- language: en
66+
min_length_in_tokens: 1000
67+
max_length_in_tokens: 2000
68+
payload_file: payload_en_1000-2000.jsonl
69+
- language: en
70+
min_length_in_tokens: 2000
71+
max_length_in_tokens: 3000
72+
payload_file: payload_en_2000-3000.jsonl
73+
- language: en
74+
min_length_in_tokens: 3000
75+
max_length_in_tokens: 4000
76+
payload_file: payload_en_3000-4000.jsonl
77+
- language: en
78+
min_length_in_tokens: 305
79+
max_length_in_tokens: 3997
80+
payload_file: payload_en_305-3997.jsonl
81+
82+
metrics:
83+
dataset_of_interest: en_1000-2000
84+
weights:
85+
price_per_tx_wt: 0.65
86+
latenct_wt: 0.35
87+
88+
pricing:
89+
ml.g5.xlarge: 1.006
90+
ml.g5.2xlarge: 1.212
91+
ml.g5.12xlarge: 7.09
92+
ml.g5.24xlarge: 10.18
93+
ml.g5.48xlarge: 20.36
94+
ml.inf2.24xlarge: 7.79
95+
ml.inf2.48xlarge: 15.58
96+
ml.p4d.24xlarge: 37.688
97+
98+
inference_parameters:
99+
do_sample: yes
100+
temperature: 0.1
101+
top_p: 0.92
102+
top_k: 120
103+
max_new_tokens: 100
104+
truncate: at-prompt-token-length
105+
106+
# Model configurations for llama-2 7b for deploying on g5 x and 2x large instances
107+
experiments:
108+
- name: llama2-7b-g5.xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0
109+
model_id: meta-textgeneration-llama-2-7b-f
110+
model_version: "3.*"
111+
model_name: llama2-7b-f
112+
ep_name: llama-2-7b-g5xlarge
113+
instance_type: "ml.g5.xlarge"
114+
image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'
115+
deploy: yes
116+
instance_count: 1
117+
deployment_script: jumpstart.py
118+
inference_script: sagemaker_predictor.py
119+
payload_files:
120+
- payload_en_1-500.jsonl
121+
- payload_en_500-1000.jsonl
122+
- payload_en_1000-2000.jsonl
123+
- payload_en_2000-3000.jsonl
124+
125+
concurrency_levels:
126+
- 1
127+
- 2
128+
- 4
129+
130+
accept_eula: true
131+
env:
132+
SAGEMAKER_PROGRAM: "inference.py"
133+
ENDPOINT_SERVER_TIMEOUT: "3600"
134+
MODEL_CACHE_ROOT: "/opt/ml/model"
135+
SAGEMAKER_ENV: "1"
136+
HF_MODEL_ID: "/opt/ml/model"
137+
MAX_INPUT_LENGTH: "4095"
138+
MAX_TOTAL_TOKENS: "4096"
139+
SM_NUM_GPUS: "1"
140+
SAGEMAKER_MODEL_SERVER_WORKERS: "1"
141+
142+
- name: llama2-7b-g5.2xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0
143+
model_id: meta-textgeneration-llama-2-7b-f
144+
model_version: "3.*"
145+
model_name: llama2-7b-f
146+
ep_name: llama-2-7b-g5-2xlarge
147+
instance_type: "ml.g5.2xlarge"
148+
image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'
149+
deploy: yes
150+
instance_count: 1
151+
deployment_script: jumpstart.py
152+
inference_script: sagemaker_predictor.py
153+
payload_files:
154+
- payload_en_1-500.jsonl
155+
- payload_en_500-1000.jsonl
156+
- payload_en_1000-2000.jsonl
157+
- payload_en_2000-3000.jsonl
158+
159+
concurrency_levels:
160+
- 1
161+
- 2
162+
- 4
163+
164+
accept_eula: true
165+
env:
166+
SAGEMAKER_PROGRAM: "inference.py"
167+
ENDPOINT_SERVER_TIMEOUT: "3600"
168+
MODEL_CACHE_ROOT: "/opt/ml/model"
169+
SAGEMAKER_ENV: "1"
170+
HF_MODEL_ID: "/opt/ml/model"
171+
MAX_INPUT_LENGTH: "4095"
172+
MAX_TOTAL_TOKENS: "4096"
173+
SM_NUM_GPUS: "1"
174+
SAGEMAKER_MODEL_SERVER_WORKERS: "1"
175+
176+
report:
177+
per_inference_request_file: per_inference_request_results.csv
178+
all_metrics_file: all_metrics.csv
179+
txn_count_for_showing_cost: 10000
180+
v_shift_w_single_instance: 0.025
181+
v_shift_w_gt_one_instance: 0.025

0 commit comments

Comments
 (0)