1717CUTOFF_INPUT = 1024
1818CUTOFF_OUTPUT = 1024
1919
20- # batch size 60, ful cache, bfloat
21- prefill_bucket_size_to_s = {
22- 64 : 0.007696230011060834 ,
23- 128 : 0.011508351005613805 ,
24- 256 : 0.01721684739459306 ,
25- 512 : 0.03257157760672271 ,
26- 1024 : 0.08185497261583805 ,
27- }
28-
29- # batch size 96, ful cache, quantized
30- prefill_bucket_size_to_s = {
31- 64 : 0.006911616190336645 ,
32- 128 : 0.011646182998083532 ,
33- 256 : 0.01875854718964547 ,
34- 512 : 0.0334438294172287 ,
35- 1024 : 0.0643601292045787 ,
36- }
37-
38- # batch size 96, rolling, bfloat
39- prefill_bucket_size_to_s = {
40- 64 : 0.007730783987790346 ,
41- 128 : 0.011515899002552033 ,
42- 256 : 0.01780580161139369 ,
43- 512 : 0.03115477201063186 ,
44- 1024 : 0.07443338260054588 ,
45- }
46-
47- # batch size 160, rolling, quantized
48- prefill_bucket_size_to_s = {
49- 64 : 0.006821704190224409 ,
50- 128 : 0.01175499300006777 ,
51- 256 : 0.018776051187887787 ,
52- 512 : 0.03392685519065708 ,
53- 1024 : 0.06476318498607725 ,
54- }
55-
56- prefill_bucket_size_to_ms = {
57- k : p * 1000 for k , p in prefill_bucket_size_to_s .items ()
58- }
59-
60- # batch size 60, ful cache, bfloat
61- SYSTEM_TIME_PER_DECODE_TOKEN_MS = 26.55 / 60
62-
63- # batch size 96, ful cache, quantized
64- SYSTEM_TIME_PER_DECODE_TOKEN_MS = 26.0 / 96
65-
66- # batch size 96, rolling, bfloat
67- SYSTEM_TIME_PER_DECODE_TOKEN_MS = 28.18 / 96
68-
69- # batch size 160, rolling, quantized
70- SYSTEM_TIME_PER_DECODE_TOKEN_MS = 30 / 160
71-
7220
7321# pylint: disable-next=all
74- def do_simulation (prefill_bucket_size_to_ms , system_time_per_decode_token_ms ):
22+ def do_simulation (
23+ sharegpt_path , prefill_bucket_size_to_ms , system_time_per_decode_token_ms
24+ ):
7525 def next_power_of_2 (x ):
7626 return 1 if x == 0 else 2 ** (x - 1 ).bit_length ()
7727
@@ -82,10 +32,9 @@ def tokens_in_input_str(s):
8232
8333 convo_numbers = []
8434 # Please update with your own data file path
85- loaded_share_gpt = json .load (
86- # pylint: disable-next=all
87- open ("~/data/ShareGPT_V3_unfiltered_cleaned_split.json" , "r" )
88- )
35+
36+ with open (sharegpt_path , "r" , encoding = "utf-8" ) as f :
37+ loaded_share_gpt = json .load (f )
8938 for example in loaded_share_gpt :
9039 if len (example ["conversations" ]) < 2 :
9140 continue
0 commit comments