|
14 | 14 | "model_size_gb": 47.98, |
15 | 15 | "total_model_gb": 69.46 |
16 | 16 | }, |
17 | | - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 1128.0GB allowed)." |
| 17 | + "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 96.0GB allowed)." |
18 | 18 | } |
19 | 19 | ], |
20 | 20 | "shape_details": { |
21 | 21 | "available": false, |
22 | 22 | "core_count": null, |
23 | 23 | "gpu_specs": { |
24 | | - "gpu_count": 8, |
25 | | - "gpu_memory_in_gbs": 1128, |
26 | | - "gpu_type": "H200", |
| 24 | + "gpu_count": 4, |
| 25 | + "gpu_memory_in_gbs": 96, |
| 26 | + "gpu_type": "A10", |
27 | 27 | "quantization": [ |
28 | 28 | "awq", |
29 | 29 | "gptq", |
30 | 30 | "marlin", |
31 | | - "fp8", |
32 | 31 | "int8", |
33 | 32 | "bitblas", |
34 | 33 | "aqlm", |
|
37 | 36 | "gguf" |
38 | 37 | ], |
39 | 38 | "ranking": { |
40 | | - "cost": 100, |
41 | | - "performance": 110 |
| 39 | + "cost": 50, |
| 40 | + "performance": 50 |
42 | 41 | } |
43 | 42 | }, |
44 | 43 | "memory_in_gbs": null, |
45 | | - "name": "BM.GPU.H200.8", |
| 44 | + "name": "BM.GPU.A10.4", |
46 | 45 | "shape_series": "GPU" |
47 | 46 | } |
48 | 47 | }, |
|
59 | 58 | "model_size_gb": 47.98, |
60 | 59 | "total_model_gb": 69.46 |
61 | 60 | }, |
62 | | - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 1536.0GB allowed)." |
| 61 | + "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 1128.0GB allowed)." |
63 | 62 | } |
64 | 63 | ], |
65 | 64 | "shape_details": { |
66 | 65 | "available": false, |
67 | 66 | "core_count": null, |
68 | 67 | "gpu_specs": { |
69 | 68 | "gpu_count": 8, |
70 | | - "gpu_memory_in_gbs": 1536, |
71 | | - "gpu_type": "MI300X", |
| 69 | + "gpu_memory_in_gbs": 1128, |
| 70 | + "gpu_type": "H200", |
72 | 71 | "quantization": [ |
| 72 | + "awq", |
| 73 | + "gptq", |
| 74 | + "marlin", |
73 | 75 | "fp8", |
| 76 | + "int8", |
| 77 | + "bitblas", |
| 78 | + "aqlm", |
| 79 | + "bitsandbytes", |
| 80 | + "deepspeedfp", |
74 | 81 | "gguf" |
75 | 82 | ], |
76 | 83 | "ranking": { |
77 | | - "cost": 90, |
78 | | - "performance": 90 |
| 84 | + "cost": 100, |
| 85 | + "performance": 110 |
79 | 86 | } |
80 | 87 | }, |
81 | 88 | "memory_in_gbs": null, |
82 | | - "name": "BM.GPU.MI300X.8", |
| 89 | + "name": "BM.GPU.H200.8", |
83 | 90 | "shape_series": "GPU" |
84 | 91 | } |
85 | 92 | }, |
|
177 | 184 | "configurations": [ |
178 | 185 | { |
179 | 186 | "deployment_params": { |
180 | | - "max_model_len": 32768, |
181 | | - "params": "--max-model-len 32768 --quantization bitsandbytes --load-format bitsandbytes", |
182 | | - "quantization": "4bit" |
| 187 | + "max_model_len": 131072, |
| 188 | + "params": "", |
| 189 | + "quantization": "bfloat16" |
183 | 190 | }, |
184 | 191 | "model_details": { |
185 | | - "kv_cache_size_gb": 5.37, |
186 | | - "model_size_gb": 12.0, |
187 | | - "total_model_gb": 17.36 |
| 192 | + "kv_cache_size_gb": 21.47, |
| 193 | + "model_size_gb": 47.98, |
| 194 | + "total_model_gb": 69.46 |
188 | 195 | }, |
189 | | - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (17.4GB used / 24.0GB allowed)." |
| 196 | + "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 1536.0GB allowed)." |
190 | 197 | } |
191 | 198 | ], |
192 | 199 | "shape_details": { |
193 | 200 | "available": false, |
194 | 201 | "core_count": null, |
195 | 202 | "gpu_specs": { |
196 | | - "gpu_count": 1, |
197 | | - "gpu_memory_in_gbs": 24, |
198 | | - "gpu_type": "A10", |
| 203 | + "gpu_count": 8, |
| 204 | + "gpu_memory_in_gbs": 1536, |
| 205 | + "gpu_type": "MI300X", |
199 | 206 | "quantization": [ |
200 | | - "awq", |
201 | | - "gptq", |
202 | | - "marlin", |
203 | | - "int8", |
204 | | - "bitblas", |
205 | | - "aqlm", |
206 | | - "bitsandbytes", |
207 | | - "deepspeedfp", |
| 207 | + "fp8", |
208 | 208 | "gguf" |
209 | 209 | ], |
210 | 210 | "ranking": { |
211 | | - "cost": 20, |
212 | | - "performance": 30 |
| 211 | + "cost": 90, |
| 212 | + "performance": 90 |
213 | 213 | } |
214 | 214 | }, |
215 | 215 | "memory_in_gbs": null, |
216 | | - "name": "VM.GPU.A10.1", |
| 216 | + "name": "BM.GPU.MI300X.8", |
217 | 217 | "shape_series": "GPU" |
218 | 218 | } |
219 | 219 | }, |
220 | 220 | { |
221 | 221 | "configurations": [ |
222 | 222 | { |
223 | 223 | "deployment_params": { |
224 | | - "max_model_len": 131072, |
225 | | - "params": " --quantization bitsandbytes --load-format bitsandbytes", |
| 224 | + "max_model_len": 32768, |
| 225 | + "params": "--max-model-len 32768 --quantization bitsandbytes --load-format bitsandbytes", |
226 | 226 | "quantization": "4bit" |
227 | 227 | }, |
228 | 228 | "model_details": { |
229 | | - "kv_cache_size_gb": 21.47, |
| 229 | + "kv_cache_size_gb": 5.37, |
230 | 230 | "model_size_gb": 12.0, |
231 | | - "total_model_gb": 33.47 |
| 231 | + "total_model_gb": 17.36 |
232 | 232 | }, |
233 | | - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (33.5GB used / 48.0GB allowed)." |
| 233 | + "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (17.4GB used / 24.0GB allowed)." |
234 | 234 | } |
235 | 235 | ], |
236 | 236 | "shape_details": { |
237 | 237 | "available": false, |
238 | 238 | "core_count": null, |
239 | 239 | "gpu_specs": { |
240 | | - "gpu_count": 2, |
241 | | - "gpu_memory_in_gbs": 48, |
| 240 | + "gpu_count": 1, |
| 241 | + "gpu_memory_in_gbs": 24, |
242 | 242 | "gpu_type": "A10", |
243 | 243 | "quantization": [ |
244 | 244 | "awq", |
|
252 | 252 | "gguf" |
253 | 253 | ], |
254 | 254 | "ranking": { |
255 | | - "cost": 40, |
256 | | - "performance": 40 |
| 255 | + "cost": 20, |
| 256 | + "performance": 30 |
257 | 257 | } |
258 | 258 | }, |
259 | 259 | "memory_in_gbs": null, |
260 | | - "name": "VM.GPU.A10.2", |
| 260 | + "name": "VM.GPU.A10.1", |
261 | 261 | "shape_series": "GPU" |
262 | 262 | } |
263 | 263 | }, |
|
266 | 266 | { |
267 | 267 | "deployment_params": { |
268 | 268 | "max_model_len": 131072, |
269 | | - "params": "", |
270 | | - "quantization": "bfloat16" |
| 269 | + "params": "--quantization bitsandbytes --load-format bitsandbytes", |
| 270 | + "quantization": "4bit" |
271 | 271 | }, |
272 | 272 | "model_details": { |
273 | 273 | "kv_cache_size_gb": 21.47, |
274 | | - "model_size_gb": 47.98, |
275 | | - "total_model_gb": 69.46 |
| 274 | + "model_size_gb": 12.0, |
| 275 | + "total_model_gb": 33.47 |
276 | 276 | }, |
277 | | - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 96.0GB allowed)." |
| 277 | + "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (33.5GB used / 48.0GB allowed)." |
278 | 278 | } |
279 | 279 | ], |
280 | 280 | "shape_details": { |
281 | 281 | "available": false, |
282 | 282 | "core_count": null, |
283 | 283 | "gpu_specs": { |
284 | | - "gpu_count": 4, |
285 | | - "gpu_memory_in_gbs": 96, |
| 284 | + "gpu_count": 2, |
| 285 | + "gpu_memory_in_gbs": 48, |
286 | 286 | "gpu_type": "A10", |
287 | 287 | "quantization": [ |
288 | 288 | "awq", |
|
296 | 296 | "gguf" |
297 | 297 | ], |
298 | 298 | "ranking": { |
299 | | - "cost": 50, |
300 | | - "performance": 50 |
| 299 | + "cost": 40, |
| 300 | + "performance": 40 |
301 | 301 | } |
302 | 302 | }, |
303 | 303 | "memory_in_gbs": null, |
304 | | - "name": "BM.GPU.A10.4", |
| 304 | + "name": "VM.GPU.A10.2", |
305 | 305 | "shape_series": "GPU" |
306 | 306 | } |
307 | 307 | } |
|
0 commit comments