@@ -133,93 +133,13 @@ Our api aims to be compatible with the `sharded_state` load format in vLLM. Thus
133133Thus, for fist-time users, you have to load the model from other backends and then converted it to the ServerlessLLM format.
134134
1351351 . Download the model from HuggingFace and save it in the ServerlessLLM format:
136- ``` python
137- import os
138- import shutil
139- from typing import Optional
140-
141- class VllmModelDownloader :
142- def __init__ (self ):
143- pass
144-
145- def download_vllm_model (
146- self ,
147- model_name : str ,
148- torch_dtype : str ,
149- tensor_parallel_size : int = 1 ,
150- pattern : Optional[str ] = None ,
151- max_size : Optional[int ] = None ,
152- ):
153- import gc
154- import shutil
155- from tempfile import TemporaryDirectory
156-
157- import torch
158- from huggingface_hub import snapshot_download
159- from vllm import LLM
160- from vllm.config import LoadFormat
161-
162- # set the model storage path
163- storage_path = os.getenv(" STORAGE_PATH" , " ./models" )
164-
165- def _run_writer (input_dir , model_name ):
166- # load models from the input directory
167- llm_writer = LLM(
168- model = input_dir,
169- download_dir = input_dir,
170- dtype = torch_dtype,
171- tensor_parallel_size = tensor_parallel_size,
172- num_gpu_blocks_override = 1 ,
173- enforce_eager = True ,
174- max_model_len = 1 ,
175- )
176- model_path = os.path.join(storage_path, model_name)
177- model_executer = llm_writer.llm_engine.model_executor
178- # save the models in the ServerlessLLM format
179- model_executer.save_serverless_llm_state(
180- path = model_path, pattern = pattern, max_size = max_size
181- )
182- for file in os.listdir(input_dir):
183- # Copy the metadata files into the output directory
184- if os.path.splitext(file )[1 ] not in (
185- " .bin" ,
186- " .pt" ,
187- " .safetensors" ,
188- ):
189- src_path = os.path.join(input_dir, file )
190- dest_path = os.path.join(model_path, file )
191- if os.path.isdir(src_path):
192- shutil.copytree(src_path, dest_path)
193- else :
194- shutil.copy(src_path, dest_path)
195- del model_executer
196- del llm_writer
197- gc.collect()
198- if torch.cuda.is_available():
199- torch.cuda.empty_cache()
200- torch.cuda.synchronize()
201-
202- try :
203- with TemporaryDirectory() as cache_dir:
204- # download from huggingface
205- input_dir = snapshot_download(
206- model_name,
207- cache_dir = cache_dir,
208- allow_patterns = [" *.safetensors" , " *.bin" , " *.json" , " *.txt" ],
209- )
210- _run_writer(input_dir, model_name)
211- except Exception as e:
212- print (f " An error occurred while saving the model: { e} " )
213- # remove the output dir
214- shutil.rmtree(os.path.join(storage_path, model_name))
215- raise RuntimeError (
216- f " Failed to save { model_name} for vllm backend: { e} "
217- )
218-
219- downloader = VllmModelDownloader()
220- downloader.download_vllm_model(" facebook/opt-1.3b" , " float16" , 1 )
136+ ``` bash
137+ python3 examples/sllm_store/save_vllm_model.py --model_name facebook/opt-1.3b --storage_path $PWD /models --tensor_parallel_size 1
138+
221139```
222140
141+ You can also transfer the model from the local path compared to download it from network by passing the ` --local_model_path ` argument.
142+
223143After downloading the model, you can launch the checkpoint store server and load the model in vLLM through ` sllm ` load format.
224144
2251452 . Launch the checkpoint store server in a separate process:
0 commit comments