From acb14ed40fa69a627024046707a53e9a50ef7fef Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 7 Nov 2025 18:21:26 +0800 Subject: [PATCH 1/7] no message --- swift/cli/pt.py | 5 +++++ swift/cli/rlhf.py | 5 +++++ swift/cli/sft.py | 2 ++ swift/cli/utils.py | 23 +++++++++++++++++++++++ swift/utils/env.py | 23 +++++++++++++++++++---- 5 files changed, 54 insertions(+), 4 deletions(-) create mode 100644 swift/cli/utils.py diff --git a/swift/cli/pt.py b/swift/cli/pt.py index 1ca2aabd8a..414b842c37 100644 --- a/swift/cli/pt.py +++ b/swift/cli/pt.py @@ -2,4 +2,9 @@ from swift.llm import pt_main if __name__ == '__main__': + from swift.cli.utils import fix_ppu + fix_ppu() + from swift.ray import try_init_ray + try_init_ray() + from swift.llm import pt_main pt_main() diff --git a/swift/cli/rlhf.py b/swift/cli/rlhf.py index 4f0fd6a0ab..c86e15f44b 100644 --- a/swift/cli/rlhf.py +++ b/swift/cli/rlhf.py @@ -2,4 +2,9 @@ from swift.llm import rlhf_main if __name__ == '__main__': + from swift.cli.utils import fix_ppu + fix_ppu() + from swift.ray import try_init_ray + try_init_ray() + from swift.llm import rlhf_main rlhf_main() diff --git a/swift/cli/sft.py b/swift/cli/sft.py index 4e780e141b..7cfa0db777 100644 --- a/swift/cli/sft.py +++ b/swift/cli/sft.py @@ -11,6 +11,8 @@ def try_init_unsloth(): if __name__ == '__main__': + from swift.cli.utils import fix_ppu + fix_ppu() try_init_unsloth() from swift.ray import try_init_ray try_init_ray() diff --git a/swift/cli/utils.py b/swift/cli/utils.py new file mode 100644 index 0000000000..04ee25c11d --- /dev/null +++ b/swift/cli/utils.py @@ -0,0 +1,23 @@ +def is_ppu(): + import subprocess + result = subprocess.run( + ['nvidia-smi'], + capture_output=True, + text=True, + timeout=10 + ) + + if result.returncode == 0: + output = result.stdout + return 'PPU-' in output + else: + return False + + +def fix_ppu(): + if is_ppu(): + import os + visible_devices = os.environ['CUDA_VISIBLE_DEVICES'].split(',') + visible_device = visible_devices[int(os.environ['LOCAL_RANK'])] + os.environ['CUDA_VISIBLE_DEVICES'] = str(visible_device) + os.environ['LOCAL_RANK'] = '0' \ No newline at end of file diff --git a/swift/utils/env.py b/swift/utils/env.py index 3553492e97..03b789b167 100644 --- a/swift/utils/env.py +++ b/swift/utils/env.py @@ -61,15 +61,30 @@ def is_dist(): return rank >= 0 and local_rank >= 0 +def is_ppu(): + import subprocess + result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, timeout=10) + + if result.returncode == 0: + output = result.stdout + return 'PPU-' in output + else: + return False + + def is_mp() -> bool: from swift.utils import get_device_count n_gpu = get_device_count() local_world_size = get_dist_setting()[3] - assert n_gpu % local_world_size == 0, f'n_gpu: {n_gpu}, local_world_size: {local_world_size}' - if n_gpu // local_world_size >= 2: - return True - return False + if not is_ppu(): + assert n_gpu % local_world_size == 0, f'n_gpu: {n_gpu}, local_world_size: {local_world_size}' + if n_gpu // local_world_size >= 2: + return True + return False + else: + # We do not support mp for PPU + return False def is_mp_ddp() -> bool: From bd8384bcec38532a4a64013d889cefe0de8c6e80 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 7 Nov 2025 18:25:03 +0800 Subject: [PATCH 2/7] fix --- swift/cli/pt.py | 1 - swift/cli/rlhf.py | 1 - swift/cli/utils.py | 9 ++------- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/swift/cli/pt.py b/swift/cli/pt.py index 414b842c37..90409577c5 100644 --- a/swift/cli/pt.py +++ b/swift/cli/pt.py @@ -1,5 +1,4 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from swift.llm import pt_main if __name__ == '__main__': from swift.cli.utils import fix_ppu diff --git a/swift/cli/rlhf.py b/swift/cli/rlhf.py index c86e15f44b..e15d0a895a 100644 --- a/swift/cli/rlhf.py +++ b/swift/cli/rlhf.py @@ -1,5 +1,4 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from swift.llm import rlhf_main if __name__ == '__main__': from swift.cli.utils import fix_ppu diff --git a/swift/cli/utils.py b/swift/cli/utils.py index 04ee25c11d..9854f3d08d 100644 --- a/swift/cli/utils.py +++ b/swift/cli/utils.py @@ -1,11 +1,6 @@ def is_ppu(): import subprocess - result = subprocess.run( - ['nvidia-smi'], - capture_output=True, - text=True, - timeout=10 - ) + result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, timeout=10) if result.returncode == 0: output = result.stdout @@ -20,4 +15,4 @@ def fix_ppu(): visible_devices = os.environ['CUDA_VISIBLE_DEVICES'].split(',') visible_device = visible_devices[int(os.environ['LOCAL_RANK'])] os.environ['CUDA_VISIBLE_DEVICES'] = str(visible_device) - os.environ['LOCAL_RANK'] = '0' \ No newline at end of file + os.environ['LOCAL_RANK'] = '0' From add7fc9e5f5bf892e8884ab8bbf1897717c6b5a8 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 7 Nov 2025 18:26:04 +0800 Subject: [PATCH 3/7] fix --- swift/cli/pt.py | 2 -- swift/cli/rlhf.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/swift/cli/pt.py b/swift/cli/pt.py index 90409577c5..f8c37510ff 100644 --- a/swift/cli/pt.py +++ b/swift/cli/pt.py @@ -3,7 +3,5 @@ if __name__ == '__main__': from swift.cli.utils import fix_ppu fix_ppu() - from swift.ray import try_init_ray - try_init_ray() from swift.llm import pt_main pt_main() diff --git a/swift/cli/rlhf.py b/swift/cli/rlhf.py index e15d0a895a..a8154ee10c 100644 --- a/swift/cli/rlhf.py +++ b/swift/cli/rlhf.py @@ -3,7 +3,5 @@ if __name__ == '__main__': from swift.cli.utils import fix_ppu fix_ppu() - from swift.ray import try_init_ray - try_init_ray() from swift.llm import rlhf_main rlhf_main() From e6e7f77b3abef6dba77f356e18c1e7208c38ce6b Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 7 Nov 2025 18:38:04 +0800 Subject: [PATCH 4/7] fix --- swift/cli/utils.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/swift/cli/utils.py b/swift/cli/utils.py index 9854f3d08d..578c270669 100644 --- a/swift/cli/utils.py +++ b/swift/cli/utils.py @@ -1,11 +1,13 @@ def is_ppu(): import subprocess - result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, timeout=10) - - if result.returncode == 0: - output = result.stdout - return 'PPU-' in output - else: + try: + result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, timeout=10) + if result.returncode == 0: + output = result.stdout + return 'PPU-' in output + else: + return False + except: # noqa return False From 2ddbfc75a3f2db989274a29e669507e142ea22f5 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 7 Nov 2025 18:48:08 +0800 Subject: [PATCH 5/7] fix --- .../Instruction/Command-line-parameters.md | 1 + .../Instruction/Command-line-parameters.md | 1 + swift/cli/pt.py | 4 ++-- swift/cli/rlhf.py | 4 ++-- swift/cli/sft.py | 4 ++-- swift/cli/utils.py | 17 +++-------------- swift/utils/env.py | 14 +------------- 7 files changed, 12 insertions(+), 33 deletions(-) diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md index a17ae3e1e9..18f35113e3 100644 --- a/docs/source/Instruction/Command-line-parameters.md +++ b/docs/source/Instruction/Command-line-parameters.md @@ -843,3 +843,4 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外,还 - VLLM_USE_V1: 用于切换vLLM使用V0/V1版本。 - SWIFT_TIMEOUT: (ms-swift>=3.10) 若多模态数据集中存在图像URL,该参数用于控制获取图片的timeout,默认为20s。 - ROOT_IMAGE_DIR: (ms-swift>=3.8) 图像(多模态)资源的根目录。通过设置该参数,可以在数据集中使用相对于 `ROOT_IMAGE_DIR` 的相对路径。默认情况下,是相对于运行目录的相对路径。 +- SWIFT_SINGLE_DEVICE_MODE: (ms-swift>=3.10) 单设备模式,在此模式下,所有进程只能看到一个设备,目前用于兼容PPU设备 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index d4e0b9319d..5baaaa5d78 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -868,3 +868,4 @@ The meanings of the following parameters can be found in the example code [here] - VLLM_USE_V1: Used to switch between V0 and V1 versions of vLLM. - SWIFT_TIMEOUT: (ms-swift >= 3.10) If the multimodal dataset contains image URLs, this parameter controls the timeout for fetching images, defaulting to 20 seconds. - ROOT_IMAGE_DIR: (ms-swift>=3.8) The root directory for image (multimodal) resources. By setting this parameter, relative paths in the dataset can be interpreted relative to `ROOT_IMAGE_DIR`. By default, paths are relative to the current working directory. +- SWIFT_SINGLE_DEVICE_MODE: (ms-swift>=3.10) Single device mode. In this mode, all processes can only see one device. Currently used for compatibility with PPU devices. diff --git a/swift/cli/pt.py b/swift/cli/pt.py index f8c37510ff..60477214b1 100644 --- a/swift/cli/pt.py +++ b/swift/cli/pt.py @@ -1,7 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. if __name__ == '__main__': - from swift.cli.utils import fix_ppu - fix_ppu() + from swift.cli.utils import try_use_single_device_mode + try_use_single_device_mode() from swift.llm import pt_main pt_main() diff --git a/swift/cli/rlhf.py b/swift/cli/rlhf.py index a8154ee10c..5d8400fc5a 100644 --- a/swift/cli/rlhf.py +++ b/swift/cli/rlhf.py @@ -1,7 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. if __name__ == '__main__': - from swift.cli.utils import fix_ppu - fix_ppu() + from swift.cli.utils import try_use_single_device_mode + try_use_single_device_mode() from swift.llm import rlhf_main rlhf_main() diff --git a/swift/cli/sft.py b/swift/cli/sft.py index 7cfa0db777..27076381da 100644 --- a/swift/cli/sft.py +++ b/swift/cli/sft.py @@ -11,8 +11,8 @@ def try_init_unsloth(): if __name__ == '__main__': - from swift.cli.utils import fix_ppu - fix_ppu() + from swift.cli.utils import try_use_single_device_mode + try_use_single_device_mode() try_init_unsloth() from swift.ray import try_init_ray try_init_ray() diff --git a/swift/cli/utils.py b/swift/cli/utils.py index 578c270669..8656496375 100644 --- a/swift/cli/utils.py +++ b/swift/cli/utils.py @@ -1,19 +1,8 @@ -def is_ppu(): - import subprocess - try: - result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, timeout=10) - if result.returncode == 0: - output = result.stdout - return 'PPU-' in output - else: - return False - except: # noqa - return False +import os -def fix_ppu(): - if is_ppu(): - import os +def try_use_single_device_mode(): + if os.environ.get('SWIFT_SINGLE_DEVICE_MODE', '0') == '1': visible_devices = os.environ['CUDA_VISIBLE_DEVICES'].split(',') visible_device = visible_devices[int(os.environ['LOCAL_RANK'])] os.environ['CUDA_VISIBLE_DEVICES'] = str(visible_device) diff --git a/swift/utils/env.py b/swift/utils/env.py index 03b789b167..bfc5c1d07f 100644 --- a/swift/utils/env.py +++ b/swift/utils/env.py @@ -61,29 +61,17 @@ def is_dist(): return rank >= 0 and local_rank >= 0 -def is_ppu(): - import subprocess - result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, timeout=10) - - if result.returncode == 0: - output = result.stdout - return 'PPU-' in output - else: - return False - - def is_mp() -> bool: from swift.utils import get_device_count n_gpu = get_device_count() local_world_size = get_dist_setting()[3] - if not is_ppu(): + if os.environ.get('SWIFT_SINGLE_DEVICE_MODE', '0') != '1': assert n_gpu % local_world_size == 0, f'n_gpu: {n_gpu}, local_world_size: {local_world_size}' if n_gpu // local_world_size >= 2: return True return False else: - # We do not support mp for PPU return False From ad4721954088d8a86bb87039d3420910e5b4173e Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 7 Nov 2025 18:57:18 +0800 Subject: [PATCH 6/7] fix --- swift/cli/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/swift/cli/utils.py b/swift/cli/utils.py index 8656496375..43fd600e64 100644 --- a/swift/cli/utils.py +++ b/swift/cli/utils.py @@ -4,6 +4,9 @@ def try_use_single_device_mode(): if os.environ.get('SWIFT_SINGLE_DEVICE_MODE', '0') == '1': visible_devices = os.environ['CUDA_VISIBLE_DEVICES'].split(',') - visible_device = visible_devices[int(os.environ['LOCAL_RANK'])] + local_rank = os.environ.get('LOCAL_RANK') + if local_rank is None or not visible_devices: + return + visible_device = visible_devices[int(local_rank)] os.environ['CUDA_VISIBLE_DEVICES'] = str(visible_device) os.environ['LOCAL_RANK'] = '0' From f98b3f5a0f71c9121360b59e530d20d745774933 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 7 Nov 2025 18:59:01 +0800 Subject: [PATCH 7/7] fix --- swift/cli/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/swift/cli/utils.py b/swift/cli/utils.py index 43fd600e64..f0c8002ff2 100644 --- a/swift/cli/utils.py +++ b/swift/cli/utils.py @@ -3,10 +3,11 @@ def try_use_single_device_mode(): if os.environ.get('SWIFT_SINGLE_DEVICE_MODE', '0') == '1': - visible_devices = os.environ['CUDA_VISIBLE_DEVICES'].split(',') + visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES') local_rank = os.environ.get('LOCAL_RANK') if local_rank is None or not visible_devices: return + visible_devices = visible_devices.split(',') visible_device = visible_devices[int(local_rank)] os.environ['CUDA_VISIBLE_DEVICES'] = str(visible_device) os.environ['LOCAL_RANK'] = '0'