From a138498e350dc7cd1d03e013b64131601f2529b4 Mon Sep 17 00:00:00 2001 From: Kusaanko <39370373+kusaanko@users.noreply.github.com> Date: Tue, 11 Nov 2025 17:21:49 +0900 Subject: [PATCH 01/12] Add ability to select backend devices --- examples/simple/src/main.rs | 56 ++++++++++++++- llama-cpp-2/src/lib.rs | 74 +++++++++++++++++++ llama-cpp-2/src/model/params.rs | 123 ++++++++++++++++++++++++++++++++ 3 files changed, 251 insertions(+), 2 deletions(-) diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs index e64ece79..c773a841 100644 --- a/examples/simple/src/main.rs +++ b/examples/simple/src/main.rs @@ -13,7 +13,7 @@ use llama_cpp_2::context::params::LlamaContextParams; use llama_cpp_2::llama_backend::LlamaBackend; use llama_cpp_2::llama_batch::LlamaBatch; use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue; -use llama_cpp_2::model::params::LlamaModelParams; +use llama_cpp_2::model::params::{LlamaModelParams, LlamaSplitMode}; use llama_cpp_2::model::LlamaModel; use llama_cpp_2::model::{AddBos, Special}; use llama_cpp_2::sampling::LlamaSampler; @@ -48,6 +48,23 @@ struct Args { #[cfg(any(feature = "cuda", feature = "vulkan"))] #[clap(long)] disable_gpu: bool, + /// Set main GPU device index (default: 0) + /// + /// By setting this option, multiple GPU is disabled. + #[arg( + long, + help = "Set main GPU device id (default: 0). Disables multi-GPU." + )] + main_gpu: Option, + /// Set devices to use by index + /// + /// This option overrides `main-gpu` and enables multi-GPU. + #[arg( + long, + value_delimiter = ',', + help = "Set devices to use by index, separated by commas (e.g. --devices 0,1,2). Overrides main-gpu and enables multi-GPU." + )] + devices: Option>, #[cfg(any(feature = "cuda", feature = "vulkan"))] #[arg(long, help = "Keep MoE layers on CPU")] cmoe: bool, @@ -72,6 +89,8 @@ struct Args { ctx_size: Option, #[arg(short = 'v', long, help = "enable verbose llama.cpp logs")] verbose: bool, + #[arg(long, help = "list backend devices")] + list_devices: bool, } /// Parse a single key-value pair @@ -132,6 +151,8 @@ fn main() -> Result<()> { file, #[cfg(any(feature = "cuda", feature = "vulkan"))] disable_gpu, + main_gpu, + devices, #[cfg(any(feature = "cuda", feature = "vulkan"))] cmoe, key_value_overrides, @@ -140,6 +161,7 @@ fn main() -> Result<()> { threads_batch, ctx_size, verbose, + list_devices, } = Args::parse(); if verbose { @@ -151,8 +173,25 @@ fn main() -> Result<()> { // init LLM let backend = LlamaBackend::init()?; + if list_devices { + let devices = llama_cpp_2::list_llama_ggml_backend_devices(); + for (i, dev) in devices.iter().enumerate() { + println!("Device {i:>2}: {}", dev.name); + println!(" Description: {}", dev.description); + println!(" Backend: {}", dev.backend); + println!( + " Memory total: {:?} MiB", + dev.memory_total / 1024 / 1024 + ); + println!( + " Memory free: {:?} MiB", + dev.memory_free / 1024 / 1024 + ); + } + } + // offload all layers to the gpu - let model_params = { + let mut model_params = { #[cfg(any(feature = "cuda", feature = "vulkan"))] if !disable_gpu { LlamaModelParams::default().with_n_gpu_layers(1000) @@ -163,6 +202,19 @@ fn main() -> Result<()> { LlamaModelParams::default() }; + if let Some(devices) = devices { + model_params = model_params + .with_devices(&devices) + .with_context(|| "invalid device index in --devices")?; + if main_gpu.is_some() { + eprintln!("warning: --devices overrides --main-gpu"); + } + } else if let Some(main_gpu) = main_gpu { + model_params = model_params.with_main_gpu(main_gpu); + // Enable single GPU mode + model_params = model_params.with_split_mode(LlamaSplitMode::None); + } + let prompt = if let Some(str) = prompt { if file.is_some() { bail!("either prompt or file must be specified, but not both") diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs index 2038b5e0..403b8a2c 100644 --- a/llama-cpp-2/src/lib.rs +++ b/llama-cpp-2/src/lib.rs @@ -66,6 +66,12 @@ pub enum LLamaCppError { #[error(transparent)] EmbeddingError(#[from] EmbeddingsError), // See [`LlamaSamplerError`] + /// Backend device not found + #[error("Backend device {0} not found")] + BackendDeviceNotFound(usize), + /// Max devices exceeded + #[error("Max devices exceeded. Max devices is {0}")] + MaxDevicesExceeded(usize), } /// There was an error while getting the chat template from a model. @@ -349,6 +355,74 @@ pub fn llama_supports_mlock() -> bool { unsafe { llama_cpp_sys_2::llama_supports_mlock() } } +/// A ggml backend device +/// +/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices. +#[derive(Debug, Clone)] +pub struct LlamaBackendDevice { + /// The index of the device + /// + /// The index is can be used from `LlamaModelParams::with_devices` to select specific devices. + pub index: usize, + /// The name of the device (e.g. "Vulkan0") + pub name: String, + /// A description of the device (e.g. "NVIDIA GeForce RTX 3080") + pub description: String, + /// The backend of the device (e.g. "Vulkan", "CUDA", "CPU") + pub backend: String, + /// Total memory of the device in bytes + pub memory_total: usize, + /// Free memory of the device in bytes + pub memory_free: usize, +} + +/// List ggml backend devices +#[must_use] +pub fn list_llama_ggml_backend_devices() -> Vec { + let mut devices = Vec::new(); + for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } { + unsafe { + let dev = llama_cpp_sys_2::ggml_backend_dev_get(i); + let mut props = std::mem::zeroed(); + llama_cpp_sys_2::ggml_backend_dev_get_props(dev, &raw mut props); + let name = props.name; + let name = if name.is_null() { + String::new() + } else { + std::ffi::CStr::from_ptr(name).to_string_lossy().to_string() + }; + let description = props.description; + let description = if description.is_null() { + String::new() + } else { + std::ffi::CStr::from_ptr(description) + .to_string_lossy() + .to_string() + }; + let backend = llama_cpp_sys_2::ggml_backend_dev_backend_reg(dev); + let backend_name = llama_cpp_sys_2::ggml_backend_reg_name(backend); + let backend = if backend_name.is_null() { + String::new() + } else { + std::ffi::CStr::from_ptr(backend_name) + .to_string_lossy() + .to_string() + }; + let memory_total = props.memory_total; + let memory_free = props.memory_free; + devices.push(LlamaBackendDevice { + index: i, + name, + description, + backend, + memory_total, + memory_free, + }); + } + } + devices +} + /// Options to configure how llama.cpp logs are intercepted. #[derive(Default, Debug, Clone)] pub struct LogOptions { diff --git a/llama-cpp-2/src/model/params.rs b/llama-cpp-2/src/model/params.rs index 822bc69f..a8f92887 100644 --- a/llama-cpp-2/src/model/params.rs +++ b/llama-cpp-2/src/model/params.rs @@ -1,6 +1,7 @@ //! A safe wrapper around `llama_model_params`. use crate::model::params::kv_overrides::KvOverrides; +use crate::LLamaCppError; use std::ffi::{c_char, CStr}; use std::fmt::{Debug, Formatter}; use std::pin::Pin; @@ -8,12 +9,56 @@ use std::ptr::null; pub mod kv_overrides; +/// A rusty wrapper around `llama_split_mode`. +#[repr(i8)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[allow(clippy::cast_possible_truncation)] +pub enum LlamaSplitMode { + /// Single GPU + None = llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE as i8, + /// Split layers and KV across GPUs + Layer = llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER as i8, + /// Split layers and KV across GPUs, use tensor parallelism if supported + Row = llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW as i8, +} + +/// Create a `LlamaSplitMode` from a `c_int` - returns `LlamaSplitMode::LAYER` if +/// the value is not recognized. +impl From for LlamaSplitMode { + fn from(value: i32) -> Self { + match value { + x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE => Self::None, + x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER => Self::Layer, + x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW => Self::Row, + _ => Self::Layer, + } + } +} + +/// Create a `c_int` from a `LlamaSplitMode`. +impl From for i32 { + fn from(value: LlamaSplitMode) -> Self { + match value { + LlamaSplitMode::None => 0, + LlamaSplitMode::Layer => 1, + LlamaSplitMode::Row => 2, + } + } +} + +/// The maximum number of devices supported. +/// +/// The real maximum number of devices is the lesser one of this value and the value returned by +/// `llama_cpp_2::max_devices()`. +pub const LLAMA_CPP_MAX_DEVICES: usize = 16; + /// A safe wrapper around `llama_model_params`. #[allow(clippy::module_name_repetitions)] pub struct LlamaModelParams { pub(crate) params: llama_cpp_sys_2::llama_model_params, kv_overrides: Vec, buft_overrides: Vec, + devices: Pin>, } impl Debug for LlamaModelParams { @@ -24,6 +69,8 @@ impl Debug for LlamaModelParams { .field("vocab_only", &self.params.vocab_only) .field("use_mmap", &self.params.use_mmap) .field("use_mlock", &self.params.use_mlock) + .field("split_mode", &self.split_mode()) + .field("devices", &self.devices) .field("kv_overrides", &"vec of kv_overrides") .finish() } @@ -181,6 +228,36 @@ impl LlamaModelParams { self.params.use_mlock } + /// get the split mode + #[must_use] + pub fn split_mode(&self) -> LlamaSplitMode { + LlamaSplitMode::from(self.params.split_mode) + } + + /// get the devices + #[must_use] + pub fn devices(&self) -> Vec { + let mut backend_devices = Vec::new(); + for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } { + let dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(i) }; + backend_devices.push(dev); + } + let mut devices = Vec::new(); + for &dev in self.devices.iter() { + if dev.is_null() { + break; + } + if let Some((index, _)) = backend_devices + .iter() + .enumerate() + .find(|&(_i, &d)| d == dev) + { + devices.push(index); + } + } + devices + } + /// sets the number of gpu layers to offload to the GPU. /// ``` /// # use llama_cpp_2::model::params::LlamaModelParams; @@ -198,6 +275,8 @@ impl LlamaModelParams { } /// sets the main GPU + /// + /// To enable this option, you must set `split_mode` to `LlamaSplitMode::None` to enable single GPU mode. #[must_use] pub fn with_main_gpu(mut self, main_gpu: i32) -> Self { self.params.main_gpu = main_gpu; @@ -217,6 +296,47 @@ impl LlamaModelParams { self.params.use_mlock = use_mlock; self } + + /// sets `split_mode` + #[must_use] + pub fn with_split_mode(mut self, split_mode: LlamaSplitMode) -> Self { + self.params.split_mode = split_mode.into(); + self + } + + /// sets `devices` + /// + /// The devices are specified as indices that correspond to the ggml backend device indices. + /// + /// The maximum number of devices is 16. + /// + /// You don't need to specify CPU or ACCEL devices. + /// + /// # Errors + /// Returns `LLamaCppError::BackendDeviceNotFound` if any device index is invalid. + pub fn with_devices(mut self, devices: &[usize]) -> Result { + for dev in self.devices.iter_mut() { + *dev = std::ptr::null_mut(); + } + // Check device count + let max_devices = crate::max_devices().min(LLAMA_CPP_MAX_DEVICES); + if devices.len() > max_devices { + return Err(LLamaCppError::MaxDevicesExceeded(max_devices)); + } + for (i, &dev) in devices.iter().enumerate() { + if dev >= unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } { + return Err(LLamaCppError::BackendDeviceNotFound(dev)); + } + let backend_dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(dev) }; + self.devices[i] = backend_dev; + } + if self.devices.is_empty() { + self.params.devices = std::ptr::null_mut(); + } else { + self.params.devices = self.devices.as_mut_ptr(); + } + Ok(self) + } } /// Default parameters for `LlamaModel`. (as defined in llama.cpp by `llama_model_default_params`) @@ -228,6 +348,8 @@ impl LlamaModelParams { /// assert_eq!(params.vocab_only(), false, "vocab_only should be false"); /// assert_eq!(params.use_mmap(), true, "use_mmap should be true"); /// assert_eq!(params.use_mlock(), false, "use_mlock should be false"); +/// assert_eq!(params.split_mode(), LlamaSplitMode::Layer, "split_mode should be LAYER"); +/// assert_eq!(params.devices().len(), 0, "devices should be empty"); /// ``` impl Default for LlamaModelParams { fn default() -> Self { @@ -246,6 +368,7 @@ impl Default for LlamaModelParams { pattern: std::ptr::null(), buft: std::ptr::null_mut(), }], + devices: Box::pin([std::ptr::null_mut(); 16]), } } } From f93eb1b1bf5d2f6ec67196c8b183911e0633acb7 Mon Sep 17 00:00:00 2001 From: Kusaanko <39370373+kusaanko@users.noreply.github.com> Date: Tue, 11 Nov 2025 17:33:25 +0900 Subject: [PATCH 02/12] Add device type to LlamaBackendDevice --- examples/simple/src/main.rs | 1 + llama-cpp-2/src/lib.rs | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs index c773a841..e7299938 100644 --- a/examples/simple/src/main.rs +++ b/examples/simple/src/main.rs @@ -178,6 +178,7 @@ fn main() -> Result<()> { for (i, dev) in devices.iter().enumerate() { println!("Device {i:>2}: {}", dev.name); println!(" Description: {}", dev.description); + println!(" Device Type: {:?}", dev.device_type); println!(" Backend: {}", dev.backend); println!( " Memory total: {:?} MiB", diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs index 403b8a2c..8da69872 100644 --- a/llama-cpp-2/src/lib.rs +++ b/llama-cpp-2/src/lib.rs @@ -355,6 +355,21 @@ pub fn llama_supports_mlock() -> bool { unsafe { llama_cpp_sys_2::llama_supports_mlock() } } +/// Backend device type +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum LlamaBackendDeviceType { + /// CPU device + CPU, + /// ACCEL device + ACCEL, + /// GPU device + GPU, + /// iGPU device + IGPU, + /// Unknown device type + UNKNOWN, +} + /// A ggml backend device /// /// The index is can be used from `LlamaModelParams::with_devices` to select specific devices. @@ -374,6 +389,8 @@ pub struct LlamaBackendDevice { pub memory_total: usize, /// Free memory of the device in bytes pub memory_free: usize, + /// Device type + pub device_type: LlamaBackendDeviceType, } /// List ggml backend devices @@ -410,6 +427,13 @@ pub fn list_llama_ggml_backend_devices() -> Vec { }; let memory_total = props.memory_total; let memory_free = props.memory_free; + let device_type = match props.type_ { + llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::CPU, + llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => LlamaBackendDeviceType::ACCEL, + llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::GPU, + llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => LlamaBackendDeviceType::IGPU, + _ => LlamaBackendDeviceType::UNKNOWN, + }; devices.push(LlamaBackendDevice { index: i, name, @@ -417,6 +441,7 @@ pub fn list_llama_ggml_backend_devices() -> Vec { backend, memory_total, memory_free, + device_type, }); } } From 750ed12824edd6782aac3ec1f221b74f2cbf85d4 Mon Sep 17 00:00:00 2001 From: Kusaanko <39370373+kusaanko@users.noreply.github.com> Date: Tue, 11 Nov 2025 17:36:02 +0900 Subject: [PATCH 03/12] Rename LlamaBackendDeviceType enums to PascalCase --- llama-cpp-2/src/lib.rs | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs index 8da69872..50d21552 100644 --- a/llama-cpp-2/src/lib.rs +++ b/llama-cpp-2/src/lib.rs @@ -359,15 +359,15 @@ pub fn llama_supports_mlock() -> bool { #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum LlamaBackendDeviceType { /// CPU device - CPU, + Cpu, /// ACCEL device - ACCEL, + Accelerator, /// GPU device - GPU, + Gpu, /// iGPU device - IGPU, + IntegratedGpu, /// Unknown device type - UNKNOWN, + Unknown, } /// A ggml backend device @@ -428,11 +428,15 @@ pub fn list_llama_ggml_backend_devices() -> Vec { let memory_total = props.memory_total; let memory_free = props.memory_free; let device_type = match props.type_ { - llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::CPU, - llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => LlamaBackendDeviceType::ACCEL, - llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::GPU, - llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => LlamaBackendDeviceType::IGPU, - _ => LlamaBackendDeviceType::UNKNOWN, + llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Gpu, + llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => { + LlamaBackendDeviceType::Accelerator + } + llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::Gpu, + llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => { + LlamaBackendDeviceType::IntegratedGpu + } + _ => LlamaBackendDeviceType::Unknown, }; devices.push(LlamaBackendDevice { index: i, From 7d12f3063682ca583f662135d68e1cbccfb1224d Mon Sep 17 00:00:00 2001 From: Kusaanko <39370373+kusaanko@users.noreply.github.com> Date: Wed, 12 Nov 2025 14:29:54 +0900 Subject: [PATCH 04/12] Fix invalid device type convertion to Gpu --- llama-cpp-2/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs index 50d21552..4f7ecd48 100644 --- a/llama-cpp-2/src/lib.rs +++ b/llama-cpp-2/src/lib.rs @@ -428,7 +428,7 @@ pub fn list_llama_ggml_backend_devices() -> Vec { let memory_total = props.memory_total; let memory_free = props.memory_free; let device_type = match props.type_ { - llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Gpu, + llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Cpu, llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => { LlamaBackendDeviceType::Accelerator } From 99ad8083fdebe81a2bedcdd0e9ce47257082cd32 Mon Sep 17 00:00:00 2001 From: Kusaanko <39370373+kusaanko@users.noreply.github.com> Date: Wed, 12 Nov 2025 14:34:02 +0900 Subject: [PATCH 05/12] Split unsafe block into small parts in list_llama_ggml_backend_devices --- llama-cpp-2/src/lib.rs | 78 ++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 45 deletions(-) diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs index 4f7ecd48..16f05e31 100644 --- a/llama-cpp-2/src/lib.rs +++ b/llama-cpp-2/src/lib.rs @@ -398,56 +398,44 @@ pub struct LlamaBackendDevice { pub fn list_llama_ggml_backend_devices() -> Vec { let mut devices = Vec::new(); for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } { - unsafe { - let dev = llama_cpp_sys_2::ggml_backend_dev_get(i); - let mut props = std::mem::zeroed(); - llama_cpp_sys_2::ggml_backend_dev_get_props(dev, &raw mut props); - let name = props.name; - let name = if name.is_null() { - String::new() - } else { - std::ffi::CStr::from_ptr(name).to_string_lossy().to_string() - }; - let description = props.description; - let description = if description.is_null() { - String::new() - } else { - std::ffi::CStr::from_ptr(description) - .to_string_lossy() - .to_string() - }; - let backend = llama_cpp_sys_2::ggml_backend_dev_backend_reg(dev); - let backend_name = llama_cpp_sys_2::ggml_backend_reg_name(backend); - let backend = if backend_name.is_null() { + fn cstr_to_string(ptr: *const i8) -> String { + if ptr.is_null() { String::new() } else { - std::ffi::CStr::from_ptr(backend_name) + unsafe { std::ffi::CStr::from_ptr(ptr) } .to_string_lossy() .to_string() - }; - let memory_total = props.memory_total; - let memory_free = props.memory_free; - let device_type = match props.type_ { - llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Cpu, - llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => { - LlamaBackendDeviceType::Accelerator - } - llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::Gpu, - llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => { - LlamaBackendDeviceType::IntegratedGpu - } - _ => LlamaBackendDeviceType::Unknown, - }; - devices.push(LlamaBackendDevice { - index: i, - name, - description, - backend, - memory_total, - memory_free, - device_type, - }); + } } + let dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(i) }; + let props = unsafe { + let mut props = std::mem::zeroed(); + llama_cpp_sys_2::ggml_backend_dev_get_props(dev, &raw mut props); + props + }; + let name = cstr_to_string(props.name); + let description = cstr_to_string(props.description); + let backend = unsafe { llama_cpp_sys_2::ggml_backend_dev_backend_reg(dev) }; + let backend_name = unsafe { llama_cpp_sys_2::ggml_backend_reg_name(backend) }; + let backend = cstr_to_string(backend_name); + let memory_total = props.memory_total; + let memory_free = props.memory_free; + let device_type = match props.type_ { + llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Cpu, + llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => LlamaBackendDeviceType::Accelerator, + llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::Gpu, + llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => LlamaBackendDeviceType::IntegratedGpu, + _ => LlamaBackendDeviceType::Unknown, + }; + devices.push(LlamaBackendDevice { + index: i, + name, + description, + backend, + memory_total, + memory_free, + device_type, + }); } devices } From d364092ece0117321ba02ef3b72cd5aae55ece6e Mon Sep 17 00:00:00 2001 From: Kusaanko <39370373+kusaanko@users.noreply.github.com> Date: Wed, 12 Nov 2025 14:39:58 +0900 Subject: [PATCH 06/12] Refactor LlamaSplitMode parsing to use TryFrom with error Replaces the From implementation for LlamaSplitMode with TryFrom, returning a custom LlamaSplitModeParseError on invalid values. Updates LlamaModelParams::split_mode to return a Result, improving error handling for unknown split modes. --- llama-cpp-2/src/model/params.rs | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/llama-cpp-2/src/model/params.rs b/llama-cpp-2/src/model/params.rs index a8f92887..635f439f 100644 --- a/llama-cpp-2/src/model/params.rs +++ b/llama-cpp-2/src/model/params.rs @@ -22,15 +22,23 @@ pub enum LlamaSplitMode { Row = llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW as i8, } -/// Create a `LlamaSplitMode` from a `c_int` - returns `LlamaSplitMode::LAYER` if -/// the value is not recognized. -impl From for LlamaSplitMode { - fn from(value: i32) -> Self { +/// An error that occurs when unknown split mode is encountered. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct LlamaSplitModeParseError(i32); + +/// Create a `LlamaSplitMode` from a `c_int`. +/// +/// # Errors +/// Returns `()` if the value does not correspond to a valid `LlamaSplitMode`. +impl TryFrom for LlamaSplitMode { + type Error = LlamaSplitModeParseError; + + fn try_from(value: i32) -> Result { match value { - x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE => Self::None, - x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER => Self::Layer, - x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW => Self::Row, - _ => Self::Layer, + llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE => Ok(Self::None), + llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER => Ok(Self::Layer), + llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW => Ok(Self::Row), + _ => Err(LlamaSplitModeParseError(value)), } } } @@ -229,9 +237,11 @@ impl LlamaModelParams { } /// get the split mode - #[must_use] - pub fn split_mode(&self) -> LlamaSplitMode { - LlamaSplitMode::from(self.params.split_mode) + /// + /// # Errors + /// Returns `LlamaSplitModeParseError` if the unknown split mode is encountered. + pub fn split_mode(&self) -> Result { + LlamaSplitMode::try_from(self.params.split_mode) } /// get the devices From 3468ae7df9232567eaf398385baab4c48e8d124f Mon Sep 17 00:00:00 2001 From: Kusaanko <39370373+kusaanko@users.noreply.github.com> Date: Wed, 12 Nov 2025 14:43:59 +0900 Subject: [PATCH 07/12] Implement Default for LlamaSplitMode --- llama-cpp-2/src/model/params.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/llama-cpp-2/src/model/params.rs b/llama-cpp-2/src/model/params.rs index 635f439f..0f6dc272 100644 --- a/llama-cpp-2/src/model/params.rs +++ b/llama-cpp-2/src/model/params.rs @@ -54,6 +54,13 @@ impl From for i32 { } } +/// The default split mode is `Layer` in llama.cpp. +impl Default for LlamaSplitMode { + fn default() -> Self { + LlamaSplitMode::Layer + } +} + /// The maximum number of devices supported. /// /// The real maximum number of devices is the lesser one of this value and the value returned by From e2a3b5738a34f13aafd4272053a4da1104980b7b Mon Sep 17 00:00:00 2001 From: Kusaanko <39370373+kusaanko@users.noreply.github.com> Date: Thu, 13 Nov 2025 17:09:15 +0900 Subject: [PATCH 08/12] Refactor LlamaSplitMode to use explicit integer values Replaces enum variant values for LlamaSplitMode with explicit integers (0, 1, 2) instead of referencing llama_cpp_sys_2 constants. Adds TryFrom implementation for LlamaSplitMode to improve type conversion and error handling. Updates documentation and error types for clarity. --- llama-cpp-2/src/model/params.rs | 38 ++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/llama-cpp-2/src/model/params.rs b/llama-cpp-2/src/model/params.rs index 0f6dc272..82cd51e0 100644 --- a/llama-cpp-2/src/model/params.rs +++ b/llama-cpp-2/src/model/params.rs @@ -15,34 +15,55 @@ pub mod kv_overrides; #[allow(clippy::cast_possible_truncation)] pub enum LlamaSplitMode { /// Single GPU - None = llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE as i8, + None = 0, /// Split layers and KV across GPUs - Layer = llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER as i8, + Layer = 1, /// Split layers and KV across GPUs, use tensor parallelism if supported - Row = llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW as i8, + Row = 2, } /// An error that occurs when unknown split mode is encountered. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct LlamaSplitModeParseError(i32); -/// Create a `LlamaSplitMode` from a `c_int`. +/// Create a `LlamaSplitMode` from a `i32`. /// /// # Errors -/// Returns `()` if the value does not correspond to a valid `LlamaSplitMode`. +/// Returns `LlamaSplitModeParseError` if the value does not correspond to a valid `LlamaSplitMode`. impl TryFrom for LlamaSplitMode { type Error = LlamaSplitModeParseError; fn try_from(value: i32) -> Result { match value { - llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE => Ok(Self::None), - llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER => Ok(Self::Layer), - llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW => Ok(Self::Row), + 0 => Ok(Self::None), + 1 => Ok(Self::Layer), + 2 => Ok(Self::Row), _ => Err(LlamaSplitModeParseError(value)), } } } +/// Create a `LlamaSplitMode` from a `u32`. +/// +/// # Errors +/// Returns `LlamaSplitModeParseError` if the value does not correspond to a valid `LlamaSplitMode`. +impl TryFrom for LlamaSplitMode { + type Error = LlamaSplitModeParseError; + + fn try_from(value: u32) -> Result { + match value { + 0 => Ok(Self::None), + 1 => Ok(Self::Layer), + 2 => Ok(Self::Row), + _ => { + // Convert u32 to i32 without allowing a wrap; if it overflows, use i32::MAX. + let v = i32::try_from(value).unwrap_or(i32::MAX); + Err(LlamaSplitModeParseError(v)) + } + } + } +} + /// Create a `c_int` from a `LlamaSplitMode`. impl From for i32 { fn from(value: LlamaSplitMode) -> Self { @@ -359,6 +380,7 @@ impl LlamaModelParams { /// Default parameters for `LlamaModel`. (as defined in llama.cpp by `llama_model_default_params`) /// ``` /// # use llama_cpp_2::model::params::LlamaModelParams; +/// use llama_cpp_2::model::params::LlamaSplitMode; /// let params = LlamaModelParams::default(); /// assert_eq!(params.n_gpu_layers(), 999, "n_gpu_layers should be 999"); /// assert_eq!(params.main_gpu(), 0, "main_gpu should be 0"); From b61e44896478ae7d3459e236bfed9c8a43888ce1 Mon Sep 17 00:00:00 2001 From: Kusaanko <39370373+kusaanko@users.noreply.github.com> Date: Fri, 14 Nov 2025 02:50:47 +0900 Subject: [PATCH 09/12] Use llama_cpp_sys_2 constants for LlamaSplitMode values Replaces hardcoded integer values in the LlamaSplitMode enum and its conversions with corresponding constants from llama_cpp_sys_2. Adds From for u32 and updates TryFrom implementations for better consistency with external definitions. --- llama-cpp-2/src/model/params.rs | 42 +++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/llama-cpp-2/src/model/params.rs b/llama-cpp-2/src/model/params.rs index 82cd51e0..40c08578 100644 --- a/llama-cpp-2/src/model/params.rs +++ b/llama-cpp-2/src/model/params.rs @@ -15,11 +15,11 @@ pub mod kv_overrides; #[allow(clippy::cast_possible_truncation)] pub enum LlamaSplitMode { /// Single GPU - None = 0, + None = llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE as i8, /// Split layers and KV across GPUs - Layer = 1, + Layer = llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER as i8, /// Split layers and KV across GPUs, use tensor parallelism if supported - Row = 2, + Row = llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW as i8, } /// An error that occurs when unknown split mode is encountered. @@ -30,14 +30,15 @@ pub struct LlamaSplitModeParseError(i32); /// /// # Errors /// Returns `LlamaSplitModeParseError` if the value does not correspond to a valid `LlamaSplitMode`. +#[allow(clippy::unnecessary_cast)] impl TryFrom for LlamaSplitMode { type Error = LlamaSplitModeParseError; fn try_from(value: i32) -> Result { match value { - 0 => Ok(Self::None), - 1 => Ok(Self::Layer), - 2 => Ok(Self::Row), + x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE as i32 => Ok(Self::None), + x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER as i32 => Ok(Self::Layer), + x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW as i32 => Ok(Self::Row), _ => Err(LlamaSplitModeParseError(value)), } } @@ -47,14 +48,16 @@ impl TryFrom for LlamaSplitMode { /// /// # Errors /// Returns `LlamaSplitModeParseError` if the value does not correspond to a valid `LlamaSplitMode`. +#[allow(clippy::cast_possible_wrap)] +#[allow(clippy::unnecessary_cast)] impl TryFrom for LlamaSplitMode { type Error = LlamaSplitModeParseError; fn try_from(value: u32) -> Result { match value { - 0 => Ok(Self::None), - 1 => Ok(Self::Layer), - 2 => Ok(Self::Row), + x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE as u32 => Ok(Self::None), + x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER as u32 => Ok(Self::Layer), + x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW as u32 => Ok(Self::Row), _ => { // Convert u32 to i32 without allowing a wrap; if it overflows, use i32::MAX. let v = i32::try_from(value).unwrap_or(i32::MAX); @@ -64,13 +67,26 @@ impl TryFrom for LlamaSplitMode { } } -/// Create a `c_int` from a `LlamaSplitMode`. +/// Create a `i32` from a `LlamaSplitMode`. +#[allow(clippy::cast_possible_wrap)] impl From for i32 { fn from(value: LlamaSplitMode) -> Self { match value { - LlamaSplitMode::None => 0, - LlamaSplitMode::Layer => 1, - LlamaSplitMode::Row => 2, + LlamaSplitMode::None => llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE as _, + LlamaSplitMode::Layer => llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER as _, + LlamaSplitMode::Row => llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW as _, + } + } +} + +/// Create a `u32` from a `LlamaSplitMode`. +#[allow(clippy::cast_possible_wrap)] +impl From for u32 { + fn from(value: LlamaSplitMode) -> Self { + match value { + LlamaSplitMode::None => llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE as _, + LlamaSplitMode::Layer => llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER as _, + LlamaSplitMode::Row => llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW as _, } } } From 6c1a58e3d7e9a2929aedce5b7bc0b80684b0905a Mon Sep 17 00:00:00 2001 From: Kusaanko <39370373+kusaanko@users.noreply.github.com> Date: Fri, 14 Nov 2025 03:02:48 +0900 Subject: [PATCH 10/12] Define LLAMA_SPLIT_MODE constants to resolve u32 or i32 Introduces constants for split mode values with explicit clippy lint allowances and updates the LlamaSplitMode enum to use these constants. Refactors TryFrom implementations for i32 and u32 to improve error handling and type safety, and makes LlamaSplitModeParseError's field public. --- llama-cpp-2/src/model/params.rs | 48 +++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/llama-cpp-2/src/model/params.rs b/llama-cpp-2/src/model/params.rs index 40c08578..f7e1e1e3 100644 --- a/llama-cpp-2/src/model/params.rs +++ b/llama-cpp-2/src/model/params.rs @@ -9,22 +9,32 @@ use std::ptr::null; pub mod kv_overrides; +#[allow(clippy::cast_possible_wrap)] +#[allow(clippy::cast_possible_truncation)] +const LLAMA_SPLIT_MODE_NONE: i8 = llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE as i8; +#[allow(clippy::cast_possible_wrap)] +#[allow(clippy::cast_possible_truncation)] +const LLAMA_SPLIT_MODE_LAYER: i8 = llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER as i8; +#[allow(clippy::cast_possible_wrap)] +#[allow(clippy::cast_possible_truncation)] +const LLAMA_SPLIT_MODE_ROW: i8 = llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW as i8; + /// A rusty wrapper around `llama_split_mode`. #[repr(i8)] #[derive(Copy, Clone, Debug, PartialEq, Eq)] #[allow(clippy::cast_possible_truncation)] pub enum LlamaSplitMode { /// Single GPU - None = llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE as i8, + None = LLAMA_SPLIT_MODE_NONE, /// Split layers and KV across GPUs - Layer = llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER as i8, + Layer = LLAMA_SPLIT_MODE_LAYER, /// Split layers and KV across GPUs, use tensor parallelism if supported - Row = llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW as i8, + Row = LLAMA_SPLIT_MODE_ROW, } /// An error that occurs when unknown split mode is encountered. #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct LlamaSplitModeParseError(i32); +pub struct LlamaSplitModeParseError(pub i32); /// Create a `LlamaSplitMode` from a `i32`. /// @@ -35,10 +45,13 @@ impl TryFrom for LlamaSplitMode { type Error = LlamaSplitModeParseError; fn try_from(value: i32) -> Result { - match value { - x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE as i32 => Ok(Self::None), - x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER as i32 => Ok(Self::Layer), - x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW as i32 => Ok(Self::Row), + let i8_value = value + .try_into() + .map_err(|_| LlamaSplitModeParseError(value))?; + match i8_value { + LLAMA_SPLIT_MODE_NONE => Ok(Self::None), + LLAMA_SPLIT_MODE_LAYER => Ok(Self::Layer), + LLAMA_SPLIT_MODE_ROW => Ok(Self::Row), _ => Err(LlamaSplitModeParseError(value)), } } @@ -54,15 +67,16 @@ impl TryFrom for LlamaSplitMode { type Error = LlamaSplitModeParseError; fn try_from(value: u32) -> Result { - match value { - x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE as u32 => Ok(Self::None), - x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER as u32 => Ok(Self::Layer), - x if x == llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW as u32 => Ok(Self::Row), - _ => { - // Convert u32 to i32 without allowing a wrap; if it overflows, use i32::MAX. - let v = i32::try_from(value).unwrap_or(i32::MAX); - Err(LlamaSplitModeParseError(v)) - } + let i8_value = value + .try_into() + .map_err(|_| LlamaSplitModeParseError(value.try_into().unwrap_or(i32::MAX)))?; + match i8_value { + LLAMA_SPLIT_MODE_NONE => Ok(Self::None), + LLAMA_SPLIT_MODE_LAYER => Ok(Self::Layer), + LLAMA_SPLIT_MODE_ROW => Ok(Self::Row), + _ => Err(LlamaSplitModeParseError( + value.try_into().unwrap_or(i32::MAX), + )), } } } From 86e1fc540030f91836275122cdc9942a0be09c50 Mon Sep 17 00:00:00 2001 From: Kusaanko <39370373+kusaanko@users.noreply.github.com> Date: Fri, 14 Nov 2025 03:05:00 +0900 Subject: [PATCH 11/12] Use LLAMA_SPLIT_MODE constants defined in params.rs Cleaned up the LlamaSplitMode enum and related conversions by removing redundant clippy allow attributes. Also simplified some type conversions for clarity. --- llama-cpp-2/src/model/params.rs | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/llama-cpp-2/src/model/params.rs b/llama-cpp-2/src/model/params.rs index f7e1e1e3..5a67e70d 100644 --- a/llama-cpp-2/src/model/params.rs +++ b/llama-cpp-2/src/model/params.rs @@ -22,7 +22,6 @@ const LLAMA_SPLIT_MODE_ROW: i8 = llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW as i8; /// A rusty wrapper around `llama_split_mode`. #[repr(i8)] #[derive(Copy, Clone, Debug, PartialEq, Eq)] -#[allow(clippy::cast_possible_truncation)] pub enum LlamaSplitMode { /// Single GPU None = LLAMA_SPLIT_MODE_NONE, @@ -40,7 +39,6 @@ pub struct LlamaSplitModeParseError(pub i32); /// /// # Errors /// Returns `LlamaSplitModeParseError` if the value does not correspond to a valid `LlamaSplitMode`. -#[allow(clippy::unnecessary_cast)] impl TryFrom for LlamaSplitMode { type Error = LlamaSplitModeParseError; @@ -61,8 +59,6 @@ impl TryFrom for LlamaSplitMode { /// /// # Errors /// Returns `LlamaSplitModeParseError` if the value does not correspond to a valid `LlamaSplitMode`. -#[allow(clippy::cast_possible_wrap)] -#[allow(clippy::unnecessary_cast)] impl TryFrom for LlamaSplitMode { type Error = LlamaSplitModeParseError; @@ -82,25 +78,23 @@ impl TryFrom for LlamaSplitMode { } /// Create a `i32` from a `LlamaSplitMode`. -#[allow(clippy::cast_possible_wrap)] impl From for i32 { fn from(value: LlamaSplitMode) -> Self { match value { - LlamaSplitMode::None => llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE as _, - LlamaSplitMode::Layer => llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER as _, - LlamaSplitMode::Row => llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW as _, + LlamaSplitMode::None => LLAMA_SPLIT_MODE_NONE.into(), + LlamaSplitMode::Layer => LLAMA_SPLIT_MODE_LAYER.into(), + LlamaSplitMode::Row => LLAMA_SPLIT_MODE_ROW.into(), } } } /// Create a `u32` from a `LlamaSplitMode`. -#[allow(clippy::cast_possible_wrap)] impl From for u32 { fn from(value: LlamaSplitMode) -> Self { match value { - LlamaSplitMode::None => llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE as _, - LlamaSplitMode::Layer => llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER as _, - LlamaSplitMode::Row => llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW as _, + LlamaSplitMode::None => LLAMA_SPLIT_MODE_NONE as u32, + LlamaSplitMode::Layer => LLAMA_SPLIT_MODE_LAYER as u32, + LlamaSplitMode::Row => LLAMA_SPLIT_MODE_ROW as u32, } } } From 86b19963557f4d38f96d4a79fe90d4893c7d96d9 Mon Sep 17 00:00:00 2001 From: Kusaanko <39370373+kusaanko@users.noreply.github.com> Date: Fri, 14 Nov 2025 03:07:26 +0900 Subject: [PATCH 12/12] Fix split_mode assert Update the doc test for LlamaModelParams to assert that split_mode returns Ok(LlamaSplitMode::Layer) instead of just LlamaSplitMode::Layer, reflecting the actual return type. --- llama-cpp-2/src/model/params.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama-cpp-2/src/model/params.rs b/llama-cpp-2/src/model/params.rs index 5a67e70d..47f4c257 100644 --- a/llama-cpp-2/src/model/params.rs +++ b/llama-cpp-2/src/model/params.rs @@ -411,7 +411,7 @@ impl LlamaModelParams { /// assert_eq!(params.vocab_only(), false, "vocab_only should be false"); /// assert_eq!(params.use_mmap(), true, "use_mmap should be true"); /// assert_eq!(params.use_mlock(), false, "use_mlock should be false"); -/// assert_eq!(params.split_mode(), LlamaSplitMode::Layer, "split_mode should be LAYER"); +/// assert_eq!(params.split_mode(), Ok(LlamaSplitMode::Layer), "split_mode should be LAYER"); /// assert_eq!(params.devices().len(), 0, "devices should be empty"); /// ``` impl Default for LlamaModelParams {