diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs index e64ece79..e7299938 100644 --- a/examples/simple/src/main.rs +++ b/examples/simple/src/main.rs @@ -13,7 +13,7 @@ use llama_cpp_2::context::params::LlamaContextParams; use llama_cpp_2::llama_backend::LlamaBackend; use llama_cpp_2::llama_batch::LlamaBatch; use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue; -use llama_cpp_2::model::params::LlamaModelParams; +use llama_cpp_2::model::params::{LlamaModelParams, LlamaSplitMode}; use llama_cpp_2::model::LlamaModel; use llama_cpp_2::model::{AddBos, Special}; use llama_cpp_2::sampling::LlamaSampler; @@ -48,6 +48,23 @@ struct Args { #[cfg(any(feature = "cuda", feature = "vulkan"))] #[clap(long)] disable_gpu: bool, + /// Set main GPU device index (default: 0) + /// + /// By setting this option, multiple GPU is disabled. + #[arg( + long, + help = "Set main GPU device id (default: 0). Disables multi-GPU." + )] + main_gpu: Option, + /// Set devices to use by index + /// + /// This option overrides `main-gpu` and enables multi-GPU. + #[arg( + long, + value_delimiter = ',', + help = "Set devices to use by index, separated by commas (e.g. --devices 0,1,2). Overrides main-gpu and enables multi-GPU." + )] + devices: Option>, #[cfg(any(feature = "cuda", feature = "vulkan"))] #[arg(long, help = "Keep MoE layers on CPU")] cmoe: bool, @@ -72,6 +89,8 @@ struct Args { ctx_size: Option, #[arg(short = 'v', long, help = "enable verbose llama.cpp logs")] verbose: bool, + #[arg(long, help = "list backend devices")] + list_devices: bool, } /// Parse a single key-value pair @@ -132,6 +151,8 @@ fn main() -> Result<()> { file, #[cfg(any(feature = "cuda", feature = "vulkan"))] disable_gpu, + main_gpu, + devices, #[cfg(any(feature = "cuda", feature = "vulkan"))] cmoe, key_value_overrides, @@ -140,6 +161,7 @@ fn main() -> Result<()> { threads_batch, ctx_size, verbose, + list_devices, } = Args::parse(); if verbose { @@ -151,8 +173,26 @@ fn main() -> Result<()> { // init LLM let backend = LlamaBackend::init()?; + if list_devices { + let devices = llama_cpp_2::list_llama_ggml_backend_devices(); + for (i, dev) in devices.iter().enumerate() { + println!("Device {i:>2}: {}", dev.name); + println!(" Description: {}", dev.description); + println!(" Device Type: {:?}", dev.device_type); + println!(" Backend: {}", dev.backend); + println!( + " Memory total: {:?} MiB", + dev.memory_total / 1024 / 1024 + ); + println!( + " Memory free: {:?} MiB", + dev.memory_free / 1024 / 1024 + ); + } + } + // offload all layers to the gpu - let model_params = { + let mut model_params = { #[cfg(any(feature = "cuda", feature = "vulkan"))] if !disable_gpu { LlamaModelParams::default().with_n_gpu_layers(1000) @@ -163,6 +203,19 @@ fn main() -> Result<()> { LlamaModelParams::default() }; + if let Some(devices) = devices { + model_params = model_params + .with_devices(&devices) + .with_context(|| "invalid device index in --devices")?; + if main_gpu.is_some() { + eprintln!("warning: --devices overrides --main-gpu"); + } + } else if let Some(main_gpu) = main_gpu { + model_params = model_params.with_main_gpu(main_gpu); + // Enable single GPU mode + model_params = model_params.with_split_mode(LlamaSplitMode::None); + } + let prompt = if let Some(str) = prompt { if file.is_some() { bail!("either prompt or file must be specified, but not both") diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs index 2038b5e0..16f05e31 100644 --- a/llama-cpp-2/src/lib.rs +++ b/llama-cpp-2/src/lib.rs @@ -66,6 +66,12 @@ pub enum LLamaCppError { #[error(transparent)] EmbeddingError(#[from] EmbeddingsError), // See [`LlamaSamplerError`] + /// Backend device not found + #[error("Backend device {0} not found")] + BackendDeviceNotFound(usize), + /// Max devices exceeded + #[error("Max devices exceeded. Max devices is {0}")] + MaxDevicesExceeded(usize), } /// There was an error while getting the chat template from a model. @@ -349,6 +355,91 @@ pub fn llama_supports_mlock() -> bool { unsafe { llama_cpp_sys_2::llama_supports_mlock() } } +/// Backend device type +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum LlamaBackendDeviceType { + /// CPU device + Cpu, + /// ACCEL device + Accelerator, + /// GPU device + Gpu, + /// iGPU device + IntegratedGpu, + /// Unknown device type + Unknown, +} + +/// A ggml backend device +/// +/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices. +#[derive(Debug, Clone)] +pub struct LlamaBackendDevice { + /// The index of the device + /// + /// The index is can be used from `LlamaModelParams::with_devices` to select specific devices. + pub index: usize, + /// The name of the device (e.g. "Vulkan0") + pub name: String, + /// A description of the device (e.g. "NVIDIA GeForce RTX 3080") + pub description: String, + /// The backend of the device (e.g. "Vulkan", "CUDA", "CPU") + pub backend: String, + /// Total memory of the device in bytes + pub memory_total: usize, + /// Free memory of the device in bytes + pub memory_free: usize, + /// Device type + pub device_type: LlamaBackendDeviceType, +} + +/// List ggml backend devices +#[must_use] +pub fn list_llama_ggml_backend_devices() -> Vec { + let mut devices = Vec::new(); + for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } { + fn cstr_to_string(ptr: *const i8) -> String { + if ptr.is_null() { + String::new() + } else { + unsafe { std::ffi::CStr::from_ptr(ptr) } + .to_string_lossy() + .to_string() + } + } + let dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(i) }; + let props = unsafe { + let mut props = std::mem::zeroed(); + llama_cpp_sys_2::ggml_backend_dev_get_props(dev, &raw mut props); + props + }; + let name = cstr_to_string(props.name); + let description = cstr_to_string(props.description); + let backend = unsafe { llama_cpp_sys_2::ggml_backend_dev_backend_reg(dev) }; + let backend_name = unsafe { llama_cpp_sys_2::ggml_backend_reg_name(backend) }; + let backend = cstr_to_string(backend_name); + let memory_total = props.memory_total; + let memory_free = props.memory_free; + let device_type = match props.type_ { + llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Cpu, + llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => LlamaBackendDeviceType::Accelerator, + llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::Gpu, + llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => LlamaBackendDeviceType::IntegratedGpu, + _ => LlamaBackendDeviceType::Unknown, + }; + devices.push(LlamaBackendDevice { + index: i, + name, + description, + backend, + memory_total, + memory_free, + device_type, + }); + } + devices +} + /// Options to configure how llama.cpp logs are intercepted. #[derive(Default, Debug, Clone)] pub struct LogOptions { diff --git a/llama-cpp-2/src/model/params.rs b/llama-cpp-2/src/model/params.rs index 822bc69f..47f4c257 100644 --- a/llama-cpp-2/src/model/params.rs +++ b/llama-cpp-2/src/model/params.rs @@ -1,6 +1,7 @@ //! A safe wrapper around `llama_model_params`. use crate::model::params::kv_overrides::KvOverrides; +use crate::LLamaCppError; use std::ffi::{c_char, CStr}; use std::fmt::{Debug, Formatter}; use std::pin::Pin; @@ -8,12 +9,116 @@ use std::ptr::null; pub mod kv_overrides; +#[allow(clippy::cast_possible_wrap)] +#[allow(clippy::cast_possible_truncation)] +const LLAMA_SPLIT_MODE_NONE: i8 = llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE as i8; +#[allow(clippy::cast_possible_wrap)] +#[allow(clippy::cast_possible_truncation)] +const LLAMA_SPLIT_MODE_LAYER: i8 = llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER as i8; +#[allow(clippy::cast_possible_wrap)] +#[allow(clippy::cast_possible_truncation)] +const LLAMA_SPLIT_MODE_ROW: i8 = llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW as i8; + +/// A rusty wrapper around `llama_split_mode`. +#[repr(i8)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum LlamaSplitMode { + /// Single GPU + None = LLAMA_SPLIT_MODE_NONE, + /// Split layers and KV across GPUs + Layer = LLAMA_SPLIT_MODE_LAYER, + /// Split layers and KV across GPUs, use tensor parallelism if supported + Row = LLAMA_SPLIT_MODE_ROW, +} + +/// An error that occurs when unknown split mode is encountered. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct LlamaSplitModeParseError(pub i32); + +/// Create a `LlamaSplitMode` from a `i32`. +/// +/// # Errors +/// Returns `LlamaSplitModeParseError` if the value does not correspond to a valid `LlamaSplitMode`. +impl TryFrom for LlamaSplitMode { + type Error = LlamaSplitModeParseError; + + fn try_from(value: i32) -> Result { + let i8_value = value + .try_into() + .map_err(|_| LlamaSplitModeParseError(value))?; + match i8_value { + LLAMA_SPLIT_MODE_NONE => Ok(Self::None), + LLAMA_SPLIT_MODE_LAYER => Ok(Self::Layer), + LLAMA_SPLIT_MODE_ROW => Ok(Self::Row), + _ => Err(LlamaSplitModeParseError(value)), + } + } +} + +/// Create a `LlamaSplitMode` from a `u32`. +/// +/// # Errors +/// Returns `LlamaSplitModeParseError` if the value does not correspond to a valid `LlamaSplitMode`. +impl TryFrom for LlamaSplitMode { + type Error = LlamaSplitModeParseError; + + fn try_from(value: u32) -> Result { + let i8_value = value + .try_into() + .map_err(|_| LlamaSplitModeParseError(value.try_into().unwrap_or(i32::MAX)))?; + match i8_value { + LLAMA_SPLIT_MODE_NONE => Ok(Self::None), + LLAMA_SPLIT_MODE_LAYER => Ok(Self::Layer), + LLAMA_SPLIT_MODE_ROW => Ok(Self::Row), + _ => Err(LlamaSplitModeParseError( + value.try_into().unwrap_or(i32::MAX), + )), + } + } +} + +/// Create a `i32` from a `LlamaSplitMode`. +impl From for i32 { + fn from(value: LlamaSplitMode) -> Self { + match value { + LlamaSplitMode::None => LLAMA_SPLIT_MODE_NONE.into(), + LlamaSplitMode::Layer => LLAMA_SPLIT_MODE_LAYER.into(), + LlamaSplitMode::Row => LLAMA_SPLIT_MODE_ROW.into(), + } + } +} + +/// Create a `u32` from a `LlamaSplitMode`. +impl From for u32 { + fn from(value: LlamaSplitMode) -> Self { + match value { + LlamaSplitMode::None => LLAMA_SPLIT_MODE_NONE as u32, + LlamaSplitMode::Layer => LLAMA_SPLIT_MODE_LAYER as u32, + LlamaSplitMode::Row => LLAMA_SPLIT_MODE_ROW as u32, + } + } +} + +/// The default split mode is `Layer` in llama.cpp. +impl Default for LlamaSplitMode { + fn default() -> Self { + LlamaSplitMode::Layer + } +} + +/// The maximum number of devices supported. +/// +/// The real maximum number of devices is the lesser one of this value and the value returned by +/// `llama_cpp_2::max_devices()`. +pub const LLAMA_CPP_MAX_DEVICES: usize = 16; + /// A safe wrapper around `llama_model_params`. #[allow(clippy::module_name_repetitions)] pub struct LlamaModelParams { pub(crate) params: llama_cpp_sys_2::llama_model_params, kv_overrides: Vec, buft_overrides: Vec, + devices: Pin>, } impl Debug for LlamaModelParams { @@ -24,6 +129,8 @@ impl Debug for LlamaModelParams { .field("vocab_only", &self.params.vocab_only) .field("use_mmap", &self.params.use_mmap) .field("use_mlock", &self.params.use_mlock) + .field("split_mode", &self.split_mode()) + .field("devices", &self.devices) .field("kv_overrides", &"vec of kv_overrides") .finish() } @@ -181,6 +288,38 @@ impl LlamaModelParams { self.params.use_mlock } + /// get the split mode + /// + /// # Errors + /// Returns `LlamaSplitModeParseError` if the unknown split mode is encountered. + pub fn split_mode(&self) -> Result { + LlamaSplitMode::try_from(self.params.split_mode) + } + + /// get the devices + #[must_use] + pub fn devices(&self) -> Vec { + let mut backend_devices = Vec::new(); + for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } { + let dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(i) }; + backend_devices.push(dev); + } + let mut devices = Vec::new(); + for &dev in self.devices.iter() { + if dev.is_null() { + break; + } + if let Some((index, _)) = backend_devices + .iter() + .enumerate() + .find(|&(_i, &d)| d == dev) + { + devices.push(index); + } + } + devices + } + /// sets the number of gpu layers to offload to the GPU. /// ``` /// # use llama_cpp_2::model::params::LlamaModelParams; @@ -198,6 +337,8 @@ impl LlamaModelParams { } /// sets the main GPU + /// + /// To enable this option, you must set `split_mode` to `LlamaSplitMode::None` to enable single GPU mode. #[must_use] pub fn with_main_gpu(mut self, main_gpu: i32) -> Self { self.params.main_gpu = main_gpu; @@ -217,17 +358,61 @@ impl LlamaModelParams { self.params.use_mlock = use_mlock; self } + + /// sets `split_mode` + #[must_use] + pub fn with_split_mode(mut self, split_mode: LlamaSplitMode) -> Self { + self.params.split_mode = split_mode.into(); + self + } + + /// sets `devices` + /// + /// The devices are specified as indices that correspond to the ggml backend device indices. + /// + /// The maximum number of devices is 16. + /// + /// You don't need to specify CPU or ACCEL devices. + /// + /// # Errors + /// Returns `LLamaCppError::BackendDeviceNotFound` if any device index is invalid. + pub fn with_devices(mut self, devices: &[usize]) -> Result { + for dev in self.devices.iter_mut() { + *dev = std::ptr::null_mut(); + } + // Check device count + let max_devices = crate::max_devices().min(LLAMA_CPP_MAX_DEVICES); + if devices.len() > max_devices { + return Err(LLamaCppError::MaxDevicesExceeded(max_devices)); + } + for (i, &dev) in devices.iter().enumerate() { + if dev >= unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } { + return Err(LLamaCppError::BackendDeviceNotFound(dev)); + } + let backend_dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(dev) }; + self.devices[i] = backend_dev; + } + if self.devices.is_empty() { + self.params.devices = std::ptr::null_mut(); + } else { + self.params.devices = self.devices.as_mut_ptr(); + } + Ok(self) + } } /// Default parameters for `LlamaModel`. (as defined in llama.cpp by `llama_model_default_params`) /// ``` /// # use llama_cpp_2::model::params::LlamaModelParams; +/// use llama_cpp_2::model::params::LlamaSplitMode; /// let params = LlamaModelParams::default(); /// assert_eq!(params.n_gpu_layers(), 999, "n_gpu_layers should be 999"); /// assert_eq!(params.main_gpu(), 0, "main_gpu should be 0"); /// assert_eq!(params.vocab_only(), false, "vocab_only should be false"); /// assert_eq!(params.use_mmap(), true, "use_mmap should be true"); /// assert_eq!(params.use_mlock(), false, "use_mlock should be false"); +/// assert_eq!(params.split_mode(), Ok(LlamaSplitMode::Layer), "split_mode should be LAYER"); +/// assert_eq!(params.devices().len(), 0, "devices should be empty"); /// ``` impl Default for LlamaModelParams { fn default() -> Self { @@ -246,6 +431,7 @@ impl Default for LlamaModelParams { pattern: std::ptr::null(), buft: std::ptr::null_mut(), }], + devices: Box::pin([std::ptr::null_mut(); 16]), } } }