diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
index e64ece79..e7299938 100644
--- a/examples/simple/src/main.rs
+++ b/examples/simple/src/main.rs
@@ -13,7 +13,7 @@ use llama_cpp_2::context::params::LlamaContextParams;
 use llama_cpp_2::llama_backend::LlamaBackend;
 use llama_cpp_2::llama_batch::LlamaBatch;
 use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue;
-use llama_cpp_2::model::params::LlamaModelParams;
+use llama_cpp_2::model::params::{LlamaModelParams, LlamaSplitMode};
 use llama_cpp_2::model::LlamaModel;
 use llama_cpp_2::model::{AddBos, Special};
 use llama_cpp_2::sampling::LlamaSampler;
@@ -48,6 +48,23 @@ struct Args {
     #[cfg(any(feature = "cuda", feature = "vulkan"))]
     #[clap(long)]
     disable_gpu: bool,
+    /// Set main GPU device index (default: 0)
+    ///
+    /// By setting this option, multiple GPU is disabled.
+    #[arg(
+        long,
+        help = "Set main GPU device id (default: 0). Disables multi-GPU."
+    )]
+    main_gpu: Option<i32>,
+    /// Set devices to use by index
+    ///
+    /// This option overrides `main-gpu` and enables multi-GPU.
+    #[arg(
+        long,
+        value_delimiter = ',',
+        help = "Set devices to use by index, separated by commas (e.g. --devices 0,1,2). Overrides main-gpu and enables multi-GPU."
+    )]
+    devices: Option<Vec<usize>>,
     #[cfg(any(feature = "cuda", feature = "vulkan"))]
     #[arg(long, help = "Keep MoE layers on CPU")]
     cmoe: bool,
@@ -72,6 +89,8 @@ struct Args {
     ctx_size: Option<NonZeroU32>,
     #[arg(short = 'v', long, help = "enable verbose llama.cpp logs")]
     verbose: bool,
+    #[arg(long, help = "list backend devices")]
+    list_devices: bool,
 }
 
 /// Parse a single key-value pair
@@ -132,6 +151,8 @@ fn main() -> Result<()> {
         file,
         #[cfg(any(feature = "cuda", feature = "vulkan"))]
         disable_gpu,
+        main_gpu,
+        devices,
         #[cfg(any(feature = "cuda", feature = "vulkan"))]
         cmoe,
         key_value_overrides,
@@ -140,6 +161,7 @@ fn main() -> Result<()> {
         threads_batch,
         ctx_size,
         verbose,
+        list_devices,
     } = Args::parse();
 
     if verbose {
@@ -151,8 +173,26 @@ fn main() -> Result<()> {
     // init LLM
     let backend = LlamaBackend::init()?;
 
+    if list_devices {
+        let devices = llama_cpp_2::list_llama_ggml_backend_devices();
+        for (i, dev) in devices.iter().enumerate() {
+            println!("Device {i:>2}: {}", dev.name);
+            println!("           Description: {}", dev.description);
+            println!("           Device Type: {:?}", dev.device_type);
+            println!("           Backend: {}", dev.backend);
+            println!(
+                "           Memory total: {:?} MiB",
+                dev.memory_total / 1024 / 1024
+            );
+            println!(
+                "           Memory free:  {:?} MiB",
+                dev.memory_free / 1024 / 1024
+            );
+        }
+    }
+
     // offload all layers to the gpu
-    let model_params = {
+    let mut model_params = {
         #[cfg(any(feature = "cuda", feature = "vulkan"))]
         if !disable_gpu {
             LlamaModelParams::default().with_n_gpu_layers(1000)
@@ -163,6 +203,19 @@ fn main() -> Result<()> {
         LlamaModelParams::default()
     };
 
+    if let Some(devices) = devices {
+        model_params = model_params
+            .with_devices(&devices)
+            .with_context(|| "invalid device index in --devices")?;
+        if main_gpu.is_some() {
+            eprintln!("warning: --devices overrides --main-gpu");
+        }
+    } else if let Some(main_gpu) = main_gpu {
+        model_params = model_params.with_main_gpu(main_gpu);
+        // Enable single GPU mode
+        model_params = model_params.with_split_mode(LlamaSplitMode::None);
+    }
+
     let prompt = if let Some(str) = prompt {
         if file.is_some() {
             bail!("either prompt or file must be specified, but not both")
diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 2038b5e0..16f05e31 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -66,6 +66,12 @@ pub enum LLamaCppError {
     #[error(transparent)]
     EmbeddingError(#[from] EmbeddingsError),
     // See [`LlamaSamplerError`]
+    /// Backend device not found
+    #[error("Backend device {0} not found")]
+    BackendDeviceNotFound(usize),
+    /// Max devices exceeded
+    #[error("Max devices exceeded. Max devices is {0}")]
+    MaxDevicesExceeded(usize),
 }
 
 /// There was an error while getting the chat template from a model.
@@ -349,6 +355,91 @@ pub fn llama_supports_mlock() -> bool {
     unsafe { llama_cpp_sys_2::llama_supports_mlock() }
 }
 
+/// Backend device type
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum LlamaBackendDeviceType {
+    /// CPU device
+    Cpu,
+    /// ACCEL device
+    Accelerator,
+    /// GPU device
+    Gpu,
+    /// iGPU device
+    IntegratedGpu,
+    /// Unknown device type
+    Unknown,
+}
+
+/// A ggml backend device
+///
+/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
+#[derive(Debug, Clone)]
+pub struct LlamaBackendDevice {
+    /// The index of the device
+    ///
+    /// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
+    pub index: usize,
+    /// The name of the device (e.g. "Vulkan0")
+    pub name: String,
+    /// A description of the device (e.g. "NVIDIA GeForce RTX 3080")
+    pub description: String,
+    /// The backend of the device (e.g. "Vulkan", "CUDA", "CPU")
+    pub backend: String,
+    /// Total memory of the device in bytes
+    pub memory_total: usize,
+    /// Free memory of the device in bytes
+    pub memory_free: usize,
+    /// Device type
+    pub device_type: LlamaBackendDeviceType,
+}
+
+/// List ggml backend devices
+#[must_use]
+pub fn list_llama_ggml_backend_devices() -> Vec<LlamaBackendDevice> {
+    let mut devices = Vec::new();
+    for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } {
+        fn cstr_to_string(ptr: *const i8) -> String {
+            if ptr.is_null() {
+                String::new()
+            } else {
+                unsafe { std::ffi::CStr::from_ptr(ptr) }
+                    .to_string_lossy()
+                    .to_string()
+            }
+        }
+        let dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(i) };
+        let props = unsafe {
+            let mut props = std::mem::zeroed();
+            llama_cpp_sys_2::ggml_backend_dev_get_props(dev, &raw mut props);
+            props
+        };
+        let name = cstr_to_string(props.name);
+        let description = cstr_to_string(props.description);
+        let backend = unsafe { llama_cpp_sys_2::ggml_backend_dev_backend_reg(dev) };
+        let backend_name = unsafe { llama_cpp_sys_2::ggml_backend_reg_name(backend) };
+        let backend = cstr_to_string(backend_name);
+        let memory_total = props.memory_total;
+        let memory_free = props.memory_free;
+        let device_type = match props.type_ {
+            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Cpu,
+            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => LlamaBackendDeviceType::Accelerator,
+            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::Gpu,
+            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => LlamaBackendDeviceType::IntegratedGpu,
+            _ => LlamaBackendDeviceType::Unknown,
+        };
+        devices.push(LlamaBackendDevice {
+            index: i,
+            name,
+            description,
+            backend,
+            memory_total,
+            memory_free,
+            device_type,
+        });
+    }
+    devices
+}
+
 /// Options to configure how llama.cpp logs are intercepted.
 #[derive(Default, Debug, Clone)]
 pub struct LogOptions {
diff --git a/llama-cpp-2/src/model/params.rs b/llama-cpp-2/src/model/params.rs
index 822bc69f..47f4c257 100644
--- a/llama-cpp-2/src/model/params.rs
+++ b/llama-cpp-2/src/model/params.rs
@@ -1,6 +1,7 @@
 //! A safe wrapper around `llama_model_params`.
 
 use crate::model::params::kv_overrides::KvOverrides;
+use crate::LLamaCppError;
 use std::ffi::{c_char, CStr};
 use std::fmt::{Debug, Formatter};
 use std::pin::Pin;
@@ -8,12 +9,116 @@ use std::ptr::null;
 
 pub mod kv_overrides;
 
+#[allow(clippy::cast_possible_wrap)]
+#[allow(clippy::cast_possible_truncation)]
+const LLAMA_SPLIT_MODE_NONE: i8 = llama_cpp_sys_2::LLAMA_SPLIT_MODE_NONE as i8;
+#[allow(clippy::cast_possible_wrap)]
+#[allow(clippy::cast_possible_truncation)]
+const LLAMA_SPLIT_MODE_LAYER: i8 = llama_cpp_sys_2::LLAMA_SPLIT_MODE_LAYER as i8;
+#[allow(clippy::cast_possible_wrap)]
+#[allow(clippy::cast_possible_truncation)]
+const LLAMA_SPLIT_MODE_ROW: i8 = llama_cpp_sys_2::LLAMA_SPLIT_MODE_ROW as i8;
+
+/// A rusty wrapper around `llama_split_mode`.
+#[repr(i8)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum LlamaSplitMode {
+    /// Single GPU
+    None = LLAMA_SPLIT_MODE_NONE,
+    /// Split layers and KV across GPUs
+    Layer = LLAMA_SPLIT_MODE_LAYER,
+    /// Split layers and KV across GPUs, use tensor parallelism if supported
+    Row = LLAMA_SPLIT_MODE_ROW,
+}
+
+/// An error that occurs when unknown split mode is encountered.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct LlamaSplitModeParseError(pub i32);
+
+/// Create a `LlamaSplitMode` from a `i32`.
+///
+/// # Errors
+/// Returns `LlamaSplitModeParseError` if the value does not correspond to a valid `LlamaSplitMode`.
+impl TryFrom<i32> for LlamaSplitMode {
+    type Error = LlamaSplitModeParseError;
+
+    fn try_from(value: i32) -> Result<Self, Self::Error> {
+        let i8_value = value
+            .try_into()
+            .map_err(|_| LlamaSplitModeParseError(value))?;
+        match i8_value {
+            LLAMA_SPLIT_MODE_NONE => Ok(Self::None),
+            LLAMA_SPLIT_MODE_LAYER => Ok(Self::Layer),
+            LLAMA_SPLIT_MODE_ROW => Ok(Self::Row),
+            _ => Err(LlamaSplitModeParseError(value)),
+        }
+    }
+}
+
+/// Create a `LlamaSplitMode` from a `u32`.
+///
+/// # Errors
+/// Returns `LlamaSplitModeParseError` if the value does not correspond to a valid `LlamaSplitMode`.
+impl TryFrom<u32> for LlamaSplitMode {
+    type Error = LlamaSplitModeParseError;
+
+    fn try_from(value: u32) -> Result<Self, Self::Error> {
+        let i8_value = value
+            .try_into()
+            .map_err(|_| LlamaSplitModeParseError(value.try_into().unwrap_or(i32::MAX)))?;
+        match i8_value {
+            LLAMA_SPLIT_MODE_NONE => Ok(Self::None),
+            LLAMA_SPLIT_MODE_LAYER => Ok(Self::Layer),
+            LLAMA_SPLIT_MODE_ROW => Ok(Self::Row),
+            _ => Err(LlamaSplitModeParseError(
+                value.try_into().unwrap_or(i32::MAX),
+            )),
+        }
+    }
+}
+
+/// Create a `i32` from a `LlamaSplitMode`.
+impl From<LlamaSplitMode> for i32 {
+    fn from(value: LlamaSplitMode) -> Self {
+        match value {
+            LlamaSplitMode::None => LLAMA_SPLIT_MODE_NONE.into(),
+            LlamaSplitMode::Layer => LLAMA_SPLIT_MODE_LAYER.into(),
+            LlamaSplitMode::Row => LLAMA_SPLIT_MODE_ROW.into(),
+        }
+    }
+}
+
+/// Create a `u32` from a `LlamaSplitMode`.
+impl From<LlamaSplitMode> for u32 {
+    fn from(value: LlamaSplitMode) -> Self {
+        match value {
+            LlamaSplitMode::None => LLAMA_SPLIT_MODE_NONE as u32,
+            LlamaSplitMode::Layer => LLAMA_SPLIT_MODE_LAYER as u32,
+            LlamaSplitMode::Row => LLAMA_SPLIT_MODE_ROW as u32,
+        }
+    }
+}
+
+/// The default split mode is `Layer` in llama.cpp.
+impl Default for LlamaSplitMode {
+    fn default() -> Self {
+        LlamaSplitMode::Layer
+    }
+}
+
+/// The maximum number of devices supported.
+///
+/// The real maximum number of devices is the lesser one of this value and the value returned by
+/// `llama_cpp_2::max_devices()`.
+pub const LLAMA_CPP_MAX_DEVICES: usize = 16;
+
 /// A safe wrapper around `llama_model_params`.
 #[allow(clippy::module_name_repetitions)]
 pub struct LlamaModelParams {
     pub(crate) params: llama_cpp_sys_2::llama_model_params,
     kv_overrides: Vec<llama_cpp_sys_2::llama_model_kv_override>,
     buft_overrides: Vec<llama_cpp_sys_2::llama_model_tensor_buft_override>,
+    devices: Pin<Box<[llama_cpp_sys_2::ggml_backend_dev_t; LLAMA_CPP_MAX_DEVICES]>>,
 }
 
 impl Debug for LlamaModelParams {
@@ -24,6 +129,8 @@ impl Debug for LlamaModelParams {
             .field("vocab_only", &self.params.vocab_only)
             .field("use_mmap", &self.params.use_mmap)
             .field("use_mlock", &self.params.use_mlock)
+            .field("split_mode", &self.split_mode())
+            .field("devices", &self.devices)
             .field("kv_overrides", &"vec of kv_overrides")
             .finish()
     }
@@ -181,6 +288,38 @@ impl LlamaModelParams {
         self.params.use_mlock
     }
 
+    /// get the split mode
+    ///
+    /// # Errors
+    /// Returns `LlamaSplitModeParseError` if the unknown split mode is encountered.
+    pub fn split_mode(&self) -> Result<LlamaSplitMode, LlamaSplitModeParseError> {
+        LlamaSplitMode::try_from(self.params.split_mode)
+    }
+
+    /// get the devices
+    #[must_use]
+    pub fn devices(&self) -> Vec<usize> {
+        let mut backend_devices = Vec::new();
+        for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } {
+            let dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(i) };
+            backend_devices.push(dev);
+        }
+        let mut devices = Vec::new();
+        for &dev in self.devices.iter() {
+            if dev.is_null() {
+                break;
+            }
+            if let Some((index, _)) = backend_devices
+                .iter()
+                .enumerate()
+                .find(|&(_i, &d)| d == dev)
+            {
+                devices.push(index);
+            }
+        }
+        devices
+    }
+
     /// sets the number of gpu layers to offload to the GPU.
     /// ```
     /// # use llama_cpp_2::model::params::LlamaModelParams;
@@ -198,6 +337,8 @@ impl LlamaModelParams {
     }
 
     /// sets the main GPU
+    ///
+    /// To enable this option, you must set `split_mode` to `LlamaSplitMode::None` to enable single GPU mode.
     #[must_use]
     pub fn with_main_gpu(mut self, main_gpu: i32) -> Self {
         self.params.main_gpu = main_gpu;
@@ -217,17 +358,61 @@ impl LlamaModelParams {
         self.params.use_mlock = use_mlock;
         self
     }
+
+    /// sets `split_mode`
+    #[must_use]
+    pub fn with_split_mode(mut self, split_mode: LlamaSplitMode) -> Self {
+        self.params.split_mode = split_mode.into();
+        self
+    }
+
+    /// sets `devices`
+    ///
+    /// The devices are specified as indices that correspond to the ggml backend device indices.
+    ///
+    /// The maximum number of devices is 16.
+    ///
+    /// You don't need to specify CPU or ACCEL devices.
+    ///
+    /// # Errors
+    /// Returns `LLamaCppError::BackendDeviceNotFound` if any device index is invalid.
+    pub fn with_devices(mut self, devices: &[usize]) -> Result<Self, LLamaCppError> {
+        for dev in self.devices.iter_mut() {
+            *dev = std::ptr::null_mut();
+        }
+        // Check device count
+        let max_devices = crate::max_devices().min(LLAMA_CPP_MAX_DEVICES);
+        if devices.len() > max_devices {
+            return Err(LLamaCppError::MaxDevicesExceeded(max_devices));
+        }
+        for (i, &dev) in devices.iter().enumerate() {
+            if dev >= unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } {
+                return Err(LLamaCppError::BackendDeviceNotFound(dev));
+            }
+            let backend_dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(dev) };
+            self.devices[i] = backend_dev;
+        }
+        if self.devices.is_empty() {
+            self.params.devices = std::ptr::null_mut();
+        } else {
+            self.params.devices = self.devices.as_mut_ptr();
+        }
+        Ok(self)
+    }
 }
 
 /// Default parameters for `LlamaModel`. (as defined in llama.cpp by `llama_model_default_params`)
 /// ```
 /// # use llama_cpp_2::model::params::LlamaModelParams;
+/// use llama_cpp_2::model::params::LlamaSplitMode;
 /// let params = LlamaModelParams::default();
 /// assert_eq!(params.n_gpu_layers(), 999, "n_gpu_layers should be 999");
 /// assert_eq!(params.main_gpu(), 0, "main_gpu should be 0");
 /// assert_eq!(params.vocab_only(), false, "vocab_only should be false");
 /// assert_eq!(params.use_mmap(), true, "use_mmap should be true");
 /// assert_eq!(params.use_mlock(), false, "use_mlock should be false");
+/// assert_eq!(params.split_mode(), Ok(LlamaSplitMode::Layer), "split_mode should be LAYER");
+/// assert_eq!(params.devices().len(), 0, "devices should be empty");
 /// ```
 impl Default for LlamaModelParams {
     fn default() -> Self {
@@ -246,6 +431,7 @@ impl Default for LlamaModelParams {
                 pattern: std::ptr::null(),
                 buft: std::ptr::null_mut(),
             }],
+            devices: Box::pin([std::ptr::null_mut(); 16]),
         }
     }
 }