@@ -256,7 +256,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
256256 return nullptr;
257257}
258258
259- // CPU: ACCEL -> CPU extra -> GPU host -> CPU
259+ // CPU: ACCEL -> GPU host -> CPU extra -> CPU
260260static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
261261 buft_list_t buft_list;
262262
@@ -272,32 +272,6 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
272272 }
273273 }
274274
275- bool has_gpu_device = false;
276- for (auto * dev : devices) {
277- if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
278- has_gpu_device = true;
279- break;
280- }
281- }
282-
283- // add extra buffer types, only if no GPU device is present
284- // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
285- if (!has_gpu_device) {
286- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
287- auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
288- auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
289- ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
290- if (ggml_backend_dev_get_extra_bufts_fn) {
291- ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
292- while (extra_bufts && *extra_bufts) {
293- buft_list.emplace_back(cpu_dev, *extra_bufts);
294- ++extra_bufts;
295- }
296- }
297- } else {
298- LLAMA_LOG_WARN("%s: disabling extra buffer types (i.e. repacking) since a GPU device is available\n", __func__);
299- }
300-
301275 // add a host buffer type
302276 // storing the tensors in a host buffer is useful when the processing of large batches
303277 // is offloaded to a GPU device, since it reduces the time spent on data transfers
@@ -312,6 +286,20 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
312286 }
313287 }
314288
289+ // add extra buffer types, only if no GPU device is present
290+ // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
291+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
292+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
293+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
294+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
295+ if (ggml_backend_dev_get_extra_bufts_fn) {
296+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
297+ while (extra_bufts && *extra_bufts) {
298+ buft_list.emplace_back(cpu_dev, *extra_bufts);
299+ ++extra_bufts;
300+ }
301+ }
302+
315303 // add the CPU buffer type
316304 for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
317305 ggml_backend_dev_t dev = ggml_backend_dev_get(i);
0 commit comments