diff --git a/CMakePresets.json b/CMakePresets.json index b5afeb3c0f2f9..77c654089abc7 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -30,6 +30,8 @@ { "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } }, { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } }, { "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } }, + { "name": "remoting_frontend", "hidden": true, "cacheVariables": { "GGML_REMOTING_FRONTEND": "ON" } }, + { "name": "remoting_backend", "hidden": true, "cacheVariables": { "GGML_REMOTING_BACKEND": "ON" } }, { "name": "x64-windows-llvm", "hidden": true, diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 181f179ed171c..6aff611728ac5 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -226,6 +226,8 @@ option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU) option(GGML_WEBGPU_GPU_PROFILE "ggml: enable WebGPU profiling (GPU)" OFF) option(GGML_ZDNN "ggml: use zDNN" OFF) +option(GGML_REMOTING_FRONTEND "ggml: use the API Remoting frontend" OFF) +option(GGML_REMOTING_BACKEND "ggml: use the API Remoting backend" OFF) option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF) option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF) @@ -317,6 +319,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-sycl.h include/ggml-vulkan.h include/ggml-webgpu.h + include/ggml-remoting-frontend.h include/gguf.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") diff --git a/ggml/include/ggml-remoting-frontend.h b/ggml/include/ggml-remoting-frontend.h new file mode 100644 index 0000000000000..4c7cd585ea4af --- /dev/null +++ b/ggml/include/ggml-remoting-frontend.h @@ -0,0 +1,16 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_REMOTING_FRONTEND_NAME "RemotingFrontend" + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_remoting_frontend_reg(); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index f30e4ac9020fa..3cdd142a42655 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -406,6 +406,8 @@ ggml_add_backend(WebGPU) ggml_add_backend(zDNN) ggml_add_backend(OpenCL) ggml_add_backend(Hexagon) +ggml_add_backend(RemotingFrontend) +ggml_add_backend(RemotingBackend) foreach (target ggml-base ggml) target_include_directories(${target} PUBLIC $ $) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index e96b5c403dd3f..ea26b3560a593 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -73,6 +73,10 @@ #include "ggml-cann.h" #endif +#ifdef GGML_USE_REMOTINGFRONTEND +#include "ggml-remoting-frontend.h" +#endif + // disable C++17 deprecation warning for std::codecvt_utf8 #if defined(__clang__) # pragma clang diagnostic push @@ -200,6 +204,10 @@ struct ggml_backend_registry { #ifdef GGML_USE_ZDNN register_backend(ggml_backend_zdnn_reg()); #endif +#ifdef GGML_USE_REMOTINGFRONTEND + register_backend(ggml_backend_remoting_frontend_reg()); +#endif + #ifdef GGML_USE_OPENCL register_backend(ggml_backend_opencl_reg()); #endif @@ -604,6 +612,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) { ggml_backend_load_best("rpc", silent, dir_path); ggml_backend_load_best("sycl", silent, dir_path); ggml_backend_load_best("vulkan", silent, dir_path); + ggml_backend_load_best("remoting_frontend", silent, dir_path); ggml_backend_load_best("opencl", silent, dir_path); ggml_backend_load_best("hexagon", silent, dir_path); ggml_backend_load_best("musa", silent, dir_path); diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt index 63418fe143083..e41ac8839bfa1 100644 --- a/ggml/src/ggml-metal/CMakeLists.txt +++ b/ggml/src/ggml-metal/CMakeLists.txt @@ -11,6 +11,7 @@ ggml_add_backend_library(ggml-metal ggml-metal-common.cpp ggml-metal-context.m ggml-metal-ops.cpp + ggml-metal-remoting.cpp ) target_link_libraries(ggml-metal PRIVATE diff --git a/ggml/src/ggml-metal/ggml-metal-remoting.cpp b/ggml/src/ggml-metal/ggml-metal-remoting.cpp new file mode 100644 index 0000000000000..420f5fa4e1a06 --- /dev/null +++ b/ggml/src/ggml-metal/ggml-metal-remoting.cpp @@ -0,0 +1,28 @@ +#include "ggml-backend.h" +#include "ggml-backend-impl.h" +#include "ggml-impl.h" + +#include "ggml-metal-device.h" +#include "ggml-metal-impl.h" +#include "ggml-metal-context.h" + +extern "C" { + GGML_BACKEND_API void ggml_backend_metal_get_device_context(ggml_backend_dev_t dev, + bool *has_simdgroup_mm, + bool *has_simdgroup_reduction, + bool *use_bfloat); + + GGML_BACKEND_API void + ggml_backend_metal_get_device_context(ggml_backend_dev_t dev, + bool *has_simdgroup_mm, + bool *has_simdgroup_reduction, + bool *has_bfloat) { + ggml_metal_device_t dev_ctx = (ggml_metal_device_t)dev->context; + + const struct ggml_metal_device_props *props = ggml_metal_device_get_props(dev_ctx); + + *has_bfloat = props->has_bfloat; + *has_simdgroup_reduction = props->has_simdgroup_reduction; + *has_simdgroup_mm = props->has_simdgroup_mm; + } +} diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt new file mode 100644 index 0000000000000..4b796ff42bd4b --- /dev/null +++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt @@ -0,0 +1,21 @@ +cmake_minimum_required(VERSION 3.19) +cmake_policy(SET CMP0114 NEW) + +message(STATUS "Enable API Remoting backend") + +ggml_add_backend_library(ggml-remotingbackend + backend.cpp + backend-dispatched.cpp + backend-dispatched-backend.cpp + backend-dispatched-device.cpp + backend-dispatched-buffer.cpp + backend-dispatched-buffer-type.cpp + backend-dispatched-metal.cpp + backend-utils.cpp + shared/api_remoting.h + shared/apir_backend.h + shared/venus_cs.h + venus_cs_ggml-rpc-back.cpp + ) + +target_compile_options(ggml-remotingbackend PRIVATE -std=c++20) diff --git a/ggml/src/ggml-remotingbackend/backend-convert.h b/ggml/src/ggml-remotingbackend/backend-convert.h new file mode 100644 index 0000000000000..b45c2784160ac --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-convert.h @@ -0,0 +1,15 @@ +#include "shared/apir_backend.h" + +#define BUFFER_TO_HOST_HANDLE(name) ggml_buffer_to_apir_handle(name) + +static inline apir_buffer_host_handle_t +ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) { + // in the backend, the buffer handle is the buffer pointer + return (apir_buffer_host_handle_t) buffer; +} + +static inline apir_buffer_type_host_handle_t +ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) { + // in the backend, the buffer handle is the buffer pointer + return (apir_buffer_type_host_handle_t) buft; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp new file mode 100644 index 0000000000000..9c73981c7bad4 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp @@ -0,0 +1,58 @@ +#include +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +#include "shared/apir_backend.h" + +struct timer_data graph_compute_timer = {0, 0, 0, "compute_timer"}; + +uint32_t +backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(enc); + + start_timer(&graph_compute_timer); + + uint32_t shmem_res_id; + vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); + + const void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id); + if (!shmem_data) { + FATAL("Couldn't get the shmem addr from virgl :/"); + } + size_t cgraph_size; + vn_decode_size_t(dec, &cgraph_size); + + struct vn_cs_decoder secondary_dec = vn_cs_new_decoder((const char *) shmem_data, cgraph_size); + + ggml_cgraph *cgraph = vn_decode_ggml_cgraph(&secondary_dec, cgraph_size); + + ggml_status status; +#if APIR_BACKEND_CHECK_SUPPORTS_OP == 1 + for (int idx = 0; idx < cgraph->n_nodes; idx++) { + ggml_tensor *op = ggml_graph_node(cgraph, idx); + if (dev->iface.supports_op(dev, op)) { + continue; + } + ERROR("Graph node %d (%s) not supported by the backend :/", idx, ggml_op_desc(op)); + + status = GGML_STATUS_ABORTED; + vn_encode_ggml_status(enc, &status); + + stop_timer(&graph_compute_timer); + return 0; + } +#endif + status = bck->iface.graph_compute(bck, cgraph); + bck->iface.synchronize(bck); + + vn_encode_ggml_status(enc, &status); + + stop_timer(&graph_compute_timer); + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp new file mode 100644 index 0000000000000..f925d1e066fc0 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp @@ -0,0 +1,81 @@ +#include +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +uint32_t +backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + ggml_backend_buffer_type_t buft; + buft = vn_decode_ggml_buffer_type(dec); + + const char *string = buft->iface.get_name(buft); + + const size_t string_size = strlen(string) + 1; + vn_encode_array_size(enc, string_size); + vn_encode_char_array(enc, string, string_size); + + return 0; +} + +uint32_t +backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + ggml_backend_buffer_type_t buft; + buft = vn_decode_ggml_buffer_type(dec); + + size_t value = buft->iface.get_alignment(buft); + vn_encode_size_t(enc, &value); + + return 0; +} + +uint32_t +backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + ggml_backend_buffer_type_t buft; + buft = vn_decode_ggml_buffer_type(dec); + + size_t value = buft->iface.get_max_size(buft); + vn_encode_size_t(enc, &value); + + return 0; +} + +uint32_t +backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + ggml_backend_buffer_type_t buft; + buft = vn_decode_ggml_buffer_type(dec); + + bool is_host = buft->iface.is_host(buft); + vn_encode_bool_t(enc, &is_host); + + return 0; +} + +uint32_t +backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + + ggml_backend_buffer_type_t buft; + buft = vn_decode_ggml_buffer_type(dec); + + size_t size; + vn_decode_size_t(dec, &size); + + ggml_backend_buffer_t buffer; + + buffer = buft->iface.alloc_buffer(buft, size); + + vn_encode_ggml_buffer(enc, buffer); + + if (buffer) { + track_backend_buffer(buffer); + } + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp new file mode 100644 index 0000000000000..fc1ccaef6748d --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp @@ -0,0 +1,143 @@ +#include +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"}; +struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"}; + +uint32_t +backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + ggml_backend_buffer_t buffer; + buffer = vn_decode_ggml_buffer(dec); + + uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer); + vn_encode_uintptr_t(enc, &base); + + return 0; +} + +uint32_t +backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(enc); + + start_timer(&set_tensor_timer); + + ggml_backend_buffer_t buffer; + buffer = vn_decode_ggml_buffer(dec); + + ggml_tensor *tensor; + // safe to remove the const qualifier here + tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor(dec); + + uint32_t shmem_res_id; + vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); + + size_t offset; + vn_decode_size_t(dec, &offset); + + size_t size; + vn_decode_size_t(dec, &size); + + void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id); + + if (!shmem_data) { + FATAL("Couldn't get the shmem addr from virgl :/"); + } + +#if 0 + INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu", + buffer, tensor, shmem_data, offset, size); +#endif +#if 0 + void **addr = (void **)(uintptr_t) shmem_data; + for (int i = 0; i <= 10; i++) { + INFO("%s: %p | %llx", __func__, addr, *addr); + addr++; + } + INFO("\n"); +#endif + + buffer->iface.set_tensor(buffer, tensor, shmem_data, offset, size); + + stop_timer(&set_tensor_timer); + + return 0; +} + +uint32_t +backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(enc); + + start_timer(&get_tensor_timer); + + ggml_backend_buffer_t buffer; + buffer = vn_decode_ggml_buffer(dec); + + + const ggml_tensor *tensor; + // safe to remove the const qualifier here + tensor = vn_decode_ggml_tensor(dec); + + uint32_t shmem_res_id; + vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); + + size_t offset; + vn_decode_size_t(dec, &offset); + + size_t size; + vn_decode_size_t(dec, &size); + + void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id); + if (!shmem_data) { + FATAL("Couldn't get the shmem addr from virgl :/"); + } + + UNUSED(buffer); + UNUSED(tensor); + buffer->iface.get_tensor(buffer, tensor, shmem_data, offset, size); + + stop_timer(&get_tensor_timer); + + return 0; +} + +uint32_t +backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(enc); + + ggml_backend_buffer_t buffer; + buffer = vn_decode_ggml_buffer(dec); + + uint8_t value; + vn_decode_uint8_t(dec, &value); + + buffer->iface.clear(buffer, value); + + return 0; +} + +uint32_t +backend_buffer_free_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(enc); + + ggml_backend_buffer_t buffer; + buffer = vn_decode_ggml_buffer(dec); + + if (!untrack_backend_buffer(buffer)) { + WARNING("%s: unknown buffer %p", (void *) buffer); + return 1; + } + + buffer->iface.free_buffer(buffer); + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp new file mode 100644 index 0000000000000..473e9d2db7089 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp @@ -0,0 +1,142 @@ +#include +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(ctx); + UNUSED(dec); + + int32_t dev_count = reg->iface.get_device_count(reg); + vn_encode_int32_t(enc, &dev_count); + + return 0; +} + +uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(dec); + + const char *string = dev->iface.get_name(dev); + + const size_t string_size = strlen(string) + 1; + vn_encode_array_size(enc, string_size); + vn_encode_char_array(enc, string, string_size); + + return 0; +} + +uint32_t +backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(dec); + + const char *string = dev->iface.get_description(dev); + + const size_t string_size = strlen(string) + 1; + vn_encode_array_size(enc, string_size); + vn_encode_char_array(enc, string, string_size); + + return 0; +} + +uint32_t +backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(dec); + + uint32_t type = dev->iface.get_type(dev); + vn_encode_uint32_t(enc, &type); + + return 0; +} + +uint32_t +backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(dec); + + size_t free, total; + dev->iface.get_memory(dev, &free, &total); + + vn_encode_size_t(enc, &free); + vn_encode_size_t(enc, &total); + + return 0; +} + +uint32_t +backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + + const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec); + + bool supports_op = dev->iface.supports_op(dev, op); + + vn_encode_bool_t(enc, &supports_op); + + return 0; +} + +uint32_t +backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(dec); + + ggml_backend_buffer_type_t bufft = dev->iface.get_buffer_type(dev); + + vn_encode_ggml_buffer_type(enc, bufft); + + return 0; +} + +uint32_t +backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(dec); + + struct ggml_backend_dev_props props; + dev->iface.get_props(dev, &props); + + vn_encode_bool_t(enc, &props.caps.async); + vn_encode_bool_t(enc, &props.caps.host_buffer); + vn_encode_bool_t(enc, &props.caps.buffer_from_host_ptr); + vn_encode_bool_t(enc, &props.caps.events); + + return 0; +} + +uint32_t +backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(dec); + + uint32_t shmem_res_id; + vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); + + void *shmem_ptr = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id); + if (!shmem_ptr) { + FATAL("Couldn't get the shmem addr from virgl :/"); + } + + size_t size; + vn_decode_size_t(dec, &size); + size_t max_tensor_size; + vn_decode_size_t(dec, &max_tensor_size); + + ggml_backend_buffer_t buffer; + buffer = dev->iface.buffer_from_host_ptr(dev, shmem_ptr, size, max_tensor_size); + + vn_encode_ggml_buffer(enc, buffer); + vn_encode_ggml_buffer_type(enc, buffer->buft); + + if (buffer) { + track_backend_buffer(buffer); + } + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-metal.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-metal.cpp new file mode 100644 index 0000000000000..38f02c07002d0 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-metal.cpp @@ -0,0 +1,41 @@ +#include +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +void (*ggml_backend_metal_get_device_context_fct)(ggml_backend_dev_t dev, + bool *has_simdgroup_mm, + bool *has_simdgroup_reduction, + bool *has_bfloat) = NULL; + +uint32_t +backend_metal_get_device_context(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(dec); + + bool has_simdgroup_mm; + bool has_simdgroup_reduction; + bool has_bfloat; + + uint32_t ret = 0; + if (ggml_backend_metal_get_device_context_fct) { + + ggml_backend_metal_get_device_context_fct(dev, + &has_simdgroup_mm, + &has_simdgroup_reduction, + &has_bfloat + ); + } else { + ERROR("ggml_backend_metal_get_device_context not available :/"); + ret = 1; + } + + vn_encode_bool_t(enc, &has_simdgroup_mm); + vn_encode_bool_t(enc, &has_simdgroup_reduction); + vn_encode_bool_t(enc, &has_bfloat); + + return ret; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp new file mode 100644 index 0000000000000..e93f5bccea709 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp @@ -0,0 +1,47 @@ +#include +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +#include "ggml-metal.h" + +ggml_backend_reg_t reg = NULL; +ggml_backend_dev_t dev = NULL; +ggml_backend_t bck = NULL; + +long long timer_start = 0; +long long timer_total = 0; +long long timer_count = 0; + +uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p) { + if (reg != NULL) { + FATAL("%s: already initialized :/", __func__); + } + ggml_backend_reg_t (* ggml_backend_reg_fct)(void) = (ggml_backend_reg_t (*)()) ggml_backend_reg_fct_p; + + reg = ggml_backend_reg_fct(); + if (reg == NULL) { + FATAL("%s: backend registration failed :/", __func__); + } + + if (reg->iface.get_device_count(reg)) { + dev = reg->iface.get_device(reg, 0); + } + + ggml_backend_t (* ggml_backend_fct)(int) = (ggml_backend_t (*)(int)) ggml_backend_init_fct_p; + + bck = ggml_backend_fct(0); + if (!bck) { + ERROR("%s: backend initialization failed :/", __func__); + return APIR_BACKEND_INITIALIZE_BACKEND_FAILED; + } + + size_t free, total; + dev->iface.get_memory(dev, &free, &total); + INFO("%s: free memory: %ld MB", __func__, (size_t) free/1024/1024); + + return APIR_BACKEND_INITIALIZE_SUCCESS; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h new file mode 100644 index 0000000000000..3420735cae8ec --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -0,0 +1,121 @@ +#pragma once + +#include +#include + +#include + +#include "backend-utils.h" +#include "backend-convert.h" +#include "shared/apir_backend.h" +#include "shared/venus_cs.h" +#include "shared/venus_cs_ggml.h" + +uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p); + +typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); + +/* *** */ + +uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); + +/* device */ +uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); + +/* buffer-type */ +uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); + +/* buffer */ +uint32_t backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_free_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); + +/* backend */ +uint32_t backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); + +/* metal */ +uint32_t backend_metal_get_device_context(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); + +static inline const char *backend_dispatch_command_name(ApirBackendCommandType type) +{ + switch (type) { + /* device */ + case APIR_COMMAND_TYPE_DEVICE_GET_COUNT: return "backend_get_device_count"; + case APIR_COMMAND_TYPE_DEVICE_GET_NAME: return "backend_get_device_name"; + case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION: return "backend_get_device_description"; + case APIR_COMMAND_TYPE_DEVICE_GET_TYPE: return "backend_device_get_type"; + case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: return "backend_get_device_memory"; + case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: return "backend_device_supports_op"; + case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE: return "backend_get_buffer_type"; + case APIR_COMMAND_TYPE_DEVICE_GET_PROPS: return "backend_get_props"; + case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR: return "backend_buffer_from_ptr"; + + /* buffer-type */ + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME: return "backend_buffer_type_get_name"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT: return "backend_buffer_type_get_alignment"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE: return "backend_buffer_type_get_max_size"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST: return "backend_buffer_type_is_host"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER: return "backend_buffer_type_alloc_buffer"; + + /* buffer */ + case APIR_COMMAND_TYPE_BUFFER_GET_BASE: return "backend_buffer_get_base"; + case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR: return "backend_buffer_set_tensor"; + case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR: return "backend_buffer_get_tensor"; + case APIR_COMMAND_TYPE_BUFFER_CLEAR: return "backend_buffer_clear"; + case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER: return "backend_buffer_free_buffer"; + + /* backend */ + case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE: return "backend_graph_compute"; + + /* metal */ + case APIR_COMMAND_TYPE_METAL_GET_DEVICE_CONTEXT: return "metal_get_device_context"; + + default: return "unknown"; + } +} + +static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = { + /* device */ + [APIR_COMMAND_TYPE_DEVICE_GET_COUNT] = backend_reg_get_device_count, + [APIR_COMMAND_TYPE_DEVICE_GET_NAME] = backend_device_get_name, + [APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION] = backend_device_get_description, + [APIR_COMMAND_TYPE_DEVICE_GET_TYPE] = backend_device_get_type, + [APIR_COMMAND_TYPE_DEVICE_GET_MEMORY] = backend_device_get_memory, + [APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP] = backend_device_supports_op, + [APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE] = backend_device_get_buffer_type, + [APIR_COMMAND_TYPE_DEVICE_GET_PROPS] = backend_device_get_props, + [APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR] = backend_device_buffer_from_ptr, + + /* buffer-type */ + [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME] = backend_buffer_type_get_name, + [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT] = backend_buffer_type_get_alignment, + [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE] = backend_buffer_type_get_max_size, + [APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST] = backend_buffer_type_is_host, + [APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER] = backend_buffer_type_alloc_buffer, + + /* buffer */ + [APIR_COMMAND_TYPE_BUFFER_GET_BASE] = backend_buffer_get_base, + [APIR_COMMAND_TYPE_BUFFER_SET_TENSOR] = backend_buffer_set_tensor, + [APIR_COMMAND_TYPE_BUFFER_GET_TENSOR] = backend_buffer_get_tensor, + [APIR_COMMAND_TYPE_BUFFER_CLEAR] = backend_buffer_clear, + [APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER] = backend_buffer_free_buffer, + + /* backend */ + [APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE] = backend_graph_compute, + + /* metal */ + [APIR_COMMAND_TYPE_METAL_GET_DEVICE_CONTEXT] = backend_metal_get_device_context, +}; diff --git a/ggml/src/ggml-remotingbackend/backend-internal.h b/ggml/src/ggml-remotingbackend/backend-internal.h new file mode 100644 index 0000000000000..4d7ef19e8a2c6 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-internal.h @@ -0,0 +1,35 @@ +#include +#include +#include + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "shared/api_remoting.h" + +extern ggml_backend_reg_t reg; +extern ggml_backend_dev_t dev; +extern ggml_backend_t bck; + +#define NOT_IMPLEMENTED \ + do { \ + static bool first = true; \ + if (first) { \ + printf("\nWARN: ###\nWARN: ### reached unimplemented function %s\nWARN: ###\n\n", __func__); \ + first = false; \ + } \ + } while(0) + +extern "C" { + ApirLoadLibraryReturnCode apir_backend_initialize(); + void apir_backend_deinit(void); + uint32_t apir_backend_dispatcher(uint32_t cmd_type, struct virgl_apir_context *ctx, + char *dec_cur, const char *dec_end, + char *enc_cur, const char *enc_end, + char **enc_cur_after); +} + +extern void (*ggml_backend_metal_get_device_context_fct)(ggml_backend_dev_t dev, + bool *has_simdgroup_mm, + bool *has_simdgroup_reduction, + bool *use_bfloat); diff --git a/ggml/src/ggml-remotingbackend/backend-utils.cpp b/ggml/src/ggml-remotingbackend/backend-utils.cpp new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/ggml/src/ggml-remotingbackend/backend-utils.h b/ggml/src/ggml-remotingbackend/backend-utils.h new file mode 100644 index 0000000000000..cf2898a71eb8a --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-utils.h @@ -0,0 +1,61 @@ +#pragma once + +#include +#include +#include + +#include + +#define UNUSED GGML_UNUSED +#define APIR_LLAMA_CPP_LOG_TO_FILE_ENV "APIR_LLAMA_CPP_LOG_TO_FILE" + +static FILE * +get_log_dest(void) +{ + static FILE *dest = NULL; + if (dest) { + return dest; + } + const char *apir_log_to_file = getenv(APIR_LLAMA_CPP_LOG_TO_FILE_ENV); + if (!apir_log_to_file) { + dest = stderr; + return dest; + } + + dest = fopen(apir_log_to_file, "w"); + + return dest; +} + +#define APIR_VA_PRINT(prefix, format) \ + do { \ + FILE *dest = get_log_dest(); \ + fprintf(dest, prefix); \ + va_list argptr; \ + va_start(argptr, format); \ + vfprintf(dest, format, argptr); \ + fprintf(dest, "\n"); \ + va_end(argptr); \ + fflush(dest); \ + } while (0) + +inline void +INFO(const char *format, ...) { + APIR_VA_PRINT("INFO: ", format); +} + +inline void +WARNING(const char *format, ...) { + APIR_VA_PRINT("WARNING: ", format); +} + +inline void +ERROR(const char *format, ...) { + APIR_VA_PRINT("ERROR: ", format); +} + +[[noreturn]] inline void +FATAL(const char *format, ...) { + APIR_VA_PRINT("FORMAT: ", format); + abort(); +} diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp new file mode 100644 index 0000000000000..a7695834d5687 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend.cpp @@ -0,0 +1,151 @@ +#include +#include + +#include + +#include "backend-utils.h" +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "shared/api_remoting.h" +#include "shared/apir_backend.h" +#include "shared/venus_cs.h" + +#define GGML_BACKEND_LIBRARY_PATH_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_PATH" +#define GGML_BACKEND_LIBRARY_REG_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_REG" +#define GGML_BACKEND_LIBRARY_INIT_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_INIT" + +#define GGML_BACKEND_LIBRARY_METAL_DEVICE_CONTEXT "ggml_backend_metal_get_device_context" + +static void *backend_library_handle = NULL; + +extern "C" { + void apir_backend_deinit(void) { + auto buffers = get_track_backend_buffers(); + for (const auto& buffer: buffers) { + untrack_backend_buffer(buffer); + buffer->iface.free_buffer(buffer); + } + + if (dev) { + size_t free, total; + dev->iface.get_memory(dev, &free, &total); + INFO("%s: free memory: %ld MB", __func__, (size_t) free/1024/1024); + } + + show_timer(&graph_compute_timer); + show_timer(&set_tensor_timer); + show_timer(&get_tensor_timer); + /* *** */ + + if (backend_library_handle) { + INFO("%s: The GGML backend library was loaded. Unloading it.", __func__); + dlclose(backend_library_handle); + } + + INFO("%s: bye-bye", __func__); + } + + ApirLoadLibraryReturnCode apir_backend_initialize() { + const char* dlsym_error; + + const char* library_name = getenv(GGML_BACKEND_LIBRARY_PATH_ENV); + const char* library_reg = getenv(GGML_BACKEND_LIBRARY_REG_ENV); + const char* library_init = getenv(GGML_BACKEND_LIBRARY_INIT_ENV); + + INFO("%s: loading %s (%s|%s)", __func__, library_name, library_reg, library_init); + + if (!library_name) { + ERROR("cannot open the GGML library: env var '%s' not defined\n", GGML_BACKEND_LIBRARY_PATH_ENV); + + return APIR_LOAD_LIBRARY_ENV_VAR_MISSING; + } + + backend_library_handle = dlopen(library_name, RTLD_LAZY); + + if (!backend_library_handle) { + ERROR("cannot open the GGML library: %s", dlerror()); + + return APIR_LOAD_LIBRARY_CANNOT_OPEN; + } + + if (!library_reg) { + ERROR("cannot register the GGML library: env var '%s' not defined", GGML_BACKEND_LIBRARY_REG_ENV); + + return APIR_LOAD_LIBRARY_ENV_VAR_MISSING; + } + + void *ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg); + dlsym_error = dlerror(); + if (dlsym_error) { + ERROR("cannot find the GGML backend registration symbol '%s' (from %s): %s", + library_reg, GGML_BACKEND_LIBRARY_REG_ENV, dlsym_error); + + return APIR_LOAD_LIBRARY_SYMBOL_MISSING; + } + + if (!library_init) { + ERROR("cannot initialize the GGML library: env var '%s' not defined", library_init); + + return APIR_LOAD_LIBRARY_ENV_VAR_MISSING; + } + + void *ggml_backend_init_fct = dlsym(backend_library_handle, library_init); + dlsym_error = dlerror(); + if (dlsym_error) { + ERROR("cannot find the GGML backend init symbol '%s' (from %s): %s", + library_init, GGML_BACKEND_LIBRARY_INIT_ENV, dlsym_error); + + return APIR_LOAD_LIBRARY_SYMBOL_MISSING; + } + + ggml_backend_metal_get_device_context_fct = (void (*)(ggml_backend_dev_t, bool *, bool *, bool *)) dlsym(backend_library_handle, GGML_BACKEND_LIBRARY_METAL_DEVICE_CONTEXT); + dlsym_error = dlerror(); + if (dlsym_error) { + ERROR("cannot find the GGML device context symbol '%s': %s\n", + GGML_BACKEND_LIBRARY_METAL_DEVICE_CONTEXT, dlsym_error); + + return APIR_LOAD_LIBRARY_SYMBOL_MISSING; + } + + uint32_t ret = backend_dispatch_initialize(ggml_backend_reg_fct, ggml_backend_init_fct); + + return (ApirLoadLibraryReturnCode) (APIR_LOAD_LIBRARY_INIT_BASE_INDEX + ret); + } + + uint32_t apir_backend_dispatcher(uint32_t cmd_type, struct virgl_apir_context *ctx, + char *dec_cur, const char *dec_end, + char *enc_cur, const char *enc_end, + char **enc_cur_after) { + struct vn_cs_encoder _enc = { + .cur = enc_cur, + .end = enc_end, + }; + struct vn_cs_encoder *enc = &_enc; + + struct vn_cs_decoder _dec = { + .cur = dec_cur, + .end = dec_end, + }; + struct vn_cs_decoder *dec = &_dec; + + + if (cmd_type >= APIR_BACKEND_DISPATCH_TABLE_COUNT) { + ERROR("Received an invalid dispatch index (%d >= %d)\n", + cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT); + return APIR_BACKEND_FORWARD_INDEX_INVALID; + } + +#if 0 + static long long count = 0; + INFO("[%lld] Calling %s", count, backend_dispatch_command_name((ApirBackendCommandType) cmd_type)); + count += 1; +#endif + backend_dispatch_t forward_fct = apir_backend_dispatch_table[cmd_type]; + uint32_t ret = forward_fct(enc, dec, ctx); + + *enc_cur_after = enc->cur; + + return ret; + } +} diff --git a/ggml/src/ggml-remotingbackend/shared/api_remoting.h b/ggml/src/ggml-remotingbackend/shared/api_remoting.h new file mode 100644 index 0000000000000..fe9d89bdcf577 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/api_remoting.h @@ -0,0 +1,88 @@ +#pragma once + +/* the rest of this file must match virglrenderer/src/apir-protocol.h */ + +#include + +#define VENUS_COMMAND_TYPE_LENGTH 331 + +#define APIR_PROTOCOL_MAJOR 0 +#define APIR_PROTOCOL_MINOR 1 + +#define APIR_HANDSHAKE_MAGIC 0xab1e + +typedef enum { + APIR_COMMAND_TYPE_HandShake = 0, + APIR_COMMAND_TYPE_LoadLibrary = 1, + APIR_COMMAND_TYPE_Forward = 2, + + APIR_COMMAND_TYPE_LENGTH = 3, +} ApirCommandType; + +typedef uint64_t ApirCommandFlags; + +typedef enum { + APIR_LOAD_LIBRARY_SUCCESS = 0, + APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR = 1, + APIR_LOAD_LIBRARY_ALREADY_LOADED = 2, + APIR_LOAD_LIBRARY_ENV_VAR_MISSING = 3, + APIR_LOAD_LIBRARY_CANNOT_OPEN = 4, + APIR_LOAD_LIBRARY_SYMBOL_MISSING = 5, + APIR_LOAD_LIBRARY_INIT_BASE_INDEX = 6, // anything above this is a APIR backend library initialization return code +} ApirLoadLibraryReturnCode; + +typedef enum { + APIR_FORWARD_SUCCESS = 0, + APIR_FORWARD_NO_DISPATCH_FCT = 1, + APIR_FORWARD_TIMEOUT = 2, + + APIR_FORWARD_BASE_INDEX = 3, // anything above this is a APIR backend library forward return code +} ApirForwardReturnCode; + +__attribute__((unused)) +static inline const char *apir_command_name(ApirCommandType type) +{ + switch (type) { + case APIR_COMMAND_TYPE_HandShake: return "HandShake"; + case APIR_COMMAND_TYPE_LoadLibrary: return "LoadLibrary"; + case APIR_COMMAND_TYPE_Forward: return "Forward"; + default: return "unknown"; + } +} + +__attribute__((unused)) +static const char *apir_load_library_error(ApirLoadLibraryReturnCode code) { +#define APIR_LOAD_LIBRARY_ERROR(code_name) \ + do { \ + if (code == code_name) return #code_name; \ + } while (0) \ + + APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_SUCCESS); + APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR); + APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_ALREADY_LOADED); + APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_ENV_VAR_MISSING); + APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_CANNOT_OPEN); + APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_SYMBOL_MISSING); + APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_INIT_BASE_INDEX); + + return "Unknown APIR_COMMAND_TYPE_LoadLibrary error"; + +#undef APIR_LOAD_LIBRARY_ERROR +} + +__attribute__((unused)) +static const char *apir_forward_error(ApirForwardReturnCode code) { +#define APIR_FORWARD_ERROR(code_name) \ + do { \ + if (code == code_name) return #code_name; \ + } while (0) \ + + APIR_FORWARD_ERROR(APIR_FORWARD_SUCCESS); + APIR_FORWARD_ERROR(APIR_FORWARD_NO_DISPATCH_FCT); + APIR_FORWARD_ERROR(APIR_FORWARD_TIMEOUT); + APIR_FORWARD_ERROR(APIR_FORWARD_BASE_INDEX); + + return "Unknown APIR_COMMAND_TYPE_Forward error"; + +#undef APIR_FORWARD_ERROR +} diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h new file mode 100644 index 0000000000000..32553e49ebb58 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -0,0 +1,139 @@ +#pragma once + +#define APIR_BACKEND_INITIALIZE_SUCCESS 0 +#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY 1 +#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY 2 +#define APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS 3 +#define APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS 4 + +#define APIR_BACKEND_INITIALIZE_BACKEND_FAILED 5 +// new entries here need to be added to the apir_backend_initialize_error function below + +#define APIR_BACKEND_FORWARD_INDEX_INVALID 6 + +// 0 is fast, 1 avoids the backend to crash if an unsupported tensor is received +#define APIR_BACKEND_CHECK_SUPPORTS_OP 0 + +typedef uintptr_t apir_buffer_type_host_handle_t; +typedef uintptr_t apir_buffer_host_handle_t; + +typedef struct { + apir_buffer_host_handle_t host_handle; + + struct vn_renderer_shmem *shmem; + apir_buffer_type_host_handle_t buft_host_handle; +} apir_buffer_context_t; + +struct vn_dispatch_context; +struct virgl_apir_context; + +typedef enum ApirBackendCommandType { + /* device */ + APIR_COMMAND_TYPE_DEVICE_GET_COUNT = 0, + APIR_COMMAND_TYPE_DEVICE_GET_NAME = 1, + APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION = 2, + APIR_COMMAND_TYPE_DEVICE_GET_TYPE = 3, + APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = 4, + APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = 5, + APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE = 6, + APIR_COMMAND_TYPE_DEVICE_GET_PROPS = 7, + APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR = 8, + + /* buffer-type */ + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = 9, + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 10, + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 11, + APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 12, + APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = 13, + + /* buffer */ + APIR_COMMAND_TYPE_BUFFER_GET_BASE = 14, + APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = 15, + APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = 16, + APIR_COMMAND_TYPE_BUFFER_CLEAR = 17, + APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER = 18, + + /* backend */ + APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 19, + + /* metal */ + APIR_COMMAND_TYPE_METAL_GET_DEVICE_CONTEXT = 20, + + // last command_type index + 1 + APIR_BACKEND_DISPATCH_TABLE_COUNT = 21, +} ApirBackendCommandType; + + +struct virgl_apir_callbacks { + void *(*get_shmem_ptr)(struct vn_dispatch_context *ctx, uint32_t res_id); +}; + +struct virgl_apir_context { + struct vn_dispatch_context *virgl_ctx; + + struct virgl_apir_callbacks iface; +}; + +struct timer_data { + long long start; + long long total; + long long count; + const char *name; +}; + +extern struct timer_data graph_compute_timer; +extern struct timer_data get_tensor_timer; +extern struct timer_data set_tensor_timer; +extern struct timer_data wait_host_reply_timer; +extern struct timer_data get_tensor_from_ptr_timer; +extern struct timer_data set_tensor_from_ptr_timer; + +static inline void start_timer(struct timer_data *timer) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + timer->start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; +} + +// returns the duration in ns +static inline long long stop_timer(struct timer_data *timer) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; + + long long duration = (timer_end - timer->start); + timer->total += duration; + timer->count += 1; + + return duration; +} + +static inline void show_timer(struct timer_data *timer) { + double ms = timer->total/1000000; + double itl = ms/timer->count; + double speed = 1/itl * 1000; + + if (!timer->total) { + return; + } + + INFO("%15s [%9.0f] ms for %4ld invocations | ITL %2.2f ms | throughput = %4.2f t/s (%4.2f ms/call)", + timer->name, ms, timer->count, itl, speed, ms/timer->count); +} + +static const char *apir_backend_initialize_error(int code) { +#define APIR_BACKEND_INITIALIZE_ERROR(code_name) \ + do { \ + if (code == code_name) return #code_name; \ + } while (0) \ + + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_SUCCESS); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_FAILED); + + return "Unknown APIR_BACKEND_INITIALIZE error:/"; + +#undef APIR_BACKEND_INITIALIZE_ERROR +} diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h new file mode 100644 index 0000000000000..e67c99a46b5b6 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -0,0 +1,554 @@ +#pragma once + +#include +#include + +// needs UNUSED to be defined +// needs FATAL to be defined + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +struct vn_cs_encoder { + char* cur; + const char *start; + const char* end; +}; + +struct vn_cs_decoder { + const char* cur; + const char* end; +}; + +/* + * new encoder and decoder + */ + +static struct vn_cs_decoder +vn_cs_new_decoder(const char *ptr, size_t size) { + struct vn_cs_decoder dec = { + .cur = ptr, + .end = ptr + size, + }; + + return dec; +} + +static struct vn_cs_encoder +vn_cs_new_encoder(char *ptr, size_t size) { + struct vn_cs_encoder enc = { + .cur = ptr, + .start = ptr, + .end = ptr + size, + }; + + return enc; +} + +/* + * encode peek + */ + +static inline bool +vn_cs_decoder_peek_internal(const struct vn_cs_decoder *dec, + size_t size, + void *val, + size_t val_size) +{ + assert(val_size <= size); + + if (unlikely(size > (size_t) (dec->end - dec->cur))) { + FATAL("READING TOO MUCH FROM THE DECODER :/"); + //vn_cs_decoder_set_fatal(dec); + memset(val, 0, val_size); + return false; + } + + /* we should not rely on the compiler to optimize away memcpy... */ + memcpy(val, dec->cur, val_size); + return true; +} + +static inline void +vn_cs_decoder_peek(const struct vn_cs_decoder *dec, + size_t size, + void *val, + size_t val_size) +{ + vn_cs_decoder_peek_internal(dec, size, val, val_size); +} + +static inline const void * +vn_cs_decoder_use_inplace(struct vn_cs_decoder *dec, + size_t size) +{ + if (unlikely(size > (size_t) (dec->end - dec->cur))) { + FATAL("READING TOO MUCH FROM THE DECODER :/"); + } + const void *addr = dec->cur; + dec->cur += size; + + return addr; +} + +/* + * read/write + */ + +static inline void +vn_cs_decoder_read(struct vn_cs_decoder *dec, + size_t size, + void *val, + size_t val_size) +{ + if (vn_cs_decoder_peek_internal(dec, size, val, val_size)) + dec->cur += size; +} + +static inline char * +vn_cs_encoder_write(struct vn_cs_encoder *enc, + size_t size, + const void *val, + size_t val_size) +{ + assert(val_size <= size); + assert(size <= ((size_t) (enc->end - enc->cur))); + + char *write_addr = enc->cur; + /* we should not rely on the compiler to optimize away memcpy... */ + memcpy(write_addr, val, val_size); + enc->cur += size; + + return write_addr; +} + +/* + * encode/decode + */ + +static inline void +vn_decode(struct vn_cs_decoder *dec, size_t size, void *data, size_t data_size) +{ + assert(size % 4 == 0); + vn_cs_decoder_read(dec, size, data, data_size); +} + +static inline void +vn_encode(struct vn_cs_encoder *enc, size_t size, const void *data, size_t data_size) +{ + assert(size % 4 == 0); + /* TODO check if the generated code is optimal */ + vn_cs_encoder_write(enc, size, data, data_size); +} + +/* + * typed encode/decode + */ + +/* uint8_t */ + +static inline void +vn_encode_uint8_t(struct vn_cs_encoder *enc, const uint8_t *val) +{ + vn_encode(enc, sizeof(int), val, sizeof(*val)); +} + +static inline void +vn_decode_uint8_t(struct vn_cs_decoder *dec, uint8_t *val) +{ + vn_decode(dec, sizeof(int), val, sizeof(*val)); +} + +/* uint64_t */ + +static inline size_t +vn_sizeof_uint64_t(const uint64_t *val) +{ + assert(sizeof(*val) == 8); +#ifdef NDEBUG + UNUSED(val); +#endif + return 8; +} + +static inline void +vn_encode_uint64_t(struct vn_cs_encoder *enc, const uint64_t *val) +{ + vn_encode(enc, 8, val, sizeof(*val)); +} + +static inline void +vn_decode_uint64_t(struct vn_cs_decoder *dec, uint64_t *val) +{ + vn_decode(dec, 8, val, sizeof(*val)); +} + +static inline size_t +vn_sizeof_uint64_t_array(const uint64_t *val, uint32_t count) +{ + assert(sizeof(*val) == 8); + const size_t size = sizeof(*val) * count; + assert(size >= count); + return size; +} + +static inline void +vn_encode_uint64_t_array(struct vn_cs_encoder *enc, const uint64_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_encode(enc, size, val, size); +} + +static inline void +vn_decode_uint64_t_array(struct vn_cs_decoder *dec, uint64_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_decode(dec, size, val, size); +} + +static inline const uint64_t * +vn_decode_uint64_t_array_inplace(struct vn_cs_decoder *dec, uint32_t count) +{ + return (uint64_t *)(uintptr_t) vn_cs_decoder_use_inplace(dec, count * sizeof(uint64_t)); +} + +/* int32_t */ + +static inline size_t +vn_sizeof_int32_t(const int32_t *val) +{ + assert(sizeof(*val) == 4); +#ifdef NDEBUG + UNUSED(val); +#endif + return 4; +} + +static inline void +vn_encode_int32_t(struct vn_cs_encoder *enc, const int32_t *val) +{ + vn_encode(enc, 4, val, sizeof(*val)); +} + +static inline void +vn_decode_int32_t(struct vn_cs_decoder *dec, int32_t *val) +{ + vn_decode(dec, 4, val, sizeof(*val)); +} + +static inline size_t +vn_sizeof_int32_t_array(const int32_t *val, uint32_t count) +{ + assert(sizeof(*val) == 4); + const size_t size = sizeof(*val) * count; + assert(size >= count); + return size; +} + +static inline void +vn_encode_int32_t_array(struct vn_cs_encoder *enc, const int32_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_encode(enc, size, val, size); +} + +static inline void +vn_decode_int32_t_array(struct vn_cs_decoder *dec, int32_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_decode(dec, size, val, size); +} + +/* array size (uint64_t) */ + +static inline size_t +vn_sizeof_array_size(uint64_t size) +{ + return vn_sizeof_uint64_t(&size); +} + +static inline void +vn_encode_array_size(struct vn_cs_encoder *enc, uint64_t size) +{ + vn_encode_uint64_t(enc, &size); +} + +static inline uint64_t +vn_decode_array_size(struct vn_cs_decoder *dec, uint64_t expected_size) +{ + uint64_t size; + vn_decode_uint64_t(dec, &size); + if (size != expected_size) { + FATAL("ENCODER IS FULL :/"); + //vn_cs_decoder_set_fatal(dec); + size = 0; + } + return size; +} + +static inline uint64_t +vn_decode_array_size_unchecked(struct vn_cs_decoder *dec) +{ + uint64_t size; + vn_decode_uint64_t(dec, &size); + return size; +} + +static inline uint64_t +vn_peek_array_size(struct vn_cs_decoder *dec) +{ + uint64_t size; + vn_cs_decoder_peek(dec, sizeof(size), &size, sizeof(size)); + return size; +} + +/* non-array pointer */ + +static inline size_t +vn_sizeof_simple_pointer(const void *val) +{ + return vn_sizeof_array_size(val ? 1 : 0); +} + +static inline bool +vn_encode_simple_pointer(struct vn_cs_encoder *enc, const void *val) +{ + vn_encode_array_size(enc, val ? 1 : 0); + return val; +} + +static inline bool +vn_decode_simple_pointer(struct vn_cs_decoder *dec) +{ + return vn_decode_array_size_unchecked(dec); +} + +/* uint32_t */ + +static inline size_t +vn_sizeof_uint32_t(const uint32_t *val) +{ + assert(sizeof(*val) == 4); +#ifdef NDEBUG + UNUSED(val); +#endif + return 4; +} + +static inline void +vn_encode_uint32_t(struct vn_cs_encoder *enc, const uint32_t *val) +{ + vn_encode(enc, 4, val, sizeof(*val)); +} + +static inline void +vn_decode_uint32_t(struct vn_cs_decoder *dec, uint32_t *val) +{ + vn_decode(dec, 4, val, sizeof(*val)); +} + +static inline size_t +vn_sizeof_uint32_t_array(const uint32_t *val, uint32_t count) +{ + assert(sizeof(*val) == 4); + const size_t size = sizeof(*val) * count; + assert(size >= count); + return size; +} + +static inline void +vn_encode_uint32_t_array(struct vn_cs_encoder *enc, const uint32_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_encode(enc, size, val, size); +} + +static inline void +vn_decode_uint32_t_array(struct vn_cs_decoder *dec, uint32_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_decode(dec, size, val, size); +} + +/* size_t */ + +static inline size_t +vn_sizeof_size_t(const size_t *val) +{ + return sizeof(*val); +} + +static inline void +vn_encode_size_t(struct vn_cs_encoder *enc, const size_t *val) +{ + const uint64_t tmp = *val; + vn_encode_uint64_t(enc, &tmp); +} + +static inline void +vn_decode_size_t(struct vn_cs_decoder *dec, size_t *val) +{ + uint64_t tmp; + vn_decode_uint64_t(dec, &tmp); + *val = tmp; +} + +static inline size_t +vn_sizeof_size_t_array(const size_t *val, uint32_t count) +{ + return vn_sizeof_size_t(val) * count; +} + +static inline void +vn_encode_size_t_array(struct vn_cs_encoder *enc, const size_t *val, uint32_t count) +{ + if (sizeof(size_t) == sizeof(uint64_t)) { + vn_encode_uint64_t_array(enc, (const uint64_t *)val, count); + } else { + for (uint32_t i = 0; i < count; i++) + vn_encode_size_t(enc, &val[i]); + } +} + +static inline void +vn_decode_size_t_array(struct vn_cs_decoder *dec, size_t *val, uint32_t count) +{ + if (sizeof(size_t) == sizeof(uint64_t)) { + vn_decode_uint64_t_array(dec, (uint64_t *)val, count); + } else { + for (uint32_t i = 0; i < count; i++) + vn_decode_size_t(dec, &val[i]); + } +} + +/* opaque blob */ + +static inline size_t +vn_sizeof_blob_array(const void *val, size_t size) +{ + UNUSED(val); + return (size + 3) & ~3; +} + +static inline void +vn_encode_blob_array(struct vn_cs_encoder *enc, const void *val, size_t size) +{ + vn_encode(enc, (size + 3) & ~3, val, size); +} + +static inline void +vn_decode_blob_array(struct vn_cs_decoder *dec, void *val, size_t size) +{ + vn_decode(dec, (size + 3) & ~3, val, size); +} + +/* string */ + +static inline size_t +vn_sizeof_char_array(const char *val, size_t size) +{ + return vn_sizeof_blob_array(val, size); +} + +static inline void +vn_encode_char_array(struct vn_cs_encoder *enc, const char *val, size_t size) +{ + assert(size && strlen(val) < size); + vn_encode_blob_array(enc, val, size); +} + +static inline void +vn_decode_char_array(struct vn_cs_decoder *dec, char *val, size_t size) +{ + vn_decode_blob_array(dec, val, size); + if (size) + val[size - 1] = '\0'; + else { + //vn_cs_decoder_set_fatal(dec); + FATAL("Couldn't decode the blog array"); + } +} + +/* (temp) buffer allocation */ + +static inline void * +vkr_cs_decoder_alloc_array(struct vkr_cs_decoder *dec, size_t size, size_t count) +{ + UNUSED(dec); + size_t alloc_size; + if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) { + FATAL("overflow in array allocation of %zu * %zu bytes", size, count); + return NULL; + } + + return malloc(alloc_size); +} + +static inline void * +vn_cs_decoder_alloc_array(struct vn_cs_decoder *dec, size_t size, size_t count) +{ + struct vkr_cs_decoder *d = (struct vkr_cs_decoder *)dec; + return vkr_cs_decoder_alloc_array(d, size, count); +} + +/* bool */ + +static inline void +vn_encode_bool_t(struct vn_cs_encoder *enc, const bool *val) +{ + vn_encode(enc, sizeof(int), val, sizeof(bool)); +} + +static inline void +vn_decode_bool_t(struct vn_cs_decoder *dec, bool *val) +{ + vn_decode(dec, sizeof(int), val, sizeof(bool)); +} + +/* apir_buffer_type_host_handle_t */ + +static inline void +vn_encode_apir_buffer_type_host_handle_t(struct vn_cs_encoder *enc, const apir_buffer_type_host_handle_t *val) +{ + vn_encode(enc, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t)); +} + +static inline void +vn_decode_apir_buffer_type_host_handle_t(struct vn_cs_decoder *dec, apir_buffer_type_host_handle_t *val) +{ + vn_decode(dec, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t)); +} + +/* apir_buffer_host_handle_t */ + +static inline void +vn_encode_apir_buffer_host_handle_t(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *val) +{ + vn_encode(enc, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t)); +} + +static inline void +vn_decode_apir_buffer_host_handle_t(struct vn_cs_decoder *dec, apir_buffer_host_handle_t *val) +{ + vn_decode(dec, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t)); +} + +/* uintptr_t */ + +static inline void +vn_encode_uintptr_t(struct vn_cs_encoder *enc, const uintptr_t *val) +{ + vn_encode(enc, sizeof(*val), val, sizeof(*val)); +} + +static inline void +vn_decode_uintptr_t(struct vn_cs_decoder *dec, uintptr_t *val) +{ + vn_decode(dec, sizeof(*val), val, sizeof(*val)); +} diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp new file mode 100644 index 0000000000000..196cd70958745 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp @@ -0,0 +1,167 @@ +#include +#include +#include +#include + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "venus_cs_ggml-rpc.h" + +std::unordered_set backend_buffers; + +void +track_backend_buffer(ggml_backend_buffer_t buffer) { + backend_buffers.insert(buffer); +} + +rpc_tensor +serialize_tensor(const ggml_tensor * tensor) { + rpc_tensor result; + result.id = reinterpret_cast(tensor); + result.type = tensor->type; + if (tensor->buffer) { + ggml_backend_buffer_t buffer = tensor->buffer; + + result.buffer = BUFFER_TO_HANDLE(buffer); + } else { + result.buffer = 0; + } + for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) { + result.ne[i] = tensor->ne[i]; + result.nb[i] = tensor->nb[i]; + } + result.op = tensor->op; + for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { + result.op_params[i] = tensor->op_params[i]; + } + result.flags = tensor->flags; + for (uint32_t i = 0; i < GGML_MAX_SRC; i++) { + result.src[i] = reinterpret_cast(tensor->src[i]); + } + result.view_src = reinterpret_cast(tensor->view_src); + result.view_offs = tensor->view_offs; + result.data = reinterpret_cast(tensor->data); + snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name); + return result; +} + +ggml_tensor * +deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) { + ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type, + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) { + result->nb[i] = tensor->nb[i]; + } + result->buffer = reinterpret_cast(tensor->buffer); + if (result->buffer && backend_buffers.find(result->buffer) == backend_buffers.end()) { + printf("WARNING: BUFFER NOT FOUND | %p\n", (void *)result->buffer); + result->buffer = nullptr; + } + + if (result->buffer) { + // require that the tensor data does not go beyond the buffer end + uint64_t tensor_size = (uint64_t) ggml_nbytes(result); + uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); + uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); + GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow + GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size); + } + + result->op = (ggml_op) tensor->op; + for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { + result->op_params[i] = tensor->op_params[i]; + } + result->flags = tensor->flags; + result->data = reinterpret_cast(tensor->data); + ggml_set_name(result, tensor->name); + return result; +} + +void +add_tensor(ggml_tensor * tensor, std::vector & tensors, std::unordered_set & visited) { + if (tensor == nullptr) { + return; + } + if (visited.find(tensor) != visited.end()) { + return; + } + visited.insert(tensor); + for (int i = 0; i < GGML_MAX_SRC; i++) { + add_tensor(tensor->src[i], tensors, visited); + } + add_tensor(tensor->view_src, tensors, visited); + tensors.push_back(serialize_tensor(tensor)); +} + +void +serialize_graph(const ggml_cgraph * cgraph, std::vector & output) { + uint32_t n_nodes = cgraph->n_nodes; + std::vector tensors; + std::unordered_set visited; + for (uint32_t i = 0; i < n_nodes; i++) { + add_tensor(cgraph->nodes[i], tensors, visited); + } + // serialization format: + // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) | + uint32_t n_tensors = tensors.size(); + int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor); + output.resize(output_size, 0); + memcpy(output.data(), &n_nodes, sizeof(n_nodes)); + for (uint32_t i = 0; i < n_nodes; i++) { + memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t)); + } + uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t)); + *out_ntensors = n_tensors; + rpc_tensor * out_tensors = (rpc_tensor *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t)); + memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor)); +} + +ggml_tensor * +create_node(uint64_t id, + struct ggml_context * ctx, + const std::unordered_map & tensor_ptrs, + std::unordered_map & tensor_map) { + if (id == 0) { + return nullptr; + } + if (tensor_map.find(id) != tensor_map.end()) { + return tensor_map[id]; + } + const rpc_tensor * tensor = tensor_ptrs.at(id); + struct ggml_tensor * result = deserialize_tensor(ctx, tensor); + if (result == nullptr) { + return nullptr; + } + tensor_map[id] = result; + for (int i = 0; i < GGML_MAX_SRC; i++) { + result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map); + } + result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map); + result->view_offs = tensor->view_offs; + return result; +} + +ggml_cgraph * +deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes) { + size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false); + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false); + graph->n_nodes = n_nodes; + std::unordered_map tensor_ptrs; + for (uint32_t i = 0; i < n_tensors; i++) { + tensor_ptrs[tensors[i].id] = &tensors[i]; + } + std::unordered_map tensor_map; + for (uint32_t i = 0; i < n_nodes; i++) { + int64_t id; + memcpy(&id, &nodes[i], sizeof(id)); + graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map); + } + + return graph; +} diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h new file mode 100644 index 0000000000000..96402287af7fc --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h @@ -0,0 +1,45 @@ +#include +#include +#include + +// ggml_tensor is serialized into rpc_tensor +struct rpc_tensor { + uint64_t id; + uint32_t type; + uint64_t buffer; + uint32_t ne[GGML_MAX_DIMS]; + uint32_t nb[GGML_MAX_DIMS]; + uint32_t op; + int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; + int32_t flags; + uint64_t src[GGML_MAX_SRC]; + uint64_t view_src; + uint64_t view_offs; + uint64_t data; + char name[GGML_MAX_NAME]; + + char padding[4]; +}; + +/* frontend */ + +rpc_tensor serialize_tensor(const ggml_tensor * tensor); + +void serialize_graph(const ggml_cgraph * cgraph, std::vector & output); + +/* backend */ + +void track_backend_buffer(ggml_backend_buffer_t buffer); +bool untrack_backend_buffer(ggml_backend_buffer_t buffer); +std::unordered_set get_track_backend_buffers(); + +void add_tensor(ggml_tensor * tensor, std::vector & tensors, std::unordered_set & visited); + +ggml_tensor *deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor); + +ggml_tensor *create_node(uint64_t id, + struct ggml_context * ctx, + const std::unordered_map & tensor_ptrs, + std::unordered_map & tensor_map); + +ggml_cgraph *deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes); diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h new file mode 100644 index 0000000000000..71c9b3f3ed820 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h @@ -0,0 +1,236 @@ +// needs the ggml-backend-impl.h definition +// needs venus_cs.h definition + +#include "venus_cs_ggml-rpc.h" + +// needs +// ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer); + +static inline void +vn_encode_ggml_buffer_host_handle(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *handle); + +static inline ggml_backend_buffer_t +vn_decode_ggml_buffer(struct vn_cs_decoder *dec); + +/* rpc_tensor */ + +static inline void +vn_encode_rcp_tensor(struct vn_cs_encoder *enc, const rpc_tensor *rpc_tensor) { + size_t rpc_tensor_size = sizeof(*rpc_tensor); + vn_encode(enc, rpc_tensor_size, rpc_tensor, rpc_tensor_size); +} + +static inline rpc_tensor * +vn_decode_rpc_tensor_inplace(struct vn_cs_decoder *dec) { + size_t rpc_tensor_size = sizeof(rpc_tensor); + + return (rpc_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, rpc_tensor_size); +} + +static inline rpc_tensor * +vn_decode_rpc_tensor_array_inplace(struct vn_cs_decoder *dec, uint32_t n_tensors) { + size_t rpc_tensor_size = sizeof(rpc_tensor) * n_tensors; + + return (rpc_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, rpc_tensor_size); +} + +/* ggml_tensor */ + +static inline void +vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor) { + rpc_tensor serialized = serialize_tensor(tensor); + + vn_encode_rcp_tensor(enc, &serialized); +} + +static inline const ggml_tensor * +vn_decode_ggml_tensor(struct vn_cs_decoder *dec) { + const rpc_tensor *rpc_tensor = vn_decode_rpc_tensor_inplace(dec); + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + + const ggml_tensor *tensor = deserialize_tensor(ctx, rpc_tensor); + + return tensor; +} + +/* *** ggml_backend_buffer_type_t *** */ + +// ggml_backend_buffer_type_t is a POINTER (to a struct). +// Only the host pointer is shared between the host and guest. +// The guest stores it in `buft->context`. +// The host simply writes the pointer address in the buffer variable. + + +static inline void +vn_encode_ggml_buffer_type(struct vn_cs_encoder *enc, ggml_backend_buffer_type_t buft) { + apir_buffer_type_host_handle_t handle = ggml_buffer_type_to_apir_handle(buft); + vn_cs_encoder_write(enc, sizeof(handle), &handle, sizeof(handle)); +} + +static inline ggml_backend_buffer_type_t +vn_decode_ggml_buffer_type(struct vn_cs_decoder *dec) { + apir_buffer_type_host_handle_t handle; + + vn_cs_decoder_read(dec, sizeof(handle), &handle, sizeof(handle)); + + return (ggml_backend_buffer_type_t) handle; +} + +static inline apir_buffer_type_host_handle_t +vn_decode_apir_buffer_type_host_handle(struct vn_cs_decoder *dec) { + apir_buffer_type_host_handle_t handle; + + vn_cs_decoder_read(dec, sizeof(handle), &handle, sizeof(handle)); + + return handle; +} + +/* *** ggml_backend_type_t *** */ + +// ggml_backend_buffer_t is a POINTER. +// same logic as for ggml_backend_buffer_type_t + +static inline void +vn_encode_ggml_buffer(struct vn_cs_encoder *enc, const ggml_backend_buffer_t buffer) { + apir_buffer_host_handle_t handle = BUFFER_TO_HOST_HANDLE(buffer); + vn_cs_encoder_write(enc, sizeof(handle), &handle, sizeof(handle)); +} + +static inline ggml_backend_buffer_t +vn_decode_ggml_buffer(struct vn_cs_decoder *dec) { + ggml_backend_buffer_t buffer; + size_t buffer_ptr_size = sizeof(buffer); + + vn_cs_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size); + + return buffer; +} + +/* enum ggml_status */ + +static inline void +vn_encode_ggml_status(struct vn_cs_encoder *enc, const enum ggml_status *status) { + vn_cs_encoder_write(enc, sizeof(*status), status, sizeof(*status)); +} + +static inline void +vn_decode_ggml_status(struct vn_cs_decoder *dec, enum ggml_status *status) { + vn_cs_decoder_read(dec, sizeof(*status), status, sizeof(*status)); +} + +/* vn_renderer_shmem */ + +static inline void +vn_encode_virtgpu_shmem_res_id(struct vn_cs_encoder *enc, uint32_t shmem_res_id) { + vn_encode_uint32_t(enc, &shmem_res_id); +} + +static inline void +vn_decode_virtgpu_shmem_res_id(struct vn_cs_decoder *dec, uint32_t *shmem_res_id) { + vn_decode_uint32_t(dec, shmem_res_id); +} + +/* ggml_cgraph */ + +static inline size_t +vn_serialize_ggml_cgraph(ggml_cgraph *cgraph, std::vector & cgraph_data) { + serialize_graph(cgraph, cgraph_data); + + return cgraph_data.size(); +} + +static inline void +vn_encode_cgraph_data(struct vn_cs_encoder *enc, std::vector & cgraph_data) { + size_t cgraph_size = cgraph_data.size(); + + vn_encode(enc, cgraph_size, cgraph_data.data(), cgraph_size); +} + +static inline ggml_cgraph * +vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, size_t cgraph_size) { + UNUSED(cgraph_size); + + uint32_t n_nodes; + vn_decode_uint32_t(dec, &n_nodes); + const uint64_t * nodes = vn_decode_uint64_t_array_inplace(dec, n_nodes); + + uint32_t n_tensors; + vn_decode_uint32_t(dec, &n_tensors); + const rpc_tensor *tensors = vn_decode_rpc_tensor_array_inplace(dec, n_tensors); + + return deserialize_graph(n_nodes, n_tensors, tensors, nodes); +} + +static inline void +vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *handle) { + vn_cs_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle)); +} + +static inline void +vn_encode_ggml_tensor_inline(struct vn_cs_encoder *enc, const ggml_tensor *tensor) { + size_t tensor_size = sizeof(*tensor); + + if (tensor->extra) { + FATAL("Cannot pass tensors with extra"); + } + + if (tensor->src[0] && tensor->buffer) { + static int first = 1; + if (first) { + // not sure if the buffer needs to be updated inside the src tensors or not + WARNING("Cannot pass tensors with src and buffer"); + first = 0; + } + } + + vn_cs_encoder_write(enc, tensor_size, tensor, tensor_size); + + // tensor->data is a pointer inside the device buffer. No need to touch it + // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence. + // (could also make a copy of the tensor, and update locally.) + + if (tensor->buffer) { + apir_buffer_host_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer); + vn_encode_ggml_buffer_handle(enc, &buffer_handle); + } + + if (tensor->view_src) { + vn_cs_encoder_write(enc, tensor_size, tensor->view_src, tensor_size); + } + + for (int i = 0; tensor->src[i]; i++) { + const ggml_tensor *tensor_src = tensor->src[i]; + vn_cs_encoder_write(enc, tensor_size, tensor_src, tensor_size); + } +} + +static inline const ggml_tensor * +vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) { + + // it safe to remove the `const` qualifier here, we *do* want to + // modify the shared memory data to fix the `src` pointers. + ggml_tensor *tensor = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); + + // tensor->data is a pointer inside the device buffer. No need to touch it + // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence. + if (tensor->buffer) { + tensor->buffer = vn_decode_ggml_buffer(dec); + } + + if (tensor->view_src) { + ggml_tensor *tensor_view_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); + tensor->view_src = tensor_view_src; + } + + for (int i = 0; tensor->src[i]; i++) { + ggml_tensor *tensor_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); + tensor->src[i] = tensor_src; // overwrite op->src[i] pointer with the actual location of the src tensor + } + + return tensor; +} diff --git a/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp new file mode 100644 index 0000000000000..30ae511aa95e8 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp @@ -0,0 +1,118 @@ +#include +#include +#include +#include + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "shared/venus_cs_ggml-rpc.h" + +std::unordered_set backend_buffers; + +void +track_backend_buffer(ggml_backend_buffer_t buffer) { + backend_buffers.insert(buffer); +} + +bool +untrack_backend_buffer(ggml_backend_buffer_t buffer) { + auto it = backend_buffers.find(buffer); + if (it == backend_buffers.end()) { + return false; + } + + backend_buffers.erase(it); + return true; +} + +std::unordered_set +get_track_backend_buffers() { + return backend_buffers; +} + +ggml_tensor * +deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) { + ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type, + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) { + result->nb[i] = tensor->nb[i]; + } + result->buffer = reinterpret_cast(tensor->buffer); + if (result->buffer && backend_buffers.find(result->buffer) == backend_buffers.end()) { + printf("WARNING: BUFFER NOT FOUND | %p\n", (void *)result->buffer); + result->buffer = nullptr; + } + + uint64_t tensor_data = tensor->data; + if (result->buffer) { + // require that the tensor data does not go beyond the buffer end + uint64_t tensor_size = (uint64_t) ggml_nbytes(result); + uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); + uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); + + // tensor->data is serialized as an offset to the buffer base address + tensor_data += buffer_start; + + GGML_ASSERT(tensor_data + tensor_size >= tensor_data); // check for overflow + GGML_ASSERT(tensor_data >= buffer_start && tensor_data + tensor_size <= buffer_start + buffer_size); + } + + result->op = (ggml_op) tensor->op; + for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { + result->op_params[i] = tensor->op_params[i]; + } + result->flags = tensor->flags; + result->data = reinterpret_cast(tensor_data); + ggml_set_name(result, tensor->name); + return result; +} + +ggml_tensor * +create_node(uint64_t id, + struct ggml_context * ctx, + const std::unordered_map & tensor_ptrs, + std::unordered_map & tensor_map) { + if (id == 0) { + return nullptr; + } + if (tensor_map.find(id) != tensor_map.end()) { + return tensor_map[id]; + } + const rpc_tensor * tensor = tensor_ptrs.at(id); + struct ggml_tensor * result = deserialize_tensor(ctx, tensor); + if (result == nullptr) { + return nullptr; + } + tensor_map[id] = result; + for (int i = 0; i < GGML_MAX_SRC; i++) { + result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map); + } + result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map); + result->view_offs = tensor->view_offs; + return result; +} + +ggml_cgraph * +deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes) { + size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false); + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false); + graph->n_nodes = n_nodes; + std::unordered_map tensor_ptrs; + for (uint32_t i = 0; i < n_tensors; i++) { + tensor_ptrs[tensors[i].id] = &tensors[i]; + } + std::unordered_map tensor_map; + for (uint32_t i = 0; i < n_nodes; i++) { + int64_t id; + memcpy(&id, &nodes[i], sizeof(id)); + graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map); + } + + return graph; +} diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt new file mode 100644 index 0000000000000..430d17ad9158b --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt @@ -0,0 +1,34 @@ +cmake_minimum_required(VERSION 3.19) +cmake_policy(SET CMP0114 NEW) + +message(STATUS "Enable API Remoting frontend") + +ggml_add_backend_library(ggml-remotingfrontend + ggml-backend-buffer.cpp + ggml-backend.cpp + ggml-backend-device.cpp + ggml-backend-reg.cpp + ggml-backend-buffer-type.cpp + ggml-backend-host-buffer-type.cpp + ggml-metal-remoting.cpp + virtgpu.cpp + virtgpu-shm.cpp + virtgpu-utils.cpp + virtgpu-forward-device.cpp + virtgpu-forward-buffer-type.cpp + virtgpu-forward-buffer.cpp + virtgpu-forward-backend.cpp + virtgpu-forward-metal.cpp + virtgpu-forward-impl.h + ../../include/ggml-remoting-frontend.h + venus_cs_ggml-rpc-front.cpp + ) + +# dnf install -y libdrm-devel +target_link_libraries(ggml-remotingfrontend PUBLIC drm) +target_include_directories(ggml-remotingfrontend PUBLIC /usr/include/libdrm/) +target_include_directories(ggml-remotingfrontend PUBLIC ./include) + +target_include_directories(ggml-remotingfrontend PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + +target_compile_options(ggml-remotingfrontend PRIVATE -std=c++20) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp new file mode 100644 index 0000000000000..b655b8018f80d --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp @@ -0,0 +1,98 @@ +#include "ggml-remoting.h" + +#define BUFT_TO_GPU(name) \ + ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu + +static ggml_backend_buffer_t +ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + IMPLEMENTED_ONCE; + struct virtgpu *gpu = BUFT_TO_GPU(buft); + + struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context)); + if (!context) { + FATAL("Couldn't allocate the buffer context ..."); + } + + context->gpu = gpu; + + const int USE_FROM_PTR = true; + + if (USE_FROM_PTR) { + context->apir_context = apir_device_buffer_from_ptr(gpu, size, size); + context->base = context->apir_context.shmem->mmap_ptr; + context->is_from_ptr = true; + } else { + context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size); + context->is_from_ptr = false; + context->base = NULL; + } + context->is_host_buffer = false; + + ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size); + + return buffer; +} + +static const char * +ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + IMPLEMENTED_ONCE; + + struct virtgpu *gpu = BUFT_TO_GPU(buft); + + return apir_buffer_type_get_name(gpu, buft); +} + +static size_t +ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + IMPLEMENTED_ONCE; + struct virtgpu *gpu = BUFT_TO_GPU(buft); + + static size_t align = 0; + + if (align == 0) { + align = apir_buffer_type_get_alignment(gpu, buft); + } + + return align; +} + +static size_t +ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + IMPLEMENTED_ONCE; + struct virtgpu *gpu = BUFT_TO_GPU(buft); + + static size_t max_size = 0; + if (max_size == 0) { + max_size = apir_buffer_type_get_max_size(gpu, buft); + } + + return max_size; +} + +static bool +ggml_backend_remoting_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + IMPLEMENTED; + struct virtgpu *gpu = BUFT_TO_GPU(buft); + + return apir_buffer_type_is_host(gpu, buft); +} + +const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = { + /* .get_name = */ ggml_backend_remoting_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_remoting_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ NULL, +}; + +const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface = { + /* .get_name = */ ggml_backend_remoting_buffer_type_get_name, + /* .alloc_buffer = */ NULL, + /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ NULL, +}; + +/****************************************************************************************/ diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp new file mode 100644 index 0000000000000..e720efcf47c69 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -0,0 +1,167 @@ +#include "ggml-remoting.h" + +#define BUFFER_TO_GPU(name) \ + ((struct ggml_backend_remoting_buffer_context *) (name)->context)->gpu + +struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"}; +struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"}; + +struct timer_data get_tensor_from_ptr_timer = {0, 0, 0, "get_tensor_from_ptr"}; +struct timer_data set_tensor_from_ptr_timer = {0, 0, 0, "set_tensor_from_ptr"}; + +static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) { + IMPLEMENTED_ONCE; + + struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) buffer->context; + if (context->base) { + return context->base; + } + + context->base = apir_buffer_get_base(BUFFER_TO_GPU(buffer), + BUFFER_TO_APIR_CONTEXT(buffer)); + + return context->base; +} + +static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + NOT_IMPLEMENTED; + + STOP_HERE; + + UNUSED(buffer); + UNUSED(tensor); + UNUSED(value); + UNUSED(offset); + UNUSED(size); +} + +static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + IMPLEMENTED_ONCE; + + start_timer(&set_tensor_timer); + + struct virtgpu *gpu = BUFFER_TO_GPU(buffer); +#if 0 + INFO("%s: data=%p, offset=%lu, size=%lu\n", __func__, data, offset, size); +#endif +#if 0 + void **addr = (void **)(uintptr_t)data; + for (int i = 0; i <= 10; i++) { + INFO("%s: %p | %llx", __func__, addr, *addr); + addr++; + } + INFO("\n"); +#endif + struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer); + if (context->is_from_ptr) { + memcpy((char *)tensor->data + offset, data, size); + } else { + apir_buffer_set_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size); + } + + stop_timer(&set_tensor_timer); + + return; +} + +static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + IMPLEMENTED_ONCE; + + start_timer(&get_tensor_timer); + + struct virtgpu *gpu = BUFFER_TO_GPU(buffer); + struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer); + if (context->is_from_ptr) { + memcpy(data, (const char *)tensor->data + offset, size); + } else { + apir_buffer_get_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size); + } + + stop_timer(&get_tensor_timer); +} + +static void ggml_backend_remoting_buffer_set_tensor_from_ptr(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + IMPLEMENTED_ONCE; + + start_timer(&set_tensor_from_ptr_timer); + + UNUSED(buffer); + + memcpy((char *)tensor->data + offset, data, size); + + stop_timer(&set_tensor_from_ptr_timer); + + return; +} + +static void ggml_backend_remoting_buffer_get_tensor_from_ptr(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + IMPLEMENTED_ONCE; + + UNUSED(buffer); + + start_timer(&get_tensor_from_ptr_timer); + + memcpy(data, (const char *)tensor->data + offset, size); + + stop_timer(&get_tensor_from_ptr_timer); +} + +static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + NOT_IMPLEMENTED; + + STOP_HERE; + + return true; + + UNUSED(buffer); + UNUSED(src); + UNUSED(dst); +} + +static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + IMPLEMENTED_ONCE; + + struct virtgpu *gpu = BUFFER_TO_GPU(buffer); + + apir_buffer_clear(gpu, BUFFER_TO_APIR_CONTEXT(buffer), value); + + return; +} + +static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) { + UNUSED(buffer); + + IMPLEMENTED_ONCE; + + struct virtgpu *gpu = BUFFER_TO_GPU(buffer); + + apir_buffer_free_buffer(gpu, BUFFER_TO_APIR_CONTEXT(buffer)); + + struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer); + free(context); + buffer->context = NULL; +} + +const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { + /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, + /* .get_base = */ ggml_backend_remoting_buffer_get_base, + /* .init_tensor = */ NULL, + /* .memset_tensor = */ ggml_backend_remoting_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor, + /* .clear = */ ggml_backend_remoting_buffer_clear, + /* .reset = */ NULL, +}; + +const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface = { + /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, + /* .get_base = */ ggml_backend_remoting_buffer_get_base, + /* .init_tensor = */ NULL, + /* .memset_tensor = */ ggml_backend_remoting_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor_from_ptr, + /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor_from_ptr, + /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor, + /* .clear = */ ggml_backend_remoting_buffer_clear, + /* .reset = */ NULL, +}; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp new file mode 100644 index 0000000000000..f326a554c509e --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -0,0 +1,216 @@ +#include "ggml-remoting.h" + +static const char * +ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) { + IMPLEMENTED_ONCE; + + struct virtgpu *gpu = DEV_TO_GPU(dev); + + return apir_device_get_name(gpu); +} + +static const char * +ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) { + IMPLEMENTED; + + struct virtgpu *gpu = DEV_TO_GPU(dev); + + return apir_device_get_description(gpu); +} + +static enum ggml_backend_dev_type +ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) { + IMPLEMENTED_ONCE; + struct virtgpu *gpu = DEV_TO_GPU(dev); + + static enum ggml_backend_dev_type type; + static bool has_type = false; + if (!has_type) { + has_type = true; + type = (enum ggml_backend_dev_type) apir_device_get_type(gpu); + } + + return type; +} + +static void +ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + IMPLEMENTED; + + struct virtgpu *gpu = DEV_TO_GPU(dev); + + return apir_device_get_memory(gpu, free, total); +} + +static bool +ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { +#if USE_ALWAYS_TRUE_SUPPORTS_OP == 1 + /* ggml-rpc cheats it like this */ + /* with the current implementation of serialize_tensor, the src/view aren't properly passed */ + UNUSED(dev); + UNUSED(op); + + return true; +#elif USE_METAL_GUEST_SUPPORTS_OP == 1 + UNUSED(dev); + + struct ggml_backend_remoting_device_context *device_ctx = GET_DEVICE_CONTEXT(); + + return ggml_metal_device_supports_op(device_ctx->metal_dev_ctx, op); +#else + struct virtgpu *gpu = DEV_TO_GPU(dev); + + return apir_device_supports_op(gpu, op); +#endif +} + +static bool +ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + //IMPLEMENTED_ONCE; + +#if 1 + bool supported = buft->device == dev; + if (!supported) { + //WARNING("%s: unsupported buffer type (%s). Double check.", __func__, buft->iface.get_name(buft)); + } + + return supported; +#else + UNUSED(dev); + UNUSED(buft); + + return true; +#endif +} + +static bool +ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + //IMPLEMENTED_ONCE; + + UNUSED(dev); + UNUSED(op); + + // related to supports_buft, need to confirm + + return false; // same as ggml-metal +} + +static void +ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + IMPLEMENTED; + + props->name = ggml_backend_remoting_device_get_name(dev); + props->description = ggml_backend_remoting_device_get_description(dev); + props->type = ggml_backend_remoting_device_get_type(dev); + ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total); + +#if 0 + struct virtgpu *gpu = DEV_TO_GPU(dev); + apir_device_get_props(gpu, + &props->caps.async, + &props->caps.host_buffer, + &props->caps.buffer_from_host_ptr, + &props->caps.events + ); +#else + // ignore the actual backend answers and set it as we provide it in + // the API Remoting frontend + props->caps.async = false; + props->caps.host_buffer = false; + props->caps.buffer_from_host_ptr = false; + props->caps.events = false; +#endif + + INFO("%s: async=%d, host_buffer=%d!, buffer_from_host_ptr=%d!, events=%d", + __func__, props->caps.async, props->caps.host_buffer, + props->caps.buffer_from_host_ptr, props->caps.events); +} + +ggml_backend_buffer_type_t +ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { + IMPLEMENTED_ONCE; + + struct virtgpu *gpu = DEV_TO_GPU(dev); + + apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu); + + static struct ggml_backend_buffer_type buft { + /* .iface = */ ggml_backend_remoting_buffer_type_interface, + /* .device = */ dev, + /* .context = */ (void *) ctx, + }; + + return &buft; +} + +static ggml_backend_buffer_type_t +ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) { + IMPLEMENTED_ONCE; + + struct virtgpu *gpu = DEV_TO_GPU(dev); + + apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu); + + static struct ggml_backend_buffer_type buft { + /* .iface = */ ggml_backend_remoting_buffer_from_ptr_type_interface, + /* .device = */ dev, + /* .context = */ (void *) ctx, + }; + + return &buft; +} + +static ggml_backend_buffer_t +ggml_backend_remoting_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + + struct virtgpu *gpu = DEV_TO_GPU(dev); + + struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context)); + if (!context) { + FATAL("Couldn't allocate the buffer context ..."); + } + + context->gpu = gpu; + context->apir_context = apir_device_buffer_from_ptr(gpu, size, max_tensor_size); + context->base = ptr; + context->is_from_ptr = true; + + ggml_backend_buffer_t buffer = ggml_backend_buffer_init(ggml_backend_remoting_device_get_buffer_from_ptr_type(dev), ggml_backend_remoting_buffer_from_ptr_interface, (void *) context, size); + + INFO("#"); + INFO("# %s(%p, %llx) --> %p", __func__, ptr, size, buffer); + INFO("#\n"); + + return buffer; +} + +static ggml_backend_buffer_type_t +ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) { + IMPLEMENTED_ONCE; + + static struct ggml_backend_buffer_type host_bufft = { + /* .iface = */ ggml_backend_remoting_host_buffer_type_interface, + /* .device = */ dev, + /* .context = */ nullptr, + }; + + return &host_bufft; +} + +const struct ggml_backend_device_i ggml_backend_remoting_device_interface = { + /* .get_name = */ ggml_backend_remoting_device_get_name, + /* .get_description = */ ggml_backend_remoting_device_get_description, + /* .get_memory = */ ggml_backend_remoting_device_get_memory, + /* .get_type = */ ggml_backend_remoting_device_get_type, + /* .get_props = */ ggml_backend_remoting_device_get_props, + /* .init_backend = */ ggml_backend_remoting_device_init, + /* .get_buffer_type = */ ggml_backend_remoting_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_remoting_device_buffer_from_ptr, + /* .supports_op = */ ggml_backend_remoting_device_supports_op, + /* .supports_buft = */ ggml_backend_remoting_device_supports_buft, + /* .offload_op = */ ggml_backend_remoting_device_offload_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp new file mode 100644 index 0000000000000..c09c80d6472f5 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp @@ -0,0 +1,110 @@ +#include "ggml-remoting.h" + +#define BUFT_TO_GPU(name) \ + ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu + +extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface; + +static void +ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + BEING_IMPLEMENTED; + + void *ptr = buffer->context; + + if (ptr == nullptr) { + return; + } + struct ggml_backend_remoting_device_context *device_ctx = GET_DEVICE_CONTEXT(); + + struct vn_renderer_shmem *shmem = nullptr; + size_t index; + + for (size_t i = 0; i < device_ctx->shared_memory.size(); i++) { + const uint8_t* addr = (const uint8_t*) std::get<0>(device_ctx->shared_memory[i]) /* ptr */; + const uint8_t* endr = addr + std::get<1>(device_ctx->shared_memory[i]) /* size */; + if (ptr >= addr && ptr < endr) { + shmem = std::get<2>(device_ctx->shared_memory[i]) /* shmem */; + index = i; + break; + } + } + + if (shmem == nullptr) { + WARNING("failed to free host shared memory: memory not in map\n"); + return; + } + + virtgpu_shmem_destroy(device_ctx->gpu, shmem->shmem); + + device_ctx->shared_memory.erase(device_ctx->shared_memory.begin() + index); +} + +static ggml_backend_buffer_t +ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + IMPLEMENTED; + + struct virtgpu *gpu = BUFT_TO_GPU(buft); + + struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context)); + if (!context) { + FATAL("Couldn't allocate the buffer context ..."); + } + + context->gpu = gpu; + context->apir_context = apir_device_buffer_from_ptr(gpu, size, size); + context->base = context->apir_context.shmem->mmap_ptr; + context->is_host_buffer = true; + + ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size); + INFO("##"); + INFO("## %s(%llx) --> %p <======================", __func__, size, buffer); + INFO("##\n"); + + return buffer; +} + +static const char * +ggml_backend_remoting_host_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + + IMPLEMENTED_ONCE; + + return "GUEST host buffer"; +} + +static size_t +ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + + IMPLEMENTED_ONCE; + + return 64; // not 100% sure ... +} + +static bool +ggml_backend_remoting_host_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + + IMPLEMENTED_ONCE; + + return true; +} + +static size_t +ggml_backend_remoting_host_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + + IMPLEMENTED; + STOP_HERE; + + return SIZE_MAX; +} + +const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface = { + /* .get_name = */ ggml_backend_remoting_host_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_remoting_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_remoting_host_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_remoting_host_buffer_type_get_max_size, + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .is_host = */ ggml_backend_remoting_host_buffer_type_is_host, + }; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp new file mode 100644 index 0000000000000..3d20d8c04a727 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp @@ -0,0 +1,159 @@ +#include +#include + +#include "ggml-remoting.h" +#include "ggml-metal-remoting.h" + +static struct virtgpu *apir_initialize() { + static struct virtgpu *apir_gpu_instance = NULL; + static bool apir_initialized = false; + + if (apir_initialized) { + return apir_gpu_instance; + } + apir_initialized = true; + + apir_gpu_instance = create_virtgpu(); + if (!apir_gpu_instance) { + FATAL("failed to initialize the virtgpu :/"); + return NULL; + } + + apir_initialized = true; + + return apir_gpu_instance; +} + +static int ggml_backend_remoting_get_device_count() { + IMPLEMENTED; + + struct virtgpu *gpu = apir_initialize(); + if (!gpu) { + WARNING("apir_initialize failed :/"); + return 0; + } + + return apir_device_get_count(gpu); +} + +static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) { + UNUSED(reg); + + IMPLEMENTED; + + return ggml_backend_remoting_get_device_count(); +} + +static std::vector devices; + +ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device) { + GGML_ASSERT(device < devices.size()); + return devices[device]; +} + +static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) { + IMPLEMENTED; + + if (devices.size() > 0) { + INFO("%s: already initialized", __func__); + } + + struct virtgpu *gpu = apir_initialize(); + if (!gpu) { + FATAL("apir_initialize failed :/"); + return; + } + + static bool initialized = false; + + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + + for (int i = 0; i < ggml_backend_remoting_get_device_count(); i++) { + ggml_backend_remoting_device_context *ctx = new ggml_backend_remoting_device_context; + char desc[256] = "API Remoting device"; + + ctx->device = i; + ctx->name = GGML_REMOTING_FRONTEND_NAME + std::to_string(i); + ctx->description = desc; + ctx->gpu = gpu; + + ggml_backend_dev_t dev = new ggml_backend_device { + /* .iface = */ ggml_backend_remoting_device_interface, + /* .reg = */ reg, + /* .context = */ ctx, + }; + + ctx->metal_dev_ctx = get_metal_dev_context(dev); + + devices.push_back(dev); + } + initialized = true; + } + } +} + +static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) { + UNUSED(reg); + + IMPLEMENTED; + + return ggml_backend_remoting_get_device(device); +} + +static const char *ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { + UNUSED(reg); + + return GGML_REMOTING_FRONTEND_NAME; +} + +static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = { + /* .get_name = */ ggml_backend_remoting_reg_get_name, + /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count, + /* .get_device = */ ggml_backend_remoting_reg_get_device, + /* .get_proc_address = */ NULL, +}; + + +static void showTime() { + show_timer(&graph_compute_timer); + show_timer(&get_tensor_timer); + show_timer(&set_tensor_timer); + show_timer(&wait_host_reply_timer); + + if (get_tensor_from_ptr_timer.count) { + show_timer(&get_tensor_from_ptr_timer); + show_timer(&set_tensor_from_ptr_timer); + } +} + +ggml_backend_reg_t ggml_backend_remoting_frontend_reg() { + struct virtgpu *gpu = apir_initialize(); + if (!gpu) { + FATAL("apir_initialize failed :/"); + return NULL; + } + + static ggml_backend_reg reg = { + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_remoting_reg_i, + /* .context = */ gpu, + }; + + static bool initialized = false; + if (initialized) { + return ® + } + initialized = true; + + ggml_backend_remoting_reg_init_devices(®); + + int cr = atexit(showTime); + GGML_ASSERT(cr == 0); + + MESSAGE("%s: initialzed", __func__); + + return ® +} diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp new file mode 100644 index 0000000000000..d1847a7583b94 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp @@ -0,0 +1,87 @@ +#include "ggml-remoting.h" + +static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) { + UNUSED(backend); + + //IMPLEMENTED_ONCE; + + return "API Remoting backend"; +} + +static void ggml_backend_remoting_free(ggml_backend_t backend) { + IMPLEMENTED; + + delete backend; +} + +struct timer_data graph_compute_timer = {0, 0, 0, "compute_timer"}; + +static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + struct virtgpu *gpu = DEV_TO_GPU(backend->device); + + IMPLEMENTED_ONCE; + + start_timer(&graph_compute_timer); + + ggml_status status = apir_backend_graph_compute(gpu, cgraph); + + stop_timer(&graph_compute_timer); + + return status; +} + +static void ggml_backend_remoting_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) { + struct virtgpu *gpu = DEV_TO_GPU(backend->device); +#if true + UNUSED(gpu); + UNUSED(cgraph); + + NOT_IMPLEMENTED; +#else + start_timer(&graph_compute_timer); + + apir_backend_graph_optimize(gpu, cgraph); + + stop_timer(&graph_compute_timer); +#endif +} + +static ggml_backend_i ggml_backend_remoting_interface = { + /* .get_name = */ ggml_backend_remoting_get_name, + /* .free = */ ggml_backend_remoting_free, + /* .set_tensor_async = */ NULL, // ggml_backend_remoting_set_tensor_async, + /* .get_tensor_async = */ NULL, // ggml_backend_remoting_get_tensor_async, + /* .cpy_tensor_async = */ NULL, // ggml_backend_remoting_cpy_tensor_async, + /* .synchronize = */ NULL, // ggml_backend_remoting_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_remoting_graph_compute, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + /* .graph_optimize = */ ggml_backend_remoting_graph_optimize, +}; + +static ggml_guid_t ggml_backend_remoting_guid() { + static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x14, 0x03, 0x86, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; + + return &guid; +} + + +ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params) { + UNUSED(params); + IMPLEMENTED; + + ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *)dev->context; + + ggml_backend_t remoting_backend = new ggml_backend { + /* .guid = */ ggml_backend_remoting_guid(), + /* .interface = */ ggml_backend_remoting_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), ctx->device), + /* .context = */ ctx, + }; + + return remoting_backend; +} diff --git a/ggml/src/ggml-remotingfrontend/ggml-metal-remoting.cpp b/ggml/src/ggml-remotingfrontend/ggml-metal-remoting.cpp new file mode 100644 index 0000000000000..2927186b80b24 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-metal-remoting.cpp @@ -0,0 +1,254 @@ +#include "ggml-remoting.h" +#include "ggml-metal-remoting.h" + +const struct ggml_backend_metal_device_context *get_metal_dev_context(const ggml_backend_dev_t dev) { + static struct ggml_backend_metal_device_context metal_dev_ctx; + static bool has_metal_dev_ctx = false; + + if (has_metal_dev_ctx) { + return &metal_dev_ctx; + } + + has_metal_dev_ctx = true; + struct virtgpu *gpu = DEV_TO_GPU(dev); + + apir_metal_get_device_context(gpu, &metal_dev_ctx); + + return &metal_dev_ctx; +} + +bool ggml_metal_device_supports_op(const struct ggml_backend_metal_device_context *dev_ctx, const struct ggml_tensor * op) { + const bool has_simdgroup_mm = dev_ctx->has_simdgroup_mm; + const bool has_simdgroup_reduction = dev_ctx->has_simdgroup_reduction; + const bool has_bfloat = dev_ctx->has_bfloat; + + if (!has_bfloat) { + if (op->type == GGML_TYPE_BF16) { + return false; + } + + for (size_t i = 0, n = 3; i < n; ++i) { + if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) { + return false; + } + } + } + + switch (op->op) { + case GGML_OP_UNARY: + switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_TANH: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_SIGMOID: + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_ERF: + case GGML_UNARY_OP_GELU_QUICK: + case GGML_UNARY_OP_SILU: + case GGML_UNARY_OP_ELU: + case GGML_UNARY_OP_NEG: + case GGML_UNARY_OP_ABS: + case GGML_UNARY_OP_SGN: + case GGML_UNARY_OP_STEP: + case GGML_UNARY_OP_HARDSWISH: + case GGML_UNARY_OP_HARDSIGMOID: + case GGML_UNARY_OP_EXP: + return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; + default: + return false; + } + case GGML_OP_GLU: + switch (ggml_get_glu_op(op)) { + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_SWIGLU: + case GGML_GLU_OP_SWIGLU_OAI: + case GGML_GLU_OP_GEGLU_ERF: + case GGML_GLU_OP_GEGLU_QUICK: + return ggml_is_contiguous_1(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; + default: + return false; + } + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_TRANSPOSE: + case GGML_OP_PERMUTE: + case GGML_OP_CONCAT: + return true; + case GGML_OP_ADD: + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_ADD_ID: + return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_ACC: + case GGML_OP_REPEAT: + case GGML_OP_SCALE: + case GGML_OP_CONV_TRANSPOSE_1D: + return true; + case GGML_OP_CONV_TRANSPOSE_2D: + return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) && + (op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32) && + op->src[1]->type == GGML_TYPE_F32 && + op->type == GGML_TYPE_F32; + case GGML_OP_CLAMP: + return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_SIN: + case GGML_OP_COS: + case GGML_OP_LOG: + return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_SUM: + return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]); + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: + case GGML_OP_SOFT_MAX: + case GGML_OP_GROUP_NORM: + return has_simdgroup_reduction && ggml_is_contiguous_rows(op->src[0]); + case GGML_OP_L2_NORM: + return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0])); + case GGML_OP_ARGMAX: + return has_simdgroup_reduction; + case GGML_OP_NORM: + case GGML_OP_RMS_NORM: + return has_simdgroup_reduction && (ggml_is_contiguous_rows(op->src[0])); + case GGML_OP_ROPE: + return true; + case GGML_OP_IM2COL: + return ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32 && (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32); + case GGML_OP_POOL_1D: + return false; + case GGML_OP_UPSCALE: + return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; + case GGML_OP_POOL_2D: + return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_PAD: + return (ggml_get_op_params_i32(op, 0) == 0) && (ggml_get_op_params_i32(op, 2) == 0) && + (ggml_get_op_params_i32(op, 4) == 0) && (ggml_get_op_params_i32(op, 6) == 0); + case GGML_OP_PAD_REFLECT_1D: + case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_LEAKY_RELU: + return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_ARGSORT: + // TODO: Support arbitrary column width + return op->src[0]->ne[0] <= 1024; + case GGML_OP_ARANGE: + return true; + case GGML_OP_FLASH_ATTN_EXT: + // for new head sizes, add checks here + if (op->src[0]->ne[0] != 32 && + op->src[0]->ne[0] != 40 && + op->src[0]->ne[0] != 64 && + op->src[0]->ne[0] != 72 && + op->src[0]->ne[0] != 80 && + op->src[0]->ne[0] != 96 && + op->src[0]->ne[0] != 112 && + op->src[0]->ne[0] != 128 && + op->src[0]->ne[0] != 192 && + op->src[0]->ne[0] != 256) { + return false; + } + if (op->src[0]->ne[0] == 576) { + // DeepSeek sizes + // TODO: disabled for now, until optmized + return false; + } + if (op->src[1]->type != op->src[2]->type) { + return false; + } + return has_simdgroup_mm; // TODO: over-restricted for vec-kernels + case GGML_OP_SSM_CONV: + case GGML_OP_SSM_SCAN: + return has_simdgroup_reduction; + case GGML_OP_RWKV_WKV6: + case GGML_OP_RWKV_WKV7: + return true; + case GGML_OP_MUL_MAT: + case GGML_OP_MUL_MAT_ID: + return has_simdgroup_reduction; + case GGML_OP_CPY: + case GGML_OP_DUP: + case GGML_OP_CONT: + { + switch (op->src[0]->type) { + case GGML_TYPE_F32: + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_I32: + return true; + default: + return false; + } + case GGML_TYPE_F16: + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + return true; + default: + return false; + } + case GGML_TYPE_BF16: + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_BF16: + return true; + default: + return false; + } + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + return true; + default: + return false; + } + case GGML_TYPE_I32: + return op->type == GGML_TYPE_F32; + default: + return false; + }; + } + case GGML_OP_GET_ROWS: + return true; + case GGML_OP_SET_ROWS: + { + if (op->src[0]->type != GGML_TYPE_F32) { + return false; + } + + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_IQ4_NL: + return true; + default: + return false; + }; + } + case GGML_OP_OPT_STEP_ADAMW: + case GGML_OP_OPT_STEP_SGD: + return has_simdgroup_reduction; + default: + return false; + } +} diff --git a/ggml/src/ggml-remotingfrontend/ggml-metal-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-metal-remoting.h new file mode 100644 index 0000000000000..4463414694665 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-metal-remoting.h @@ -0,0 +1,16 @@ +#pragma once + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +struct ggml_backend_metal_device_context { + bool has_simdgroup_mm; + bool has_simdgroup_reduction; + bool has_bfloat; +}; + + +const struct ggml_backend_metal_device_context *get_metal_dev_context(const ggml_backend_dev_t dev); + +bool ggml_metal_device_supports_op(const struct ggml_backend_metal_device_context *dev_ctx, const struct ggml_tensor * op); diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp b/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp new file mode 100644 index 0000000000000..87679fe59a8d3 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp @@ -0,0 +1,26 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ggml-remoting-frontend.h" +#include "remoting.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" + + + +int ggml_backend_remoting_get_device_count(); + + + + +struct remoting_device_struct { + std::mutex mutex; +}; diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h new file mode 100644 index 0000000000000..c6f39a533da4a --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -0,0 +1,139 @@ +#pragma once + +#include +#include + +#include "ggml-remoting-frontend.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-metal-remoting.h" +#include "virtgpu.h" + + +// 1 is fast, 0 avoid micro-benchmark crashes +#define USE_ALWAYS_TRUE_SUPPORTS_OP 0 +#define USE_METAL_GUEST_SUPPORTS_OP 1 + +#define DEV_TO_GPU(name) \ + ((struct ggml_backend_remoting_device_context *) (name)->context)->gpu + +#define BUFFER_TO_GGML_CONTEXT(name) \ + ((struct ggml_backend_remoting_buffer_context *) (name)->context) + +#define BUFFER_TO_APIR_CONTEXT(name) \ + &((struct ggml_backend_remoting_buffer_context *) (name)->context)->apir_context + +#define BUFFER_TO_HOST_HANDLE(name) \ + ((struct ggml_backend_remoting_buffer_context *) (name)->context)->apir_context.host_handle + +#define GET_DEVICE_CONTEXT() \ + (struct ggml_backend_remoting_device_context *) ggml_backend_remoting_get_device(0)->context + +static inline apir_buffer_type_host_handle_t +ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) { + // in the backend, the buffer handle is the buffer pointer + return (apir_buffer_type_host_handle_t) buft->context; +} + +#define NOT_IMPLEMENTED \ + do { \ + static bool first = true; \ + if (first) { \ + printf("\nWARN: ###\nWARN: ### reached unimplemented function %s\nWARN: ###\n\n", __func__); \ + first = false; \ + } \ + } while(0) + +#define BEING_IMPLEMENTED \ + do { \ + printf("\nINFO: ###\nINFO: ### function being implemented: %s\nINFO: ###\n\n", __func__); \ + } while(0) + +#define NEXT + +#define STOP_HERE \ + thks_bye() + +#define BREAKPOINT \ + breakpoint() + +#ifndef NDEBUG +#define IMPLEMENTED \ + printf("INFO: ### reached implemented function %s\n", __func__) +#else +#define IMPLEMENTED \ + do {} while(0) +#endif + +#ifndef NDEBUG +#define IMPLEMENTED_ONCE \ + do { \ + static bool first = true; \ + if (first) { \ + printf("INFO: ### reached implemented function %s\n", __func__); \ + first = false; \ + } \ + } while(0) +#else +#define IMPLEMENTED_ONCE \ + do {} while(0) +#endif + +#define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl + +struct ggml_backend_remoting_device_context { + size_t device; + std::string name; + std::string description; + + std::vector> shared_memory; + + struct virtgpu *gpu; + + const struct ggml_backend_metal_device_context *metal_dev_ctx; +}; + +struct ggml_backend_remoting_buffer_context { + apir_buffer_context_t apir_context; + + struct virtgpu *gpu; + + void *base; + + bool is_host_buffer; + bool is_from_ptr; +}; + +extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface; +extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface; +extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface; +extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface; +extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface; +extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface; + +ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device); +ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type(); +ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params); +ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev); + +struct remoting_buffer_struct; +typedef std::shared_ptr remoting_buffer; +typedef std::weak_ptr remoting_buffer_ref; + +void ggml_remoting_destroy_buffer(remoting_buffer& buf); + +struct remoting_device_struct; +typedef std::shared_ptr remoting_device; +typedef std::weak_ptr remoting_device_ref; + +struct remoting_context_struct { + int i; +}; +typedef std::shared_ptr remoting_context; +typedef std::weak_ptr remoting_context_ref; + +static inline apir_buffer_host_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) { + return BUFFER_TO_HOST_HANDLE(buffer); +} diff --git a/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h b/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h new file mode 100644 index 0000000000000..4e4f7c2c39e4f --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h @@ -0,0 +1,1408 @@ +/* + * Header for the Direct Rendering Manager + * + * Author: Rickard E. (Rik) Faith + * + * Acknowledgments: + * Dec 1999, Richard Henderson , move to generic cmpxchg. + */ + +/* + * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. + * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. + * All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _DRM_H_ +#define _DRM_H_ + +#if defined(__linux__) + +#include +#include +typedef unsigned int drm_handle_t; + +#else /* One of the BSDs */ + +#include +#include +#include +typedef int8_t __s8; +typedef uint8_t __u8; +typedef int16_t __s16; +typedef uint16_t __u16; +typedef int32_t __s32; +typedef uint32_t __u32; +typedef int64_t __s64; +typedef uint64_t __u64; +typedef size_t __kernel_size_t; +typedef unsigned long drm_handle_t; + +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_NAME "drm" /**< Name in kernel, /dev, and /proc */ +#define DRM_MIN_ORDER 5 /**< At least 2^5 bytes = 32 bytes */ +#define DRM_MAX_ORDER 22 /**< Up to 2^22 bytes = 4MB */ +#define DRM_RAM_PERCENT 10 /**< How much system ram can we lock? */ + +#define _DRM_LOCK_HELD 0x80000000U /**< Hardware lock is held */ +#define _DRM_LOCK_CONT 0x40000000U /**< Hardware lock is contended */ +#define _DRM_LOCK_IS_HELD(lock) ((lock) & _DRM_LOCK_HELD) +#define _DRM_LOCK_IS_CONT(lock) ((lock) & _DRM_LOCK_CONT) +#define _DRM_LOCKING_CONTEXT(lock) ((lock) & ~(_DRM_LOCK_HELD|_DRM_LOCK_CONT)) + +typedef unsigned int drm_context_t; +typedef unsigned int drm_drawable_t; +typedef unsigned int drm_magic_t; + +/* + * Cliprect. + * + * \warning: If you change this structure, make sure you change + * XF86DRIClipRectRec in the server as well + * + * \note KW: Actually it's illegal to change either for + * backwards-compatibility reasons. + */ +struct drm_clip_rect { + unsigned short x1; + unsigned short y1; + unsigned short x2; + unsigned short y2; +}; + +/* + * Drawable information. + */ +struct drm_drawable_info { + unsigned int num_rects; + struct drm_clip_rect *rects; +}; + +/* + * Texture region, + */ +struct drm_tex_region { + unsigned char next; + unsigned char prev; + unsigned char in_use; + unsigned char padding; + unsigned int age; +}; + +/* + * Hardware lock. + * + * The lock structure is a simple cache-line aligned integer. To avoid + * processor bus contention on a multiprocessor system, there should not be any + * other data stored in the same cache line. + */ +struct drm_hw_lock { + __volatile__ unsigned int lock; /**< lock variable */ + char padding[60]; /**< Pad to cache line */ +}; + +/* + * DRM_IOCTL_VERSION ioctl argument type. + * + * \sa drmGetVersion(). + */ +struct drm_version { + int version_major; /**< Major version */ + int version_minor; /**< Minor version */ + int version_patchlevel; /**< Patch level */ + __kernel_size_t name_len; /**< Length of name buffer */ + char *name; /**< Name of driver */ + __kernel_size_t date_len; /**< Length of date buffer */ + char *date; /**< User-space buffer to hold date */ + __kernel_size_t desc_len; /**< Length of desc buffer */ + char *desc; /**< User-space buffer to hold desc */ +}; + +/* + * DRM_IOCTL_GET_UNIQUE ioctl argument type. + * + * \sa drmGetBusid() and drmSetBusId(). + */ +struct drm_unique { + __kernel_size_t unique_len; /**< Length of unique */ + char *unique; /**< Unique name for driver instantiation */ +}; + +struct drm_list { + int count; /**< Length of user-space structures */ + struct drm_version *version; +}; + +struct drm_block { + int unused; +}; + +/* + * DRM_IOCTL_CONTROL ioctl argument type. + * + * \sa drmCtlInstHandler() and drmCtlUninstHandler(). + */ +struct drm_control { + enum { + DRM_ADD_COMMAND, + DRM_RM_COMMAND, + DRM_INST_HANDLER, + DRM_UNINST_HANDLER + } func; + int irq; +}; + +/* + * Type of memory to map. + */ +enum drm_map_type { + _DRM_FRAME_BUFFER = 0, /**< WC (no caching), no core dump */ + _DRM_REGISTERS = 1, /**< no caching, no core dump */ + _DRM_SHM = 2, /**< shared, cached */ + _DRM_AGP = 3, /**< AGP/GART */ + _DRM_SCATTER_GATHER = 4, /**< Scatter/gather memory for PCI DMA */ + _DRM_CONSISTENT = 5 /**< Consistent memory for PCI DMA */ +}; + +/* + * Memory mapping flags. + */ +enum drm_map_flags { + _DRM_RESTRICTED = 0x01, /**< Cannot be mapped to user-virtual */ + _DRM_READ_ONLY = 0x02, + _DRM_LOCKED = 0x04, /**< shared, cached, locked */ + _DRM_KERNEL = 0x08, /**< kernel requires access */ + _DRM_WRITE_COMBINING = 0x10, /**< use write-combining if available */ + _DRM_CONTAINS_LOCK = 0x20, /**< SHM page that contains lock */ + _DRM_REMOVABLE = 0x40, /**< Removable mapping */ + _DRM_DRIVER = 0x80 /**< Managed by driver */ +}; + +struct drm_ctx_priv_map { + unsigned int ctx_id; /**< Context requesting private mapping */ + void *handle; /**< Handle of map */ +}; + +/* + * DRM_IOCTL_GET_MAP, DRM_IOCTL_ADD_MAP and DRM_IOCTL_RM_MAP ioctls + * argument type. + * + * \sa drmAddMap(). + */ +struct drm_map { + unsigned long offset; /**< Requested physical address (0 for SAREA)*/ + unsigned long size; /**< Requested physical size (bytes) */ + enum drm_map_type type; /**< Type of memory to map */ + enum drm_map_flags flags; /**< Flags */ + void *handle; /**< User-space: "Handle" to pass to mmap() */ + /**< Kernel-space: kernel-virtual address */ + int mtrr; /**< MTRR slot used */ + /* Private data */ +}; + +/* + * DRM_IOCTL_GET_CLIENT ioctl argument type. + */ +struct drm_client { + int idx; /**< Which client desired? */ + int auth; /**< Is client authenticated? */ + unsigned long pid; /**< Process ID */ + unsigned long uid; /**< User ID */ + unsigned long magic; /**< Magic */ + unsigned long iocs; /**< Ioctl count */ +}; + +enum drm_stat_type { + _DRM_STAT_LOCK, + _DRM_STAT_OPENS, + _DRM_STAT_CLOSES, + _DRM_STAT_IOCTLS, + _DRM_STAT_LOCKS, + _DRM_STAT_UNLOCKS, + _DRM_STAT_VALUE, /**< Generic value */ + _DRM_STAT_BYTE, /**< Generic byte counter (1024bytes/K) */ + _DRM_STAT_COUNT, /**< Generic non-byte counter (1000/k) */ + + _DRM_STAT_IRQ, /**< IRQ */ + _DRM_STAT_PRIMARY, /**< Primary DMA bytes */ + _DRM_STAT_SECONDARY, /**< Secondary DMA bytes */ + _DRM_STAT_DMA, /**< DMA */ + _DRM_STAT_SPECIAL, /**< Special DMA (e.g., priority or polled) */ + _DRM_STAT_MISSED /**< Missed DMA opportunity */ + /* Add to the *END* of the list */ +}; + +/* + * DRM_IOCTL_GET_STATS ioctl argument type. + */ +struct drm_stats { + unsigned long count; + struct { + unsigned long value; + enum drm_stat_type type; + } data[15]; +}; + +/* + * Hardware locking flags. + */ +enum drm_lock_flags { + _DRM_LOCK_READY = 0x01, /**< Wait until hardware is ready for DMA */ + _DRM_LOCK_QUIESCENT = 0x02, /**< Wait until hardware quiescent */ + _DRM_LOCK_FLUSH = 0x04, /**< Flush this context's DMA queue first */ + _DRM_LOCK_FLUSH_ALL = 0x08, /**< Flush all DMA queues first */ + /* These *HALT* flags aren't supported yet + -- they will be used to support the + full-screen DGA-like mode. */ + _DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */ + _DRM_HALT_CUR_QUEUES = 0x20 /**< Halt all current queues */ +}; + +/* + * DRM_IOCTL_LOCK, DRM_IOCTL_UNLOCK and DRM_IOCTL_FINISH ioctl argument type. + * + * \sa drmGetLock() and drmUnlock(). + */ +struct drm_lock { + int context; + enum drm_lock_flags flags; +}; + +/* + * DMA flags + * + * \warning + * These values \e must match xf86drm.h. + * + * \sa drm_dma. + */ +enum drm_dma_flags { + /* Flags for DMA buffer dispatch */ + _DRM_DMA_BLOCK = 0x01, /**< + * Block until buffer dispatched. + * + * \note The buffer may not yet have + * been processed by the hardware -- + * getting a hardware lock with the + * hardware quiescent will ensure + * that the buffer has been + * processed. + */ + _DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */ + _DRM_DMA_PRIORITY = 0x04, /**< High priority dispatch */ + + /* Flags for DMA buffer request */ + _DRM_DMA_WAIT = 0x10, /**< Wait for free buffers */ + _DRM_DMA_SMALLER_OK = 0x20, /**< Smaller-than-requested buffers OK */ + _DRM_DMA_LARGER_OK = 0x40 /**< Larger-than-requested buffers OK */ +}; + +/* + * DRM_IOCTL_ADD_BUFS and DRM_IOCTL_MARK_BUFS ioctl argument type. + * + * \sa drmAddBufs(). + */ +struct drm_buf_desc { + int count; /**< Number of buffers of this size */ + int size; /**< Size in bytes */ + int low_mark; /**< Low water mark */ + int high_mark; /**< High water mark */ + enum { + _DRM_PAGE_ALIGN = 0x01, /**< Align on page boundaries for DMA */ + _DRM_AGP_BUFFER = 0x02, /**< Buffer is in AGP space */ + _DRM_SG_BUFFER = 0x04, /**< Scatter/gather memory buffer */ + _DRM_FB_BUFFER = 0x08, /**< Buffer is in frame buffer */ + _DRM_PCI_BUFFER_RO = 0x10 /**< Map PCI DMA buffer read-only */ + } flags; + unsigned long agp_start; /**< + * Start address of where the AGP buffers are + * in the AGP aperture + */ +}; + +/* + * DRM_IOCTL_INFO_BUFS ioctl argument type. + */ +struct drm_buf_info { + int count; /**< Entries in list */ + struct drm_buf_desc *list; +}; + +/* + * DRM_IOCTL_FREE_BUFS ioctl argument type. + */ +struct drm_buf_free { + int count; + int *list; +}; + +/* + * Buffer information + * + * \sa drm_buf_map. + */ +struct drm_buf_pub { + int idx; /**< Index into the master buffer list */ + int total; /**< Buffer size */ + int used; /**< Amount of buffer in use (for DMA) */ + void *address; /**< Address of buffer */ +}; + +/* + * DRM_IOCTL_MAP_BUFS ioctl argument type. + */ +struct drm_buf_map { + int count; /**< Length of the buffer list */ +#ifdef __cplusplus + void *virt; +#else + void *virtual; /**< Mmap'd area in user-virtual */ +#endif + struct drm_buf_pub *list; /**< Buffer information */ +}; + +/* + * DRM_IOCTL_DMA ioctl argument type. + * + * Indices here refer to the offset into the buffer list in drm_buf_get. + * + * \sa drmDMA(). + */ +struct drm_dma { + int context; /**< Context handle */ + int send_count; /**< Number of buffers to send */ + int *send_indices; /**< List of handles to buffers */ + int *send_sizes; /**< Lengths of data to send */ + enum drm_dma_flags flags; /**< Flags */ + int request_count; /**< Number of buffers requested */ + int request_size; /**< Desired size for buffers */ + int *request_indices; /**< Buffer information */ + int *request_sizes; + int granted_count; /**< Number of buffers granted */ +}; + +enum drm_ctx_flags { + _DRM_CONTEXT_PRESERVED = 0x01, + _DRM_CONTEXT_2DONLY = 0x02 +}; + +/* + * DRM_IOCTL_ADD_CTX ioctl argument type. + * + * \sa drmCreateContext() and drmDestroyContext(). + */ +struct drm_ctx { + drm_context_t handle; + enum drm_ctx_flags flags; +}; + +/* + * DRM_IOCTL_RES_CTX ioctl argument type. + */ +struct drm_ctx_res { + int count; + struct drm_ctx *contexts; +}; + +/* + * DRM_IOCTL_ADD_DRAW and DRM_IOCTL_RM_DRAW ioctl argument type. + */ +struct drm_draw { + drm_drawable_t handle; +}; + +/* + * DRM_IOCTL_UPDATE_DRAW ioctl argument type. + */ +typedef enum { + DRM_DRAWABLE_CLIPRECTS +} drm_drawable_info_type_t; + +struct drm_update_draw { + drm_drawable_t handle; + unsigned int type; + unsigned int num; + unsigned long long data; +}; + +/* + * DRM_IOCTL_GET_MAGIC and DRM_IOCTL_AUTH_MAGIC ioctl argument type. + */ +struct drm_auth { + drm_magic_t magic; +}; + +/* + * DRM_IOCTL_IRQ_BUSID ioctl argument type. + * + * \sa drmGetInterruptFromBusID(). + */ +struct drm_irq_busid { + int irq; /**< IRQ number */ + int busnum; /**< bus number */ + int devnum; /**< device number */ + int funcnum; /**< function number */ +}; + +enum drm_vblank_seq_type { + _DRM_VBLANK_ABSOLUTE = 0x0, /**< Wait for specific vblank sequence number */ + _DRM_VBLANK_RELATIVE = 0x1, /**< Wait for given number of vblanks */ + /* bits 1-6 are reserved for high crtcs */ + _DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e, + _DRM_VBLANK_EVENT = 0x4000000, /**< Send event instead of blocking */ + _DRM_VBLANK_FLIP = 0x8000000, /**< Scheduled buffer swap should flip */ + _DRM_VBLANK_NEXTONMISS = 0x10000000, /**< If missed, wait for next vblank */ + _DRM_VBLANK_SECONDARY = 0x20000000, /**< Secondary display controller */ + _DRM_VBLANK_SIGNAL = 0x40000000 /**< Send signal instead of blocking, unsupported */ +}; +#define _DRM_VBLANK_HIGH_CRTC_SHIFT 1 + +#define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE) +#define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \ + _DRM_VBLANK_SECONDARY | _DRM_VBLANK_NEXTONMISS) + +struct drm_wait_vblank_request { + enum drm_vblank_seq_type type; + unsigned int sequence; + unsigned long signal; +}; + +struct drm_wait_vblank_reply { + enum drm_vblank_seq_type type; + unsigned int sequence; + long tval_sec; + long tval_usec; +}; + +/* + * DRM_IOCTL_WAIT_VBLANK ioctl argument type. + * + * \sa drmWaitVBlank(). + */ +union drm_wait_vblank { + struct drm_wait_vblank_request request; + struct drm_wait_vblank_reply reply; +}; + +#define _DRM_PRE_MODESET 1 +#define _DRM_POST_MODESET 2 + +/* + * DRM_IOCTL_MODESET_CTL ioctl argument type + * + * \sa drmModesetCtl(). + */ +struct drm_modeset_ctl { + __u32 crtc; + __u32 cmd; +}; + +/* + * DRM_IOCTL_AGP_ENABLE ioctl argument type. + * + * \sa drmAgpEnable(). + */ +struct drm_agp_mode { + unsigned long mode; /**< AGP mode */ +}; + +/* + * DRM_IOCTL_AGP_ALLOC and DRM_IOCTL_AGP_FREE ioctls argument type. + * + * \sa drmAgpAlloc() and drmAgpFree(). + */ +struct drm_agp_buffer { + unsigned long size; /**< In bytes -- will round to page boundary */ + unsigned long handle; /**< Used for binding / unbinding */ + unsigned long type; /**< Type of memory to allocate */ + unsigned long physical; /**< Physical used by i810 */ +}; + +/* + * DRM_IOCTL_AGP_BIND and DRM_IOCTL_AGP_UNBIND ioctls argument type. + * + * \sa drmAgpBind() and drmAgpUnbind(). + */ +struct drm_agp_binding { + unsigned long handle; /**< From drm_agp_buffer */ + unsigned long offset; /**< In bytes -- will round to page boundary */ +}; + +/* + * DRM_IOCTL_AGP_INFO ioctl argument type. + * + * \sa drmAgpVersionMajor(), drmAgpVersionMinor(), drmAgpGetMode(), + * drmAgpBase(), drmAgpSize(), drmAgpMemoryUsed(), drmAgpMemoryAvail(), + * drmAgpVendorId() and drmAgpDeviceId(). + */ +struct drm_agp_info { + int agp_version_major; + int agp_version_minor; + unsigned long mode; + unsigned long aperture_base; /* physical address */ + unsigned long aperture_size; /* bytes */ + unsigned long memory_allowed; /* bytes */ + unsigned long memory_used; + + /* PCI information */ + unsigned short id_vendor; + unsigned short id_device; +}; + +/* + * DRM_IOCTL_SG_ALLOC ioctl argument type. + */ +struct drm_scatter_gather { + unsigned long size; /**< In bytes -- will round to page boundary */ + unsigned long handle; /**< Used for mapping / unmapping */ +}; + +/* + * DRM_IOCTL_SET_VERSION ioctl argument type. + */ +struct drm_set_version { + int drm_di_major; + int drm_di_minor; + int drm_dd_major; + int drm_dd_minor; +}; + +/* DRM_IOCTL_GEM_CLOSE ioctl argument type */ +struct drm_gem_close { + /** Handle of the object to be closed. */ + __u32 handle; + __u32 pad; +}; + +/* DRM_IOCTL_GEM_FLINK ioctl argument type */ +struct drm_gem_flink { + /** Handle for the object being named */ + __u32 handle; + + /** Returned global name */ + __u32 name; +}; + +/* DRM_IOCTL_GEM_OPEN ioctl argument type */ +struct drm_gem_open { + /** Name of object being opened */ + __u32 name; + + /** Returned handle for the object */ + __u32 handle; + + /** Returned size of the object */ + __u64 size; +}; + +/** + * DRM_CAP_DUMB_BUFFER + * + * If set to 1, the driver supports creating dumb buffers via the + * &DRM_IOCTL_MODE_CREATE_DUMB ioctl. + */ +#define DRM_CAP_DUMB_BUFFER 0x1 +/** + * DRM_CAP_VBLANK_HIGH_CRTC + * + * If set to 1, the kernel supports specifying a :ref:`CRTC index` + * in the high bits of &drm_wait_vblank_request.type. + * + * Starting kernel version 2.6.39, this capability is always set to 1. + */ +#define DRM_CAP_VBLANK_HIGH_CRTC 0x2 +/** + * DRM_CAP_DUMB_PREFERRED_DEPTH + * + * The preferred bit depth for dumb buffers. + * + * The bit depth is the number of bits used to indicate the color of a single + * pixel excluding any padding. This is different from the number of bits per + * pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per + * pixel. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ +#define DRM_CAP_DUMB_PREFERRED_DEPTH 0x3 +/** + * DRM_CAP_DUMB_PREFER_SHADOW + * + * If set to 1, the driver prefers userspace to render to a shadow buffer + * instead of directly rendering to a dumb buffer. For best speed, userspace + * should do streaming ordered memory copies into the dumb buffer and never + * read from it. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ +#define DRM_CAP_DUMB_PREFER_SHADOW 0x4 +/** + * DRM_CAP_PRIME + * + * Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT + * and &DRM_PRIME_CAP_EXPORT. + * + * Starting from kernel version 6.6, both &DRM_PRIME_CAP_IMPORT and + * &DRM_PRIME_CAP_EXPORT are always advertised. + * + * PRIME buffers are exposed as dma-buf file descriptors. + * See :ref:`prime_buffer_sharing`. + */ +#define DRM_CAP_PRIME 0x5 +/** + * DRM_PRIME_CAP_IMPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME + * buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl. + * + * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. + */ +#define DRM_PRIME_CAP_IMPORT 0x1 +/** + * DRM_PRIME_CAP_EXPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME + * buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl. + * + * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. + */ +#define DRM_PRIME_CAP_EXPORT 0x2 +/** + * DRM_CAP_TIMESTAMP_MONOTONIC + * + * If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in + * struct drm_event_vblank. If set to 1, the kernel will report timestamps with + * ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these + * clocks. + * + * Starting from kernel version 2.6.39, the default value for this capability + * is 1. Starting kernel version 4.15, this capability is always set to 1. + */ +#define DRM_CAP_TIMESTAMP_MONOTONIC 0x6 +/** + * DRM_CAP_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for legacy + * page-flips. + */ +#define DRM_CAP_ASYNC_PAGE_FLIP 0x7 +/** + * DRM_CAP_CURSOR_WIDTH + * + * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid + * width x height combination for the hardware cursor. The intention is that a + * hardware agnostic userspace can query a cursor plane size to use. + * + * Note that the cross-driver contract is to merely return a valid size; + * drivers are free to attach another meaning on top, eg. i915 returns the + * maximum plane size. + */ +#define DRM_CAP_CURSOR_WIDTH 0x8 +/** + * DRM_CAP_CURSOR_HEIGHT + * + * See &DRM_CAP_CURSOR_WIDTH. + */ +#define DRM_CAP_CURSOR_HEIGHT 0x9 +/** + * DRM_CAP_ADDFB2_MODIFIERS + * + * If set to 1, the driver supports supplying modifiers in the + * &DRM_IOCTL_MODE_ADDFB2 ioctl. + */ +#define DRM_CAP_ADDFB2_MODIFIERS 0x10 +/** + * DRM_CAP_PAGE_FLIP_TARGET + * + * If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and + * &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in + * &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP + * ioctl. + */ +#define DRM_CAP_PAGE_FLIP_TARGET 0x11 +/** + * DRM_CAP_CRTC_IN_VBLANK_EVENT + * + * If set to 1, the kernel supports reporting the CRTC ID in + * &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and + * &DRM_EVENT_FLIP_COMPLETE events. + * + * Starting kernel version 4.12, this capability is always set to 1. + */ +#define DRM_CAP_CRTC_IN_VBLANK_EVENT 0x12 +/** + * DRM_CAP_SYNCOBJ + * + * If set to 1, the driver supports sync objects. See :ref:`drm_sync_objects`. + */ +#define DRM_CAP_SYNCOBJ 0x13 +/** + * DRM_CAP_SYNCOBJ_TIMELINE + * + * If set to 1, the driver supports timeline operations on sync objects. See + * :ref:`drm_sync_objects`. + */ +#define DRM_CAP_SYNCOBJ_TIMELINE 0x14 +/** + * DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for atomic + * commits. + */ +#define DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP 0x15 + +/* DRM_IOCTL_GET_CAP ioctl argument type */ +struct drm_get_cap { + __u64 capability; + __u64 value; +}; + +/** + * DRM_CLIENT_CAP_STEREO_3D + * + * If set to 1, the DRM core will expose the stereo 3D capabilities of the + * monitor by advertising the supported 3D layouts in the flags of struct + * drm_mode_modeinfo. See ``DRM_MODE_FLAG_3D_*``. + * + * This capability is always supported for all drivers starting from kernel + * version 3.13. + */ +#define DRM_CLIENT_CAP_STEREO_3D 1 + +/** + * DRM_CLIENT_CAP_UNIVERSAL_PLANES + * + * If set to 1, the DRM core will expose all planes (overlay, primary, and + * cursor) to userspace. + * + * This capability has been introduced in kernel version 3.15. Starting from + * kernel version 3.17, this capability is always supported for all drivers. + */ +#define DRM_CLIENT_CAP_UNIVERSAL_PLANES 2 + +/** + * DRM_CLIENT_CAP_ATOMIC + * + * If set to 1, the DRM core will expose atomic properties to userspace. This + * implicitly enables &DRM_CLIENT_CAP_UNIVERSAL_PLANES and + * &DRM_CLIENT_CAP_ASPECT_RATIO. + * + * If the driver doesn't support atomic mode-setting, enabling this capability + * will fail with -EOPNOTSUPP. + * + * This capability has been introduced in kernel version 4.0. Starting from + * kernel version 4.2, this capability is always supported for atomic-capable + * drivers. + */ +#define DRM_CLIENT_CAP_ATOMIC 3 + +/** + * DRM_CLIENT_CAP_ASPECT_RATIO + * + * If set to 1, the DRM core will provide aspect ratio information in modes. + * See ``DRM_MODE_FLAG_PIC_AR_*``. + * + * This capability is always supported for all drivers starting from kernel + * version 4.18. + */ +#define DRM_CLIENT_CAP_ASPECT_RATIO 4 + +/** + * DRM_CLIENT_CAP_WRITEBACK_CONNECTORS + * + * If set to 1, the DRM core will expose special connectors to be used for + * writing back to memory the scene setup in the commit. The client must enable + * &DRM_CLIENT_CAP_ATOMIC first. + * + * This capability is always supported for atomic-capable drivers starting from + * kernel version 4.19. + */ +#define DRM_CLIENT_CAP_WRITEBACK_CONNECTORS 5 + +/** + * DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT + * + * Drivers for para-virtualized hardware (e.g. vmwgfx, qxl, virtio and + * virtualbox) have additional restrictions for cursor planes (thus + * making cursor planes on those drivers not truly universal,) e.g. + * they need cursor planes to act like one would expect from a mouse + * cursor and have correctly set hotspot properties. + * If this client cap is not set the DRM core will hide cursor plane on + * those virtualized drivers because not setting it implies that the + * client is not capable of dealing with those extra restictions. + * Clients which do set cursor hotspot and treat the cursor plane + * like a mouse cursor should set this property. + * The client must enable &DRM_CLIENT_CAP_ATOMIC first. + * + * Setting this property on drivers which do not special case + * cursor planes (i.e. non-virtualized drivers) will return + * EOPNOTSUPP, which can be used by userspace to gauge + * requirements of the hardware/drivers they're running on. + * + * This capability is always supported for atomic-capable virtualized + * drivers starting from kernel version 6.6. + */ +#define DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT 6 + +/* DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */ +struct drm_set_client_cap { + __u64 capability; + __u64 value; +}; + +#define DRM_RDWR O_RDWR +#define DRM_CLOEXEC O_CLOEXEC +struct drm_prime_handle { + __u32 handle; + + /** Flags.. only applicable for handle->fd */ + __u32 flags; + + /** Returned dmabuf file descriptor */ + __s32 fd; +}; + +struct drm_syncobj_create { + __u32 handle; +#define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0) + __u32 flags; +}; + +struct drm_syncobj_destroy { + __u32 handle; + __u32 pad; +}; + +#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0) +#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0) +struct drm_syncobj_handle { + __u32 handle; + __u32 flags; + + __s32 fd; + __u32 pad; +}; + +struct drm_syncobj_transfer { + __u32 src_handle; + __u32 dst_handle; + __u64 src_point; + __u64 dst_point; + __u32 flags; + __u32 pad; +}; + +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL (1 << 0) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (1 << 1) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE (1 << 2) /* wait for time point to become available */ +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE (1 << 3) /* set fence deadline to deadline_nsec */ +struct drm_syncobj_wait { + __u64 handles; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; + /** + * @deadline_nsec - fence deadline hint + * + * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing + * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is + * set. + */ + __u64 deadline_nsec; +}; + +struct drm_syncobj_timeline_wait { + __u64 handles; + /* wait on specific timeline point for every handles*/ + __u64 points; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; + /** + * @deadline_nsec - fence deadline hint + * + * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing + * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is + * set. + */ + __u64 deadline_nsec; +}; + +/** + * struct drm_syncobj_eventfd + * @handle: syncobj handle. + * @flags: Zero to wait for the point to be signalled, or + * &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE to wait for a fence to be + * available for the point. + * @point: syncobj timeline point (set to zero for binary syncobjs). + * @fd: Existing eventfd to sent events to. + * @pad: Must be zero. + * + * Register an eventfd to be signalled by a syncobj. The eventfd counter will + * be incremented by one. + */ +struct drm_syncobj_eventfd { + __u32 handle; + __u32 flags; + __u64 point; + __s32 fd; + __u32 pad; +}; + + +struct drm_syncobj_array { + __u64 handles; + __u32 count_handles; + __u32 pad; +}; + +#define DRM_SYNCOBJ_QUERY_FLAGS_LAST_SUBMITTED (1 << 0) /* last available point on timeline syncobj */ +struct drm_syncobj_timeline_array { + __u64 handles; + __u64 points; + __u32 count_handles; + __u32 flags; +}; + + +/* Query current scanout sequence number */ +struct drm_crtc_get_sequence { + __u32 crtc_id; /* requested crtc_id */ + __u32 active; /* return: crtc output is active */ + __u64 sequence; /* return: most recent vblank sequence */ + __s64 sequence_ns; /* return: most recent time of first pixel out */ +}; + +/* Queue event to be delivered at specified sequence. Time stamp marks + * when the first pixel of the refresh cycle leaves the display engine + * for the display + */ +#define DRM_CRTC_SEQUENCE_RELATIVE 0x00000001 /* sequence is relative to current */ +#define DRM_CRTC_SEQUENCE_NEXT_ON_MISS 0x00000002 /* Use next sequence if we've missed */ + +struct drm_crtc_queue_sequence { + __u32 crtc_id; + __u32 flags; + __u64 sequence; /* on input, target sequence. on output, actual sequence */ + __u64 user_data; /* user data passed to event */ +}; + +#if defined(__cplusplus) +} +#endif + +#include "drm_mode.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_IOCTL_BASE 'd' +#define DRM_IO(nr) _IO(DRM_IOCTL_BASE,nr) +#define DRM_IOR(nr,type) _IOR(DRM_IOCTL_BASE,nr,type) +#define DRM_IOW(nr,type) _IOW(DRM_IOCTL_BASE,nr,type) +#define DRM_IOWR(nr,type) _IOWR(DRM_IOCTL_BASE,nr,type) + +#define DRM_IOCTL_VERSION DRM_IOWR(0x00, struct drm_version) +#define DRM_IOCTL_GET_UNIQUE DRM_IOWR(0x01, struct drm_unique) +#define DRM_IOCTL_GET_MAGIC DRM_IOR( 0x02, struct drm_auth) +#define DRM_IOCTL_IRQ_BUSID DRM_IOWR(0x03, struct drm_irq_busid) +#define DRM_IOCTL_GET_MAP DRM_IOWR(0x04, struct drm_map) +#define DRM_IOCTL_GET_CLIENT DRM_IOWR(0x05, struct drm_client) +#define DRM_IOCTL_GET_STATS DRM_IOR( 0x06, struct drm_stats) +#define DRM_IOCTL_SET_VERSION DRM_IOWR(0x07, struct drm_set_version) +#define DRM_IOCTL_MODESET_CTL DRM_IOW(0x08, struct drm_modeset_ctl) +/** + * DRM_IOCTL_GEM_CLOSE - Close a GEM handle. + * + * GEM handles are not reference-counted by the kernel. User-space is + * responsible for managing their lifetime. For example, if user-space imports + * the same memory object twice on the same DRM file description, the same GEM + * handle is returned by both imports, and user-space needs to ensure + * &DRM_IOCTL_GEM_CLOSE is performed once only. The same situation can happen + * when a memory object is allocated, then exported and imported again on the + * same DRM file description. The &DRM_IOCTL_MODE_GETFB2 IOCTL is an exception + * and always returns fresh new GEM handles even if an existing GEM handle + * already refers to the same memory object before the IOCTL is performed. + */ +#define DRM_IOCTL_GEM_CLOSE DRM_IOW (0x09, struct drm_gem_close) +#define DRM_IOCTL_GEM_FLINK DRM_IOWR(0x0a, struct drm_gem_flink) +#define DRM_IOCTL_GEM_OPEN DRM_IOWR(0x0b, struct drm_gem_open) +#define DRM_IOCTL_GET_CAP DRM_IOWR(0x0c, struct drm_get_cap) +#define DRM_IOCTL_SET_CLIENT_CAP DRM_IOW( 0x0d, struct drm_set_client_cap) + +#define DRM_IOCTL_SET_UNIQUE DRM_IOW( 0x10, struct drm_unique) +#define DRM_IOCTL_AUTH_MAGIC DRM_IOW( 0x11, struct drm_auth) +#define DRM_IOCTL_BLOCK DRM_IOWR(0x12, struct drm_block) +#define DRM_IOCTL_UNBLOCK DRM_IOWR(0x13, struct drm_block) +#define DRM_IOCTL_CONTROL DRM_IOW( 0x14, struct drm_control) +#define DRM_IOCTL_ADD_MAP DRM_IOWR(0x15, struct drm_map) +#define DRM_IOCTL_ADD_BUFS DRM_IOWR(0x16, struct drm_buf_desc) +#define DRM_IOCTL_MARK_BUFS DRM_IOW( 0x17, struct drm_buf_desc) +#define DRM_IOCTL_INFO_BUFS DRM_IOWR(0x18, struct drm_buf_info) +#define DRM_IOCTL_MAP_BUFS DRM_IOWR(0x19, struct drm_buf_map) +#define DRM_IOCTL_FREE_BUFS DRM_IOW( 0x1a, struct drm_buf_free) + +#define DRM_IOCTL_RM_MAP DRM_IOW( 0x1b, struct drm_map) + +#define DRM_IOCTL_SET_SAREA_CTX DRM_IOW( 0x1c, struct drm_ctx_priv_map) +#define DRM_IOCTL_GET_SAREA_CTX DRM_IOWR(0x1d, struct drm_ctx_priv_map) + +#define DRM_IOCTL_SET_MASTER DRM_IO(0x1e) +#define DRM_IOCTL_DROP_MASTER DRM_IO(0x1f) + +#define DRM_IOCTL_ADD_CTX DRM_IOWR(0x20, struct drm_ctx) +#define DRM_IOCTL_RM_CTX DRM_IOWR(0x21, struct drm_ctx) +#define DRM_IOCTL_MOD_CTX DRM_IOW( 0x22, struct drm_ctx) +#define DRM_IOCTL_GET_CTX DRM_IOWR(0x23, struct drm_ctx) +#define DRM_IOCTL_SWITCH_CTX DRM_IOW( 0x24, struct drm_ctx) +#define DRM_IOCTL_NEW_CTX DRM_IOW( 0x25, struct drm_ctx) +#define DRM_IOCTL_RES_CTX DRM_IOWR(0x26, struct drm_ctx_res) +#define DRM_IOCTL_ADD_DRAW DRM_IOWR(0x27, struct drm_draw) +#define DRM_IOCTL_RM_DRAW DRM_IOWR(0x28, struct drm_draw) +#define DRM_IOCTL_DMA DRM_IOWR(0x29, struct drm_dma) +#define DRM_IOCTL_LOCK DRM_IOW( 0x2a, struct drm_lock) +#define DRM_IOCTL_UNLOCK DRM_IOW( 0x2b, struct drm_lock) +#define DRM_IOCTL_FINISH DRM_IOW( 0x2c, struct drm_lock) + +/** + * DRM_IOCTL_PRIME_HANDLE_TO_FD - Convert a GEM handle to a DMA-BUF FD. + * + * User-space sets &drm_prime_handle.handle with the GEM handle to export and + * &drm_prime_handle.flags, and gets back a DMA-BUF file descriptor in + * &drm_prime_handle.fd. + * + * The export can fail for any driver-specific reason, e.g. because export is + * not supported for this specific GEM handle (but might be for others). + * + * Support for exporting DMA-BUFs is advertised via &DRM_PRIME_CAP_EXPORT. + */ +#define DRM_IOCTL_PRIME_HANDLE_TO_FD DRM_IOWR(0x2d, struct drm_prime_handle) +/** + * DRM_IOCTL_PRIME_FD_TO_HANDLE - Convert a DMA-BUF FD to a GEM handle. + * + * User-space sets &drm_prime_handle.fd with a DMA-BUF file descriptor to + * import, and gets back a GEM handle in &drm_prime_handle.handle. + * &drm_prime_handle.flags is unused. + * + * If an existing GEM handle refers to the memory object backing the DMA-BUF, + * that GEM handle is returned. Therefore user-space which needs to handle + * arbitrary DMA-BUFs must have a user-space lookup data structure to manually + * reference-count duplicated GEM handles. For more information see + * &DRM_IOCTL_GEM_CLOSE. + * + * The import can fail for any driver-specific reason, e.g. because import is + * only supported for DMA-BUFs allocated on this DRM device. + * + * Support for importing DMA-BUFs is advertised via &DRM_PRIME_CAP_IMPORT. + */ +#define DRM_IOCTL_PRIME_FD_TO_HANDLE DRM_IOWR(0x2e, struct drm_prime_handle) + +#define DRM_IOCTL_AGP_ACQUIRE DRM_IO( 0x30) +#define DRM_IOCTL_AGP_RELEASE DRM_IO( 0x31) +#define DRM_IOCTL_AGP_ENABLE DRM_IOW( 0x32, struct drm_agp_mode) +#define DRM_IOCTL_AGP_INFO DRM_IOR( 0x33, struct drm_agp_info) +#define DRM_IOCTL_AGP_ALLOC DRM_IOWR(0x34, struct drm_agp_buffer) +#define DRM_IOCTL_AGP_FREE DRM_IOW( 0x35, struct drm_agp_buffer) +#define DRM_IOCTL_AGP_BIND DRM_IOW( 0x36, struct drm_agp_binding) +#define DRM_IOCTL_AGP_UNBIND DRM_IOW( 0x37, struct drm_agp_binding) + +#define DRM_IOCTL_SG_ALLOC DRM_IOWR(0x38, struct drm_scatter_gather) +#define DRM_IOCTL_SG_FREE DRM_IOW( 0x39, struct drm_scatter_gather) + +#define DRM_IOCTL_WAIT_VBLANK DRM_IOWR(0x3a, union drm_wait_vblank) + +#define DRM_IOCTL_CRTC_GET_SEQUENCE DRM_IOWR(0x3b, struct drm_crtc_get_sequence) +#define DRM_IOCTL_CRTC_QUEUE_SEQUENCE DRM_IOWR(0x3c, struct drm_crtc_queue_sequence) + +#define DRM_IOCTL_UPDATE_DRAW DRM_IOW(0x3f, struct drm_update_draw) + +#define DRM_IOCTL_MODE_GETRESOURCES DRM_IOWR(0xA0, struct drm_mode_card_res) +#define DRM_IOCTL_MODE_GETCRTC DRM_IOWR(0xA1, struct drm_mode_crtc) +#define DRM_IOCTL_MODE_SETCRTC DRM_IOWR(0xA2, struct drm_mode_crtc) +#define DRM_IOCTL_MODE_CURSOR DRM_IOWR(0xA3, struct drm_mode_cursor) +#define DRM_IOCTL_MODE_GETGAMMA DRM_IOWR(0xA4, struct drm_mode_crtc_lut) +#define DRM_IOCTL_MODE_SETGAMMA DRM_IOWR(0xA5, struct drm_mode_crtc_lut) +#define DRM_IOCTL_MODE_GETENCODER DRM_IOWR(0xA6, struct drm_mode_get_encoder) +#define DRM_IOCTL_MODE_GETCONNECTOR DRM_IOWR(0xA7, struct drm_mode_get_connector) +#define DRM_IOCTL_MODE_ATTACHMODE DRM_IOWR(0xA8, struct drm_mode_mode_cmd) /* deprecated (never worked) */ +#define DRM_IOCTL_MODE_DETACHMODE DRM_IOWR(0xA9, struct drm_mode_mode_cmd) /* deprecated (never worked) */ + +#define DRM_IOCTL_MODE_GETPROPERTY DRM_IOWR(0xAA, struct drm_mode_get_property) +#define DRM_IOCTL_MODE_SETPROPERTY DRM_IOWR(0xAB, struct drm_mode_connector_set_property) +#define DRM_IOCTL_MODE_GETPROPBLOB DRM_IOWR(0xAC, struct drm_mode_get_blob) +#define DRM_IOCTL_MODE_GETFB DRM_IOWR(0xAD, struct drm_mode_fb_cmd) +#define DRM_IOCTL_MODE_ADDFB DRM_IOWR(0xAE, struct drm_mode_fb_cmd) +/** + * DRM_IOCTL_MODE_RMFB - Remove a framebuffer. + * + * This removes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL + * argument is a framebuffer object ID. + * + * Warning: removing a framebuffer currently in-use on an enabled plane will + * disable that plane. The CRTC the plane is linked to may also be disabled + * (depending on driver capabilities). + */ +#define DRM_IOCTL_MODE_RMFB DRM_IOWR(0xAF, unsigned int) +#define DRM_IOCTL_MODE_PAGE_FLIP DRM_IOWR(0xB0, struct drm_mode_crtc_page_flip) +#define DRM_IOCTL_MODE_DIRTYFB DRM_IOWR(0xB1, struct drm_mode_fb_dirty_cmd) + +/** + * DRM_IOCTL_MODE_CREATE_DUMB - Create a new dumb buffer object. + * + * KMS dumb buffers provide a very primitive way to allocate a buffer object + * suitable for scanout and map it for software rendering. KMS dumb buffers are + * not suitable for hardware-accelerated rendering nor video decoding. KMS dumb + * buffers are not suitable to be displayed on any other device than the KMS + * device where they were allocated from. Also see + * :ref:`kms_dumb_buffer_objects`. + * + * The IOCTL argument is a struct drm_mode_create_dumb. + * + * User-space is expected to create a KMS dumb buffer via this IOCTL, then add + * it as a KMS framebuffer via &DRM_IOCTL_MODE_ADDFB and map it via + * &DRM_IOCTL_MODE_MAP_DUMB. + * + * &DRM_CAP_DUMB_BUFFER indicates whether this IOCTL is supported. + * &DRM_CAP_DUMB_PREFERRED_DEPTH and &DRM_CAP_DUMB_PREFER_SHADOW indicate + * driver preferences for dumb buffers. + */ +#define DRM_IOCTL_MODE_CREATE_DUMB DRM_IOWR(0xB2, struct drm_mode_create_dumb) +#define DRM_IOCTL_MODE_MAP_DUMB DRM_IOWR(0xB3, struct drm_mode_map_dumb) +#define DRM_IOCTL_MODE_DESTROY_DUMB DRM_IOWR(0xB4, struct drm_mode_destroy_dumb) +#define DRM_IOCTL_MODE_GETPLANERESOURCES DRM_IOWR(0xB5, struct drm_mode_get_plane_res) +#define DRM_IOCTL_MODE_GETPLANE DRM_IOWR(0xB6, struct drm_mode_get_plane) +#define DRM_IOCTL_MODE_SETPLANE DRM_IOWR(0xB7, struct drm_mode_set_plane) +#define DRM_IOCTL_MODE_ADDFB2 DRM_IOWR(0xB8, struct drm_mode_fb_cmd2) +#define DRM_IOCTL_MODE_OBJ_GETPROPERTIES DRM_IOWR(0xB9, struct drm_mode_obj_get_properties) +#define DRM_IOCTL_MODE_OBJ_SETPROPERTY DRM_IOWR(0xBA, struct drm_mode_obj_set_property) +#define DRM_IOCTL_MODE_CURSOR2 DRM_IOWR(0xBB, struct drm_mode_cursor2) +#define DRM_IOCTL_MODE_ATOMIC DRM_IOWR(0xBC, struct drm_mode_atomic) +#define DRM_IOCTL_MODE_CREATEPROPBLOB DRM_IOWR(0xBD, struct drm_mode_create_blob) +#define DRM_IOCTL_MODE_DESTROYPROPBLOB DRM_IOWR(0xBE, struct drm_mode_destroy_blob) + +#define DRM_IOCTL_SYNCOBJ_CREATE DRM_IOWR(0xBF, struct drm_syncobj_create) +#define DRM_IOCTL_SYNCOBJ_DESTROY DRM_IOWR(0xC0, struct drm_syncobj_destroy) +#define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD DRM_IOWR(0xC1, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE DRM_IOWR(0xC2, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_WAIT DRM_IOWR(0xC3, struct drm_syncobj_wait) +#define DRM_IOCTL_SYNCOBJ_RESET DRM_IOWR(0xC4, struct drm_syncobj_array) +#define DRM_IOCTL_SYNCOBJ_SIGNAL DRM_IOWR(0xC5, struct drm_syncobj_array) + +#define DRM_IOCTL_MODE_CREATE_LEASE DRM_IOWR(0xC6, struct drm_mode_create_lease) +#define DRM_IOCTL_MODE_LIST_LESSEES DRM_IOWR(0xC7, struct drm_mode_list_lessees) +#define DRM_IOCTL_MODE_GET_LEASE DRM_IOWR(0xC8, struct drm_mode_get_lease) +#define DRM_IOCTL_MODE_REVOKE_LEASE DRM_IOWR(0xC9, struct drm_mode_revoke_lease) + +#define DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT DRM_IOWR(0xCA, struct drm_syncobj_timeline_wait) +#define DRM_IOCTL_SYNCOBJ_QUERY DRM_IOWR(0xCB, struct drm_syncobj_timeline_array) +#define DRM_IOCTL_SYNCOBJ_TRANSFER DRM_IOWR(0xCC, struct drm_syncobj_transfer) +#define DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL DRM_IOWR(0xCD, struct drm_syncobj_timeline_array) + +/** + * DRM_IOCTL_MODE_GETFB2 - Get framebuffer metadata. + * + * This queries metadata about a framebuffer. User-space fills + * &drm_mode_fb_cmd2.fb_id as the input, and the kernels fills the rest of the + * struct as the output. + * + * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles + * will be filled with GEM buffer handles. Fresh new GEM handles are always + * returned, even if another GEM handle referring to the same memory object + * already exists on the DRM file description. The caller is responsible for + * removing the new handles, e.g. via the &DRM_IOCTL_GEM_CLOSE IOCTL. The same + * new handle will be returned for multiple planes in case they use the same + * memory object. Planes are valid until one has a zero handle -- this can be + * used to compute the number of planes. + * + * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid + * until one has a zero &drm_mode_fb_cmd2.pitches. + * + * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set + * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the + * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier. + * + * To obtain DMA-BUF FDs for each plane without leaking GEM handles, user-space + * can export each handle via &DRM_IOCTL_PRIME_HANDLE_TO_FD, then immediately + * close each unique handle via &DRM_IOCTL_GEM_CLOSE, making sure to not + * double-close handles which are specified multiple times in the array. + */ +#define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2) + +#define DRM_IOCTL_SYNCOBJ_EVENTFD DRM_IOWR(0xCF, struct drm_syncobj_eventfd) + +/** + * DRM_IOCTL_MODE_CLOSEFB - Close a framebuffer. + * + * This closes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL + * argument is a framebuffer object ID. + * + * This IOCTL is similar to &DRM_IOCTL_MODE_RMFB, except it doesn't disable + * planes and CRTCs. As long as the framebuffer is used by a plane, it's kept + * alive. When the plane no longer uses the framebuffer (because the + * framebuffer is replaced with another one, or the plane is disabled), the + * framebuffer is cleaned up. + * + * This is useful to implement flicker-free transitions between two processes. + * + * Depending on the threat model, user-space may want to ensure that the + * framebuffer doesn't expose any sensitive user information: closed + * framebuffers attached to a plane can be read back by the next DRM master. + */ +#define DRM_IOCTL_MODE_CLOSEFB DRM_IOWR(0xD0, struct drm_mode_closefb) + +/* + * Device specific ioctls should only be in their respective headers + * The device specific ioctl range is from 0x40 to 0x9f. + * Generic IOCTLS restart at 0xA0. + * + * \sa drmCommandNone(), drmCommandRead(), drmCommandWrite(), and + * drmCommandReadWrite(). + */ +#define DRM_COMMAND_BASE 0x40 +#define DRM_COMMAND_END 0xA0 + +/** + * struct drm_event - Header for DRM events + * @type: event type. + * @length: total number of payload bytes (including header). + * + * This struct is a header for events written back to user-space on the DRM FD. + * A read on the DRM FD will always only return complete events: e.g. if the + * read buffer is 100 bytes large and there are two 64 byte events pending, + * only one will be returned. + * + * Event types 0 - 0x7fffffff are generic DRM events, 0x80000000 and + * up are chipset specific. Generic DRM events include &DRM_EVENT_VBLANK, + * &DRM_EVENT_FLIP_COMPLETE and &DRM_EVENT_CRTC_SEQUENCE. + */ +struct drm_event { + __u32 type; + __u32 length; +}; + +/** + * DRM_EVENT_VBLANK - vertical blanking event + * + * This event is sent in response to &DRM_IOCTL_WAIT_VBLANK with the + * &_DRM_VBLANK_EVENT flag set. + * + * The event payload is a struct drm_event_vblank. + */ +#define DRM_EVENT_VBLANK 0x01 +/** + * DRM_EVENT_FLIP_COMPLETE - page-flip completion event + * + * This event is sent in response to an atomic commit or legacy page-flip with + * the &DRM_MODE_PAGE_FLIP_EVENT flag set. + * + * The event payload is a struct drm_event_vblank. + */ +#define DRM_EVENT_FLIP_COMPLETE 0x02 +/** + * DRM_EVENT_CRTC_SEQUENCE - CRTC sequence event + * + * This event is sent in response to &DRM_IOCTL_CRTC_QUEUE_SEQUENCE. + * + * The event payload is a struct drm_event_crtc_sequence. + */ +#define DRM_EVENT_CRTC_SEQUENCE 0x03 + +struct drm_event_vblank { + struct drm_event base; + __u64 user_data; + __u32 tv_sec; + __u32 tv_usec; + __u32 sequence; + __u32 crtc_id; /* 0 on older kernels that do not support this */ +}; + +/* Event delivered at sequence. Time stamp marks when the first pixel + * of the refresh cycle leaves the display engine for the display + */ +struct drm_event_crtc_sequence { + struct drm_event base; + __u64 user_data; + __s64 time_ns; + __u64 sequence; +}; + +/* typedef area */ +typedef struct drm_clip_rect drm_clip_rect_t; +typedef struct drm_drawable_info drm_drawable_info_t; +typedef struct drm_tex_region drm_tex_region_t; +typedef struct drm_hw_lock drm_hw_lock_t; +typedef struct drm_version drm_version_t; +typedef struct drm_unique drm_unique_t; +typedef struct drm_list drm_list_t; +typedef struct drm_block drm_block_t; +typedef struct drm_control drm_control_t; +typedef enum drm_map_type drm_map_type_t; +typedef enum drm_map_flags drm_map_flags_t; +typedef struct drm_ctx_priv_map drm_ctx_priv_map_t; +typedef struct drm_map drm_map_t; +typedef struct drm_client drm_client_t; +typedef enum drm_stat_type drm_stat_type_t; +typedef struct drm_stats drm_stats_t; +typedef enum drm_lock_flags drm_lock_flags_t; +typedef struct drm_lock drm_lock_t; +typedef enum drm_dma_flags drm_dma_flags_t; +typedef struct drm_buf_desc drm_buf_desc_t; +typedef struct drm_buf_info drm_buf_info_t; +typedef struct drm_buf_free drm_buf_free_t; +typedef struct drm_buf_pub drm_buf_pub_t; +typedef struct drm_buf_map drm_buf_map_t; +typedef struct drm_dma drm_dma_t; +typedef union drm_wait_vblank drm_wait_vblank_t; +typedef struct drm_agp_mode drm_agp_mode_t; +typedef enum drm_ctx_flags drm_ctx_flags_t; +typedef struct drm_ctx drm_ctx_t; +typedef struct drm_ctx_res drm_ctx_res_t; +typedef struct drm_draw drm_draw_t; +typedef struct drm_update_draw drm_update_draw_t; +typedef struct drm_auth drm_auth_t; +typedef struct drm_irq_busid drm_irq_busid_t; +typedef enum drm_vblank_seq_type drm_vblank_seq_type_t; + +typedef struct drm_agp_buffer drm_agp_buffer_t; +typedef struct drm_agp_binding drm_agp_binding_t; +typedef struct drm_agp_info drm_agp_info_t; +typedef struct drm_scatter_gather drm_scatter_gather_t; +typedef struct drm_set_version drm_set_version_t; + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h b/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h new file mode 100644 index 0000000000000..9debb320c34be --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h @@ -0,0 +1,276 @@ +/* + * Copyright 2013 Red Hat + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef VIRTGPU_DRM_H +#define VIRTGPU_DRM_H + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/* Please note that modifications to all structs defined here are + * subject to backwards-compatibility constraints. + * + * Do not use pointers, use __u64 instead for 32 bit / 64 bit user/kernel + * compatibility Keep fields aligned to their size + */ + +#define DRM_VIRTGPU_MAP 0x01 +#define DRM_VIRTGPU_EXECBUFFER 0x02 +#define DRM_VIRTGPU_GETPARAM 0x03 +#define DRM_VIRTGPU_RESOURCE_CREATE 0x04 +#define DRM_VIRTGPU_RESOURCE_INFO 0x05 +#define DRM_VIRTGPU_TRANSFER_FROM_HOST 0x06 +#define DRM_VIRTGPU_TRANSFER_TO_HOST 0x07 +#define DRM_VIRTGPU_WAIT 0x08 +#define DRM_VIRTGPU_GET_CAPS 0x09 +#define DRM_VIRTGPU_RESOURCE_CREATE_BLOB 0x0a +#define DRM_VIRTGPU_CONTEXT_INIT 0x0b + +#define VIRTGPU_EXECBUF_FENCE_FD_IN 0x01 +#define VIRTGPU_EXECBUF_FENCE_FD_OUT 0x02 +#define VIRTGPU_EXECBUF_RING_IDX 0x04 +#define VIRTGPU_EXECBUF_FLAGS (\ + VIRTGPU_EXECBUF_FENCE_FD_IN |\ + VIRTGPU_EXECBUF_FENCE_FD_OUT |\ + VIRTGPU_EXECBUF_RING_IDX |\ + 0) + +struct drm_virtgpu_map { + __u64 offset; /* use for mmap system call */ + __u32 handle; + __u32 pad; +}; + +#define VIRTGPU_EXECBUF_SYNCOBJ_RESET 0x01 +#define VIRTGPU_EXECBUF_SYNCOBJ_FLAGS ( \ + VIRTGPU_EXECBUF_SYNCOBJ_RESET | \ + 0) +struct drm_virtgpu_execbuffer_syncobj { + __u32 handle; + __u32 flags; + __u64 point; +}; + +/* fence_fd is modified on success if VIRTGPU_EXECBUF_FENCE_FD_OUT flag is set. */ +struct drm_virtgpu_execbuffer { + __u32 flags; + __u32 size; + __u64 command; /* void* */ + __u64 bo_handles; + __u32 num_bo_handles; + __s32 fence_fd; /* in/out fence fd (see VIRTGPU_EXECBUF_FENCE_FD_IN/OUT) */ + __u32 ring_idx; /* command ring index (see VIRTGPU_EXECBUF_RING_IDX) */ + __u32 syncobj_stride; /* size of @drm_virtgpu_execbuffer_syncobj */ + __u32 num_in_syncobjs; + __u32 num_out_syncobjs; + __u64 in_syncobjs; + __u64 out_syncobjs; +}; + +#define VIRTGPU_PARAM_3D_FEATURES 1 /* do we have 3D features in the hw */ +#define VIRTGPU_PARAM_CAPSET_QUERY_FIX 2 /* do we have the capset fix */ +#define VIRTGPU_PARAM_RESOURCE_BLOB 3 /* DRM_VIRTGPU_RESOURCE_CREATE_BLOB */ +#define VIRTGPU_PARAM_HOST_VISIBLE 4 /* Host blob resources are mappable */ +#define VIRTGPU_PARAM_CROSS_DEVICE 5 /* Cross virtio-device resource sharing */ +#define VIRTGPU_PARAM_CONTEXT_INIT 6 /* DRM_VIRTGPU_CONTEXT_INIT */ +#define VIRTGPU_PARAM_SUPPORTED_CAPSET_IDs 7 /* Bitmask of supported capability set ids */ +#define VIRTGPU_PARAM_EXPLICIT_DEBUG_NAME 8 /* Ability to set debug name from userspace */ + +struct drm_virtgpu_getparam { + __u64 param; + __u64 value; +}; + +/* NO_BO flags? NO resource flag? */ +/* resource flag for y_0_top */ +struct drm_virtgpu_resource_create { + __u32 target; + __u32 format; + __u32 bind; + __u32 width; + __u32 height; + __u32 depth; + __u32 array_size; + __u32 last_level; + __u32 nr_samples; + __u32 flags; + __u32 bo_handle; /* if this is set - recreate a new resource attached to this bo ? */ + __u32 res_handle; /* returned by kernel */ + __u32 size; /* validate transfer in the host */ + __u32 stride; /* validate transfer in the host */ +}; + +struct drm_virtgpu_resource_info { + __u32 bo_handle; + __u32 res_handle; + __u32 size; + __u32 blob_mem; +}; + +struct drm_virtgpu_3d_box { + __u32 x; + __u32 y; + __u32 z; + __u32 w; + __u32 h; + __u32 d; +}; + +struct drm_virtgpu_3d_transfer_to_host { + __u32 bo_handle; + struct drm_virtgpu_3d_box box; + __u32 level; + __u32 offset; + __u32 stride; + __u32 layer_stride; +}; + +struct drm_virtgpu_3d_transfer_from_host { + __u32 bo_handle; + struct drm_virtgpu_3d_box box; + __u32 level; + __u32 offset; + __u32 stride; + __u32 layer_stride; +}; + +#define VIRTGPU_WAIT_NOWAIT 1 /* like it */ +struct drm_virtgpu_3d_wait { + __u32 handle; /* 0 is an invalid handle */ + __u32 flags; +}; + +#define VIRTGPU_DRM_CAPSET_VIRGL 1 +#define VIRTGPU_DRM_CAPSET_VIRGL2 2 +#define VIRTGPU_DRM_CAPSET_GFXSTREAM_VULKAN 3 +#define VIRTGPU_DRM_CAPSET_VENUS 4 +#define VIRTGPU_DRM_CAPSET_CROSS_DOMAIN 5 +#define VIRTGPU_DRM_CAPSET_DRM 6 +struct drm_virtgpu_get_caps { + __u32 cap_set_id; + __u32 cap_set_ver; + __u64 addr; + __u32 size; + __u32 pad; +}; + +struct drm_virtgpu_resource_create_blob { +#define VIRTGPU_BLOB_MEM_GUEST 0x0001 +#define VIRTGPU_BLOB_MEM_HOST3D 0x0002 +#define VIRTGPU_BLOB_MEM_HOST3D_GUEST 0x0003 + +#define VIRTGPU_BLOB_FLAG_USE_MAPPABLE 0x0001 +#define VIRTGPU_BLOB_FLAG_USE_SHAREABLE 0x0002 +#define VIRTGPU_BLOB_FLAG_USE_CROSS_DEVICE 0x0004 + /* zero is invalid blob_mem */ + __u32 blob_mem; + __u32 blob_flags; + __u32 bo_handle; + __u32 res_handle; + __u64 size; + + /* + * for 3D contexts with VIRTGPU_BLOB_MEM_HOST3D_GUEST and + * VIRTGPU_BLOB_MEM_HOST3D otherwise, must be zero. + */ + __u32 pad; + __u32 cmd_size; + __u64 cmd; + __u64 blob_id; +}; + +#define VIRTGPU_CONTEXT_PARAM_CAPSET_ID 0x0001 +#define VIRTGPU_CONTEXT_PARAM_NUM_RINGS 0x0002 +#define VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK 0x0003 +#define VIRTGPU_CONTEXT_PARAM_DEBUG_NAME 0x0004 +struct drm_virtgpu_context_set_param { + __u64 param; + __u64 value; +}; + +struct drm_virtgpu_context_init { + __u32 num_params; + __u32 pad; + + /* pointer to drm_virtgpu_context_set_param array */ + __u64 ctx_set_params; +}; + +/* + * Event code that's given when VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK is in + * effect. The event size is sizeof(drm_event), since there is no additional + * payload. + */ +#define VIRTGPU_EVENT_FENCE_SIGNALED 0x90000000 + +#define DRM_IOCTL_VIRTGPU_MAP \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_MAP, struct drm_virtgpu_map) + +#define DRM_IOCTL_VIRTGPU_EXECBUFFER \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_EXECBUFFER,\ + struct drm_virtgpu_execbuffer) + +#define DRM_IOCTL_VIRTGPU_GETPARAM \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GETPARAM,\ + struct drm_virtgpu_getparam) + +#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE, \ + struct drm_virtgpu_resource_create) + +#define DRM_IOCTL_VIRTGPU_RESOURCE_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_INFO, \ + struct drm_virtgpu_resource_info) + +#define DRM_IOCTL_VIRTGPU_TRANSFER_FROM_HOST \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_FROM_HOST, \ + struct drm_virtgpu_3d_transfer_from_host) + +#define DRM_IOCTL_VIRTGPU_TRANSFER_TO_HOST \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_TO_HOST, \ + struct drm_virtgpu_3d_transfer_to_host) + +#define DRM_IOCTL_VIRTGPU_WAIT \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_WAIT, \ + struct drm_virtgpu_3d_wait) + +#define DRM_IOCTL_VIRTGPU_GET_CAPS \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GET_CAPS, \ + struct drm_virtgpu_get_caps) + +#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE_BLOB \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE_BLOB, \ + struct drm_virtgpu_resource_create_blob) + +#define DRM_IOCTL_VIRTGPU_CONTEXT_INIT \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_CONTEXT_INIT, \ + struct drm_virtgpu_context_init) + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/ggml/src/ggml-remotingfrontend/include/venus_hw.h b/ggml/src/ggml-remotingfrontend/include/venus_hw.h new file mode 100644 index 0000000000000..3ef774b8259d3 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/include/venus_hw.h @@ -0,0 +1,74 @@ +/* + * Copyright 2020 Chromium + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef VENUS_HW_H +#define VENUS_HW_H + +#include + +struct virgl_renderer_capset_venus { + uint32_t wire_format_version; + uint32_t vk_xml_version; + uint32_t vk_ext_command_serialization_spec_version; + uint32_t vk_mesa_venus_protocol_spec_version; + + /* This flag indicates render server config, and will be needed until drm + * virtio-gpu blob mem gets fixed to attach_resource before resource_map. + */ + uint32_t supports_blob_id_0; + + /* Extension number N, where N is defined by the Vulkan spec, corresponds + * to bit [N / 32] & (1 << N % 32). The below mask1 covers the first 1023 + * Vulkan extensions (numbered from 1 to 1023). + * + * Bit (mask1[0] & 0x1) is used for backward compatibility purpose. When + * that bit is set, the extension mask(s) are valid. Otherwise, all the + * extensions are assumed to be supported by the renderer side protocol. + */ + uint32_t vk_extension_mask1[32]; + + /* The single-threaded renderer cannot afford potential blocking calls. It + * also leads to GPU lost if the wait depends on a following command. This + * capset allows such blocking calls to passthrough from the clients, and + * shifts the responsibilities to the client drivers. + */ + uint32_t allow_vk_wait_syncs; + + /* This flag indicates that the renderer supports multiple fencing + * timelines. The client driver is expected to associate each VkQueue with + * one of these timelines at queue creation by binding it with an unused + * ring_idx. Queues created without a ring_idx binding are associated to a + * shared legacy timeline. The special ring_idx==0 is reserved for CPU + * fences that are signaled by the renderer immediately upon consumption of + * the associated renderer submission. + */ + uint32_t supports_multiple_timelines; + + /* This flag indicates to the guest that hypervisor does not support memory + * pages injections and blob allocations must be done by guest from the + * dedicated heap (Host visible memory). + */ + uint32_t use_guest_vram; +}; + +#endif /* VENUS_HW_H */ diff --git a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp new file mode 100644 index 0000000000000..7ce0dbb7fbc67 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp @@ -0,0 +1,87 @@ +#include +#include +#include +#include + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "../ggml-remotingbackend/shared/venus_cs_ggml-rpc.h" + +#include "ggml-remoting.h" + +rpc_tensor +serialize_tensor(const ggml_tensor * tensor) { + rpc_tensor result; + result.id = reinterpret_cast(tensor); + result.type = tensor->type; + if (tensor->buffer) { + ggml_backend_buffer_t buffer = tensor->buffer; + + result.buffer = BUFFER_TO_HOST_HANDLE(buffer); + } else { + result.buffer = 0; + } + for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) { + result.ne[i] = tensor->ne[i]; + result.nb[i] = tensor->nb[i]; + } + result.op = tensor->op; + for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { + result.op_params[i] = tensor->op_params[i]; + } + result.flags = tensor->flags; + for (uint32_t i = 0; i < GGML_MAX_SRC; i++) { + result.src[i] = reinterpret_cast(tensor->src[i]); + } + result.view_src = reinterpret_cast(tensor->view_src); + result.view_offs = tensor->view_offs; + result.data = reinterpret_cast(tensor->data); + if (tensor->data) { + if (!tensor->buffer) { + FATAL("tensor has data but not buffer :/"); + } + // tensor->data is serialized as an offset to the buffer base address + result.data -= reinterpret_cast(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base); + } + snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name); + return result; +} + +void +add_tensor(ggml_tensor * tensor, std::vector & tensors, std::unordered_set & visited) { + if (tensor == nullptr) { + return; + } + if (visited.find(tensor) != visited.end()) { + return; + } + visited.insert(tensor); + for (int i = 0; i < GGML_MAX_SRC; i++) { + add_tensor(tensor->src[i], tensors, visited); + } + add_tensor(tensor->view_src, tensors, visited); + tensors.push_back(serialize_tensor(tensor)); +} + +void +serialize_graph(const ggml_cgraph * cgraph, std::vector & output) { + uint32_t n_nodes = cgraph->n_nodes; + std::vector tensors; + std::unordered_set visited; + for (uint32_t i = 0; i < n_nodes; i++) { + add_tensor(cgraph->nodes[i], tensors, visited); + } + // serialization format: + // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) | + uint32_t n_tensors = tensors.size(); + int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor); + output.resize(output_size, 0); + memcpy(output.data(), &n_nodes, sizeof(n_nodes)); + for (uint32_t i = 0; i < n_nodes; i++) { + memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t)); + } + uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t)); + *out_ntensors = n_tensors; + rpc_tensor * out_tensors = (rpc_tensor *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t)); + memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor)); +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp new file mode 100644 index 0000000000000..8a7c9bea60212 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp @@ -0,0 +1,54 @@ +#include "virtgpu-forward-impl.h" + +static long long current_time_ms() { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time + return (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; +} + +ggml_status +apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE); + + std::vector cgraph_data; + size_t cgraph_size = vn_serialize_ggml_cgraph(cgraph, cgraph_data); + + struct vn_renderer_shmem *shmem; + if (cgraph_size > gpu->data_shmem->mmap_size) { + shmem = virtgpu_shmem_create(gpu, cgraph_size); + WARNING("%s: 0x%lx | %dkB | %dMB", __func__, cgraph_size, (int)cgraph_size/1024, (int)cgraph_size/1024/1024); + if (!shmem) { + FATAL("Couldn't allocate the guest-host shared buffer :/"); + } + } else { + shmem = gpu->data_shmem; + } + + //INFO("Send shmem ID %d", shmem->res_id); + vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id); + //INFO("Send shmem size %lu", cgraph_size); + vn_encode_size_t(encoder, &cgraph_size); + + char *shmem_data = (char *) shmem->mmap_ptr; + struct vn_cs_encoder secondary_enc = vn_cs_new_encoder(shmem_data, cgraph_size); + + vn_encode_cgraph_data(&secondary_enc, cgraph_data); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + ggml_status status = GGML_STATUS_ABORTED; + vn_decode_ggml_status(decoder, &status); + //INFO("Received status %u", status); + + remote_call_finish(gpu, encoder, decoder); + + if (shmem != gpu->data_shmem) { + virtgpu_shmem_destroy(gpu, shmem->shmem); + } + + return status; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp new file mode 100644 index 0000000000000..4b635f21a18c2 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp @@ -0,0 +1,119 @@ +#include "virtgpu-forward-impl.h" + +const char * +apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME); + + vn_encode_ggml_buffer_type(encoder, buft); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + const size_t string_size = vn_decode_array_size_unchecked(decoder); + char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size); + if (!string) { + FATAL("%s: Could not allocate the device name buffer", __func__); + } + vn_decode_char_array(decoder, string, string_size); + + //INFO("%s: Forward BUFT NAME --> %s", __func__, string); + + /* *** */ + + remote_call_finish(gpu, encoder, decoder); + + return string; +} + +size_t +apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT); + + vn_encode_ggml_buffer_type(encoder, buft); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + size_t alignment; + vn_decode_size_t(decoder, &alignment); + + INFO("%s: Forward BUFT ALIGNMENT --> %zu ", __func__, alignment); + + remote_call_finish(gpu, encoder, decoder); + + return alignment; +} + +size_t +apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE); + + vn_encode_ggml_buffer_type(encoder, buft); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + size_t max_size; + vn_decode_size_t(decoder, &max_size); + + INFO("%s: Forward BUFT MAX SIZE --> %zu ", __func__, max_size); + + remote_call_finish(gpu, encoder, decoder); + + return max_size; +} + +bool +apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST); + + vn_encode_ggml_buffer_type(encoder, buft); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + bool is_host; + vn_decode_bool_t(decoder, &is_host); + + INFO("%s: buffer is host? %d", __func__, is_host); + + remote_call_finish(gpu, encoder, decoder); + + return is_host; +} + +apir_buffer_context_t +apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buft, size_t size) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + apir_buffer_context_t buffer_context; + INFO("%s: allocate device memory (%lu)", __func__, size); + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER); + + vn_encode_ggml_buffer_type(encoder, buft); + + vn_encode_size_t(encoder, &size); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle); + + remote_call_finish(gpu, encoder, decoder); + + return buffer_context; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp new file mode 100644 index 0000000000000..cf160b133b04e --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp @@ -0,0 +1,148 @@ +#include "virtgpu-forward-impl.h" + +void * +apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_context_t *buffer_context) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_BASE); + + vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + uintptr_t base; + vn_decode_uintptr_t(decoder, &base); + + remote_call_finish(gpu, encoder, decoder); + + return (void *) base; +} + +void +apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, + ggml_tensor *tensor, const void *data, size_t offset, size_t size) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + +#if 0 + INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu", + buffer_context->host_handle, tensor, data, offset, size); +#endif + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR); + + vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); + vn_encode_ggml_tensor(encoder, tensor); + + struct vn_renderer_shmem *shmem; + if (size > gpu->data_shmem->mmap_size) { + shmem = virtgpu_shmem_create(gpu, size); + //WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024); + if (!shmem) { + FATAL("Couldn't allocate the guest-host shared buffer :/"); + } + } else { + shmem = gpu->data_shmem; + } + + memcpy(shmem->mmap_ptr, data, size); + vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id); + + vn_encode_size_t(encoder, &offset); + vn_encode_size_t(encoder, &size); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + remote_call_finish(gpu, encoder, decoder); + + if (shmem != gpu->data_shmem) { + virtgpu_shmem_destroy(gpu, shmem->shmem); + } + + return; +} + +#if false +void +apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, + const ggml_tensor *tensor, void *data, size_t offset, size_t size) { + UNUSED(gpu); + UNUSED(tensor); + char *buffer_base_addr = (char *) buffer_context->shmem->mmap_ptr; + + memcpy(data, buffer_base_addr+offset, size); +} +#else +void +apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, + const ggml_tensor *tensor, void *data, size_t offset, size_t size) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_TENSOR); + + vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); + vn_encode_ggml_tensor(encoder, tensor); + + struct vn_renderer_shmem *shmem; + if (size > gpu->data_shmem->mmap_size) { + shmem = virtgpu_shmem_create(gpu, size); + WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024); + if (!shmem) { + FATAL("Couldn't allocate the guest-host shared buffer :/"); + } + } else { + shmem = gpu->data_shmem; + } + + vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id); + vn_encode_size_t(encoder, &offset); + vn_encode_size_t(encoder, &size); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + memcpy(data, shmem->mmap_ptr, size); + + remote_call_finish(gpu, encoder, decoder); + + if (shmem != gpu->data_shmem) { + virtgpu_shmem_destroy(gpu, shmem->shmem); + } +} +#endif + +void +apir_buffer_clear(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, + uint8_t value) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_CLEAR); + + vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); + vn_encode_uint8_t(encoder, &value); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + remote_call_finish(gpu, encoder, decoder); +} + + +void +apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_context_t *buffer_context) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER); + + vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + remote_call_finish(gpu, encoder, decoder); +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp new file mode 100644 index 0000000000000..e025483f1df52 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp @@ -0,0 +1,237 @@ +#include "virtgpu-forward-impl.h" + +int +apir_device_get_count(struct virtgpu *gpu) { + static int32_t dev_count = -1; + if (dev_count != -1) { + CACHED; + return dev_count; + } + + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_COUNT); + REMOTE_CALL(gpu, encoder, decoder, ret); + + vn_decode_int32_t(decoder, &dev_count); + + INFO("%s: Forward DEV COUNT --> %d ", __func__, dev_count); + + remote_call_finish(gpu, encoder, decoder); + + return dev_count; +} + +const char * +apir_device_get_name(struct virtgpu *gpu) { + static char *string = nullptr; + if (string) { + CACHED; + return string; + } + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_NAME); + REMOTE_CALL(gpu, encoder, decoder, ret); + + const size_t string_size = vn_decode_array_size_unchecked(decoder); + string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size); + if (!string) { + FATAL("%s: Could not allocate the device name buffer", __func__); + } + vn_decode_char_array(decoder, string, string_size); + + INFO("%s: Forward DEV NAME --> %s", __func__, string); + + remote_call_finish(gpu, encoder, decoder); + + return string; +} + +const char * +apir_device_get_description(struct virtgpu *gpu) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + const size_t string_size = vn_decode_array_size_unchecked(decoder); + char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size); + if (!string) { + FATAL("%s: Could not allocate the device description buffer", __func__); + } + vn_decode_char_array(decoder, string, string_size); + + //INFO("%s: Forward DEV DESCR --> %s", __func__, string); + + remote_call_finish(gpu, encoder, decoder); + + return string; +} + +uint32_t +apir_device_get_type(struct virtgpu *gpu) { + static uint32_t dev_type = 255; + if (dev_type != 255) { + CACHED; + return dev_type; + } + + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_TYPE); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + vn_decode_uint32_t(decoder, &dev_type); + + INFO("%s: Forward DEV TYPE --> %d ", __func__, dev_type); + + remote_call_finish(gpu, encoder, decoder); + + return dev_type; +} + +void +apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total) { + static size_t dev_free = 0; + static size_t dev_total = 0; + /* + if (dev_total != 0) { + WARNING("Not sure if llama.cpp expects fresh information for the free memory ..."); + *free = dev_free; + *total = dev_total; + + CACHED; + return; + } + */ + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_MEMORY); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + vn_decode_size_t(decoder, &dev_free); + vn_decode_size_t(decoder, &dev_total); + + *free = dev_free; + *total = dev_total; + + //INFO("%s: Forward DEV FREE mem --> %zu MB", __func__, dev_free / 1024 / 1024); + //INFO("%s: Forward DEV TOTAL mem --> %zu MB", __func__, dev_total / 1024 / 1024); + + remote_call_finish(gpu, encoder, decoder); + + return; +} + +bool +apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP); + + vn_encode_ggml_tensor_inline(encoder, op); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + bool supports_op; + vn_decode_bool_t(decoder, &supports_op); + + remote_call_finish(gpu, encoder, decoder); + + return supports_op; +} + +apir_buffer_type_host_handle_t +apir_device_get_buffer_type(struct virtgpu *gpu) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + apir_buffer_type_host_handle_t buft_handle; + vn_decode_apir_buffer_type_host_handle_t(decoder, &buft_handle); + + remote_call_finish(gpu, encoder, decoder); + + return buft_handle; +} + +void +apir_device_get_props(struct virtgpu *gpu, + bool *async, + bool *host_buffer, + bool *buffer_from_host_ptr, + bool *events) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_PROPS); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + vn_decode_bool_t(decoder, async); + vn_decode_bool_t(decoder, host_buffer); + vn_decode_bool_t(decoder, buffer_from_host_ptr); + vn_decode_bool_t(decoder, events); + + /* *** */ + remote_call_finish(gpu, encoder, decoder); + + return; +} + +apir_buffer_context_t +apir_device_buffer_from_ptr(struct virtgpu *gpu, + size_t size, + size_t max_tensor_size) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + apir_buffer_context_t buffer_context; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR); + + /* *** */ + + buffer_context.shmem = virtgpu_shmem_create(gpu, size); + if (!buffer_context.shmem) { + FATAL("Couldn't allocate the guest-host shared buffer :/"); + } + + vn_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem->res_id); + + vn_encode_size_t(encoder, &size); + vn_encode_size_t(encoder, &max_tensor_size); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle); + buffer_context.buft_host_handle = vn_decode_apir_buffer_type_host_handle(decoder); + + /* *** */ + + remote_call_finish(gpu, encoder, decoder); + + return buffer_context; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h new file mode 100644 index 0000000000000..7edae38e775ee --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h @@ -0,0 +1,31 @@ +#include "ggml-backend-impl.h" +#include "ggml-remoting.h" +#include "virtgpu.h" +#include "../ggml-remotingbackend/shared/apir_backend.h" +#include "../ggml-remotingbackend/shared/venus_cs_ggml.h" + +#define CACHED +// printf("INFO: ### found response in the cache %s\n", __func__)o + + +#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__) \ + do { \ + int32_t forward_flag = (int32_t) apir_command_type__; \ + encoder_name = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_Forward, forward_flag); \ + if (!encoder_name) { \ + FATAL("%s: failed to prepare the remote call encoder :/", __func__); \ + } \ + } while(0) + + +#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name, ret_name) \ + do { \ + ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL); \ + if (!decoder_name) { \ + FATAL("%s: failed to kick the remote call :/", __func__); \ + } \ + if (ret_name < APIR_FORWARD_BASE_INDEX) { \ + FATAL("%s: failed to forward the API call: %s: code %d", __func__, apir_forward_error(ret_name), ret_name); \ + } \ + ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX); \ + } while(0) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-metal.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-metal.cpp new file mode 100644 index 0000000000000..e7bdc10f94481 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-metal.cpp @@ -0,0 +1,20 @@ +#include "virtgpu-forward-impl.h" + +bool +apir_metal_get_device_context(struct virtgpu *gpu, struct ggml_backend_metal_device_context *metal_dev_ctx) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_METAL_GET_DEVICE_CONTEXT); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + vn_decode_bool_t(decoder, &metal_dev_ctx->has_simdgroup_mm); + vn_decode_bool_t(decoder, &metal_dev_ctx->has_simdgroup_reduction); + vn_decode_bool_t(decoder, &metal_dev_ctx->has_bfloat); + + remote_call_finish(gpu, encoder, decoder); + + return true; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h new file mode 100644 index 0000000000000..4cbb6341ebb07 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -0,0 +1,50 @@ +#include "ggml.h" +#include "ggml-impl.h" +#include "ggml-alloc.h" + +#include "virtgpu-utils.h" + +#include "../ggml-remotingbackend/shared/apir_backend.h" + +/* device */ +int apir_device_get_count(struct virtgpu *gpu); +const char *apir_device_get_name(struct virtgpu *gpu); +const char *apir_device_get_description(struct virtgpu *gpu); +uint32_t apir_device_get_type(struct virtgpu *gpu); +void apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total); +bool apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op); +apir_buffer_type_host_handle_t apir_device_get_buffer_type(struct virtgpu *gpu); +void apir_device_get_props(struct virtgpu *gpu, + bool *async, + bool *host_buffer, + bool *buffer_from_host_ptr, + bool *events); +apir_buffer_context_t apir_device_buffer_from_ptr(struct virtgpu *gpu, + size_t size, + size_t max_tensor_size); +/* buffer-type */ +const char *apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); +size_t apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); +size_t apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); +bool apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); +apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buffer_buft, size_t size); + +/* buffer */ + +void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_context_t *buffer_context); +enum ggml_status apir_buffer_init_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, ggml_tensor *tensor); +void apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, + ggml_tensor *tensor, const void *data, size_t offset, size_t size); +void apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, + const ggml_tensor *tensor, void *data, size_t offset, size_t size); +void apir_buffer_clear(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, + uint8_t value); +void apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_context_t *buffer_context); + +/* backend */ + +ggml_status apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph); + +/* metal */ + +bool apir_metal_get_device_context(struct virtgpu *gpu, struct ggml_backend_metal_device_context *metal_dev_ctx); diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp new file mode 100644 index 0000000000000..a09fd22371a8c --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp @@ -0,0 +1,111 @@ +#include + +#include "virtgpu-shm.h" + +static uint32_t +virtgpu_ioctl_resource_create_blob(struct virtgpu *gpu, + uint32_t blob_mem, + uint32_t blob_flags, + size_t blob_size, + uint64_t blob_id, + uint32_t *res_id) +{ +#ifdef SIMULATE_BO_SIZE_FIX + blob_size = align64(blob_size, 4096); +#endif + + struct drm_virtgpu_resource_create_blob args = { + .blob_mem = blob_mem, + .blob_flags = blob_flags, + .bo_handle = 0, + .res_handle = 0, + .size = blob_size, + .pad = 0, + .cmd_size = 0, + .cmd = 0, + .blob_id = blob_id, + }; + + if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_RESOURCE_CREATE_BLOB, &args)) + return 0; + + *res_id = args.res_handle; + return args.bo_handle; +} + +static void +virtgpu_ioctl_gem_close(struct virtgpu *gpu, uint32_t gem_handle) +{ + struct drm_gem_close args = { + .handle = gem_handle, + .pad = 0, + }; + + const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_GEM_CLOSE, &args); + assert(!ret); +#ifdef NDEBUG + UNUSED(ret); +#endif +} + +static void * +virtgpu_ioctl_map(struct virtgpu *gpu, uint32_t gem_handle, size_t size) +{ + struct drm_virtgpu_map args = { + .offset = 0, + .handle = gem_handle, + .pad = 0, + }; + + if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_MAP, &args)) + return NULL; + + void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, gpu->fd, + args.offset); + if (ptr == MAP_FAILED) + return NULL; + + return ptr; +} + +void +virtgpu_shmem_destroy(struct virtgpu *gpu, + struct virtgpu_shmem *shmem) +{ + munmap(shmem->base.mmap_ptr, shmem->base.mmap_size); + virtgpu_ioctl_gem_close(gpu, shmem->gem_handle); +} + +struct vn_renderer_shmem * +virtgpu_shmem_create(struct virtgpu *gpu, size_t size) +{ + size = align64(size, 16384); + + uint32_t res_id; + uint32_t gem_handle = virtgpu_ioctl_resource_create_blob( + gpu, gpu->shmem_blob_mem, VIRTGPU_BLOB_FLAG_USE_MAPPABLE, size, 0, + &res_id); + if (!gem_handle) + return NULL; + + void *ptr = virtgpu_ioctl_map(gpu, gem_handle, size); + if (!ptr) { + virtgpu_ioctl_gem_close(gpu, gem_handle); + return NULL; + } + if (gpu->shmem_array.elem_size == 0) { + INFO("gpu->shmem_array.elem_size == 0 | Not working :/\n"); + assert(false); + } + struct virtgpu_shmem *shmem = (struct virtgpu_shmem *) util_sparse_array_get(&gpu->shmem_array, gem_handle); + + shmem->gem_handle = gem_handle; + shmem->base.res_id = res_id; + shmem->base.mmap_size = size; + shmem->base.mmap_ptr = ptr; + shmem->base.refcount.count = 1; + shmem->base.gem_handle = gem_handle; + shmem->base.shmem = shmem; + + return &shmem->base; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.h b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h new file mode 100644 index 0000000000000..52217f5b7e857 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "virtgpu.h" +#include "virtgpu-utils.h" + +struct vn_refcount { + int count; //atomic_int +}; + + +struct vn_renderer_shmem { + struct vn_refcount refcount; + + uint32_t res_id; + size_t mmap_size; /* for internal use only (i.e., munmap) */ + void *mmap_ptr; + + struct list_head cache_head; + int64_t cache_timestamp; + + uint32_t gem_handle; + + struct virtgpu_shmem *shmem; +}; + +struct vn_renderer_shmem *virtgpu_shmem_create(struct virtgpu *gpu, size_t size); +void virtgpu_shmem_destroy(struct virtgpu *gpu, struct virtgpu_shmem *shmem); + + +struct virtgpu_shmem { + struct vn_renderer_shmem base; + uint32_t gem_handle; +}; diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp new file mode 100644 index 0000000000000..833f0e4680103 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp @@ -0,0 +1,200 @@ +#include "virtgpu-utils.h" +#include +#include +#include + +#define NODE_ALLOC_ALIGN 64 +#define NODE_PTR_MASK (~((uintptr_t)NODE_ALLOC_ALIGN - 1)) +#define NODE_LEVEL_MASK ((uintptr_t)NODE_ALLOC_ALIGN - 1) +#define NULL_NODE 0 + +#define os_malloc_aligned(_size, _align) _aligned_malloc(_size, _align) +#define os_free_aligned(_ptr) free(_ptr) +#define p_atomic_cmpxchg(v, old, _new) \ + __sync_val_compare_and_swap((v), (old), (_new)) + +static inline uint64_t +util_logbase2_64(uint64_t n) +{ +#if defined(HAVE___BUILTIN_CLZLL) + return ((sizeof(uint64_t) * 8 - 1) - __builtin_clzll(n | 1)); +#else + uint64_t pos = 0ull; + if (n >= 1ull<<32) { n >>= 32; pos += 32; } + if (n >= 1ull<<16) { n >>= 16; pos += 16; } + if (n >= 1ull<< 8) { n >>= 8; pos += 8; } + if (n >= 1ull<< 4) { n >>= 4; pos += 4; } + if (n >= 1ull<< 2) { n >>= 2; pos += 2; } + if (n >= 1ull<< 1) { pos += 1; } + return pos; +#endif +} + +void +util_sparse_array_init(struct util_sparse_array *arr, + size_t elem_size, size_t node_size) +{ + memset(arr, 0, sizeof(*arr)); + arr->elem_size = elem_size; + arr->node_size_log2 = util_logbase2_64(node_size); + assert(node_size >= 2 && node_size == (1ull << arr->node_size_log2)); +} + +static inline void * +os_malloc_aligned(size_t size, size_t alignment) +{ + void *ptr; + alignment = (alignment + sizeof(void*) - 1) & ~(sizeof(void*) - 1); + if(posix_memalign(&ptr, alignment, size) != 0) + return NULL; + return ptr; +} + +static inline void * +_util_sparse_array_node_data(uintptr_t handle) +{ + return (void *)(handle & NODE_PTR_MASK); +} + +static inline unsigned +_util_sparse_array_node_level(uintptr_t handle) +{ + return handle & NODE_LEVEL_MASK; +} + +static inline void +_util_sparse_array_node_finish(struct util_sparse_array *arr, + uintptr_t node) +{ + if (_util_sparse_array_node_level(node) > 0) { + uintptr_t *children = (uintptr_t *) _util_sparse_array_node_data(node); + size_t node_size = 1ull << arr->node_size_log2; + for (size_t i = 0; i < node_size; i++) { + if (children[i]) + _util_sparse_array_node_finish(arr, children[i]); + } + } + + os_free_aligned(_util_sparse_array_node_data(node)); +} + +static inline uintptr_t +_util_sparse_array_node(void *data, unsigned level) +{ + assert(data != NULL); + assert(((uintptr_t)data & NODE_LEVEL_MASK) == 0); + assert((level & NODE_PTR_MASK) == 0); + return (uintptr_t)data | level; +} + +inline uintptr_t +_util_sparse_array_node_alloc(struct util_sparse_array *arr, + unsigned level) +{ + size_t size; + if (level == 0) { + size = arr->elem_size << arr->node_size_log2; + } else { + size = sizeof(uintptr_t) << arr->node_size_log2; + } + + void *data = os_malloc_aligned(size, NODE_ALLOC_ALIGN); + memset(data, 0, size); + + return _util_sparse_array_node(data, level); +} + +static inline uintptr_t +_util_sparse_array_set_or_free_node(uintptr_t *node_ptr, + uintptr_t cmp_node, + uintptr_t node) +{ + uintptr_t prev_node = p_atomic_cmpxchg(node_ptr, cmp_node, node); + + if (prev_node != cmp_node) { + /* We lost the race. Free this one and return the one that was already + * allocated. + */ + os_free_aligned(_util_sparse_array_node_data(node)); + return prev_node; + } else { + return node; + } +} + +void * +util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx) +{ + const unsigned node_size_log2 = arr->node_size_log2; + uintptr_t root = p_atomic_read(&arr->root); + if (unlikely(!root)) { + unsigned root_level = 0; + uint64_t idx_iter = idx >> node_size_log2; + while (idx_iter) { + idx_iter >>= node_size_log2; + root_level++; + } + uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level); + root = _util_sparse_array_set_or_free_node(&arr->root, + NULL_NODE, new_root); + } + + while (1) { + unsigned root_level = _util_sparse_array_node_level(root); + uint64_t root_idx = idx >> (root_level * node_size_log2); + if (likely(root_idx < (1ull << node_size_log2))) + break; + + /* In this case, we have a root but its level is low enough that the + * requested index is out-of-bounds. + */ + uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level + 1); + + uintptr_t *new_root_children = (uintptr_t *) _util_sparse_array_node_data(new_root); + new_root_children[0] = root; + + /* We only add one at a time instead of the whole tree because it's + * easier to ensure correctness of both the tree building and the + * clean-up path. Because we're only adding one node we never have to + * worry about trying to free multiple things without freeing the old + * things. + */ + root = _util_sparse_array_set_or_free_node(&arr->root, root, new_root); + } + + void *node_data = _util_sparse_array_node_data(root); + unsigned node_level = _util_sparse_array_node_level(root); + while (node_level > 0) { + uint64_t child_idx = (idx >> (node_level * node_size_log2)) & + ((1ull << node_size_log2) - 1); + + uintptr_t *children = (uintptr_t *) node_data; + uintptr_t child = p_atomic_read(&children[child_idx]); + + if (unlikely(!child)) { + child = _util_sparse_array_node_alloc(arr, node_level - 1); + child = _util_sparse_array_set_or_free_node(&children[child_idx], + NULL_NODE, child); + } + + node_data = _util_sparse_array_node_data(child); + node_level = _util_sparse_array_node_level(child); + } + + uint64_t elem_idx = idx & ((1ull << node_size_log2) - 1); + return (void *)((char *)node_data + (elem_idx * arr->elem_size)); +} + +void *something = NULL; +void thks_bye () { + // break here + INFO("thks bye, stopping early and happilly :)"); + if (!something) { // avoid the [[noreturn]] detection mechanism + exit(0); + } +} + +void breakpoint() { + // break here + INFO("breakpoint here :)"); +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h new file mode 100644 index 0000000000000..dd911a63b59e7 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h @@ -0,0 +1,133 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define unlikely(x) __builtin_expect(!!(x), 0) +#define likely(x) __builtin_expect(!!(x), 1) + +#ifndef UNUSED +#define UNUSED(x) (void)(x) +#endif + +/** Checks is a value is a power of two. Does not handle zero. */ +#define IS_POT(v) (((v) & ((v) - 1)) == 0) + +/** Checks is a value is a power of two. Zero handled. */ +#define IS_POT_NONZERO(v) ((v) != 0 && IS_POT(v)) + +/** Align a value to a power of two */ +#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1)) + +#define p_atomic_read(_v) __atomic_load_n((_v), __ATOMIC_ACQUIRE) + +void thks_bye(); +void breakpoint(); + +#ifndef NDEBUG +inline void +INFO(const char *format, ...) { + fprintf(stderr, "INFO: "); + + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); +} +#else +inline void +INFO(...) {} +#endif + +inline void +MESSAGE(const char *format, ...) { + fprintf(stderr, "APIR: "); + + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); +} + +inline void +WARNING(const char *format, ...) { + fprintf(stderr, "WARNING: "); + + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); +} + +inline void +ERROR(const char *format, ...) { + fprintf(stderr, "ERROR: "); + + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); +} + +inline void +FATAL(const char *format, ...) { + fprintf(stderr, "FATAL: "); + + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); + + abort(); +} + +static inline bool +util_is_power_of_two_nonzero64(uint64_t v) +{ + return IS_POT_NONZERO(v); +} + +static inline uint64_t +align64(uint64_t value, uint64_t alignment) +{ + assert(util_is_power_of_two_nonzero64(alignment)); + return ALIGN_POT(value, alignment); +} + +struct list_head +{ + struct list_head *prev; + struct list_head *next; +}; + +struct util_sparse_array { + size_t elem_size; + unsigned node_size_log2; + + uintptr_t root; +}; + +void *util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx); +void util_sparse_array_init(struct util_sparse_array *arr, + size_t elem_size, size_t node_size); + +inline void +os_time_sleep(int64_t usecs) +{ + struct timespec time; + time.tv_sec = usecs / 1000000; + time.tv_nsec = (usecs % 1000000) * 1000; + while (clock_nanosleep(CLOCK_MONOTONIC, 0, &time, &time) == EINTR); +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp new file mode 100644 index 0000000000000..b3b0ab2dc68cc --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -0,0 +1,681 @@ +#include +#include +#include +#include + +#include + +#include "virtgpu.h" + +static virt_gpu_result_t virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev); +static virt_gpu_result_t virtgpu_open(struct virtgpu *gpu); + + +static virt_gpu_result_t virtgpu_init_params(struct virtgpu *gpu); +static virt_gpu_result_t virtgpu_init_capset(struct virtgpu *gpu); +static virt_gpu_result_t virtgpu_init_context(struct virtgpu *gpu); + +static int virtgpu_ioctl_context_init(struct virtgpu *gpu, + enum virgl_renderer_capset capset_id); +static int +virtgpu_ioctl_get_caps(struct virtgpu *gpu, + enum virgl_renderer_capset id, + uint32_t version, + void *capset, + size_t capset_size); +static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param); +static void virtgpu_init_renderer_info(struct virtgpu *gpu); + +struct timer_data wait_host_reply_timer = {0, 0, 0, "wait_host_reply"}; + +static void log_call_duration(long long call_duration_ns, const char *name); + +const uint64_t APIR_HANDSHAKE_MAX_WAIT_MS = 15*1000; // 15s +const uint64_t APIR_LOADLIBRARY_MAX_WAIT_MS = 60*1000; // 60s + +static inline void +virtgpu_init_shmem_blob_mem(struct virtgpu *gpu) +{ + /* VIRTGPU_BLOB_MEM_GUEST allocates from the guest system memory. They are + * logically contiguous in the guest but are sglists (iovecs) in the host. + * That makes them slower to process in the host. With host process + * isolation, it also becomes impossible for the host to access sglists + * directly. + * + * While there are ideas (and shipped code in some cases) such as creating + * udmabufs from sglists, or having a dedicated guest heap, it seems the + * easiest way is to reuse VIRTGPU_BLOB_MEM_HOST3D. That is, when the + * renderer sees a request to export a blob where + * + * - blob_mem is VIRTGPU_BLOB_MEM_HOST3D + * - blob_flags is VIRTGPU_BLOB_FLAG_USE_MAPPABLE + * - blob_id is 0 + * + * it allocates a host shmem. + * + * supports_blob_id_0 has been enforced by mandated render server config. + */ + assert(gpu->capset.data.supports_blob_id_0); + gpu->shmem_blob_mem = VIRTGPU_BLOB_MEM_HOST3D; +} + +static int +virtgpu_handshake(struct virtgpu *gpu) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + encoder = remote_call_prepare(gpu, APIR_COMMAND_TYPE_HandShake, 0); + if (!encoder) { + FATAL("%s: failed to prepare the remote call encoder :/", __func__); + return 1; + } + + /* write handshake props */ + + uint32_t guest_major = APIR_PROTOCOL_MAJOR; + uint32_t guest_minor = APIR_PROTOCOL_MINOR; + vn_encode_uint32_t(encoder, &guest_major); + vn_encode_uint32_t(encoder, &guest_minor); + + /* *** */ + + + uint32_t ret_magic; + long long call_duration_ns; + ret_magic = remote_call(gpu, encoder, &decoder, APIR_HANDSHAKE_MAX_WAIT_MS, &call_duration_ns); + log_call_duration(call_duration_ns, "API Remoting handshake"); + + if (!decoder) { + FATAL("%s: failed to initiate the communication with the virglrenderer library. " + "Most likely, the wrong virglrenderer library was loaded in the hypervisor.", __func__); + return 1; + } + + /* read handshake return values */ + + uint32_t host_major; + uint32_t host_minor; + + if (ret_magic != APIR_HANDSHAKE_MAGIC) { + FATAL("%s: handshake with the virglrenderer failed (code=%d | %s):/", + __func__, ret_magic, apir_backend_initialize_error(ret_magic)); + } else { + vn_decode_uint32_t(decoder, &host_major); + vn_decode_uint32_t(decoder, &host_minor); + } + + /* *** */ + + remote_call_finish(gpu, encoder, decoder); + + if (ret_magic != APIR_HANDSHAKE_MAGIC) { + return 1; + } + + /* *** */ + + INFO("%s: Guest is running with %u.%u", __func__, guest_major, guest_minor); + INFO("%s: Host is running with %u.%u", __func__, host_major, host_minor); + + if (guest_major != host_major) { + ERROR("Host major (%d) and guest major (%d) version differ", host_major, guest_major); + } else if (guest_minor != host_minor) { + WARNING("Host minor (%d) and guest minor (%d) version differ", host_minor, guest_minor); + } + + INFO("Handshake with the host virglrenderer library completed."); + + return 0; +} + +static ApirLoadLibraryReturnCode +virtgpu_load_library(struct virtgpu *gpu) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + ApirLoadLibraryReturnCode ret; + + encoder = remote_call_prepare(gpu, APIR_COMMAND_TYPE_LoadLibrary, 0); + if (!encoder) { + FATAL("%s: hypercall error: failed to prepare the remote call encoder :/", __func__); + return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR; + } + + long long call_duration_ns; + + ret = (ApirLoadLibraryReturnCode) remote_call(gpu, encoder, &decoder, + APIR_LOADLIBRARY_MAX_WAIT_MS, &call_duration_ns); + log_call_duration(call_duration_ns, "API Remoting LoadLibrary"); + + if (!decoder) { + FATAL("%s: hypercall error: failed to kick the API remoting hypercall. :/", __func__); + return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR; + } + + remote_call_finish(gpu, encoder, decoder); + + if (ret == APIR_LOAD_LIBRARY_SUCCESS) { + INFO("%s: The API Remoting backend was successfully loaded and initialized", __func__); + + return ret; + } + + // something wrong happened, find out what. + + if (ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) { + FATAL("%s: virglrenderer could not load the API Remoting backend library: %s (code %d)", + __func__, apir_load_library_error(ret), ret); + return ret; + } + + INFO("%s: virglrenderer successfully loaded the API Remoting backend library", __func__); + + ApirLoadLibraryReturnCode apir_ret = (ApirLoadLibraryReturnCode) (ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX); + + if (apir_ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) { + FATAL("%s: the API Remoting backend library couldn't load the backend library: apir code=%d | %s):/", + __func__, apir_ret, apir_load_library_error(apir_ret)); + } else { + uint32_t lib_ret = apir_ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX; + FATAL("%s: the API Remoting backend library initialize its backend library: apir code=%d):/", + __func__, lib_ret); + } + return ret; +} + +struct virtgpu * +create_virtgpu() { + struct virtgpu *gpu = new struct virtgpu(); + + util_sparse_array_init(&gpu->shmem_array, sizeof(struct virtgpu_shmem), + 1024); + + virt_gpu_result_t result = virtgpu_open(gpu); + if (result != APIR_SUCCESS) { + FATAL("%s: failed to create the open the virtgpu device :/", __func__); + return NULL; + } + + result = virtgpu_init_params(gpu); + assert(result == APIR_SUCCESS); + + result = virtgpu_init_capset(gpu); + assert(result == APIR_SUCCESS); + + result = virtgpu_init_context(gpu); + assert(result == APIR_SUCCESS); + +#ifdef NDEBUG + UNUSED(result); +#endif + + virtgpu_init_shmem_blob_mem(gpu); + + gpu->reply_shmem = virtgpu_shmem_create(gpu, 0x4000); + gpu->data_shmem = virtgpu_shmem_create(gpu, 0x1830000); // 24MiB + + if (!gpu->reply_shmem) { + FATAL("%s: failed to create the shared reply memory pages :/", __func__); + return NULL; + } + + if (!gpu->data_shmem) { + FATAL("%s: failed to create the shared data memory pages :/", __func__); + return NULL; + } + + if (virtgpu_handshake(gpu)) { + FATAL("%s: failed to handshake with the virglrenderer library :/", __func__); + return NULL; + } + + if (virtgpu_load_library(gpu) != APIR_LOAD_LIBRARY_SUCCESS) { + FATAL("%s: failed to load the backend library :/", __func__); + return NULL; + } + + return gpu; +} + +static virt_gpu_result_t +virtgpu_open(struct virtgpu *gpu) +{ + drmDevicePtr devs[8]; + int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs)); + if (count < 0) { + ERROR("%s: failed to enumerate DRM devices", __func__); + return APIR_ERROR_INITIALIZATION_FAILED; + } + + virt_gpu_result_t result = APIR_ERROR_INITIALIZATION_FAILED; + for (int i = 0; i < count; i++) { + result = virtgpu_open_device(gpu, devs[i]); + if (result == APIR_SUCCESS) + break; + } + + drmFreeDevices(devs, count); + + return result; +} + +static virt_gpu_result_t +virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev) +{ + bool supported_bus = false; + + switch (dev->bustype) { + case DRM_BUS_PCI: + if (dev->deviceinfo.pci->vendor_id == VIRTGPU_PCI_VENDOR_ID && + dev->deviceinfo.pci->device_id == VIRTGPU_PCI_DEVICE_ID) + supported_bus = true; + break; + case DRM_BUS_PLATFORM: + supported_bus = true; + break; + default: + break; + } + + if (!supported_bus || !(dev->available_nodes & (1 << DRM_NODE_RENDER))) { + if (VN_DEBUG(INIT)) { + const char *name = "unknown"; + for (uint32_t i = 0; i < DRM_NODE_MAX; i++) { + if (dev->available_nodes & (1 << i)) { + name = dev->nodes[i]; + break; + } + } + vn_log(gpu->instance, "skipping DRM device %s", name); + } + return APIR_ERROR_INITIALIZATION_FAILED; + } + + const char *primary_path = dev->nodes[DRM_NODE_PRIMARY]; + const char *node_path = dev->nodes[DRM_NODE_RENDER]; + + int fd = open(node_path, O_RDWR | O_CLOEXEC); + if (fd < 0) { + if (VN_DEBUG(INIT)) + vn_log(gpu->instance, "failed to open %s", node_path); + return APIR_ERROR_INITIALIZATION_FAILED; + } + + drmVersionPtr version = drmGetVersion(fd); + if (!version || strcmp(version->name, "virtio_gpu") || + version->version_major != 0) { + if (VN_DEBUG(INIT)) { + if (version) { + vn_log(gpu->instance, "unknown DRM driver %s version %d", + version->name, version->version_major); + } else { + vn_log(gpu->instance, "failed to get DRM driver version"); + } + } + if (version) + drmFreeVersion(version); + close(fd); + return APIR_ERROR_INITIALIZATION_FAILED; + } + + gpu->fd = fd; + + struct stat st; + if (stat(primary_path, &st) == 0) { + gpu->has_primary = true; + gpu->primary_major = major(st.st_rdev); + gpu->primary_minor = minor(st.st_rdev); + } else { + gpu->has_primary = false; + gpu->primary_major = 0; + gpu->primary_minor = 0; + } + stat(node_path, &st); + gpu->render_major = major(st.st_rdev); + gpu->render_minor = minor(st.st_rdev); + + gpu->bustype = dev->bustype; + if (dev->bustype == DRM_BUS_PCI) + gpu->pci_bus_info = *dev->businfo.pci; + + drmFreeVersion(version); + + MESSAGE("using DRM device %s", node_path); + + return APIR_SUCCESS; +} + +void +vn_log(struct remoting_dev_instance *instance, const char *format, ...) +{ + if (instance) { + printf(""); + } + + va_list ap; + + va_start(ap, format); + vprintf(format, ap); + va_end(ap); + + /* instance may be NULL or partially initialized */ +} + +static virt_gpu_result_t +virtgpu_init_context(struct virtgpu *gpu) +{ + assert(!gpu->capset.version); + const int ret = virtgpu_ioctl_context_init(gpu, gpu->capset.id); + if (ret) { + if (VN_DEBUG(INIT)) { + vn_log(gpu->instance, "failed to initialize context: %s", + strerror(errno)); + } + return APIR_ERROR_INITIALIZATION_FAILED; + } + + return APIR_SUCCESS; +} + +static virt_gpu_result_t +virtgpu_init_capset(struct virtgpu *gpu) +{ + gpu->capset.id = VIRGL_RENDERER_CAPSET_VENUS; + gpu->capset.version = 0; + + const int ret = + virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version, + &gpu->capset.data, sizeof(gpu->capset.data)); + if (ret) { + if (VN_DEBUG(INIT)) { + vn_log(gpu->instance, "failed to get venus v%d capset: %s", + gpu->capset.version, strerror(errno)); + } + return APIR_ERROR_INITIALIZATION_FAILED; + } + + return APIR_SUCCESS; +} + +static virt_gpu_result_t +virtgpu_init_params(struct virtgpu *gpu) +{ + const uint64_t required_params[] = { + VIRTGPU_PARAM_3D_FEATURES, VIRTGPU_PARAM_CAPSET_QUERY_FIX, + VIRTGPU_PARAM_RESOURCE_BLOB, VIRTGPU_PARAM_CONTEXT_INIT, + }; + uint64_t val; + for (uint32_t i = 0; i < ARRAY_SIZE(required_params); i++) { + val = virtgpu_ioctl_getparam(gpu, required_params[i]); + if (!val) { + if (VN_DEBUG(INIT)) { + vn_log(gpu->instance, "required kernel param %d is missing", + (int)required_params[i]); + } + return APIR_ERROR_INITIALIZATION_FAILED; + } + } + + val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_HOST_VISIBLE); + if (val) { + gpu->bo_blob_mem = VIRTGPU_BLOB_MEM_HOST3D; + } else { + val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_GUEST_VRAM); + if (val) { + gpu->bo_blob_mem = VIRTGPU_BLOB_MEM_GUEST_VRAM; + } + } + + if (!val) { + vn_log(gpu->instance, + "one of required kernel params (%d or %d) is missing", + (int)VIRTGPU_PARAM_HOST_VISIBLE, (int)VIRTGPU_PARAM_GUEST_VRAM); + return APIR_ERROR_INITIALIZATION_FAILED; + } + + /* Cross-device feature is optional. It enables sharing dma-bufs + * with other virtio devices, like virtio-wl or virtio-video used + * by ChromeOS VMs. Qemu doesn't support cross-device sharing. + */ + val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_CROSS_DEVICE); + if (val) + gpu->supports_cross_device = true; + + /* implied by CONTEXT_INIT uapi */ + gpu->max_timeline_count = 64; + + return APIR_SUCCESS; +} + +static int +virtgpu_ioctl_context_init(struct virtgpu *gpu, + enum virgl_renderer_capset capset_id) +{ + struct drm_virtgpu_context_set_param ctx_set_params[3] = { + { + .param = VIRTGPU_CONTEXT_PARAM_CAPSET_ID, + .value = capset_id, + }, + { + .param = VIRTGPU_CONTEXT_PARAM_NUM_RINGS, + .value = 64, + }, + { + .param = VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK, + .value = 0, /* don't generate drm_events on fence signaling */ + }, + }; + + struct drm_virtgpu_context_init args = { + .num_params = ARRAY_SIZE(ctx_set_params), + .pad = 0, + .ctx_set_params = (uintptr_t)&ctx_set_params, + }; + + return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_CONTEXT_INIT, &args); +} + +static int +virtgpu_ioctl_get_caps(struct virtgpu *gpu, + enum virgl_renderer_capset id, + uint32_t version, + void *capset, + size_t capset_size) +{ + struct drm_virtgpu_get_caps args = { + .cap_set_id = id, + .cap_set_ver = version, + .addr = (uintptr_t)capset, + .size = (__u32) capset_size, + .pad = 0, + }; + + return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GET_CAPS, &args); +} + +static uint64_t +virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param) +{ + /* val must be zeroed because kernel only writes the lower 32 bits */ + uint64_t val = 0; + struct drm_virtgpu_getparam args = { + .param = param, + .value = (uintptr_t)&val, + }; + + const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GETPARAM, &args); + return ret ? 0 : val; +} + + +struct vn_cs_encoder * +remote_call_prepare( + struct virtgpu *gpu, + ApirCommandType apir_cmd_type, + int32_t cmd_flags) +{ + + if (!gpu->reply_shmem) { + FATAL("%s: the reply shmem page can't be null", __func__); + } + + /* + * Prepare the command encoder and its buffer + */ + + static char encoder_buffer[4096]; + + static struct vn_cs_encoder enc; + enc = { + encoder_buffer, + encoder_buffer, + encoder_buffer + sizeof(encoder_buffer), + }; + + /* + * Fill the command encoder with the common args: + * - cmd_type (int32_t) + * - cmd_flags (int32_t) + * - reply res id (uint32_t) + */ + + int32_t cmd_type = VENUS_COMMAND_TYPE_LENGTH + apir_cmd_type; + vn_encode_int32_t(&enc, &cmd_type); + vn_encode_int32_t(&enc, &cmd_flags); + + uint32_t reply_res_id = gpu->reply_shmem->res_id; + vn_encode_uint32_t(&enc, &reply_res_id); + + return &enc; +} + +void +remote_call_finish( + struct virtgpu *gpu, + struct vn_cs_encoder *enc, + struct vn_cs_decoder *dec) { + UNUSED(gpu); + + if (!enc) { + ERROR("Invalid (null) encoder :/"); + } + + if (!dec) { + ERROR("Invalid (null) decoder :/"); + } + + // encoder and decoder are statically allocated, nothing to do to release them +} + +uint32_t +remote_call( + struct virtgpu *gpu, + struct vn_cs_encoder *encoder, + struct vn_cs_decoder **decoder, + float max_wait_ms, + long long *call_duration_ns) +{ + /* + * Prepare the reply notification pointer + */ + + volatile std::atomic_uint *atomic_reply_notif = (volatile std::atomic_uint *) gpu->reply_shmem->mmap_ptr; + *atomic_reply_notif = 0; + + /* + * Trigger the execbuf ioctl + */ + + struct drm_virtgpu_execbuffer args = { + .flags = VIRTGPU_EXECBUF_RING_IDX, + .size = (uint32_t) (encoder->cur - encoder->start), + .command = (uintptr_t) encoder->start, + + .bo_handles = 0, + .num_bo_handles = 0, + + .fence_fd = 0, + .ring_idx = 0, + .syncobj_stride = 0, + .num_in_syncobjs = 0, + .num_out_syncobjs = 0, + .in_syncobjs = 0, + .out_syncobjs = 0, + }; + + *decoder = NULL; + + int ret = drmIoctl(gpu->fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &args); + + if (ret != 0) { + FATAL("%s: the virtgpu EXECBUFFER ioctl failed (%d) :/ \n", ret); + } + + /* + * Wait for the response notification + */ + + start_timer(&wait_host_reply_timer); + + struct timespec ts_start, ts_end; + clock_gettime(CLOCK_MONOTONIC, &ts_start); + long long start_time = (long long)ts_start.tv_sec * 1000000000LL + ts_start.tv_nsec; + + bool timedout = false; + uint32_t notif_value = 0; + while (true) { + notif_value = std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire); + + if (notif_value != 0) { + break; + } + + int64_t base_sleep_us = 15; + + os_time_sleep(base_sleep_us); + + if (max_wait_ms) { + clock_gettime(CLOCK_MONOTONIC, &ts_end); + long long end_time = (long long)ts_end.tv_sec * 1000000000LL + ts_end.tv_nsec; + float duration_ms = (end_time - start_time) / 1000000; + + if (duration_ms > max_wait_ms) { + timedout = true; + break; + } + } + } + + if (call_duration_ns) { + *call_duration_ns = stop_timer(&wait_host_reply_timer); + } + + if (max_wait_ms && timedout) { + ERROR("timed out waiting for the host answer..."); + return APIR_FORWARD_TIMEOUT; + } + + /* + * Prepare the decoder + */ + static struct vn_cs_decoder response_dec; + response_dec.cur = (char *) gpu->reply_shmem->mmap_ptr + sizeof(*atomic_reply_notif); + response_dec.end = (char *) gpu->reply_shmem->mmap_ptr + gpu->reply_shmem->mmap_size; + *decoder = &response_dec; + + // extract the actual return value from the notif flag + uint32_t returned_value = notif_value - 1; + return returned_value; +} + +static void log_call_duration(long long call_duration_ns, const char *name) { + double call_duration_ms = (double) call_duration_ns / 1e6; // 1 millisecond = 1e6 nanoseconds + double call_duration_s = (double) call_duration_ns / 1e9; // 1 second = 1e9 nanoseconds + + if (call_duration_s > 1) { + MESSAGE("%s: waited %.2fs for the %s host reply...", __func__, call_duration_s, name); + } else if (call_duration_ms > 1) { + MESSAGE("%s: waited %.2fms for the %s host reply...", __func__, call_duration_ms, name); + } else { + MESSAGE("%s: waited %lldns for the %s host reply...", __func__, call_duration_ns, name); + } +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h new file mode 100644 index 0000000000000..7a8cfc3d7c5a6 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu.h @@ -0,0 +1,125 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "virtgpu-forward.h" +#include "virtgpu-utils.h" +#include "../ggml-remotingbackend/shared/api_remoting.h" +#include "../ggml-remotingbackend/shared/venus_cs.h" + +#include "virtgpu-shm.h" + +#define VIRGL_RENDERER_UNSTABLE_APIS 1 +#include "drm-uapi/virtgpu_drm.h" +#include "venus_hw.h" + +// must match https://gitlab.freedesktop.org/kpouget/virglrenderer/-/blob/main/src/virglrenderer_hw.h?ref_type=heads +enum virgl_renderer_capset { + VIRGL_RENDERER_CAPSET_VIRGL = 1, + VIRGL_RENDERER_CAPSET_VIRGL2 = 2, + /* 3 is reserved for gfxstream */ + VIRGL_RENDERER_CAPSET_VENUS = 4, + /* 5 is reserved for cross-domain */ + VIRGL_RENDERER_CAPSET_DRM = 6, +}; + +/* from src/virtio/vulkan/vn_renderer_virtgpu.c */ +#define VIRTGPU_PCI_VENDOR_ID 0x1af4 +#define VIRTGPU_PCI_DEVICE_ID 0x1050 +#define VIRTGPU_BLOB_MEM_GUEST_VRAM 0x0004 +#define VIRTGPU_PARAM_GUEST_VRAM 9 + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +#define VN_DEBUG(what) true + +typedef enum virt_gpu_result_t { + APIR_SUCCESS = 0, + APIR_ERROR_INITIALIZATION_FAILED = -1, +} virt_gpu_result_t; + + +struct remoting_dev_instance { + int yes; +}; + +#define PRINTFLIKE(f, a) __attribute__ ((format(__printf__, f, a))) + +inline void +vn_log(struct remoting_dev_instance *instance, const char *format, ...) + PRINTFLIKE(2, 3); + + +struct virtgpu { + struct remoting_dev_instance *instance; + + int fd; + + bool has_primary; + int primary_major; + int primary_minor; + int render_major; + int render_minor; + + int bustype; + drmPciBusInfo pci_bus_info; + + uint32_t max_timeline_count; + + struct { + enum virgl_renderer_capset id; + uint32_t version; + struct virgl_renderer_capset_venus data; + } capset; + + uint32_t shmem_blob_mem; + uint32_t bo_blob_mem; + + /* note that we use gem_handle instead of res_id to index because + * res_id is monotonically increasing by default (see + * virtio_gpu_resource_id_get) + */ + struct util_sparse_array shmem_array; + + mtx_t dma_buf_import_mutex; + + bool supports_cross_device; + + /* APIR */ + struct vn_renderer_shmem *reply_shmem; + struct vn_renderer_shmem *data_shmem; +}; + + +static inline int +virtgpu_ioctl(struct virtgpu *gpu, unsigned long request, void *args) +{ + return drmIoctl(gpu->fd, request, args); +} + +struct virtgpu *create_virtgpu(); + +struct vn_cs_encoder *remote_call_prepare( + struct virtgpu *gpu, + ApirCommandType apir_cmd_type, + int32_t cmd_flags); + +uint32_t remote_call( + struct virtgpu *gpu, + struct vn_cs_encoder *enc, + struct vn_cs_decoder **dec, + float max_wait_ms, + long long *call_duration_ns +); + +void remote_call_finish( + struct virtgpu *gpu, + struct vn_cs_encoder *enc, + struct vn_cs_decoder *dec);