ggml-org · kpouget · Aug 29, 2025 · Nov 3, 2025
diff --git a/CMakePresets.json b/CMakePresets.json
@@ -30,6 +30,8 @@
     { "name": "static",   "hidden": true, "cacheVariables": { "GGML_STATIC":      "ON" } },
     { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16":    "ON" } },
     { "name": "vulkan",   "hidden": true, "cacheVariables": { "GGML_VULKAN":      "ON" } },
+    { "name": "remoting_frontend",   "hidden": true, "cacheVariables": { "GGML_REMOTING_FRONTEND":      "ON" } },
+    { "name": "remoting_backend",   "hidden": true, "cacheVariables": { "GGML_REMOTING_BACKEND":      "ON" } },
 
     {
         "name": "x64-windows-llvm", "hidden": true,

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
@@ -226,6 +226,8 @@ option(GGML_WEBGPU_CPU_PROFILE              "ggml: enable WebGPU profiling (CPU)
 option(GGML_WEBGPU_GPU_PROFILE              "ggml: enable WebGPU profiling (GPU)"             OFF)
 
 option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
+option(GGML_REMOTING_FRONTEND               "ggml: use the API Remoting frontend"             OFF)
+option(GGML_REMOTING_BACKEND                "ggml: use the API Remoting backend"              OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
 option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
@@ -317,6 +319,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-sycl.h
     include/ggml-vulkan.h
     include/ggml-webgpu.h
+    include/ggml-remoting-frontend.h
     include/gguf.h)
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")

@@ -0,0 +1,16 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_REMOTING_FRONTEND_NAME "RemotingFrontend"
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_remoting_frontend_reg();
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
@@ -406,6 +406,8 @@ ggml_add_backend(WebGPU)
 ggml_add_backend(zDNN)
 ggml_add_backend(OpenCL)
 ggml_add_backend(Hexagon)
+ggml_add_backend(RemotingFrontend)
+ggml_add_backend(RemotingBackend)
 
 foreach (target ggml-base ggml)
     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)

@@ -73,6 +73,10 @@
 #include "ggml-cann.h"
 #endif
 
+#ifdef GGML_USE_REMOTINGFRONTEND
+#include "ggml-remoting-frontend.h"
+#endif
+
 // disable C++17 deprecation warning for std::codecvt_utf8
 #if defined(__clang__)
 #    pragma clang diagnostic push
@@ -200,6 +204,10 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_ZDNN
         register_backend(ggml_backend_zdnn_reg());
 #endif
+#ifdef GGML_USE_REMOTINGFRONTEND
+        register_backend(ggml_backend_remoting_frontend_reg());
+#endif
+
 #ifdef GGML_USE_OPENCL
         register_backend(ggml_backend_opencl_reg());
 #endif
@@ -604,6 +612,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
     ggml_backend_load_best("rpc", silent, dir_path);
     ggml_backend_load_best("sycl", silent, dir_path);
     ggml_backend_load_best("vulkan", silent, dir_path);
+    ggml_backend_load_best("remoting_frontend", silent, dir_path);
     ggml_backend_load_best("opencl", silent, dir_path);
     ggml_backend_load_best("hexagon", silent, dir_path);
     ggml_backend_load_best("musa", silent, dir_path);

@@ -11,6 +11,7 @@ ggml_add_backend_library(ggml-metal
                          ggml-metal-common.cpp
                          ggml-metal-context.m
                          ggml-metal-ops.cpp
+                         ggml-metal-remoting.cpp
                         )
 
 target_link_libraries(ggml-metal PRIVATE

@@ -0,0 +1,28 @@
+#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
+#include "ggml-impl.h"
+
+#include "ggml-metal-device.h"
+#include "ggml-metal-impl.h"
+#include "ggml-metal-context.h"
+
+extern "C" {
+  GGML_BACKEND_API void ggml_backend_metal_get_device_context(ggml_backend_dev_t dev,
+							      bool *has_simdgroup_mm,
+							      bool *has_simdgroup_reduction,
+							      bool *use_bfloat);
+
+  GGML_BACKEND_API void
+  ggml_backend_metal_get_device_context(ggml_backend_dev_t dev,
+					bool *has_simdgroup_mm,
+					bool *has_simdgroup_reduction,
+					bool *has_bfloat) {
+    ggml_metal_device_t dev_ctx = (ggml_metal_device_t)dev->context;
+
+    const struct ggml_metal_device_props *props = ggml_metal_device_get_props(dev_ctx);
+
+    *has_bfloat = props->has_bfloat;
+    *has_simdgroup_reduction = props->has_simdgroup_reduction;
+    *has_simdgroup_mm = props->has_simdgroup_mm;
+  }
+}
diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 3.19)
+cmake_policy(SET CMP0114 NEW)
+
+message(STATUS "Enable API Remoting backend")
+
+ggml_add_backend_library(ggml-remotingbackend
+                         backend.cpp
+                         backend-dispatched.cpp
+                         backend-dispatched-backend.cpp
+                         backend-dispatched-device.cpp
+                         backend-dispatched-buffer.cpp
+                         backend-dispatched-buffer-type.cpp
+                         backend-dispatched-metal.cpp
+                         backend-utils.cpp
+                         shared/api_remoting.h
+                         shared/apir_backend.h
+                         shared/venus_cs.h
+                         venus_cs_ggml-rpc-back.cpp
+                        )
+
+target_compile_options(ggml-remotingbackend PRIVATE -std=c++20)
diff --git a/ggml/src/ggml-remotingbackend/backend-convert.h b/ggml/src/ggml-remotingbackend/backend-convert.h
@@ -0,0 +1,15 @@
+#include "shared/apir_backend.h"
+
+#define BUFFER_TO_HOST_HANDLE(name) ggml_buffer_to_apir_handle(name)
+
+static inline apir_buffer_host_handle_t
+ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
+  // in the backend, the buffer handle is the buffer pointer
+  return (apir_buffer_host_handle_t) buffer;
+}
+
+static inline apir_buffer_type_host_handle_t
+ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) {
+  // in the backend, the buffer handle is the buffer pointer
+  return (apir_buffer_type_host_handle_t) buft;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
@@ -0,0 +1,58 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+#include "shared/apir_backend.h"
+
+struct timer_data graph_compute_timer = {0, 0, 0, "compute_timer"};
+
+uint32_t
+backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(enc);
+
+  start_timer(&graph_compute_timer);
+
+  uint32_t shmem_res_id;
+  vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+  const void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
+  if (!shmem_data) {
+    FATAL("Couldn't get the shmem addr from virgl :/");
+  }
+  size_t cgraph_size;
+  vn_decode_size_t(dec, &cgraph_size);
+
+  struct vn_cs_decoder secondary_dec = vn_cs_new_decoder((const char *) shmem_data, cgraph_size);
+
+  ggml_cgraph *cgraph = vn_decode_ggml_cgraph(&secondary_dec, cgraph_size);
+
+  ggml_status status;
+#if APIR_BACKEND_CHECK_SUPPORTS_OP == 1
+  for (int idx = 0; idx < cgraph->n_nodes; idx++) {
+    ggml_tensor *op = ggml_graph_node(cgraph, idx);
+    if (dev->iface.supports_op(dev, op)) {
+      continue;
+    }
+    ERROR("Graph node %d (%s) not supported by the backend :/", idx, ggml_op_desc(op));
+
+    status = GGML_STATUS_ABORTED;
+    vn_encode_ggml_status(enc, &status);
+
+    stop_timer(&graph_compute_timer);
+    return 0;
+  }
+#endif
+  status = bck->iface.graph_compute(bck, cgraph);
+  bck->iface.synchronize(bck);
+
+  vn_encode_ggml_status(enc, &status);
+
+  stop_timer(&graph_compute_timer);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
@@ -0,0 +1,81 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+uint32_t
+backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buffer_type(dec);
+
+  const char *string = buft->iface.get_name(buft);
+
+  const size_t string_size = strlen(string) + 1;
+  vn_encode_array_size(enc, string_size);
+  vn_encode_char_array(enc, string, string_size);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buffer_type(dec);
+
+  size_t value = buft->iface.get_alignment(buft);
+  vn_encode_size_t(enc, &value);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buffer_type(dec);
+
+  size_t value = buft->iface.get_max_size(buft);
+  vn_encode_size_t(enc, &value);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buffer_type(dec);
+
+  bool is_host = buft->iface.is_host(buft);
+  vn_encode_bool_t(enc, &is_host);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buffer_type(dec);
+
+  size_t size;
+  vn_decode_size_t(dec, &size);
+
+  ggml_backend_buffer_t buffer;
+
+  buffer = buft->iface.alloc_buffer(buft, size);
+
+  vn_encode_ggml_buffer(enc, buffer);
+
+  if (buffer) {
+    track_backend_buffer(buffer);
+  }
+
+  return 0;
+}