Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions custom_ops/gpu_ops/cpp_extensions.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <stdexcept>
#include <string>

#include "paddle/extension.h"
#include "pybind11/pybind11.h"
namespace py = pybind11;
Expand Down Expand Up @@ -49,6 +55,59 @@ void cuda_host_free(uintptr_t ptr) {
check_cuda_error(cudaFreeHost(reinterpret_cast<void*>(ptr)));
}

// Create a shared memory region and register it with CUDA
// The pinned shm can be shared between processes
uintptr_t create_pinned_shm(const char* shm_name, size_t byte_size) {
int fd = shm_open(shm_name, O_CREAT | O_RDWR, 0666);
if (fd < 0) throw std::runtime_error("shm_open failed");

if (ftruncate(fd, byte_size) != 0) {
close(fd);
throw std::runtime_error("ftruncate failed");
}

void* addr =
mmap(nullptr, byte_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (addr == MAP_FAILED) {
close(fd);
throw std::runtime_error("mmap failed");
}

check_cuda_error(cudaHostRegister(addr, byte_size, cudaHostRegisterPortable));

close(fd);
Comment on lines +75 to +78
Copy link

Copilot AI Dec 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The close(fd) call should be done before line 76 (after the mmap), not after cudaHostRegister. File descriptors from shm_open can be closed immediately after mmap completes, as the mapping maintains its own reference. The current placement is correct but the ordering could be improved for clarity. Consider moving it right after mmap (line 70-75 area) to follow best practices.

Suggested change
check_cuda_error(cudaHostRegister(addr, byte_size, cudaHostRegisterPortable));
close(fd);
close(fd);
check_cuda_error(cudaHostRegister(addr, byte_size, cudaHostRegisterPortable));

Copilot uses AI. Check for mistakes.
return reinterpret_cast<uintptr_t>(addr);
}
Comment on lines +60 to +80
Copy link

Copilot AI Dec 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Resource leak potential: If cudaHostRegister fails (line 76), the shared memory created by shm_open, ftruncate, and mmap is not cleaned up. Consider adding error handling to unmap and unlink the shared memory on failure:

if (cudaHostRegister(addr, byte_size, cudaHostRegisterPortable) != cudaSuccess) {
  munmap(addr, byte_size);
  shm_unlink(shm_name);
  close(fd);
  throw std::runtime_error("cudaHostRegister failed");
}

Note: close(fd) should also be called before line 76 to avoid leaking the file descriptor if registration fails.

Copilot uses AI. Check for mistakes.

uintptr_t open_pinned_shm(const char* shm_name, size_t byte_size) {
int fd = shm_open(shm_name, O_RDWR, 0666);
if (fd < 0) throw std::runtime_error("shm_open failed");

void* addr =
mmap(nullptr, byte_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (addr == MAP_FAILED) {
close(fd);
throw std::runtime_error("mmap failed");
}

check_cuda_error(cudaHostRegister(addr, byte_size, cudaHostRegisterPortable));

close(fd);
return reinterpret_cast<uintptr_t>(addr);
}

void free_pinned_shm(const char* shm_name,
uintptr_t addr_uint,
size_t byte_size) {
void* addr = reinterpret_cast<void*>(addr_uint);

check_cuda_error(cudaHostUnregister(addr));

if (munmap(addr, byte_size) != 0) throw std::runtime_error("munmap failed");

if (shm_unlink(shm_name) != 0) throw std::runtime_error("shm_unlink failed");
}
Comment on lines +58 to +109
Copy link

Copilot AI Dec 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new C++ functions create_pinned_shm, open_pinned_shm, and free_pinned_shm in cpp_extensions.cc lack unit test coverage. Consider adding tests to verify:

  1. Successful creation and opening of shared memory
  2. Error handling when operations fail (e.g., shm_open, mmap, cudaHostRegister failures)
  3. Proper cleanup and resource release
  4. Concurrent access scenarios

Copilot uses AI. Check for mistakes.

std::vector<paddle::Tensor> AppendAttention(
const paddle::Tensor& qkv,
const paddle::Tensor& key_cache,
Expand Down Expand Up @@ -1146,6 +1205,22 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
py::arg("flags") = cudaHostAllocDefault);
m.def(
"cuda_host_free", &cuda_host_free, "Free pinned memory", py::arg("ptr"));
m.def("create_pinned_shm",
&create_pinned_shm,
"Allocate pinned memory for supporting inter process communication",
py::arg("name"),
py::arg("byte_size"));
m.def("open_pinned_shm",
&open_pinned_shm,
"Open pinned memory which has been allocated by another process",
py::arg("name"),
py::arg("byte_size"));
m.def("free_pinned_shm",
&free_pinned_shm,
"Free pinned memory which supports inter process communication",
py::arg("name"),
py::arg("addr_uint"),
py::arg("byte_size"));
py::register_exception<CudaError>(m, "CudaError");
/**
* append_attention.cu
Expand Down
1 change: 1 addition & 0 deletions fastdeploy/cache_manager/cache_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class CacheStatus(Enum):
SWAP2CPU = 1
SWAP2GPU = 2
CPU = 3
SPLITWISE_CPU2GPU = 4


class BlockNode:
Expand Down
Loading