-
Notifications
You must be signed in to change notification settings - Fork 661
[PD Disaggregation] [tmp] decode use cpu buffer to receive cache from prefill #5308
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,12 @@ | |
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| #include <fcntl.h> | ||
| #include <sys/mman.h> | ||
| #include <unistd.h> | ||
| #include <stdexcept> | ||
| #include <string> | ||
|
|
||
| #include "paddle/extension.h" | ||
| #include "pybind11/pybind11.h" | ||
| namespace py = pybind11; | ||
|
|
@@ -49,6 +55,59 @@ void cuda_host_free(uintptr_t ptr) { | |
| check_cuda_error(cudaFreeHost(reinterpret_cast<void*>(ptr))); | ||
| } | ||
|
|
||
| // Create a shared memory region and register it with CUDA | ||
| // The pinned shm can be shared between processes | ||
| uintptr_t create_pinned_shm(const char* shm_name, size_t byte_size) { | ||
| int fd = shm_open(shm_name, O_CREAT | O_RDWR, 0666); | ||
| if (fd < 0) throw std::runtime_error("shm_open failed"); | ||
|
|
||
| if (ftruncate(fd, byte_size) != 0) { | ||
| close(fd); | ||
| throw std::runtime_error("ftruncate failed"); | ||
| } | ||
|
|
||
| void* addr = | ||
| mmap(nullptr, byte_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); | ||
| if (addr == MAP_FAILED) { | ||
| close(fd); | ||
| throw std::runtime_error("mmap failed"); | ||
| } | ||
|
|
||
| check_cuda_error(cudaHostRegister(addr, byte_size, cudaHostRegisterPortable)); | ||
|
|
||
| close(fd); | ||
| return reinterpret_cast<uintptr_t>(addr); | ||
| } | ||
|
Comment on lines
+60
to
+80
|
||
|
|
||
| uintptr_t open_pinned_shm(const char* shm_name, size_t byte_size) { | ||
| int fd = shm_open(shm_name, O_RDWR, 0666); | ||
| if (fd < 0) throw std::runtime_error("shm_open failed"); | ||
|
|
||
| void* addr = | ||
| mmap(nullptr, byte_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); | ||
| if (addr == MAP_FAILED) { | ||
| close(fd); | ||
| throw std::runtime_error("mmap failed"); | ||
| } | ||
|
|
||
| check_cuda_error(cudaHostRegister(addr, byte_size, cudaHostRegisterPortable)); | ||
juncaipeng marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| close(fd); | ||
| return reinterpret_cast<uintptr_t>(addr); | ||
| } | ||
|
|
||
| void free_pinned_shm(const char* shm_name, | ||
| uintptr_t addr_uint, | ||
| size_t byte_size) { | ||
| void* addr = reinterpret_cast<void*>(addr_uint); | ||
|
|
||
| check_cuda_error(cudaHostUnregister(addr)); | ||
|
|
||
| if (munmap(addr, byte_size) != 0) throw std::runtime_error("munmap failed"); | ||
|
|
||
| if (shm_unlink(shm_name) != 0) throw std::runtime_error("shm_unlink failed"); | ||
| } | ||
|
Comment on lines
+58
to
+109
|
||
|
|
||
| std::vector<paddle::Tensor> AppendAttention( | ||
| const paddle::Tensor& qkv, | ||
| const paddle::Tensor& key_cache, | ||
|
|
@@ -1146,6 +1205,22 @@ PYBIND11_MODULE(fastdeploy_ops, m) { | |
| py::arg("flags") = cudaHostAllocDefault); | ||
| m.def( | ||
| "cuda_host_free", &cuda_host_free, "Free pinned memory", py::arg("ptr")); | ||
| m.def("create_pinned_shm", | ||
| &create_pinned_shm, | ||
| "Allocate pinned memory for supporting inter process communication", | ||
| py::arg("name"), | ||
| py::arg("byte_size")); | ||
| m.def("open_pinned_shm", | ||
| &open_pinned_shm, | ||
| "Open pinned memory which has been allocated by another process", | ||
| py::arg("name"), | ||
| py::arg("byte_size")); | ||
| m.def("free_pinned_shm", | ||
| &free_pinned_shm, | ||
| "Free pinned memory which supports inter process communication", | ||
| py::arg("name"), | ||
| py::arg("addr_uint"), | ||
| py::arg("byte_size")); | ||
| py::register_exception<CudaError>(m, "CudaError"); | ||
| /** | ||
| * append_attention.cu | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,6 +42,7 @@ class CacheStatus(Enum): | |
| SWAP2CPU = 1 | ||
| SWAP2GPU = 2 | ||
| CPU = 3 | ||
| SPLITWISE_CPU2GPU = 4 | ||
|
|
||
|
|
||
| class BlockNode: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[nitpick] The
close(fd)call should be done before line 76 (after the mmap), not after cudaHostRegister. File descriptors from shm_open can be closed immediately after mmap completes, as the mapping maintains its own reference. The current placement is correct but the ordering could be improved for clarity. Consider moving it right after mmap (line 70-75 area) to follow best practices.