2525#include " ibuffered_device.h"
2626#include " logger/logger.h"
2727
28+ #define CUDA_TRANS_UNIT_SIZE (sizeof (uint64_t ) * 2 )
29+ #define CUDA_TRANS_BLOCK_NUMBER (32 )
30+ #define CUDA_TRANS_BLOCK_SIZE (256 )
31+ #define CUDA_TRANS_THREAD_NUMBER (CUDA_TRANS_BLOCK_NUMBER * CUDA_TRANS_BLOCK_SIZE)
32+
33+ inline __device__ void H2DUnit (uint8_t * __restrict__ dst, const volatile uint8_t * __restrict__ src)
34+ {
35+ uint64_t a, b;
36+ asm volatile (" ld.global.cs.v2.u64 {%0, %1}, [%2];" : " =l" (a), " =l" (b) : " l" (src));
37+ asm volatile (" st.global.cg.v2.u64 [%0], {%1, %2};" ::" l" (dst), " l" (a), " l" (b));
38+ }
39+
40+ inline __device__ void D2HUnit (volatile uint8_t * __restrict__ dst, const uint8_t * __restrict__ src)
41+ {
42+ uint64_t a, b;
43+ asm volatile (" ld.global.cs.v2.u64 {%0, %1}, [%2];" : " =l" (a), " =l" (b) : " l" (src));
44+ asm volatile (" st.volatile.global.v2.u64 [%0], {%1, %2};" ::" l" (dst), " l" (a), " l" (b));
45+ }
46+
47+ __global__ void H2DKernel (uintptr_t * dst, const volatile uintptr_t * src, size_t num, size_t size)
48+ {
49+ auto length = num * size;
50+ auto offset = (blockIdx .x * blockDim .x + threadIdx .x ) * CUDA_TRANS_UNIT_SIZE;
51+ while (offset + CUDA_TRANS_UNIT_SIZE <= length) {
52+ auto idx = offset / size;
53+ auto off = offset % size;
54+ H2DUnit (((uint8_t *)dst[idx]) + off, ((const uint8_t *)src[idx]) + off);
55+ offset += CUDA_TRANS_THREAD_NUMBER * CUDA_TRANS_UNIT_SIZE;
56+ }
57+ }
58+
59+ __global__ void D2HKernel (volatile uintptr_t * dst, const uintptr_t * src, size_t num, size_t size)
60+ {
61+ auto length = num * size;
62+ auto offset = (blockIdx .x * blockDim .x + threadIdx .x ) * CUDA_TRANS_UNIT_SIZE;
63+ while (offset + CUDA_TRANS_UNIT_SIZE <= length) {
64+ auto idx = offset / size;
65+ auto off = offset % size;
66+ D2HUnit (((uint8_t *)dst[idx]) + off, ((const uint8_t *)src[idx]) + off);
67+ offset += CUDA_TRANS_THREAD_NUMBER * CUDA_TRANS_UNIT_SIZE;
68+ }
69+ }
70+
71+ inline __host__ void H2DBatch (uintptr_t * dst, const volatile uintptr_t * src, size_t num,
72+ size_t size, cudaStream_t stream)
73+ {
74+ H2DKernel<<<CUDA_TRANS_BLOCK_NUMBER, CUDA_TRANS_BLOCK_SIZE, 0 , stream>>> (dst, src, num, size);
75+ }
76+
77+ inline __host__ void D2HBatch (volatile uintptr_t * dst, const uintptr_t * src, size_t num,
78+ size_t size, cudaStream_t stream)
79+ {
80+ D2HKernel<<<CUDA_TRANS_BLOCK_NUMBER, CUDA_TRANS_BLOCK_SIZE, 0 , stream>>> (dst, src, num, size);
81+ }
82+
2883template <>
2984struct fmt ::formatter<cudaError_t> : formatter<int32_t > {
3085 auto format (cudaError_t err, format_context& ctx) const -> format_context::iterator
@@ -39,7 +94,7 @@ template <typename Api, typename... Args>
3994Status CudaApi (const char * caller, const char * file, const size_t line, const char * name, Api&& api,
4095 Args&&... args)
4196{
42- auto ret = api ( args...);
97+ auto ret = std::invoke (api, args...);
4398 if (ret != cudaSuccess) {
4499 UC_ERROR (" CUDA ERROR: api={}, code={}, err={}, caller={},{}:{}." , name, ret,
45100 cudaGetErrorString (ret), caller, basename (file), line);
@@ -62,6 +117,22 @@ class CudaDevice : public IBufferedDevice {
62117 c->cb (ret == cudaSuccess);
63118 delete c;
64119 }
120+ static void * MakeDeviceArray (const void * hostArray[], const size_t number)
121+ {
122+ auto size = sizeof (void *) * number;
123+ void * deviceArray = nullptr ;
124+ auto ret = cudaMalloc (&deviceArray, size);
125+ if (ret != cudaSuccess) {
126+ UC_ERROR (" Failed({},{}) to alloc({}) on device." , ret, cudaGetErrorString (ret), size);
127+ return nullptr ;
128+ }
129+ if (CUDA_API (cudaMemcpy, deviceArray, hostArray, size, cudaMemcpyHostToDevice).Success ()) {
130+ return deviceArray;
131+ }
132+ ReleaseDeviceArray (deviceArray);
133+ return nullptr ;
134+ }
135+ static void ReleaseDeviceArray (void * deviceArray) { CUDA_API (cudaFree, deviceArray); }
65136
66137public:
67138 CudaDevice (const int32_t deviceId, const size_t bufferSize, const size_t bufferNumber)
@@ -88,13 +159,11 @@ class CudaDevice : public IBufferedDevice {
88159 }
89160 Status H2DAsync (std::byte* dst, const std::byte* src, const size_t count) override
90161 {
91- return CUDA_API (cudaMemcpyAsync, dst, src, count, cudaMemcpyHostToDevice,
92- (cudaStream_t)this ->stream_ );
162+ return CUDA_API (cudaMemcpyAsync, dst, src, count, cudaMemcpyHostToDevice, this ->stream_ );
93163 }
94164 Status D2HAsync (std::byte* dst, const std::byte* src, const size_t count) override
95165 {
96- return CUDA_API (cudaMemcpyAsync, dst, src, count, cudaMemcpyDeviceToHost,
97- (cudaStream_t)this ->stream_ );
166+ return CUDA_API (cudaMemcpyAsync, dst, src, count, cudaMemcpyDeviceToHost, this ->stream_ );
98167 }
99168 Status AppendCallback (std::function<void (bool )> cb) override
100169 {
@@ -103,14 +172,42 @@ class CudaDevice : public IBufferedDevice {
103172 UC_ERROR (" Failed to make closure for append cb." );
104173 return Status::OutOfMemory ();
105174 }
106- auto status =
107- CUDA_API (cudaStreamAddCallback, (cudaStream_t)this ->stream_ , Trampoline, (void *)c, 0 );
175+ auto status = CUDA_API (cudaStreamAddCallback, this ->stream_ , Trampoline, (void *)c, 0 );
108176 if (status.Failure ()) { delete c; }
109177 return status;
110178 }
111- Status Synchronized () override
179+ Status Synchronized () override { return CUDA_API (cudaStreamSynchronize, this ->stream_ ); }
180+ Status H2DBatchSync (std::byte* dArr[], const std::byte* hArr[], const size_t number,
181+ const size_t count) override
112182 {
113- return CUDA_API (cudaStreamSynchronize, (cudaStream_t)this ->stream_ );
183+ auto src = MakeDeviceArray ((const void **)hArr, number);
184+ if (!src) { return Status::OutOfMemory (); }
185+ auto dst = MakeDeviceArray ((const void **)dArr, number);
186+ if (!dst) {
187+ ReleaseDeviceArray (src);
188+ return Status::OutOfMemory ();
189+ }
190+ H2DBatch ((uintptr_t *)dst, (const volatile uintptr_t *)src, number, count, this ->stream_ );
191+ auto status = this ->Synchronized ();
192+ ReleaseDeviceArray (src);
193+ ReleaseDeviceArray (dst);
194+ return status;
195+ }
196+ Status D2HBatchSync (std::byte* hArr[], const std::byte* dArr[], const size_t number,
197+ const size_t count) override
198+ {
199+ auto src = MakeDeviceArray ((const void **)dArr, number);
200+ if (!src) { return Status::OutOfMemory (); }
201+ auto dst = MakeDeviceArray ((const void **)hArr, number);
202+ if (!dst) {
203+ ReleaseDeviceArray (src);
204+ return Status::OutOfMemory ();
205+ }
206+ D2HBatch ((volatile uintptr_t *)dst, (const uintptr_t *)src, number, count, this ->stream_ );
207+ auto status = this ->Synchronized ();
208+ ReleaseDeviceArray (src);
209+ ReleaseDeviceArray (dst);
210+ return status;
114211 }
115212
116213protected:
@@ -126,7 +223,7 @@ class CudaDevice : public IBufferedDevice {
126223 }
127224
128225private:
129- void * stream_;
226+ cudaStream_t stream_;
130227};
131228
132229std::unique_ptr<IDevice> DeviceFactory::Make (const int32_t deviceId, const size_t bufferSize,
0 commit comments