@@ -30,7 +30,8 @@ DEFINE_bool(use_nvprof, false, "Start / stop nvprof");
3030
3131namespace {
3232
33- std::tuple<std::vector<std::string>, std::vector<size_t >> init () {
33+ std::tuple<std::vector<std::string>, std::vector<size_t >, std::vector<size_t >>
34+ init () {
3435 int deviceCount = 0 ;
3536 auto err_id = cudaGetDeviceCount (&deviceCount);
3637 if (err_id == 35 or err_id == 30 ) {
@@ -44,14 +45,16 @@ std::tuple<std::vector<std::string>, std::vector<size_t>> init() {
4445 }
4546 std::vector<std::string> gpuNames;
4647 std::vector<size_t > sharedMemSizes;
48+ std::vector<size_t > registersPerBlock;
4749 gpuNames.reserve (deviceCount);
4850 for (int i = 0 ; i < deviceCount; ++i) {
4951 cudaDeviceProp deviceProp;
5052 TC_CUDA_RUNTIMEAPI_ENFORCE (cudaGetDeviceProperties (&deviceProp, i));
5153 gpuNames.emplace_back (deviceProp.name );
5254 sharedMemSizes.emplace_back (deviceProp.sharedMemPerBlock );
55+ registersPerBlock.emplace_back (deviceProp.regsPerBlock );
5356 }
54- return std::make_tuple (gpuNames, sharedMemSizes);
57+ return std::make_tuple (gpuNames, sharedMemSizes, registersPerBlock );
5558}
5659
5760} // namespace
@@ -61,8 +64,8 @@ CudaGPUInfo& CudaGPUInfo::GPUInfo() {
6164 static thread_local bool inited = false ;
6265 if (!inited) {
6366 auto infos = init ();
64- pInfo = std::unique_ptr<CudaGPUInfo>(
65- new CudaGPUInfo ( std::get<0 >(infos), std::get<1 >(infos)));
67+ pInfo = std::unique_ptr<CudaGPUInfo>(new CudaGPUInfo (
68+ std::get<0 >(infos), std::get<1 >(infos), std::get< 2 >(infos)));
6669 inited = true ;
6770 }
6871 return *pInfo;
@@ -102,4 +105,11 @@ size_t CudaGPUInfo::SharedMemorySize() const {
102105 }
103106 return sharedMemSizes_.at (CurrentGPUId ());
104107}
108+
109+ size_t CudaGPUInfo::RegistersPerBlock () const {
110+ if (NumberGPUs () == 0 ) {
111+ return 0 ; // no registers if no GPUs
112+ }
113+ return registersPerBlock_.at (CurrentGPUId ());
114+ }
105115} // namespace tc
0 commit comments