Skip to content

Commit f39d729

Browse files
author
Hexu Zhao
committed
Update the rank and world size because we add DP and run across nodes and therefore more types of ranks and corresponding groups. 2. delete the timer output to screen.
1 parent dd2fcd8 commit f39d729

File tree

7 files changed

+49
-63
lines changed

7 files changed

+49
-63
lines changed

cuda_rasterizer/forward.cu

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -178,9 +178,7 @@ __global__ void preprocessCUDA(int P, int D, int M,
178178
float4* conic_opacity,
179179
const dim3 grid,
180180
uint32_t* tiles_touched,
181-
bool prefiltered,
182-
int local_rank,
183-
int world_size)
181+
bool prefiltered)
184182
{
185183
auto idx = cg::this_grid().thread_rank();
186184
if (idx >= P)
@@ -468,9 +466,7 @@ void FORWARD::preprocess(int P, int D, int M,
468466
float4* conic_opacity,
469467
const dim3 grid,
470468
uint32_t* tiles_touched,
471-
bool prefiltered,
472-
int local_rank,
473-
int world_size)
469+
bool prefiltered)
474470
{
475471
preprocessCUDA<NUM_CHANNELS> << <(P + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >> > (
476472
P, D, M,
@@ -497,8 +493,6 @@ void FORWARD::preprocess(int P, int D, int M,
497493
conic_opacity,
498494
grid,
499495
tiles_touched,
500-
prefiltered,
501-
local_rank,
502-
world_size
496+
prefiltered
503497
);
504498
}

cuda_rasterizer/forward.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,7 @@ namespace FORWARD
4545
float4* conic_opacity,
4646
const dim3 grid,
4747
uint32_t* tiles_touched,
48-
bool prefiltered,
49-
int local_rank,
50-
int world_size);
48+
bool prefiltered);
5149

5250
// Main rasterization method.
5351
void render(

cuda_rasterizer/rasterizer_impl.cu

Lines changed: 24 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,7 @@ __global__ void duplicateWithKeys(
8080
uint32_t* gaussian_values_unsorted,
8181
int* radii,
8282
bool* compute_locally,
83-
dim3 grid,
84-
int local_rank,
85-
int world_size)
83+
dim3 grid)
8684
{
8785
auto idx = cg::this_grid().thread_rank();
8886
if (idx >= P)
@@ -289,9 +287,9 @@ __global__ void updateTileTouched(
289287
tiles_touched[idx] = cnt;
290288
}
291289

292-
void save_log_in_file(int iteration, int local_rank, int world_size, std::string log_folder, const char* filename_prefix, const char* log_content) {
293-
char* filename = new char[128];
294-
sprintf(filename, "%s/%s_ws=%d_rk=%d.log", log_folder.c_str(), filename_prefix, world_size, local_rank);
290+
void save_log_in_file(int iteration, int global_rank, int world_size, std::string log_folder, const char* filename_prefix, const char* log_content) {
291+
char* filename = new char[256];
292+
sprintf(filename, "%s/%s_ws=%d_rk=%d.log", log_folder.c_str(), filename_prefix, world_size, global_rank);
295293
std::ofstream outfile;
296294
outfile.open(filename, std::ios_base::app);
297295
outfile << "iteration: " << iteration << ", " << log_content << "\n";
@@ -307,7 +305,7 @@ void save_log_in_file(int iteration, int local_rank, int world_size, std::string
307305
std::tuple<int, int, int, int, int, bool, bool, std::string, std::string, std::string>
308306
prepareArgs(const pybind11::dict &args) {
309307
std::string mode = args["mode"].cast<std::string>();
310-
std::string local_rank_str = args["local_rank"].cast<std::string>();
308+
std::string global_rank_str = args["global_rank"].cast<std::string>();
311309
std::string world_size_str = args["world_size"].cast<std::string>();
312310
std::string iteration_str = args["iteration"].cast<std::string>();
313311
std::string log_interval_str = args["log_interval"].cast<std::string>();
@@ -317,7 +315,7 @@ std::tuple<int, int, int, int, int, bool, bool, std::string, std::string, std::s
317315
// std::string dist_division_mode_str = args["dist_division_mode"].cast<std::string>();
318316
std::string dist_division_mode_str = "";
319317

320-
int local_rank = std::stoi(local_rank_str);
318+
int global_rank = std::stoi(global_rank_str);
321319
int world_size = std::stoi(world_size_str);
322320
int iteration = std::stoi(iteration_str);
323321
int log_interval = std::stoi(log_interval_str);
@@ -328,7 +326,7 @@ std::tuple<int, int, int, int, int, bool, bool, std::string, std::string, std::s
328326
cudaError_t status = cudaGetDevice(&device);
329327

330328
// Pack and return the variables in a tuple
331-
return std::make_tuple(local_rank, world_size, iteration, log_interval, device,
329+
return std::make_tuple(global_rank, world_size, iteration, log_interval, device,
332330
zhx_debug, zhx_time,
333331
mode, dist_division_mode_str, log_folder_str);
334332
}
@@ -359,14 +357,14 @@ int CudaRasterizer::Rasterizer::preprocessForward(
359357
bool debug,//raster_settings
360358
const pybind11::dict &args)
361359
{
362-
auto [local_rank, world_size, iteration, log_interval, device, zhx_debug, zhx_time, mode, dist_division_mode, log_folder] = prepareArgs(args);
360+
auto [global_rank, world_size, iteration, log_interval, device, zhx_debug, zhx_time, mode, dist_division_mode, log_folder] = prepareArgs(args);
363361
char* log_tmp = new char[500];
364362

365363
// print out the environment variables
366364
if (mode == "train" && zhx_debug && iteration % log_interval == 1) {
367-
sprintf(log_tmp, "world_size: %d, local_rank: %d, iteration: %d, log_folder: %s, zhx_debug: %d, zhx_time: %d, device: %d, log_interval: %d, dist_division_mode: %s",
368-
world_size, local_rank, iteration, log_folder.c_str(), zhx_debug, zhx_time, device, log_interval, dist_division_mode.c_str());
369-
save_log_in_file(iteration, local_rank, world_size, log_folder, "cuda", log_tmp);
365+
sprintf(log_tmp, "world_size: %d, global_rank: %d, iteration: %d, log_folder: %s, zhx_debug: %d, zhx_time: %d, device: %d, log_interval: %d, dist_division_mode: %s",
366+
world_size, global_rank, iteration, log_folder.c_str(), zhx_debug, zhx_time, device, log_interval, dist_division_mode.c_str());
367+
save_log_in_file(iteration, global_rank, world_size, log_folder, "cuda", log_tmp);
370368
}
371369

372370
MyTimerOnGPU timer;
@@ -410,17 +408,15 @@ int CudaRasterizer::Rasterizer::preprocessForward(
410408
conic_opacity,
411409
tile_grid,
412410
tiles_touched_temp_buffer,
413-
prefiltered,
414-
local_rank,
415-
world_size
411+
prefiltered
416412
), debug)
417413
timer.stop("10 preprocess");
418414

419415
int num_rendered = 0;//TODO: should I calculate this here?
420416

421417
// Print out timing information
422418
if (zhx_time && iteration % log_interval == 1) {
423-
timer.printAllTimes(iteration, world_size, local_rank, log_folder, true);
419+
timer.printAllTimes(iteration, world_size, global_rank, log_folder, true);
424420
}
425421
delete log_tmp;
426422
// free temporary buffer for tiles_touched. TODO: remove it.
@@ -454,7 +450,7 @@ void CudaRasterizer::Rasterizer::preprocessBackward(
454450
bool debug,
455451
const pybind11::dict &args)
456452
{
457-
auto [local_rank, world_size, iteration, log_interval, device, zhx_debug, zhx_time, mode, dist_division_mode, log_folder] = prepareArgs(args);
453+
auto [global_rank, world_size, iteration, log_interval, device, zhx_debug, zhx_time, mode, dist_division_mode, log_folder] = prepareArgs(args);
458454

459455
MyTimerOnGPU timer;
460456
const float focal_y = height / (2.0f * tan_fovy);
@@ -488,7 +484,7 @@ void CudaRasterizer::Rasterizer::preprocessBackward(
488484

489485
// Print out timing information
490486
if (zhx_time && iteration % log_interval == 1) {
491-
timer.printAllTimes(iteration, world_size, local_rank, log_folder, false);
487+
timer.printAllTimes(iteration, world_size, global_rank, log_folder, false);
492488
}
493489
}
494490

@@ -537,7 +533,7 @@ int CudaRasterizer::Rasterizer::renderForward(
537533
bool debug,
538534
const pybind11::dict &args)
539535
{
540-
auto [local_rank, world_size, iteration, log_interval, device, zhx_debug, zhx_time, mode, dist_division_mode, log_folder] = prepareArgs(args);
536+
auto [global_rank, world_size, iteration, log_interval, device, zhx_debug, zhx_time, mode, dist_division_mode, log_folder] = prepareArgs(args);
541537
char* log_tmp = new char[500];
542538

543539
MyTimerOnGPU timer;
@@ -593,9 +589,7 @@ int CudaRasterizer::Rasterizer::renderForward(
593589
binningState.point_list_unsorted,
594590
radii,
595591
compute_locally,
596-
tile_grid,
597-
local_rank,
598-
world_size)
592+
tile_grid)
599593
CHECK_CUDA(, debug)
600594
timer.stop("40 duplicateWithKeys");
601595

@@ -675,7 +669,7 @@ int CudaRasterizer::Rasterizer::renderForward(
675669
//////////////////////////// Logging && Save Statictis ////////////////////////////////////////////
676670
// DEBUG: print out timing information
677671
if (mode == "train" && zhx_time && iteration % log_interval == 1) {
678-
timer.printAllTimes(iteration, world_size, local_rank, log_folder, false);
672+
timer.printAllTimes(iteration, world_size, global_rank, log_folder, false);
679673
}
680674

681675
// DEBUG: print out the number of Gaussians contributing to each pixel.
@@ -731,7 +725,7 @@ int CudaRasterizer::Rasterizer::renderForward(
731725
ave_n_contrib2loss,
732726
contrib2loss_ratio);
733727

734-
save_log_in_file(iteration, local_rank, world_size, log_folder, "n_contrib", log_tmp);
728+
save_log_in_file(iteration, global_rank, world_size, log_folder, "n_contrib", log_tmp);
735729
global_sum_n_rendered += n_rendered;
736730
global_sum_n_considered += sum_n_considered;
737731
global_sum_n_contrib2loss += sum_n_contrib2loss;
@@ -742,8 +736,8 @@ int CudaRasterizer::Rasterizer::renderForward(
742736
float global_ave_n_considered_per_pix = global_sum_n_considered / (float)total_pixels;
743737
float global_ave_n_contrib2loss_per_pix = global_sum_n_contrib2loss / (float)total_pixels;
744738

745-
sprintf(log_tmp, "local_rank: %d, world_size: %d, num_tiles: %d, num_pixels: %d, num_rendered: %d, global_ave_n_rendered_per_pix: %f, global_ave_n_considered_per_pix: %f, global_ave_n_contrib2loss_per_pix: %f",
746-
(int)local_rank,
739+
sprintf(log_tmp, "global_rank: %d, world_size: %d, num_tiles: %d, num_pixels: %d, num_rendered: %d, global_ave_n_rendered_per_pix: %f, global_ave_n_considered_per_pix: %f, global_ave_n_contrib2loss_per_pix: %f",
740+
(int)global_rank,
747741
(int)world_size,
748742
(int)num_local_tiles,
749743
(int)total_pixels,
@@ -752,7 +746,7 @@ int CudaRasterizer::Rasterizer::renderForward(
752746
global_ave_n_considered_per_pix,
753747
global_ave_n_contrib2loss_per_pix
754748
);
755-
save_log_in_file(iteration, local_rank, world_size, log_folder, "n_contrib", log_tmp);
749+
save_log_in_file(iteration, global_rank, world_size, log_folder, "n_contrib", log_tmp);
756750

757751
delete[] cpu_ranges;
758752
delete[] cpu_n_considered;
@@ -784,7 +778,7 @@ void CudaRasterizer::Rasterizer::renderBackward(
784778
bool debug,
785779
const pybind11::dict &args)
786780
{
787-
auto [local_rank, world_size, iteration, log_interval, device, zhx_debug, zhx_time, mode, dist_division_mode, log_folder] = prepareArgs(args);
781+
auto [global_rank, world_size, iteration, log_interval, device, zhx_debug, zhx_time, mode, dist_division_mode, log_folder] = prepareArgs(args);
788782

789783
MyTimerOnGPU timer;
790784

@@ -825,6 +819,6 @@ void CudaRasterizer::Rasterizer::renderBackward(
825819

826820
// Print out timing information
827821
if (zhx_time && iteration % log_interval == 1) {
828-
timer.printAllTimes(iteration, world_size, local_rank, log_folder, false);
822+
timer.printAllTimes(iteration, world_size, global_rank, log_folder, false);
829823
}
830824
}

cuda_rasterizer/timers.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,12 @@ public:
5151
return a.first < b.first;
5252
}
5353
);
54-
if (print_prefix) {
55-
std::cout << prefix << std::endl;
56-
}
57-
for (const auto& pair : sortedTimes) {
58-
std::cout << pair.first << " time: " << elapsedMilliseconds(pair.first, "sum") << " ms" << std::endl;
59-
}
54+
// if (print_prefix) {
55+
// std::cout << prefix << std::endl;
56+
// }
57+
// for (const auto& pair : sortedTimes) {
58+
// std::cout << pair.first << " time: " << elapsedMilliseconds(pair.first, "sum") << " ms" << std::endl;
59+
// }
6060
//save in file
6161
FILE *fp;
6262
fp = fopen(filename, "a");

diff_gaussian_rasterization/__init__.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -174,13 +174,13 @@ def render_gaussians(
174174
)
175175

176176
def get_extended_compute_locally(cuda_args, image_height, image_width):
177-
local_rank = int(cuda_args["local_rank"])
177+
mp_rank = int(cuda_args["mp_rank"])
178178
dist_global_strategy = [int(x) for x in cuda_args["dist_global_strategy"].split(",")]
179179

180180
num_tile_y = (image_height + 16 - 1) // 16 #TODO: this is dangerous because 16 may change.
181181
num_tile_x = (image_width + 16 - 1) // 16
182-
tile_l = max(dist_global_strategy[local_rank]-num_tile_x-1, 0)
183-
tile_r = min(dist_global_strategy[local_rank+1]+num_tile_x+1, num_tile_y*num_tile_x)
182+
tile_l = max(dist_global_strategy[mp_rank]-num_tile_x-1, 0)
183+
tile_r = min(dist_global_strategy[mp_rank+1]+num_tile_x+1, num_tile_y*num_tile_x)
184184

185185
extended_compute_locally = torch.zeros(num_tile_y*num_tile_x, dtype=torch.bool, device="cuda")
186186
extended_compute_locally[tile_l:tile_r] = True
@@ -368,20 +368,20 @@ def render_gaussians(self, means2D, conic_opacity, rgb, depths, radii, compute_l
368368
def get_local2j_ids(self, means2D, radii, cuda_args):
369369

370370
raster_settings = self.raster_settings
371-
world_size = int(cuda_args["world_size"])
372-
local_rank = int(cuda_args["local_rank"])
371+
mp_world_size = int(cuda_args["mp_world_size"])
372+
mp_rank = int(cuda_args["mp_rank"])
373373

374374
# TODO: make it more general.
375375
dist_global_strategy = [int(x) for x in cuda_args["dist_global_strategy"].split(",")]
376-
assert len(dist_global_strategy) == world_size+1, "dist_global_strategy should have length WORLD_SIZE+1"
376+
assert len(dist_global_strategy) == mp_world_size+1, "dist_global_strategy should have length WORLD_SIZE+1"
377377
assert dist_global_strategy[0] == 0, "dist_global_strategy[0] should be 0"
378378
dist_global_strategy = torch.tensor(dist_global_strategy, dtype=torch.int, device=means2D.device)
379379

380380
args = (
381381
raster_settings.image_height,
382382
raster_settings.image_width,
383-
local_rank,
384-
world_size,
383+
mp_rank,
384+
mp_world_size,
385385
means2D,
386386
radii,
387387
dist_global_strategy,
@@ -391,7 +391,7 @@ def get_local2j_ids(self, means2D, radii, cuda_args):
391391
local2j_ids_bool = _C.get_local2j_ids_bool(*args) # local2j_ids_bool is (P, world_size) bool tensor
392392

393393
local2j_ids = []
394-
for rk in range(world_size):
394+
for rk in range(mp_world_size):
395395
local2j_ids.append(local2j_ids_bool[:, rk].nonzero())
396396

397397
return local2j_ids, local2j_ids_bool

rasterize_points.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -453,8 +453,8 @@ __global__ void getTouchedIdsBool(
453453
torch::Tensor GetLocal2jIdsBoolCUDA(
454454
int image_height,
455455
int image_width,
456-
int local_rank,
457-
int world_size,
456+
int mp_rank,
457+
int mp_world_size,
458458
const torch::Tensor& means2D,
459459
const torch::Tensor& radii,
460460
const torch::Tensor& dist_global_strategy,
@@ -465,13 +465,13 @@ torch::Tensor GetLocal2jIdsBoolCUDA(
465465
const int W = image_width;
466466
bool avoid_pixel_all2all = args["avoid_pixel_all2all"].cast<bool>();
467467

468-
torch::Tensor local2jIdsBool = torch::full({P, world_size}, false, means2D.options().dtype(torch::kBool));
468+
torch::Tensor local2jIdsBool = torch::full({P, mp_world_size}, false, means2D.options().dtype(torch::kBool));
469469

470470
getTouchedIdsBool << <(P + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >> >(
471471
P,
472472
H,
473473
W,
474-
world_size,
474+
mp_world_size,
475475
reinterpret_cast<float2*>(means2D.contiguous().data<float>()),
476476
radii.contiguous().data<int>(),
477477
dist_global_strategy.contiguous().data<int>(),

rasterize_points.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,8 @@ RenderGaussiansBackwardCUDA(
170170
torch::Tensor GetLocal2jIdsBoolCUDA(
171171
int image_height,
172172
int image_width,
173-
int local_rank,
174-
int world_size,
173+
int mp_rank,
174+
int mp_world_size,
175175
const torch::Tensor& means2D,
176176
const torch::Tensor& radii,
177177
const torch::Tensor& dist_global_strategy,

0 commit comments

Comments
 (0)