Skip to content

Commit a15ef08

Browse files
author
Hexu Zhao
committed
refactor adjust_mode to render_distribution_mode: delete GetDistributionStrategyCUDA; delete dist_division_mode; change avoid_pixel_all2all to render_distribution_mode==
1 parent becfda1 commit a15ef08

File tree

3 files changed

+9
-297
lines changed

3 files changed

+9
-297
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
build/
22
diff_gaussian_rasterization.egg-info/
33
dist/
4+
diff_gaussian_rasterization/__pycache__/
5+
*so

cuda_rasterizer/rasterizer_impl.cu

Lines changed: 4 additions & 296 deletions
Original file line numberDiff line numberDiff line change
@@ -263,77 +263,6 @@ __global__ void reduce_data_per_block(
263263
}
264264
}
265265

266-
__global__ void getComputeLocally(//TODO: this function is not heavy enough to be parallelized.
267-
const int tile_num,
268-
uint32_t* gs_on_tiles_offsets,
269-
bool* compute_locally,
270-
int last_local_num_rendered_end,
271-
int local_num_rendered_end
272-
) {
273-
auto idx = cg::this_grid().thread_rank();
274-
if (idx >= tile_num)
275-
return;
276-
277-
int x = (int)gs_on_tiles_offsets[idx];
278-
if (x > last_local_num_rendered_end && x <= local_num_rendered_end)
279-
compute_locally[idx] = true;
280-
else
281-
compute_locally[idx] = false;
282-
}
283-
284-
__global__ void getComputeLocallyByTileNum(//TODO: this function is not heavy enough to be parallelized.
285-
const int tile_num,
286-
bool* compute_locally,
287-
int last_local_num_rendered_end,
288-
int local_num_rendered_end
289-
) {
290-
auto idx = cg::this_grid().thread_rank();
291-
if (idx >= tile_num)
292-
return;
293-
294-
if (idx >= last_local_num_rendered_end && idx < local_num_rendered_end)
295-
compute_locally[idx] = true;
296-
else
297-
compute_locally[idx] = false;
298-
}
299-
300-
__global__ void getComputeLocallyByTileId(
301-
const int tile_num,
302-
bool* compute_locally,
303-
int tile_id_start,
304-
int tile_id_end
305-
) {
306-
auto idx = cg::this_grid().thread_rank();
307-
if (idx >= tile_num)
308-
return;
309-
310-
if (idx >= tile_id_start && idx < tile_id_end)
311-
compute_locally[idx] = true;
312-
else
313-
compute_locally[idx] = false;
314-
}
315-
316-
__global__ void getComputeLocallyByRowId(
317-
const int tile_num,
318-
bool* compute_locally,
319-
int tile_grid_x,
320-
int tile_grid_y,
321-
int row_id_start,
322-
int row_id_end
323-
) {
324-
auto idx = cg::this_grid().thread_rank();
325-
if (idx >= tile_num)
326-
return;
327-
328-
int tile_x = idx % tile_grid_x;
329-
int tile_y = idx / tile_grid_x;
330-
if (tile_y >= row_id_start && tile_y < row_id_end)
331-
compute_locally[idx] = true;
332-
else
333-
compute_locally[idx] = false;
334-
}
335-
336-
337266
__global__ void updateTileTouched(
338267
const int P,
339268
const dim3 tile_grid,
@@ -360,156 +289,6 @@ __global__ void updateTileTouched(
360289
tiles_touched[idx] = cnt;
361290
}
362291

363-
__global__ void getGlobalGaussianOnTiles(//TODO: maybe this could take significant amount of time.
364-
const int P,
365-
const float2* means2D,
366-
int* radii,
367-
const dim3 tile_grid,
368-
uint32_t* gs_on_tiles
369-
) {
370-
auto idx = cg::this_grid().thread_rank();
371-
if (idx >= P)
372-
return;
373-
374-
if (radii[idx] > 0)
375-
{
376-
uint2 rect_min, rect_max;
377-
getRect(means2D[idx], radii[idx], rect_min, rect_max, tile_grid);
378-
for (int y = rect_min.y; y < rect_max.y; y++)
379-
for (int x = rect_min.x; x < rect_max.x; x++)
380-
{
381-
atomicAdd(&gs_on_tiles[y * tile_grid.x + x], 1);
382-
//TODO: Do I have to use atomicAdd? This is slow, honestly.
383-
}
384-
}
385-
}
386-
387-
// NOTE: This method should also deal with world_size == 1 safely.
388-
void updateDistributedStatLocally(//TODO: optimize implementations for all these kernels.
389-
const int P,
390-
const int width,
391-
const int height,
392-
const dim3 tile_grid,
393-
int* radii,
394-
float2* means2D,
395-
CudaRasterizer::DistributedState& distState,
396-
const int local_rank,
397-
const int world_size,
398-
const char * dist_division_mode,
399-
MyTimerOnGPU& timer
400-
){
401-
int tile_num = tile_grid.x * tile_grid.y;
402-
timer.start("21 updateDistributedStatLocally.getGlobalGaussianOnTiles");
403-
cudaMemset(distState.gs_on_tiles, 0, tile_num * sizeof(uint32_t));
404-
getGlobalGaussianOnTiles <<<(P + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >>> (
405-
P,
406-
means2D,
407-
radii,
408-
tile_grid,
409-
distState.gs_on_tiles
410-
);
411-
timer.stop("21 updateDistributedStatLocally.getGlobalGaussianOnTiles");
412-
413-
// getComputeLocally
414-
if (world_size >= 1) {
415-
timer.start("22 updateDistributedStatLocally.InclusiveSum");
416-
cub::DeviceScan::InclusiveSum(distState.scanning_space, distState.scan_size, distState.gs_on_tiles, distState.gs_on_tiles_offsets, tile_num);
417-
timer.stop("22 updateDistributedStatLocally.InclusiveSum");
418-
419-
int num_rendered;
420-
cudaMemcpy(&num_rendered, distState.gs_on_tiles_offsets + tile_num - 1, sizeof(int), cudaMemcpyDeviceToHost);
421-
422-
timer.start("23 updateDistributedStatLocally.getComputeLocally");
423-
// find the position by binary search or customized kernal function?
424-
// printf("dist_division_mode: %s, length: %d\n", dist_division_mode, strlen(dist_division_mode));
425-
if (strcmp(dist_division_mode, "rendered_num") == 0) {
426-
int num_rendered_per_device = num_rendered / world_size + 1;
427-
int last_local_num_rendered_end = num_rendered_per_device * local_rank;
428-
int local_num_rendered_end = min(num_rendered_per_device * (local_rank + 1), num_rendered);
429-
getComputeLocally <<<(tile_num + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >>> (
430-
tile_num,
431-
distState.gs_on_tiles_offsets,
432-
distState.compute_locally,
433-
last_local_num_rendered_end,
434-
local_num_rendered_end
435-
);
436-
distState.last_local_num_rendered_end = last_local_num_rendered_end;
437-
distState.local_num_rendered_end = local_num_rendered_end;
438-
} else if (strcmp(dist_division_mode, "tile_num") == 0) {
439-
int num_tiles_per_device = tile_num / world_size + 1;
440-
int last_local_num_rendered_end = num_tiles_per_device * local_rank;
441-
int local_num_rendered_end = min(num_tiles_per_device * (local_rank + 1), tile_num);
442-
//TODO: optimze this; in some cases, it will not be divied evenly -> 2170 will be into 1086 and 1084
443-
getComputeLocallyByTileNum <<<(tile_num + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >>> (
444-
tile_num,
445-
distState.compute_locally,
446-
last_local_num_rendered_end,
447-
local_num_rendered_end
448-
);
449-
distState.last_local_num_rendered_end = last_local_num_rendered_end;
450-
distState.local_num_rendered_end = local_num_rendered_end;
451-
} else if (dist_division_mode[0] == 'T') {
452-
// dist_division_mode example: "T:0,1" or "T:10,20"
453-
char* dist_division_mode_left = new char[strlen(dist_division_mode) + 1];
454-
char* dist_division_mode_right = new char[strlen(dist_division_mode) + 1];
455-
strcpy(dist_division_mode_left, dist_division_mode);
456-
strcpy(dist_division_mode_right, dist_division_mode);
457-
458-
char* pch = strtok(dist_division_mode_left, ":");
459-
pch = strtok(NULL, ":");
460-
pch = strtok(pch, ",");
461-
int tile_id_start = atoi(pch);
462-
pch = strtok(NULL, ",");
463-
int tile_id_end = atoi(pch);
464-
delete[] dist_division_mode_left;
465-
delete[] dist_division_mode_right;
466-
// printf("dist_division_mode is %s, tile_id_start is %d, tile_id_end is %d\n", dist_division_mode, tile_id_start, tile_id_end);
467-
468-
getComputeLocallyByTileId <<<(tile_num + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >>> (
469-
tile_num,
470-
distState.compute_locally,
471-
tile_id_start,
472-
tile_id_end
473-
);
474-
distState.last_local_num_rendered_end = tile_id_start;
475-
distState.local_num_rendered_end = tile_id_end;
476-
477-
} else {
478-
// dist_division_mode example: "0,1" or "10,20"
479-
// TODO: refactor code: I should change it into: "R:0,1" or "R:10,20" later. refactor code.
480-
481-
char* dist_division_mode_left = new char[strlen(dist_division_mode) + 1];
482-
char* dist_division_mode_right = new char[strlen(dist_division_mode) + 1];
483-
strcpy(dist_division_mode_left, dist_division_mode);
484-
strcpy(dist_division_mode_right, dist_division_mode);
485-
char* pch = strtok(dist_division_mode_left, ",");
486-
int row_id_start = atoi(pch);
487-
pch = strtok(NULL, ",");
488-
int row_id_end = atoi(pch);
489-
delete[] dist_division_mode_left;
490-
delete[] dist_division_mode_right;
491-
// printf("dist_division_mode is %s, row_id_start is %d, row_id_end is %d\n", dist_division_mode, row_id_start, row_id_end);
492-
493-
getComputeLocallyByRowId <<<(tile_num + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >>> (
494-
tile_num,
495-
distState.compute_locally,
496-
tile_grid.x,
497-
tile_grid.y,
498-
row_id_start,
499-
row_id_end
500-
);
501-
distState.last_local_num_rendered_end = row_id_start * tile_grid.x;
502-
distState.local_num_rendered_end = row_id_end * tile_grid.x;
503-
504-
// printf("division_mode: %s is not supported.\n", dist_division_mode);
505-
}
506-
timer.stop("23 updateDistributedStatLocally.getComputeLocally");
507-
}
508-
else {
509-
cudaMemset(distState.compute_locally, true, tile_num * sizeof(bool));
510-
}
511-
}
512-
513292
void save_log_in_file(int iteration, int local_rank, int world_size, std::string log_folder, const char* filename_prefix, const char* log_content) {
514293
char* filename = new char[128];
515294
sprintf(filename, "%s/%s_ws=%d_rk=%d.log", log_folder.c_str(), filename_prefix, world_size, local_rank);
@@ -535,7 +314,8 @@ std::tuple<int, int, int, int, int, bool, bool, std::string, std::string, std::s
535314
std::string log_folder_str = args["log_folder"].cast<std::string>();
536315
std::string zhx_debug_str = args["zhx_debug"].cast<std::string>();
537316
std::string zhx_time_str = args["zhx_time"].cast<std::string>();
538-
std::string dist_division_mode_str = args["dist_division_mode"].cast<std::string>();
317+
// std::string dist_division_mode_str = args["dist_division_mode"].cast<std::string>();
318+
std::string dist_division_mode_str = "";
539319

540320
int local_rank = std::stoi(local_rank_str);
541321
int world_size = std::stoi(world_size_str);
@@ -728,80 +508,8 @@ void CudaRasterizer::Rasterizer::getDistributionStrategy(
728508
bool debug,
729509
const pybind11::dict &args)
730510
{
731-
auto [local_rank, world_size, iteration, log_interval, device, zhx_debug, zhx_time, mode, dist_division_mode, log_folder] = prepareArgs(args);
732-
char* log_tmp = new char[500];
733-
734-
MyTimerOnGPU timer;
735-
736-
dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
737-
int tile_num = tile_grid.x * tile_grid.y;
738-
739-
size_t dist_chunk_size = required<DistributedState>(tile_grid.x * tile_grid.y);
740-
char* dist_chunkptr = distBuffer(dist_chunk_size);
741-
DistributedState distState = DistributedState::fromChunk(dist_chunkptr, tile_grid.x * tile_grid.y, true);
742-
743-
distState.compute_locally = compute_locally;
744-
// NOTE: do not allocate memory for distState.compute_locally in fromChunk for sep_rendering mode,
745-
// but use the compute_locally from python.
746-
747-
// Use means2D and radii to decide how to evenly distribute the workloads.
748-
timer.start("20 updateDistributedStatLocally");
749-
updateDistributedStatLocally(// FIXME: in memory_distribution mode, this function's calculation is not correct.
750-
P,
751-
width,
752-
height,
753-
tile_grid,
754-
radii,
755-
means2D,
756-
distState,
757-
local_rank,
758-
world_size,
759-
dist_division_mode.c_str(),
760-
timer
761-
);
762-
timer.stop("20 updateDistributedStatLocally");
763-
764-
// DEBUG: print out compute_locally information
765-
if (mode == "train" && zhx_debug && iteration % log_interval == 1) {
766-
int last_local_num_rendered_end = distState.last_local_num_rendered_end;
767-
int local_num_rendered_end = distState.local_num_rendered_end;
768-
uint32_t* gs_on_tiles_cpu = new uint32_t[tile_grid.x * tile_grid.y];
769-
CHECK_CUDA(cudaMemcpy(gs_on_tiles_cpu, distState.gs_on_tiles, tile_grid.x * tile_grid.y * sizeof(uint32_t), cudaMemcpyDeviceToHost), debug);
770-
771-
// distState.compute_locally to cpu
772-
bool* compute_locally_cpu = new bool[tile_grid.x * tile_grid.y];
773-
CHECK_CUDA(cudaMemcpy(compute_locally_cpu, distState.compute_locally, tile_grid.x * tile_grid.y * sizeof(bool), cudaMemcpyDeviceToHost), debug);
774-
775-
int num_local_tiles = 0;
776-
int local_tiles_left_idx = 999999999;
777-
int local_tiles_right_idx = 0;
778-
int num_rendered_from_distState = 0;
779-
for (int i = 0; i < tile_grid.x * tile_grid.y; i++)
780-
{
781-
if (compute_locally_cpu[i])
782-
{
783-
if (local_tiles_left_idx == 999999999)
784-
local_tiles_left_idx = i;
785-
local_tiles_right_idx = i;
786-
num_local_tiles++;
787-
num_rendered_from_distState += (int)gs_on_tiles_cpu[i];
788-
}
789-
}
790-
791-
sprintf(log_tmp, "num_local_tiles: %d, local_tiles_left_idx: %d, local_tiles_right_idx: %d, last_local_num_rendered_end: %d, local_num_rendered_end: %d, num_rendered_from_distState: %d",
792-
(int)num_local_tiles, (int)local_tiles_left_idx, (int)local_tiles_right_idx, (int)last_local_num_rendered_end, (int)local_num_rendered_end, (int)num_rendered_from_distState);
793-
save_log_in_file(iteration, local_rank, world_size, log_folder, "num_rendered", log_tmp);
794-
795-
delete[] compute_locally_cpu;
796-
delete[] gs_on_tiles_cpu;
797-
}
798-
799-
// Print out timing information
800-
if (zhx_time && iteration % log_interval == 1) {
801-
timer.printAllTimes(iteration, world_size, local_rank, log_folder, false);
802-
}
803-
804-
delete[] log_tmp;
511+
// This function is deprecated for now. But I keed the structure of code here potentially for future use.
512+
throw std::runtime_error("getDistributionStrategy is deprecated.");
805513
}
806514

807515
/////////////////////////////// Render ///////////////////////////////

diff_gaussian_rasterization/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,9 @@ def get_local2j_ids(self, means2D, radii, cuda_args):
383383
return local2j_ids, local2j_ids_bool
384384

385385
def get_distribution_strategy(self, means2D, radii, cuda_args):
386-
386+
387+
assert False, "This function is not used in the current version."
388+
387389
raster_settings = self.raster_settings
388390

389391
return _C.get_distribution_strategy(

0 commit comments

Comments
 (0)