refactor adjust_mode to render_distribution_mode: delete GetDistributionStrategyCUDA; delete dist_division_mode; change avoid_pixel_all2all to render_distribution_mode==

Hexu Zhao · Hexu Zhao · commit a15ef086a4f4 · 2024-03-11T23:42:42.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 build/
 diff_gaussian_rasterization.egg-info/
 dist/
+diff_gaussian_rasterization/__pycache__/
+*so
diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu
@@ -263,77 +263,6 @@ __global__ void reduce_data_per_block(
     }
 }
 
-__global__ void getComputeLocally(//TODO: this function is not heavy enough to be parallelized.
-	const int tile_num,
-	uint32_t* gs_on_tiles_offsets,
-	bool* compute_locally,
-	int last_local_num_rendered_end,
-	int local_num_rendered_end
-) {
-	auto idx = cg::this_grid().thread_rank();
-	if (idx >= tile_num)
-		return;
-
-	int x = (int)gs_on_tiles_offsets[idx];
-	if (x > last_local_num_rendered_end && x <= local_num_rendered_end)
-		compute_locally[idx] = true;
-	else
-		compute_locally[idx] = false;
-}
-
-__global__ void getComputeLocallyByTileNum(//TODO: this function is not heavy enough to be parallelized.
-	const int tile_num,
-	bool* compute_locally,
-	int last_local_num_rendered_end,
-	int local_num_rendered_end
-) {
-	auto idx = cg::this_grid().thread_rank();
-	if (idx >= tile_num)
-		return;
-
-	if (idx >= last_local_num_rendered_end && idx < local_num_rendered_end)
-		compute_locally[idx] = true;
-	else
-		compute_locally[idx] = false;
-}
-
-__global__ void getComputeLocallyByTileId(
-	const int tile_num,
-	bool* compute_locally,
-	int tile_id_start,
-	int tile_id_end
-) {
-	auto idx = cg::this_grid().thread_rank();
-	if (idx >= tile_num)
-		return;
-
-	if (idx >= tile_id_start && idx < tile_id_end)
-		compute_locally[idx] = true;
-	else
-		compute_locally[idx] = false;
-}
-
-__global__ void getComputeLocallyByRowId(
-	const int tile_num,
-	bool* compute_locally,
-	int tile_grid_x,
-	int tile_grid_y,
-	int row_id_start,
-	int row_id_end
-) {
-	auto idx = cg::this_grid().thread_rank();
-	if (idx >= tile_num)
-		return;
-
-	int tile_x = idx % tile_grid_x;
-	int tile_y = idx / tile_grid_x;
-	if (tile_y >= row_id_start && tile_y < row_id_end)
-		compute_locally[idx] = true;
-	else
-		compute_locally[idx] = false;
-}
-
-
 __global__ void updateTileTouched(
 	const int P,
 	const dim3 tile_grid,
@@ -360,156 +289,6 @@ __global__ void updateTileTouched(
 	tiles_touched[idx] = cnt;
 }
 
-__global__ void getGlobalGaussianOnTiles(//TODO: maybe this could take significant amount of time. 
-	const int P,
-	const float2* means2D,
-	int* radii,
-	const dim3 tile_grid,
-	uint32_t* gs_on_tiles
-) {
-	auto idx = cg::this_grid().thread_rank();
-	if (idx >= P)
-		return;
-
-	if (radii[idx] > 0)
-	{
-		uint2 rect_min, rect_max;
-		getRect(means2D[idx], radii[idx], rect_min, rect_max, tile_grid);
-		for (int y = rect_min.y; y < rect_max.y; y++)
-			for (int x = rect_min.x; x < rect_max.x; x++)
-			{
-				atomicAdd(&gs_on_tiles[y * tile_grid.x + x], 1);
-				//TODO: Do I have to use atomicAdd? This is slow, honestly. 
-			}
-	}
-}
-
-// NOTE: This method should also deal with world_size == 1 safely.
-void updateDistributedStatLocally(//TODO: optimize implementations for all these kernels. 
-	const int P,
-	const int width,
-	const int height,
-	const dim3 tile_grid,
-	int* radii,
-	float2* means2D,
-	CudaRasterizer::DistributedState& distState,
-	const int local_rank,
-	const int world_size,
-	const char * dist_division_mode,
-	MyTimerOnGPU& timer
-){
-	int tile_num = tile_grid.x * tile_grid.y;
-	timer.start("21 updateDistributedStatLocally.getGlobalGaussianOnTiles");
-	cudaMemset(distState.gs_on_tiles, 0, tile_num * sizeof(uint32_t));
-	getGlobalGaussianOnTiles <<<(P + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >>> (
-		P,
-		means2D,
-		radii,
-		tile_grid,
-		distState.gs_on_tiles
-	);
-	timer.stop("21 updateDistributedStatLocally.getGlobalGaussianOnTiles");
-
-	// getComputeLocally
-	if (world_size >= 1) {
-		timer.start("22 updateDistributedStatLocally.InclusiveSum");
-		cub::DeviceScan::InclusiveSum(distState.scanning_space, distState.scan_size, distState.gs_on_tiles, distState.gs_on_tiles_offsets, tile_num);
-		timer.stop("22 updateDistributedStatLocally.InclusiveSum");
-
-		int num_rendered;
-		cudaMemcpy(&num_rendered, distState.gs_on_tiles_offsets + tile_num - 1, sizeof(int), cudaMemcpyDeviceToHost);
-
-		timer.start("23 updateDistributedStatLocally.getComputeLocally");
-		// find the position by binary search or customized kernal function?
-		// printf("dist_division_mode: %s, length: %d\n", dist_division_mode, strlen(dist_division_mode));
-		if (strcmp(dist_division_mode, "rendered_num") == 0) {
-			int num_rendered_per_device = num_rendered / world_size + 1;
-			int last_local_num_rendered_end = num_rendered_per_device * local_rank;
-			int local_num_rendered_end = min(num_rendered_per_device * (local_rank + 1), num_rendered);
-			getComputeLocally <<<(tile_num + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >>> (
-				tile_num,
-				distState.gs_on_tiles_offsets,
-				distState.compute_locally,
-				last_local_num_rendered_end,
-				local_num_rendered_end
-			);
-			distState.last_local_num_rendered_end = last_local_num_rendered_end;
-			distState.local_num_rendered_end = local_num_rendered_end;
-		} else if (strcmp(dist_division_mode, "tile_num") == 0) {
-			int num_tiles_per_device =	tile_num / world_size + 1;
-			int last_local_num_rendered_end = num_tiles_per_device * local_rank;
-			int local_num_rendered_end = min(num_tiles_per_device * (local_rank + 1), tile_num);
-			//TODO: optimze this; in some cases, it will not be divied evenly -> 2170 will be into 1086 and 1084
-			getComputeLocallyByTileNum <<<(tile_num + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >>> (
-				tile_num,
-				distState.compute_locally,
-				last_local_num_rendered_end,
-				local_num_rendered_end
-			);
-			distState.last_local_num_rendered_end = last_local_num_rendered_end;
-			distState.local_num_rendered_end = local_num_rendered_end;
-		} else if (dist_division_mode[0] == 'T') {
-			// dist_division_mode example: "T:0,1" or "T:10,20"
-			char* dist_division_mode_left = new char[strlen(dist_division_mode) + 1];
-			char* dist_division_mode_right = new char[strlen(dist_division_mode) + 1];
-			strcpy(dist_division_mode_left, dist_division_mode);
-			strcpy(dist_division_mode_right, dist_division_mode);
-			
-			char* pch = strtok(dist_division_mode_left, ":");
-			pch = strtok(NULL, ":");
-			pch = strtok(pch, ",");
-			int tile_id_start = atoi(pch);
-			pch = strtok(NULL, ",");
-			int tile_id_end = atoi(pch);
-			delete[] dist_division_mode_left;
-			delete[] dist_division_mode_right;
-			// printf("dist_division_mode is %s, tile_id_start is %d, tile_id_end is %d\n", dist_division_mode, tile_id_start, tile_id_end);
-			
-			getComputeLocallyByTileId <<<(tile_num + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >>> (
-				tile_num,
-				distState.compute_locally,
-				tile_id_start,
-				tile_id_end
-			);
-			distState.last_local_num_rendered_end = tile_id_start;
-			distState.local_num_rendered_end = tile_id_end;
-
-		} else {
-			// dist_division_mode example: "0,1" or "10,20"
-			// TODO: refactor code: I should change it into: "R:0,1" or "R:10,20" later. refactor code.
-
-			char* dist_division_mode_left = new char[strlen(dist_division_mode) + 1];
-			char* dist_division_mode_right = new char[strlen(dist_division_mode) + 1];
-			strcpy(dist_division_mode_left, dist_division_mode);
-			strcpy(dist_division_mode_right, dist_division_mode);
-			char* pch = strtok(dist_division_mode_left, ",");
-			int row_id_start = atoi(pch);
-			pch = strtok(NULL, ",");
-			int row_id_end = atoi(pch);
-			delete[] dist_division_mode_left;
-			delete[] dist_division_mode_right;
-			// printf("dist_division_mode is %s, row_id_start is %d, row_id_end is %d\n", dist_division_mode, row_id_start, row_id_end);
-
-			getComputeLocallyByRowId <<<(tile_num + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >>> (
-				tile_num,
-				distState.compute_locally,
-				tile_grid.x,
-				tile_grid.y,
-				row_id_start,
-				row_id_end
-			);
-			distState.last_local_num_rendered_end = row_id_start * tile_grid.x;
-			distState.local_num_rendered_end = row_id_end * tile_grid.x;
-
-			// printf("division_mode: %s is not supported.\n", dist_division_mode);
-		}
-		timer.stop("23 updateDistributedStatLocally.getComputeLocally");
-	}
-	else {
-		cudaMemset(distState.compute_locally, true, tile_num * sizeof(bool));
-	}
-}
-
 void save_log_in_file(int iteration, int local_rank, int world_size, std::string log_folder, const char* filename_prefix, const char* log_content) {
 	char* filename = new char[128];
 	sprintf(filename, "%s/%s_ws=%d_rk=%d.log", log_folder.c_str(), filename_prefix, world_size, local_rank);
@@ -535,7 +314,8 @@ std::tuple<int, int, int, int, int, bool, bool, std::string, std::string, std::s
     std::string log_folder_str = args["log_folder"].cast<std::string>();
     std::string zhx_debug_str = args["zhx_debug"].cast<std::string>();
     std::string zhx_time_str = args["zhx_time"].cast<std::string>();
-    std::string dist_division_mode_str = args["dist_division_mode"].cast<std::string>();
+    // std::string dist_division_mode_str = args["dist_division_mode"].cast<std::string>();
+	std::string dist_division_mode_str = "";
 
     int local_rank = std::stoi(local_rank_str);
     int world_size = std::stoi(world_size_str);
@@ -728,80 +508,8 @@ void CudaRasterizer::Rasterizer::getDistributionStrategy(
 	bool debug,
 	const pybind11::dict &args)
 {
-	auto [local_rank, world_size, iteration, log_interval, device, zhx_debug, zhx_time, mode, dist_division_mode, log_folder] = prepareArgs(args);
-	char* log_tmp = new char[500];
-
-	MyTimerOnGPU timer;
-
-	dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
-	int tile_num = tile_grid.x * tile_grid.y;
-
-	size_t dist_chunk_size = required<DistributedState>(tile_grid.x * tile_grid.y);
-	char* dist_chunkptr = distBuffer(dist_chunk_size);
-	DistributedState distState = DistributedState::fromChunk(dist_chunkptr, tile_grid.x * tile_grid.y, true);
-
-	distState.compute_locally = compute_locally;
-	// NOTE: do not allocate memory for distState.compute_locally in fromChunk for sep_rendering mode, 
-	// but use the compute_locally from python.
-
-	// Use means2D and radii to decide how to evenly distribute the workloads. 
-	timer.start("20 updateDistributedStatLocally");
-	updateDistributedStatLocally(// FIXME: in memory_distribution mode, this function's calculation is not correct.
-		P,
-		width,
-		height,
-		tile_grid,
-		radii,
-		means2D,
-		distState,
-		local_rank,
-		world_size,
-		dist_division_mode.c_str(),
-		timer
-	);
-	timer.stop("20 updateDistributedStatLocally");
-	
-	// DEBUG: print out compute_locally	information
-	if (mode == "train" && zhx_debug && iteration % log_interval == 1) {
-		int last_local_num_rendered_end = distState.last_local_num_rendered_end;
-		int local_num_rendered_end = distState.local_num_rendered_end;
-		uint32_t* gs_on_tiles_cpu = new uint32_t[tile_grid.x * tile_grid.y];
-		CHECK_CUDA(cudaMemcpy(gs_on_tiles_cpu, distState.gs_on_tiles, tile_grid.x * tile_grid.y * sizeof(uint32_t), cudaMemcpyDeviceToHost), debug);
-
-		// distState.compute_locally to cpu
-		bool* compute_locally_cpu = new bool[tile_grid.x * tile_grid.y];
-		CHECK_CUDA(cudaMemcpy(compute_locally_cpu, distState.compute_locally, tile_grid.x * tile_grid.y * sizeof(bool), cudaMemcpyDeviceToHost), debug);
-
-		int num_local_tiles = 0;
-		int local_tiles_left_idx = 999999999;
-		int local_tiles_right_idx = 0;
-		int num_rendered_from_distState = 0;
-		for (int i = 0; i < tile_grid.x * tile_grid.y; i++)
-		{
-			if (compute_locally_cpu[i])
-			{
-				if (local_tiles_left_idx == 999999999)
-					local_tiles_left_idx = i;
-				local_tiles_right_idx = i;
-				num_local_tiles++;
-				num_rendered_from_distState += (int)gs_on_tiles_cpu[i];
-			}
-		}
-
-		sprintf(log_tmp, "num_local_tiles: %d, local_tiles_left_idx: %d, local_tiles_right_idx: %d, last_local_num_rendered_end: %d, local_num_rendered_end: %d, num_rendered_from_distState: %d", 
-			(int)num_local_tiles, (int)local_tiles_left_idx, (int)local_tiles_right_idx, (int)last_local_num_rendered_end, (int)local_num_rendered_end, (int)num_rendered_from_distState);
-		save_log_in_file(iteration, local_rank, world_size, log_folder, "num_rendered", log_tmp);
-
-		delete[] compute_locally_cpu;
-		delete[] gs_on_tiles_cpu;
-	}
-
-	// Print out timing information
-	if (zhx_time && iteration % log_interval == 1) {
-		timer.printAllTimes(iteration, world_size, local_rank, log_folder, false);
-	}
-
-	delete[] log_tmp;
+	// This function is deprecated for now. But I keed the structure of code here potentially for future use.
+	throw std::runtime_error("getDistributionStrategy is deprecated.");
 }
 
 /////////////////////////////// Render ///////////////////////////////
diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py
@@ -383,7 +383,9 @@ def get_local2j_ids(self, means2D, radii, cuda_args):
         return local2j_ids, local2j_ids_bool
 
     def get_distribution_strategy(self, means2D, radii, cuda_args):
-        
+
+        assert False, "This function is not used in the current version."
+
         raster_settings = self.raster_settings
 
         return _C.get_distribution_strategy(

-Original file line number
+Diff line change
@@ @@ -1,3 +1,5 @@ @@
 build/
 diff_gaussian_rasterization.egg-info/
 dist/
 +diff_gaussian_rasterization/__pycache__/
 +*so