support adjust_mode==6; However, I do not know why it is slower than adjust_mode==5.

Hexu Zhao · Hexu Zhao · commit d802de42976d · 2024-03-31T17:29:23.000-04:00
diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py
@@ -174,19 +174,40 @@ def render_gaussians(
     )
 
 def get_extended_compute_locally(cuda_args, image_height, image_width):
-    mp_rank = int(cuda_args["mp_rank"])
-    dist_global_strategy = [int(x) for x in cuda_args["dist_global_strategy"].split(",")]
+    if isinstance(cuda_args["dist_global_strategy"], str):
+        mp_rank = int(cuda_args["mp_rank"])
+        dist_global_strategy = [int(x) for x in cuda_args["dist_global_strategy"].split(",")]
+
+        num_tile_y = (image_height + 16 - 1) // 16 #TODO: this is dangerous because 16 may change.
+        num_tile_x = (image_width + 16 - 1) // 16
+        tile_l = max(dist_global_strategy[mp_rank]-num_tile_x-1, 0)
+        tile_r = min(dist_global_strategy[mp_rank+1]+num_tile_x+1, num_tile_y*num_tile_x)
+
+        extended_compute_locally = torch.zeros(num_tile_y*num_tile_x, dtype=torch.bool, device="cuda")
+        extended_compute_locally[tile_l:tile_r] = True
+        extended_compute_locally = extended_compute_locally.view(num_tile_y, num_tile_x)
 
-    num_tile_y = (image_height + 16 - 1) // 16 #TODO: this is dangerous because 16 may change.
-    num_tile_x = (image_width + 16 - 1) // 16
-    tile_l = max(dist_global_strategy[mp_rank]-num_tile_x-1, 0)
-    tile_r = min(dist_global_strategy[mp_rank+1]+num_tile_x+1, num_tile_y*num_tile_x)
+        return extended_compute_locally
+    else:
+        division_pos = cuda_args["dist_global_strategy"]
+        division_pos_xs, division_pos_ys = division_pos
+        mp_rank = int(cuda_args["mp_rank"])
+        grid_size_x = len(division_pos_xs) - 1
+        grid_size_y = len(division_pos_ys[0]) - 1
+        y_rank = mp_rank // grid_size_x
+        x_rank = mp_rank % grid_size_x
+
+        local_tile_x_l, local_tile_x_r = division_pos_xs[x_rank], division_pos_xs[x_rank+1]
+        local_tile_y_l, local_tile_y_r = division_pos_ys[x_rank][y_rank], division_pos_ys[x_rank][y_rank+1]
 
-    extended_compute_locally = torch.zeros(num_tile_y*num_tile_x, dtype=torch.bool, device="cuda")
-    extended_compute_locally[tile_l:tile_r] = True
-    extended_compute_locally = extended_compute_locally.view(num_tile_y, num_tile_x)
+        num_tile_y = (image_height + 16 - 1) // 16
+        num_tile_x = (image_width + 16 - 1) // 16
 
-    return extended_compute_locally
+        extended_compute_locally = torch.zeros((num_tile_y, num_tile_x), dtype=torch.bool, device="cuda")
+        extended_compute_locally[max(local_tile_y_l-1,0):min(local_tile_y_r+1,num_tile_y),
+                                 max(local_tile_x_l-1,0):min(local_tile_x_r+1,num_tile_x)] = True
+
+        return extended_compute_locally
 
 class _RenderGaussians(torch.autograd.Function):
     @staticmethod
@@ -367,35 +388,66 @@ def render_gaussians(self, means2D, conic_opacity, rgb, depths, radii, compute_l
 
     def get_local2j_ids(self, means2D, radii, cuda_args):
 
-        raster_settings = self.raster_settings
-        mp_world_size = int(cuda_args["mp_world_size"])
-        mp_rank = int(cuda_args["mp_rank"])
+        if isinstance(cuda_args["dist_global_strategy"], str):
+            raster_settings = self.raster_settings
+            mp_world_size = int(cuda_args["mp_world_size"])
+            mp_rank = int(cuda_args["mp_rank"])
 
-        # TODO: make it more general.
-        dist_global_strategy = [int(x) for x in cuda_args["dist_global_strategy"].split(",")]
-        assert len(dist_global_strategy) == mp_world_size+1, "dist_global_strategy should have length WORLD_SIZE+1"
-        assert dist_global_strategy[0] == 0, "dist_global_strategy[0] should be 0"
-        dist_global_strategy = torch.tensor(dist_global_strategy, dtype=torch.int, device=means2D.device)
+            # TODO: make it more general.
+            dist_global_strategy = [int(x) for x in cuda_args["dist_global_strategy"].split(",")]
+            assert len(dist_global_strategy) == mp_world_size+1, "dist_global_strategy should have length WORLD_SIZE+1"
+            assert dist_global_strategy[0] == 0, "dist_global_strategy[0] should be 0"
+            dist_global_strategy = torch.tensor(dist_global_strategy, dtype=torch.int, device=means2D.device)
 
-        args = (
-            raster_settings.image_height,
-            raster_settings.image_width,
-            mp_rank,
-            mp_world_size,
-            means2D,
-            radii,
-            dist_global_strategy,
-            cuda_args
-        )
+            args = (
+                raster_settings.image_height,
+                raster_settings.image_width,
+                mp_rank,
+                mp_world_size,
+                means2D,
+                radii,
+                dist_global_strategy,
+                cuda_args
+            )
+
+            local2j_ids_bool = _C.get_local2j_ids_bool(*args) # local2j_ids_bool is (P, world_size) bool tensor
+
+        else:
+            raster_settings = self.raster_settings
+            mp_world_size = int(cuda_args["mp_world_size"])
+            mp_rank = int(cuda_args["mp_rank"])
 
-        local2j_ids_bool = _C.get_local2j_ids_bool(*args) # local2j_ids_bool is (P, world_size) bool tensor
+            division_pos = cuda_args["dist_global_strategy"]
+            division_pos_xs, division_pos_ys = division_pos
+
+            rectangles = []
+            for y_rank in range(len(division_pos_ys[0])-1):
+                for x_rank in range(len(division_pos_ys)):
+                    local_tile_x_l, local_tile_x_r = division_pos_xs[x_rank], division_pos_xs[x_rank+1]
+                    local_tile_y_l, local_tile_y_r = division_pos_ys[x_rank][y_rank], division_pos_ys[x_rank][y_rank+1]
+                    rectangles.append([local_tile_y_l, local_tile_y_r, local_tile_x_l, local_tile_x_r])
+            rectangles = torch.tensor(rectangles, dtype=torch.int, device=means2D.device)# (mp_world_size, 4)
+
+            args = (
+                raster_settings.image_height,
+                raster_settings.image_width,
+                mp_rank,
+                mp_world_size,
+                means2D,
+                radii,
+                rectangles,
+                cuda_args
+            )
+
+            local2j_ids_bool = _C.get_local2j_ids_bool_adjust_mode6(*args) # local2j_ids_bool is (P, world_size) bool tensor
 
         local2j_ids = []
         for rk in range(mp_world_size):
             local2j_ids.append(local2j_ids_bool[:, rk].nonzero())
 
         return local2j_ids, local2j_ids_bool
 
+
     def get_distribution_strategy(self, means2D, radii, cuda_args):
 
         assert False, "This function is not used in the current version."
diff --git a/ext.cpp b/ext.cpp
@@ -21,6 +21,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("render_gaussians", &RenderGaussiansCUDA);
   m.def("render_gaussians_backward", &RenderGaussiansBackwardCUDA);
   m.def("get_local2j_ids_bool", &GetLocal2jIdsBoolCUDA);
+  m.def("get_local2j_ids_bool_adjust_mode6", &GetLocal2jIdsBoolAdjustMode6CUDA);
 
   // Image Distribution Utilities
   m.def("get_touched_locally", &GetTouchedLocally);
diff --git a/rasterize_points.cu b/rasterize_points.cu
@@ -483,7 +483,83 @@ torch::Tensor GetLocal2jIdsBoolCUDA(
 }
 
 
+__global__ void getTouchedIdsBoolAdjustMode6(
+	int P,
+	int height,
+	int width,
+	int world_size,
+	const float2* means2D,
+	const int* radii,// NOTE: radii is not const in getRect()
+	const int* rectangles,
+	bool* touchedIdsBool,
+	bool avoid_pixel_all2all)
+{
+	auto i = cg::this_grid().thread_rank();
+	if (i < P)
+	{
+		uint2 rect_min, rect_max;
+		dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
+
+		getRect(means2D[i], radii[i], rect_min, rect_max, tile_grid);
+
+		for (int rk = 0; rk < world_size; rk++)
+		{
+			// local_tile_y_l, local_tile_y_r, local_tile_x_l, local_tile_x_r
+			const int* rectangles_offset = rectangles+(rk*4);
+			int local_tile_y_l = *(rectangles_offset);
+			int local_tile_y_r = *(rectangles_offset+1);
+			int local_tile_x_l = *(rectangles_offset+2);
+			int local_tile_x_r = *(rectangles_offset+3);
+
+
+
+			if (avoid_pixel_all2all) {
+				if (local_tile_y_l>0) local_tile_y_l-=1;
+				if (local_tile_x_l>0) local_tile_x_l-=1;//WERID: If local_tile_x_l changes to -1, then it gives weird behavior and I have not figure it out yet. 
+				local_tile_y_r+=1;
+				local_tile_x_r+=1;
+			}
+			if (rect_max.y <= local_tile_y_l || 
+				local_tile_y_r <= rect_min.y || 
+				rect_max.x <= local_tile_x_l || 
+				local_tile_x_r <= rect_min.x) continue;
+
+			touchedIdsBool[i * world_size + rk] = true;
+		}
+	}
+}
+
+torch::Tensor GetLocal2jIdsBoolAdjustMode6CUDA(
+	int image_height,
+	int image_width,
+	int mp_rank,
+	int mp_world_size,
+	const torch::Tensor& means2D,
+	const torch::Tensor& radii,
+	const torch::Tensor& rectangles,
+	const pybind11::dict &args)
+{
+	const int P = means2D.size(0);
+	const int H = image_height;
+	const int W = image_width;
+	bool avoid_pixel_all2all = args["avoid_pixel_all2all"].cast<bool>();
 
+	torch::Tensor local2jIdsBool = torch::full({P, mp_world_size}, false, means2D.options().dtype(torch::kBool));
+
+	getTouchedIdsBoolAdjustMode6 << <(P + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >> >(
+		P,
+		H,
+		W,
+		mp_world_size,
+		reinterpret_cast<float2*>(means2D.contiguous().data<float>()),
+		radii.contiguous().data<int>(),
+		rectangles.contiguous().data<int>(),
+		local2jIdsBool.contiguous().data<bool>(),
+		avoid_pixel_all2all
+	);
+
+	return local2jIdsBool;
+}
 
 
 
diff --git a/rasterize_points.h b/rasterize_points.h
@@ -177,4 +177,15 @@ torch::Tensor GetLocal2jIdsBoolCUDA(
 	const torch::Tensor& dist_global_strategy,
 	const pybind11::dict &args);
 
+torch::Tensor GetLocal2jIdsBoolAdjustMode6CUDA(
+	int image_height,
+	int image_width,
+	int mp_rank,
+	int mp_world_size,
+	const torch::Tensor& means2D,
+	const torch::Tensor& radii,
+	const torch::Tensor& rectangles,
+	const pybind11::dict &args);
+
+
 std::tuple<int, int, int> GetBlockXY();