@@ -263,77 +263,6 @@ __global__ void reduce_data_per_block(
263263 }
264264}
265265
266- __global__ void getComputeLocally (// TODO: this function is not heavy enough to be parallelized.
267- const int tile_num,
268- uint32_t * gs_on_tiles_offsets,
269- bool * compute_locally,
270- int last_local_num_rendered_end,
271- int local_num_rendered_end
272- ) {
273- auto idx = cg::this_grid ().thread_rank ();
274- if (idx >= tile_num)
275- return ;
276-
277- int x = (int )gs_on_tiles_offsets[idx];
278- if (x > last_local_num_rendered_end && x <= local_num_rendered_end)
279- compute_locally[idx] = true ;
280- else
281- compute_locally[idx] = false ;
282- }
283-
284- __global__ void getComputeLocallyByTileNum (// TODO: this function is not heavy enough to be parallelized.
285- const int tile_num,
286- bool * compute_locally,
287- int last_local_num_rendered_end,
288- int local_num_rendered_end
289- ) {
290- auto idx = cg::this_grid ().thread_rank ();
291- if (idx >= tile_num)
292- return ;
293-
294- if (idx >= last_local_num_rendered_end && idx < local_num_rendered_end)
295- compute_locally[idx] = true ;
296- else
297- compute_locally[idx] = false ;
298- }
299-
300- __global__ void getComputeLocallyByTileId (
301- const int tile_num,
302- bool * compute_locally,
303- int tile_id_start,
304- int tile_id_end
305- ) {
306- auto idx = cg::this_grid ().thread_rank ();
307- if (idx >= tile_num)
308- return ;
309-
310- if (idx >= tile_id_start && idx < tile_id_end)
311- compute_locally[idx] = true ;
312- else
313- compute_locally[idx] = false ;
314- }
315-
316- __global__ void getComputeLocallyByRowId (
317- const int tile_num,
318- bool * compute_locally,
319- int tile_grid_x,
320- int tile_grid_y,
321- int row_id_start,
322- int row_id_end
323- ) {
324- auto idx = cg::this_grid ().thread_rank ();
325- if (idx >= tile_num)
326- return ;
327-
328- int tile_x = idx % tile_grid_x;
329- int tile_y = idx / tile_grid_x;
330- if (tile_y >= row_id_start && tile_y < row_id_end)
331- compute_locally[idx] = true ;
332- else
333- compute_locally[idx] = false ;
334- }
335-
336-
337266__global__ void updateTileTouched (
338267 const int P,
339268 const dim3 tile_grid,
@@ -360,156 +289,6 @@ __global__ void updateTileTouched(
360289 tiles_touched[idx] = cnt;
361290}
362291
363- __global__ void getGlobalGaussianOnTiles (// TODO: maybe this could take significant amount of time.
364- const int P,
365- const float2 * means2D,
366- int * radii,
367- const dim3 tile_grid,
368- uint32_t * gs_on_tiles
369- ) {
370- auto idx = cg::this_grid ().thread_rank ();
371- if (idx >= P)
372- return ;
373-
374- if (radii[idx] > 0 )
375- {
376- uint2 rect_min, rect_max;
377- getRect (means2D[idx], radii[idx], rect_min, rect_max, tile_grid);
378- for (int y = rect_min.y ; y < rect_max.y ; y++)
379- for (int x = rect_min.x ; x < rect_max.x ; x++)
380- {
381- atomicAdd (&gs_on_tiles[y * tile_grid.x + x], 1 );
382- // TODO: Do I have to use atomicAdd? This is slow, honestly.
383- }
384- }
385- }
386-
387- // NOTE: This method should also deal with world_size == 1 safely.
388- void updateDistributedStatLocally (// TODO: optimize implementations for all these kernels.
389- const int P,
390- const int width,
391- const int height,
392- const dim3 tile_grid,
393- int * radii,
394- float2 * means2D,
395- CudaRasterizer::DistributedState& distState,
396- const int local_rank,
397- const int world_size,
398- const char * dist_division_mode,
399- MyTimerOnGPU& timer
400- ){
401- int tile_num = tile_grid.x * tile_grid.y ;
402- timer.start (" 21 updateDistributedStatLocally.getGlobalGaussianOnTiles" );
403- cudaMemset (distState.gs_on_tiles , 0 , tile_num * sizeof (uint32_t ));
404- getGlobalGaussianOnTiles <<<(P + ONE_DIM_BLOCK_SIZE - 1 ) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >>> (
405- P,
406- means2D,
407- radii,
408- tile_grid,
409- distState.gs_on_tiles
410- );
411- timer.stop (" 21 updateDistributedStatLocally.getGlobalGaussianOnTiles" );
412-
413- // getComputeLocally
414- if (world_size >= 1 ) {
415- timer.start (" 22 updateDistributedStatLocally.InclusiveSum" );
416- cub::DeviceScan::InclusiveSum (distState.scanning_space , distState.scan_size , distState.gs_on_tiles , distState.gs_on_tiles_offsets , tile_num);
417- timer.stop (" 22 updateDistributedStatLocally.InclusiveSum" );
418-
419- int num_rendered;
420- cudaMemcpy (&num_rendered, distState.gs_on_tiles_offsets + tile_num - 1 , sizeof (int ), cudaMemcpyDeviceToHost);
421-
422- timer.start (" 23 updateDistributedStatLocally.getComputeLocally" );
423- // find the position by binary search or customized kernal function?
424- // printf("dist_division_mode: %s, length: %d\n", dist_division_mode, strlen(dist_division_mode));
425- if (strcmp (dist_division_mode, " rendered_num" ) == 0 ) {
426- int num_rendered_per_device = num_rendered / world_size + 1 ;
427- int last_local_num_rendered_end = num_rendered_per_device * local_rank;
428- int local_num_rendered_end = min (num_rendered_per_device * (local_rank + 1 ), num_rendered);
429- getComputeLocally <<<(tile_num + ONE_DIM_BLOCK_SIZE - 1 ) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >>> (
430- tile_num,
431- distState.gs_on_tiles_offsets ,
432- distState.compute_locally ,
433- last_local_num_rendered_end,
434- local_num_rendered_end
435- );
436- distState.last_local_num_rendered_end = last_local_num_rendered_end;
437- distState.local_num_rendered_end = local_num_rendered_end;
438- } else if (strcmp (dist_division_mode, " tile_num" ) == 0 ) {
439- int num_tiles_per_device = tile_num / world_size + 1 ;
440- int last_local_num_rendered_end = num_tiles_per_device * local_rank;
441- int local_num_rendered_end = min (num_tiles_per_device * (local_rank + 1 ), tile_num);
442- // TODO: optimze this; in some cases, it will not be divied evenly -> 2170 will be into 1086 and 1084
443- getComputeLocallyByTileNum <<<(tile_num + ONE_DIM_BLOCK_SIZE - 1 ) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >>> (
444- tile_num,
445- distState.compute_locally ,
446- last_local_num_rendered_end,
447- local_num_rendered_end
448- );
449- distState.last_local_num_rendered_end = last_local_num_rendered_end;
450- distState.local_num_rendered_end = local_num_rendered_end;
451- } else if (dist_division_mode[0 ] == ' T' ) {
452- // dist_division_mode example: "T:0,1" or "T:10,20"
453- char * dist_division_mode_left = new char [strlen (dist_division_mode) + 1 ];
454- char * dist_division_mode_right = new char [strlen (dist_division_mode) + 1 ];
455- strcpy (dist_division_mode_left, dist_division_mode);
456- strcpy (dist_division_mode_right, dist_division_mode);
457-
458- char * pch = strtok (dist_division_mode_left, " :" );
459- pch = strtok (NULL , " :" );
460- pch = strtok (pch, " ," );
461- int tile_id_start = atoi (pch);
462- pch = strtok (NULL , " ," );
463- int tile_id_end = atoi (pch);
464- delete[] dist_division_mode_left;
465- delete[] dist_division_mode_right;
466- // printf("dist_division_mode is %s, tile_id_start is %d, tile_id_end is %d\n", dist_division_mode, tile_id_start, tile_id_end);
467-
468- getComputeLocallyByTileId <<<(tile_num + ONE_DIM_BLOCK_SIZE - 1 ) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >>> (
469- tile_num,
470- distState.compute_locally ,
471- tile_id_start,
472- tile_id_end
473- );
474- distState.last_local_num_rendered_end = tile_id_start;
475- distState.local_num_rendered_end = tile_id_end;
476-
477- } else {
478- // dist_division_mode example: "0,1" or "10,20"
479- // TODO: refactor code: I should change it into: "R:0,1" or "R:10,20" later. refactor code.
480-
481- char * dist_division_mode_left = new char [strlen (dist_division_mode) + 1 ];
482- char * dist_division_mode_right = new char [strlen (dist_division_mode) + 1 ];
483- strcpy (dist_division_mode_left, dist_division_mode);
484- strcpy (dist_division_mode_right, dist_division_mode);
485- char * pch = strtok (dist_division_mode_left, " ," );
486- int row_id_start = atoi (pch);
487- pch = strtok (NULL , " ," );
488- int row_id_end = atoi (pch);
489- delete[] dist_division_mode_left;
490- delete[] dist_division_mode_right;
491- // printf("dist_division_mode is %s, row_id_start is %d, row_id_end is %d\n", dist_division_mode, row_id_start, row_id_end);
492-
493- getComputeLocallyByRowId <<<(tile_num + ONE_DIM_BLOCK_SIZE - 1 ) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >>> (
494- tile_num,
495- distState.compute_locally ,
496- tile_grid.x ,
497- tile_grid.y ,
498- row_id_start,
499- row_id_end
500- );
501- distState.last_local_num_rendered_end = row_id_start * tile_grid.x ;
502- distState.local_num_rendered_end = row_id_end * tile_grid.x ;
503-
504- // printf("division_mode: %s is not supported.\n", dist_division_mode);
505- }
506- timer.stop (" 23 updateDistributedStatLocally.getComputeLocally" );
507- }
508- else {
509- cudaMemset (distState.compute_locally , true , tile_num * sizeof (bool ));
510- }
511- }
512-
513292void save_log_in_file (int iteration, int local_rank, int world_size, std::string log_folder, const char * filename_prefix, const char * log_content) {
514293 char * filename = new char [128 ];
515294 sprintf (filename, " %s/%s_ws=%d_rk=%d.log" , log_folder.c_str (), filename_prefix, world_size, local_rank);
@@ -535,7 +314,8 @@ std::tuple<int, int, int, int, int, bool, bool, std::string, std::string, std::s
535314 std::string log_folder_str = args[" log_folder" ].cast <std::string>();
536315 std::string zhx_debug_str = args[" zhx_debug" ].cast <std::string>();
537316 std::string zhx_time_str = args[" zhx_time" ].cast <std::string>();
538- std::string dist_division_mode_str = args[" dist_division_mode" ].cast <std::string>();
317+ // std::string dist_division_mode_str = args["dist_division_mode"].cast<std::string>();
318+ std::string dist_division_mode_str = " " ;
539319
540320 int local_rank = std::stoi (local_rank_str);
541321 int world_size = std::stoi (world_size_str);
@@ -728,80 +508,8 @@ void CudaRasterizer::Rasterizer::getDistributionStrategy(
728508 bool debug,
729509 const pybind11::dict &args)
730510{
731- auto [local_rank, world_size, iteration, log_interval, device, zhx_debug, zhx_time, mode, dist_division_mode, log_folder] = prepareArgs (args);
732- char * log_tmp = new char [500 ];
733-
734- MyTimerOnGPU timer;
735-
736- dim3 tile_grid ((width + BLOCK_X - 1 ) / BLOCK_X, (height + BLOCK_Y - 1 ) / BLOCK_Y, 1 );
737- int tile_num = tile_grid.x * tile_grid.y ;
738-
739- size_t dist_chunk_size = required<DistributedState>(tile_grid.x * tile_grid.y );
740- char * dist_chunkptr = distBuffer (dist_chunk_size);
741- DistributedState distState = DistributedState::fromChunk (dist_chunkptr, tile_grid.x * tile_grid.y , true );
742-
743- distState.compute_locally = compute_locally;
744- // NOTE: do not allocate memory for distState.compute_locally in fromChunk for sep_rendering mode,
745- // but use the compute_locally from python.
746-
747- // Use means2D and radii to decide how to evenly distribute the workloads.
748- timer.start (" 20 updateDistributedStatLocally" );
749- updateDistributedStatLocally (// FIXME: in memory_distribution mode, this function's calculation is not correct.
750- P,
751- width,
752- height,
753- tile_grid,
754- radii,
755- means2D,
756- distState,
757- local_rank,
758- world_size,
759- dist_division_mode.c_str (),
760- timer
761- );
762- timer.stop (" 20 updateDistributedStatLocally" );
763-
764- // DEBUG: print out compute_locally information
765- if (mode == " train" && zhx_debug && iteration % log_interval == 1 ) {
766- int last_local_num_rendered_end = distState.last_local_num_rendered_end ;
767- int local_num_rendered_end = distState.local_num_rendered_end ;
768- uint32_t * gs_on_tiles_cpu = new uint32_t [tile_grid.x * tile_grid.y ];
769- CHECK_CUDA (cudaMemcpy (gs_on_tiles_cpu, distState.gs_on_tiles , tile_grid.x * tile_grid.y * sizeof (uint32_t ), cudaMemcpyDeviceToHost), debug);
770-
771- // distState.compute_locally to cpu
772- bool * compute_locally_cpu = new bool [tile_grid.x * tile_grid.y ];
773- CHECK_CUDA (cudaMemcpy (compute_locally_cpu, distState.compute_locally , tile_grid.x * tile_grid.y * sizeof (bool ), cudaMemcpyDeviceToHost), debug);
774-
775- int num_local_tiles = 0 ;
776- int local_tiles_left_idx = 999999999 ;
777- int local_tiles_right_idx = 0 ;
778- int num_rendered_from_distState = 0 ;
779- for (int i = 0 ; i < tile_grid.x * tile_grid.y ; i++)
780- {
781- if (compute_locally_cpu[i])
782- {
783- if (local_tiles_left_idx == 999999999 )
784- local_tiles_left_idx = i;
785- local_tiles_right_idx = i;
786- num_local_tiles++;
787- num_rendered_from_distState += (int )gs_on_tiles_cpu[i];
788- }
789- }
790-
791- sprintf (log_tmp, " num_local_tiles: %d, local_tiles_left_idx: %d, local_tiles_right_idx: %d, last_local_num_rendered_end: %d, local_num_rendered_end: %d, num_rendered_from_distState: %d" ,
792- (int )num_local_tiles, (int )local_tiles_left_idx, (int )local_tiles_right_idx, (int )last_local_num_rendered_end, (int )local_num_rendered_end, (int )num_rendered_from_distState);
793- save_log_in_file (iteration, local_rank, world_size, log_folder, " num_rendered" , log_tmp);
794-
795- delete[] compute_locally_cpu;
796- delete[] gs_on_tiles_cpu;
797- }
798-
799- // Print out timing information
800- if (zhx_time && iteration % log_interval == 1 ) {
801- timer.printAllTimes (iteration, world_size, local_rank, log_folder, false );
802- }
803-
804- delete[] log_tmp;
511+ // This function is deprecated for now. But I keed the structure of code here potentially for future use.
512+ throw std::runtime_error (" getDistributionStrategy is deprecated." );
805513}
806514
807515// ///////////////////////////// Render ///////////////////////////////
0 commit comments