@@ -926,6 +926,78 @@ def fun(float(N, K) I, float(N) O0) -> (O) {
926926)TC" );
927927}
928928
929+ /*
930+ * Check that a 2D mean with these parameters does not produce a library call.
931+ * The call is not produced because the band is tiled by 32 and 512 threads are
932+ * mapped to the band.
933+ * In practice, check that the library call does not appear in the code.
934+ */
935+ TEST_F (PolyhedralMapperTest, Mean2DNonParametric_512threads) {
936+ string tc = R"TC(
937+ def fun(float(36864, 1024) I) -> (O) {
938+ O(n) +=! I(n, r_n)
939+ O(n) = O(n) / (1024)
940+ }
941+ )TC" ;
942+ auto mappingOptions =
943+ DefaultOptions ()
944+ .outerScheduleFusionStrategy (tc::FusionStrategy::Preserve3Coincident)
945+ .outerScheduleAllowSkewing (false )
946+ .outerSchedulePositiveOrthant (true )
947+ .intraTileScheduleFusionStrategy (tc::FusionStrategy::Min)
948+ .intraTileScheduleAllowSkewing (false )
949+ .intraTileSchedulePositiveOrthant (true )
950+ .fixParametersBeforeScheduling (false )
951+ .tile (18 , 32 )
952+ .unroll (16 )
953+ .tileImperfectlyNested (false )
954+ .matchLibraryCalls (true )
955+ .mapToThreads ({512 })
956+ .mapToBlocks ({16384 })
957+ .useSharedMemory (true )
958+ .usePrivateMemory (false )
959+ .unrollCopyShared (true );
960+
961+ auto code = codegenMapped (tc, mappingOptions);
962+ using tc::code::cuda::kCUBReductionName ;
963+ EXPECT_TRUE (code.find (kCUBReductionName ) == std::string::npos);
964+ }
965+
966+ /*
967+ * Check that a 2D mean with these parameters produce a reduction library call.
968+ * In practice, check that the library call appears in the code.
969+ */
970+ TEST_F (PolyhedralMapperTest, Mean2DNonParametric_32threads) {
971+ string tc = R"TC(
972+ def fun(float(36864, 1024) I) -> (O) {
973+ O(n) +=! I(n, r_n)
974+ O(n) = O(n) / (1024)
975+ }
976+ )TC" ;
977+ auto mappingOptions =
978+ DefaultOptions ()
979+ .outerScheduleFusionStrategy (tc::FusionStrategy::Preserve3Coincident)
980+ .outerScheduleAllowSkewing (false )
981+ .outerSchedulePositiveOrthant (true )
982+ .intraTileScheduleFusionStrategy (tc::FusionStrategy::Min)
983+ .intraTileScheduleAllowSkewing (false )
984+ .intraTileSchedulePositiveOrthant (true )
985+ .fixParametersBeforeScheduling (false )
986+ .tile (18 , 32 )
987+ .unroll (16 )
988+ .tileImperfectlyNested (false )
989+ .matchLibraryCalls (true )
990+ .mapToThreads ({32 })
991+ .mapToBlocks ({16384 })
992+ .useSharedMemory (true )
993+ .usePrivateMemory (false )
994+ .unrollCopyShared (true );
995+
996+ auto code = codegenMapped (tc, mappingOptions);
997+ using tc::code::cuda::kCUBReductionName ;
998+ EXPECT_TRUE (code.find (kCUBReductionName ) != std::string::npos);
999+ }
1000+
9291001static const string kTcMM = R"TC(
9301002def fun(float(M, K) A, float(K, N) B) -> (C) {
9311003 C(m, n) +=! A(m, r_k) * B(r_k, n)
0 commit comments