1313 * See the License for the specific language governing permissions and
1414 * limitations under the License.
1515 */
16+ #include " batchmatmul.h"
17+
1618#include < iostream>
1719#include < string>
1820#include < vector>
@@ -43,23 +45,22 @@ DEFINE_uint32(M, 72, "M dimension in Z(b, n, m) += X(b, n, kk) * Y(b, kk, m)");
4345DEFINE_uint32 (K, 26 , " K dimension in Z(b, n, m) += X(b, n, kk) * Y(b, kk, m)" );
4446
4547class BatchMatMul : public Benchmark {
48+ protected:
49+ uint32_t B, N, M, K;
50+
4651 public:
47- void runBatchMatMul (
48- uint32_t B,
49- uint32_t N,
50- uint32_t M,
51- uint32_t K,
52- const tc::CudaMappingOptions& options,
53- bool use_flags = false );
52+ void Init (uint32_t b, uint32_t n, uint32_t m, uint32_t k) {
53+ B = b;
54+ N = n;
55+ M = m;
56+ K = k;
57+ }
58+ void runBatchMatMul (const tc::CudaMappingOptions& options);
59+ void runCaffe2BatchMatMul ();
60+ void runATenBatchMatMul ();
5461};
5562
56- void BatchMatMul::runBatchMatMul (
57- uint32_t B,
58- uint32_t N,
59- uint32_t M,
60- uint32_t K,
61- const tc::CudaMappingOptions& options,
62- bool use_flags) {
63+ void BatchMatMul::runBatchMatMul (const tc::CudaMappingOptions& options) {
6364 at::Tensor X = at::CUDA (at::kFloat ).rand ({B, N, M});
6465 at::Tensor Y = at::CUDA (at::kFloat ).rand ({B, M, K});
6566
@@ -85,96 +86,83 @@ def batch_matmul(float(B, N, M) X, float(B, M, K) Y) -> (Z) {
8586 std::string suffix = std::string (" _B_" ) + std::to_string (FLAGS_B) +
8687 std::string (" _K_" ) + std::to_string (FLAGS_K) + std::string (" _M_" ) +
8788 std::to_string (FLAGS_M) + std::string (" _N_" ) + std::to_string (FLAGS_N);
88- if (use_flags && FLAGS_validate_proto) {
89- validateProto (
89+ std::vector<tc::CudaMappingOptions> bestOptions{options};
90+ if (FLAGS_autotune) {
91+ bestOptions = autotune (
9092 FLAGS_save_tuner_proto_prefix + std::string (" /batchmatmul_cache" ) +
9193 suffix,
94+ FLAGS_save_tuner_proto_prefix + std::string (" /batchmatmul_best" ) +
95+ suffix,
9296 tc,
9397 " batch_matmul" ,
9498 inputs,
99+ options,
95100 check_fun);
96- } else {
97- Check (tc, " batch_matmul" , options, inputs, check_fun);
98- if (use_flags) {
99- autotune (
100- FLAGS_save_tuner_proto_prefix + std::string (" /batchmatmul_cache" ) +
101- suffix,
102- FLAGS_save_tuner_proto_prefix + std::string (" /batchmatmul_best" ) +
103- suffix,
104- tc,
105- " batch_matmul" ,
106- inputs,
107- options,
108- check_fun);
109- }
110101 }
102+ Check (tc, " batch_matmul" , bestOptions[0 ], inputs, check_fun);
111103}
112104
113- TEST_F (BatchMatMul, TransposedBatchMatMul) {
114- auto B = FLAGS_B;
115- auto N = FLAGS_N;
116- auto M = FLAGS_M;
117- auto K = FLAGS_K;
118- auto options = tc::CudaMappingOptions::makeNaiveMappingOptions ()
119- .tile (1 )
120- .mapToThreads ({128 })
121- .mapToBlocks ({B})
122- .useSharedMemory (true )
123- .usePrivateMemory (true )
124- .unroll (256 );
125- runBatchMatMul (B, N, M, K, options, true );
126- }
127-
128- TEST_F (BatchMatMul, TransposedBatchMatMul_P100_autotuned_B_500_K_26_M_72_N_26) {
129- uint32_t B = 500 ;
130- uint32_t K = 26 ;
131- uint32_t M = 72 ;
132- uint32_t N = 26 ;
133- auto options = tc::CudaMappingOptions::makeNaiveMappingOptions ()
134- .outerScheduleFusionStrategy (tc::FusionStrategy::Max)
135- .outerScheduleAllowSkewing (false )
136- .outerSchedulePositiveOrthant (true )
137- .intraTileScheduleFusionStrategy (tc::FusionStrategy::Min)
138- .intraTileScheduleAllowSkewing (false )
139- .intraTileSchedulePositiveOrthant (true )
140- .tile (3 )
141- .mapToThreads (4 , 36 , 3 )
142- .mapToBlocks (512 )
143- .unroll (64 )
144- .tileImperfectlyNested (false )
145- .useSharedMemory (true )
146- .usePrivateMemory (false )
147- .unrollCopyShared (true )
148- .matchLibraryCalls (true );
149- runBatchMatMul (B, N, M, K, options);
105+ void BatchMatMul::runCaffe2BatchMatMul () {
106+ Workspace w_ref;
107+ auto AddInput = AddDeterministicallyRandomInput<caffe2::CUDABackend, float >;
108+ AddInput (w_ref, {B, N, M}, " X" );
109+ AddInput (w_ref, {B, M, K}, " Y" );
110+ OperatorDef ref_def =
111+ MakeOperatorDef<caffe2::CUDABackend>(" BatchMatMul" , {" X" , " Y" }, {" Z" });
112+ std::unique_ptr<OperatorBase> net (CreateOperator (ref_def, &w_ref));
113+ Reference ([&]() { return true ; }, [&](bool flag) { net->Run (); });
150114}
151115
152- TEST_F (BatchMatMul, ATenTransposedBatchMatMulReference) {
153- auto B = FLAGS_B;
154- auto N = FLAGS_N;
155- auto M = FLAGS_M;
156- auto K = FLAGS_K;
116+ void BatchMatMul::runATenBatchMatMul () {
157117 at::Tensor X = at::CUDA (at::kFloat ).rand ({B, N, M});
158118 at::Tensor Y = at::CUDA (at::kFloat ).rand ({B, M, K});
159119 Reference (
160120 [&]() { return bmm (X, Y); },
161121 [&](at::Tensor& res) { bmm_out (res, X, Y); });
162122}
163123
164- TEST_F (BatchMatMul, C2TransposedBatchMatMulReference) {
165- int B = FLAGS_B;
166- int N = FLAGS_N ;
167- int M = FLAGS_M ;
168- int K = FLAGS_K;
124+ // Generic
125+ TEST_F (BatchMatMul, TransposedBatchMatMul) {
126+ Init (FLAGS_B, FLAGS_N, FLAGS_M, FLAGS_K) ;
127+ runBatchMatMul ( tc::CudaMappingOptions::makeNaiveMappingOptions ()) ;
128+ }
169129
170- Workspace w_ref;
171- auto AddInput = AddDeterministicallyRandomInput<caffe2::CUDABackend, float >;
172- AddInput (w_ref, {B, N, M}, " X" );
173- AddInput (w_ref, {B, M, K}, " Y" );
174- OperatorDef ref_def =
175- MakeOperatorDef<caffe2::CUDABackend>(" BatchMatMul" , {" X" , " Y" }, {" Z" });
176- std::unique_ptr<OperatorBase> net (CreateOperator (ref_def, &w_ref));
177- Reference ([&]() { return true ; }, [&](bool flag) { net->Run (); });
130+ // P100 TC
131+ TEST_F (BatchMatMul, TransposedBatchMatMul_P100_autotuned_B_500_K_26_M_72_N_26) {
132+ Init (500 , 26 , 72 , 26 );
133+ runBatchMatMul (
134+ tc::options_TransposedBatchMatMul_P100_autotuned_B_500_K_26_M_72_N_26);
135+ }
136+
137+ // P100 ATen
138+ TEST_F (BatchMatMul, TransposedBatchMatMul_ATen_P100_B_500_K_26_M_72_N_26) {
139+ Init (500 , 26 , 72 , 26 );
140+ runATenBatchMatMul ();
141+ }
142+
143+ // P100 Caffe2
144+ TEST_F (BatchMatMul, TransposedBatchMatMul_Caffe2_P100_B_500_K_26_M_72_N_26) {
145+ Init (500 , 26 , 72 , 26 );
146+ runCaffe2BatchMatMul ();
147+ }
148+
149+ // V100 TC
150+ TEST_F (BatchMatMul, TransposedBatchMatMul_V100_autotuned_B_500_K_26_M_72_N_26) {
151+ Init (500 , 26 , 72 , 26 );
152+ runBatchMatMul (
153+ tc::options_TransposedBatchMatMul_V100_autotuned_B_500_K_26_M_72_N_26);
154+ }
155+
156+ // V100 ATen
157+ TEST_F (BatchMatMul, TransposedBatchMatMul_ATen_V100_B_500_K_26_M_72_N_26) {
158+ Init (500 , 26 , 72 , 26 );
159+ runATenBatchMatMul ();
160+ }
161+
162+ // V100 Caffe2
163+ TEST_F (BatchMatMul, TransposedBatchMatMul_Caffe2_V100_B_500_K_26_M_72_N_26) {
164+ Init (500 , 26 , 72 , 26 );
165+ runCaffe2BatchMatMul ();
178166}
179167
180168int main (int argc, char ** argv) {
0 commit comments