Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit c55e338

Browse files
Add a benchmark for first and second moments and sum / sumSquares
In particular this benchmark used to exhibit a performance bug where we left a lot of performance on the table for on mean / var due to the normalization update which used to disable reduction detection. This is fixed in 6fdced3 The performance ratio between (sum, mean) or (sumSquares, var) used to be about 4x on a P100 card. Note that Max outer fuse followed by Min intra fuse works nicely and allows reductions to occur. Lastly, fusing sum and sumSquares only gets an extra 10% over the sum of the kernels which shows the benefit of fusion. This has been driven by performance analysis of GroupNormalization which will be added in a followup commit.
1 parent 6a055ee commit c55e338

File tree

3 files changed

+900
-0
lines changed

3 files changed

+900
-0
lines changed

tc/benchmarks/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ set(BENCHMARKS
1919
batchmatmul
2020
group_convolution
2121
kronecker
22+
moments
2223
tmm
2324
MLP_model
2425
)

tc/benchmarks/moments.cc

Lines changed: 349 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,349 @@
1+
/**
2+
* Copyright (c) 2017-present, Facebook, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#include "moments.h"
17+
18+
#include <iostream>
19+
#include <string>
20+
#include <vector>
21+
22+
#include <gflags/gflags.h>
23+
#include <glog/logging.h>
24+
#include <gtest/gtest.h>
25+
26+
#include "tc/aten/aten.h"
27+
28+
#include "tc/aten/aten_compiler.h"
29+
#include "tc/core/cuda/cuda_mapping_options.h"
30+
31+
#include "../test/caffe2/cuda/test_harness.h"
32+
#include "../test/caffe2/test_harness.h"
33+
#include "../test/test_harness_aten_cuda.h"
34+
#include "benchmark_fixture.h"
35+
36+
#include "tc/c2/context.h"
37+
#include "tc/core/cuda/cuda.h"
38+
#include "tc/core/flags.h"
39+
40+
using namespace caffe2;
41+
42+
DEFINE_uint32(N, 1024, "N batch size (32 * 32 from group_norm equivalent)");
43+
DEFINE_uint32(K, 36864, "K row size (16 * 48 * 48 from group_norm equivalent)");
44+
45+
class Moments2_2D_1D : public Benchmark {
46+
protected:
47+
uint32_t N, K;
48+
at::Tensor I, sum, mean, sumSquares, var;
49+
50+
public:
51+
void Init(uint32_t n, uint32_t k) {
52+
N = n;
53+
K = k;
54+
I = at::CUDA(at::kFloat).rand({N, K}).uniform_(0.0f, 1.0f);
55+
at::Tensor v = I.view({N, -1});
56+
sum = v.sum(1);
57+
mean = v.mean(-1, true).view({N});
58+
sumSquares = v.pow(2.0f).sum(1);
59+
var = v.var(-1, true).view({N});
60+
}
61+
void runSum_2D_1D(const tc::CudaMappingOptions& options);
62+
void runMean_2D_1D(const tc::CudaMappingOptions& options);
63+
void runSumSquares_2D_1D(const tc::CudaMappingOptions& options);
64+
void runVar_2D_1D(const tc::CudaMappingOptions& options);
65+
void runSumAndSquares_2D_1D(const tc::CudaMappingOptions& options);
66+
void runMoments2_2D_1D(const tc::CudaMappingOptions& options);
67+
68+
private:
69+
void autotuneAndCheck(
70+
const std::string& entryPoint,
71+
const std::vector<at::Tensor>& inputs,
72+
const tc::CudaMappingOptions& options,
73+
std::function<bool(
74+
const std::vector<at::Tensor>& inputs,
75+
const std::vector<at::Tensor>& outputs)> checkFun);
76+
};
77+
78+
void Moments2_2D_1D::autotuneAndCheck(
79+
const std::string& entryPoint,
80+
const std::vector<at::Tensor>& inputs,
81+
const tc::CudaMappingOptions& options,
82+
std::function<bool(
83+
const std::vector<at::Tensor>& inputs,
84+
const std::vector<at::Tensor>& outputs)> checkFun) {
85+
std::string suffix = std::string("_N_") + std::to_string(N) +
86+
std::string("_K_") + std::to_string(K);
87+
std::vector<tc::CudaMappingOptions> bestOptions{options};
88+
if (FLAGS_autotune) {
89+
bestOptions = autotune(
90+
FLAGS_save_tuner_proto_prefix + std::string("/moments_cache") + suffix,
91+
FLAGS_save_tuner_proto_prefix + std::string("/moments_best") + suffix,
92+
tc::TC_Moments,
93+
entryPoint,
94+
inputs,
95+
options);
96+
CHECK_GE(bestOptions.size(), 1u);
97+
}
98+
Check(tc::TC_Moments, entryPoint, bestOptions[0], inputs, checkFun);
99+
}
100+
101+
void Moments2_2D_1D::runSum_2D_1D(const tc::CudaMappingOptions& options) {
102+
std::vector<at::Tensor> inputs{I};
103+
auto check_fun = [&](const std::vector<at::Tensor>& inputs,
104+
const std::vector<at::Tensor>& outputs) {
105+
checkRtol(outputs[0] - sum, inputs, K, 1e-5);
106+
return true;
107+
};
108+
autotuneAndCheck(tc::TC_Sum_2D_1D_NAME, inputs, options, check_fun);
109+
}
110+
111+
void Moments2_2D_1D::runMean_2D_1D(const tc::CudaMappingOptions& options) {
112+
std::vector<at::Tensor> inputs{I};
113+
auto check_fun = [&](const std::vector<at::Tensor>& inputs,
114+
const std::vector<at::Tensor>& outputs) {
115+
checkRtol(outputs[0] - mean, inputs, K, 1e-5);
116+
return true;
117+
};
118+
autotuneAndCheck(tc::TC_Mean_2D_1D_NAME, inputs, options, check_fun);
119+
}
120+
121+
void Moments2_2D_1D::runSumSquares_2D_1D(
122+
const tc::CudaMappingOptions& options) {
123+
std::vector<at::Tensor> inputs{I};
124+
auto check_fun = [&](const std::vector<at::Tensor>& inputs,
125+
const std::vector<at::Tensor>& outputs) {
126+
checkRtol(outputs[0] - sumSquares, inputs, 2 * K, 1e5);
127+
return true;
128+
};
129+
autotuneAndCheck(tc::TC_Sum_Squares_2D_1D_NAME, inputs, options, check_fun);
130+
}
131+
132+
void Moments2_2D_1D::runVar_2D_1D(const tc::CudaMappingOptions& options) {
133+
std::vector<at::Tensor> inputs{I, mean};
134+
auto check_fun = [&](const std::vector<at::Tensor>& inputs,
135+
const std::vector<at::Tensor>& outputs) {
136+
checkRtol(outputs[0] - var, inputs, K, 1e-5);
137+
return true;
138+
};
139+
autotuneAndCheck(tc::TC_Var_2D_1D_NAME, inputs, options, check_fun);
140+
}
141+
142+
void Moments2_2D_1D::runSumAndSquares_2D_1D(
143+
const tc::CudaMappingOptions& options) {
144+
std::vector<at::Tensor> inputs{I};
145+
auto check_fun = [&](const std::vector<at::Tensor>& inputs,
146+
const std::vector<at::Tensor>& outputs) {
147+
checkRtol(outputs[0] - sum, inputs, 2 * K, 1e-5);
148+
checkRtol(outputs[1] - sumSquares, inputs, 2 * K, 1e-5);
149+
return true;
150+
};
151+
autotuneAndCheck(
152+
tc::TC_Sum_And_Squares_2D_1D_NAME, inputs, options, check_fun);
153+
}
154+
155+
void Moments2_2D_1D::runMoments2_2D_1D(const tc::CudaMappingOptions& options) {
156+
std::vector<at::Tensor> inputs{I};
157+
auto check_fun = [&](const std::vector<at::Tensor>& inputs,
158+
const std::vector<at::Tensor>& outputs) {
159+
checkRtol(outputs[0] - mean, inputs, K, 1e-5);
160+
checkRtol(outputs[1] - var, inputs, 2 * K, 1e-5);
161+
return true;
162+
};
163+
autotuneAndCheck(tc::TC_Moments2_2D_1D_NAME, inputs, options, check_fun);
164+
}
165+
166+
/// Sum
167+
// Generic
168+
TEST_F(Moments2_2D_1D, Sum_2D_1D) {
169+
Init(FLAGS_N, FLAGS_K);
170+
runSum_2D_1D(tc::CudaMappingOptions::makeNaiveMappingOptions());
171+
}
172+
173+
// P100
174+
TEST_F(Moments2_2D_1D, Sum_2D_1D_P100_autotuned_N_128_K_2304) {
175+
Init(128, 2304);
176+
runSum_2D_1D(tc::options_Sum_2D_1D_P100_autotuned_N_128_K_2304);
177+
}
178+
179+
TEST_F(Moments2_2D_1D, Sum_2D_1D_P100_autotuned_N_1024_K_36864) {
180+
Init(1024, 36864);
181+
runSum_2D_1D(tc::options_Sum_2D_1D_P100_autotuned_N_1024_K_36864);
182+
}
183+
184+
// V100
185+
TEST_F(Moments2_2D_1D, Sum_2D_1D_V100_autotuned_N_128_K_2304) {
186+
Init(128, 2304);
187+
runSum_2D_1D(tc::options_Sum_2D_1D_V100_autotuned_N_128_K_2304);
188+
}
189+
190+
TEST_F(Moments2_2D_1D, Sum_2D_1D_V100_autotuned_N_1024_K_36864) {
191+
Init(1024, 36864);
192+
runSum_2D_1D(tc::options_Sum_2D_1D_V100_autotuned_N_1024_K_36864);
193+
}
194+
195+
// Autotunes and benchmarks mean
196+
TEST_F(Moments2_2D_1D, Mean_2D_1D) {
197+
Init(FLAGS_N, FLAGS_K);
198+
runMean_2D_1D(tc::CudaMappingOptions::makeNaiveMappingOptions());
199+
}
200+
201+
// P100
202+
TEST_F(Moments2_2D_1D, Mean_2D_1D_P100_autotuned_N_128_K_2304) {
203+
Init(128, 2304);
204+
runMean_2D_1D(tc::options_Mean_2D_1D_P100_autotuned_N_128_K_2304);
205+
}
206+
207+
TEST_F(Moments2_2D_1D, Mean_2D_1D_P100_autotuned_N_1024_K_36864) {
208+
Init(1024, 36864);
209+
runMean_2D_1D(tc::options_Mean_2D_1D_P100_autotuned_N_1024_K_36864);
210+
}
211+
212+
// V100
213+
TEST_F(Moments2_2D_1D, Mean_2D_1D_V100_autotuned_N_128_K_2304) {
214+
Init(128, 2304);
215+
runMean_2D_1D(tc::options_Mean_2D_1D_V100_autotuned_N_128_K_2304);
216+
}
217+
218+
TEST_F(Moments2_2D_1D, Mean_2D_1D_V100_autotuned_N_1024_K_36864) {
219+
Init(1024, 36864);
220+
runMean_2D_1D(tc::options_Mean_2D_1D_V100_autotuned_N_1024_K_36864);
221+
}
222+
223+
// Autotunes and benchmarks sum_squares
224+
TEST_F(Moments2_2D_1D, Sum_Squares_2D_1D) {
225+
Init(FLAGS_N, FLAGS_K);
226+
runSumSquares_2D_1D(tc::CudaMappingOptions::makeNaiveMappingOptions());
227+
}
228+
229+
// P100
230+
TEST_F(Moments2_2D_1D, Sum_Squares_2D_1D_P100_autotuned_N_128_K_2304) {
231+
Init(128, 2304);
232+
runSumSquares_2D_1D(
233+
tc::options_Sum_Squares_2D_1D_P100_autotuned_N_128_K_2304);
234+
}
235+
236+
TEST_F(Moments2_2D_1D, Sum_Squares_2D_1D_P100_autotuned_N_1024_K_36864) {
237+
Init(1024, 36864);
238+
runSumSquares_2D_1D(
239+
tc::options_Sum_Squares_2D_1D_P100_autotuned_N_1024_K_36864);
240+
}
241+
242+
// V100
243+
TEST_F(Moments2_2D_1D, Sum_Squares_2D_1D_V100_autotuned_N_128_K_2304) {
244+
Init(128, 2304);
245+
runSumSquares_2D_1D(
246+
tc::options_Sum_Squares_2D_1D_V100_autotuned_N_128_K_2304);
247+
}
248+
249+
TEST_F(Moments2_2D_1D, Sum_Squares_2D_1D_V100_autotuned_N_1024_K_36864) {
250+
Init(1024, 36864);
251+
runSumSquares_2D_1D(
252+
tc::options_Sum_Squares_2D_1D_V100_autotuned_N_1024_K_36864);
253+
}
254+
255+
// Autotunes and benchmarks var
256+
TEST_F(Moments2_2D_1D, Var_2D_1D) {
257+
Init(FLAGS_N, FLAGS_K);
258+
runVar_2D_1D(tc::CudaMappingOptions::makeNaiveMappingOptions());
259+
}
260+
261+
// P100
262+
TEST_F(Moments2_2D_1D, Var_2D_1D_P100_autotuned_N_128_K_2304) {
263+
Init(128, 2304);
264+
runVar_2D_1D(tc::options_Var_2D_1D_P100_autotuned_N_128_K_2304);
265+
}
266+
267+
TEST_F(Moments2_2D_1D, Var_2D_1D_P100_autotuned_N_1024_K_36864) {
268+
Init(1024, 36864);
269+
runVar_2D_1D(tc::options_Var_2D_1D_P100_autotuned_N_1024_K_36864);
270+
}
271+
272+
// V100
273+
TEST_F(Moments2_2D_1D, Var_2D_1D_V100_autotuned_N_128_K_2304) {
274+
Init(128, 2304);
275+
runVar_2D_1D(tc::options_Var_2D_1D_V100_autotuned_N_128_K_2304);
276+
}
277+
278+
TEST_F(Moments2_2D_1D, Var_2D_1D_V100_autotuned_N_1024_K_36864) {
279+
Init(1024, 36864);
280+
runVar_2D_1D(tc::options_Var_2D_1D_V100_autotuned_N_1024_K_36864);
281+
}
282+
283+
// Autotunes and benchmarks sum_and_squares
284+
TEST_F(Moments2_2D_1D, Sum_And_Squares_2D_1D) {
285+
Init(FLAGS_N, FLAGS_K);
286+
runSumAndSquares_2D_1D(tc::CudaMappingOptions::makeNaiveMappingOptions());
287+
}
288+
289+
// P100
290+
TEST_F(Moments2_2D_1D, Sum_And_Squares_2D_1D_P100_autotuned_N_128_K_2304) {
291+
Init(128, 2304);
292+
runSumAndSquares_2D_1D(
293+
tc::options_Sum_And_Squares_2D_1D_P100_autotuned_N_128_K_2304);
294+
}
295+
296+
TEST_F(Moments2_2D_1D, Sum_And_Squares_2D_1D_P100_autotuned_N_1024_K_36864) {
297+
Init(1024, 36864);
298+
runSumAndSquares_2D_1D(
299+
tc::options_Sum_And_Squares_2D_1D_P100_autotuned_N_1024_K_36864);
300+
}
301+
302+
// V100
303+
TEST_F(Moments2_2D_1D, Sum_And_Squares_2D_1D_V100_autotuned_N_128_K_2304) {
304+
Init(128, 2304);
305+
runSumAndSquares_2D_1D(
306+
tc::options_Sum_And_Squares_2D_1D_V100_autotuned_N_128_K_2304);
307+
}
308+
309+
TEST_F(Moments2_2D_1D, Sum_And_Squares_2D_1D_V100_autotuned_N_1024_K_36864) {
310+
Init(1024, 36864);
311+
runSumAndSquares_2D_1D(
312+
tc::options_Sum_And_Squares_2D_1D_V100_autotuned_N_1024_K_36864);
313+
}
314+
315+
// Benchmarks 2 moments (mean and var)
316+
TEST_F(Moments2_2D_1D, Moments2_2D_1D) {
317+
Init(FLAGS_N, FLAGS_K);
318+
runMoments2_2D_1D(tc::CudaMappingOptions::makeNaiveMappingOptions());
319+
}
320+
321+
// P100
322+
TEST_F(Moments2_2D_1D, Moments2_2D_1D_P100_autotuned_N_128_K_2304) {
323+
Init(128, 2304);
324+
runMoments2_2D_1D(tc::options_Moments2_2D_1D_P100_autotuned_N_128_K_2304);
325+
}
326+
327+
TEST_F(Moments2_2D_1D, Moments2_2D_1D_P100_autotuned_N_1024_K_36864) {
328+
Init(1024, 36864);
329+
runMoments2_2D_1D(tc::options_Moments2_2D_1D_P100_autotuned_N_1024_K_36864);
330+
}
331+
332+
// V100
333+
TEST_F(Moments2_2D_1D, Moments2_2D_1D_V100_autotuned_N_128_K_2304) {
334+
Init(128, 2304);
335+
runMoments2_2D_1D(tc::options_Moments2_2D_1D_V100_autotuned_N_128_K_2304);
336+
}
337+
338+
TEST_F(Moments2_2D_1D, Moments2_2D_1D_V100_autotuned_N_1024_K_36864) {
339+
Init(1024, 36864);
340+
runMoments2_2D_1D(tc::options_Moments2_2D_1D_V100_autotuned_N_1024_K_36864);
341+
}
342+
343+
int main(int argc, char** argv) {
344+
::testing::InitGoogleTest(&argc, argv);
345+
::gflags::ParseCommandLineFlags(&argc, &argv, true);
346+
::google::InitGoogleLogging(argv[0]);
347+
tc::aten::setAtenSeed(tc::initRandomSeed(), at::Backend::CUDA);
348+
return RUN_ALL_TESTS();
349+
}

0 commit comments

Comments
 (0)