|
| 1 | +/** |
| 2 | + * Copyright (c) 2017-present, Facebook, Inc. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | +#include "moments.h" |
| 17 | + |
| 18 | +#include <iostream> |
| 19 | +#include <string> |
| 20 | +#include <vector> |
| 21 | + |
| 22 | +#include <gflags/gflags.h> |
| 23 | +#include <glog/logging.h> |
| 24 | +#include <gtest/gtest.h> |
| 25 | + |
| 26 | +#include "tc/aten/aten.h" |
| 27 | + |
| 28 | +#include "tc/aten/aten_compiler.h" |
| 29 | +#include "tc/core/cuda/cuda_mapping_options.h" |
| 30 | + |
| 31 | +#include "../test/caffe2/cuda/test_harness.h" |
| 32 | +#include "../test/caffe2/test_harness.h" |
| 33 | +#include "../test/test_harness_aten_cuda.h" |
| 34 | +#include "benchmark_fixture.h" |
| 35 | + |
| 36 | +#include "tc/c2/context.h" |
| 37 | +#include "tc/core/cuda/cuda.h" |
| 38 | +#include "tc/core/flags.h" |
| 39 | + |
| 40 | +using namespace caffe2; |
| 41 | + |
| 42 | +DEFINE_uint32(N, 1024, "N batch size (32 * 32 from group_norm equivalent)"); |
| 43 | +DEFINE_uint32(K, 36864, "K row size (16 * 48 * 48 from group_norm equivalent)"); |
| 44 | + |
| 45 | +class Moments2_2D_1D : public Benchmark { |
| 46 | + protected: |
| 47 | + uint32_t N, K; |
| 48 | + at::Tensor I, sum, mean, sumSquares, var; |
| 49 | + |
| 50 | + public: |
| 51 | + void Init(uint32_t n, uint32_t k) { |
| 52 | + N = n; |
| 53 | + K = k; |
| 54 | + I = at::CUDA(at::kFloat).rand({N, K}).uniform_(0.0f, 1.0f); |
| 55 | + at::Tensor v = I.view({N, -1}); |
| 56 | + sum = v.sum(1); |
| 57 | + mean = v.mean(-1, true).view({N}); |
| 58 | + sumSquares = v.pow(2.0f).sum(1); |
| 59 | + var = v.var(-1, true).view({N}); |
| 60 | + } |
| 61 | + void runSum_2D_1D(const tc::CudaMappingOptions& options); |
| 62 | + void runMean_2D_1D(const tc::CudaMappingOptions& options); |
| 63 | + void runSumSquares_2D_1D(const tc::CudaMappingOptions& options); |
| 64 | + void runVar_2D_1D(const tc::CudaMappingOptions& options); |
| 65 | + void runSumAndSquares_2D_1D(const tc::CudaMappingOptions& options); |
| 66 | + void runMoments2_2D_1D(const tc::CudaMappingOptions& options); |
| 67 | + |
| 68 | + private: |
| 69 | + void autotuneAndCheck( |
| 70 | + const std::string& entryPoint, |
| 71 | + const std::vector<at::Tensor>& inputs, |
| 72 | + const tc::CudaMappingOptions& options, |
| 73 | + std::function<bool( |
| 74 | + const std::vector<at::Tensor>& inputs, |
| 75 | + const std::vector<at::Tensor>& outputs)> checkFun); |
| 76 | +}; |
| 77 | + |
| 78 | +void Moments2_2D_1D::autotuneAndCheck( |
| 79 | + const std::string& entryPoint, |
| 80 | + const std::vector<at::Tensor>& inputs, |
| 81 | + const tc::CudaMappingOptions& options, |
| 82 | + std::function<bool( |
| 83 | + const std::vector<at::Tensor>& inputs, |
| 84 | + const std::vector<at::Tensor>& outputs)> checkFun) { |
| 85 | + std::string suffix = std::string("_N_") + std::to_string(N) + |
| 86 | + std::string("_K_") + std::to_string(K); |
| 87 | + std::vector<tc::CudaMappingOptions> bestOptions{options}; |
| 88 | + if (FLAGS_autotune) { |
| 89 | + bestOptions = autotune( |
| 90 | + FLAGS_save_tuner_proto_prefix + std::string("/moments_cache") + suffix, |
| 91 | + FLAGS_save_tuner_proto_prefix + std::string("/moments_best") + suffix, |
| 92 | + tc::TC_Moments, |
| 93 | + entryPoint, |
| 94 | + inputs, |
| 95 | + options); |
| 96 | + CHECK_GE(bestOptions.size(), 1u); |
| 97 | + } |
| 98 | + Check(tc::TC_Moments, entryPoint, bestOptions[0], inputs, checkFun); |
| 99 | +} |
| 100 | + |
| 101 | +void Moments2_2D_1D::runSum_2D_1D(const tc::CudaMappingOptions& options) { |
| 102 | + std::vector<at::Tensor> inputs{I}; |
| 103 | + auto check_fun = [&](const std::vector<at::Tensor>& inputs, |
| 104 | + const std::vector<at::Tensor>& outputs) { |
| 105 | + checkRtol(outputs[0] - sum, inputs, K, 1e-5); |
| 106 | + return true; |
| 107 | + }; |
| 108 | + autotuneAndCheck(tc::TC_Sum_2D_1D_NAME, inputs, options, check_fun); |
| 109 | +} |
| 110 | + |
| 111 | +void Moments2_2D_1D::runMean_2D_1D(const tc::CudaMappingOptions& options) { |
| 112 | + std::vector<at::Tensor> inputs{I}; |
| 113 | + auto check_fun = [&](const std::vector<at::Tensor>& inputs, |
| 114 | + const std::vector<at::Tensor>& outputs) { |
| 115 | + checkRtol(outputs[0] - mean, inputs, K, 1e-5); |
| 116 | + return true; |
| 117 | + }; |
| 118 | + autotuneAndCheck(tc::TC_Mean_2D_1D_NAME, inputs, options, check_fun); |
| 119 | +} |
| 120 | + |
| 121 | +void Moments2_2D_1D::runSumSquares_2D_1D( |
| 122 | + const tc::CudaMappingOptions& options) { |
| 123 | + std::vector<at::Tensor> inputs{I}; |
| 124 | + auto check_fun = [&](const std::vector<at::Tensor>& inputs, |
| 125 | + const std::vector<at::Tensor>& outputs) { |
| 126 | + checkRtol(outputs[0] - sumSquares, inputs, 2 * K, 1e5); |
| 127 | + return true; |
| 128 | + }; |
| 129 | + autotuneAndCheck(tc::TC_Sum_Squares_2D_1D_NAME, inputs, options, check_fun); |
| 130 | +} |
| 131 | + |
| 132 | +void Moments2_2D_1D::runVar_2D_1D(const tc::CudaMappingOptions& options) { |
| 133 | + std::vector<at::Tensor> inputs{I, mean}; |
| 134 | + auto check_fun = [&](const std::vector<at::Tensor>& inputs, |
| 135 | + const std::vector<at::Tensor>& outputs) { |
| 136 | + checkRtol(outputs[0] - var, inputs, K, 1e-5); |
| 137 | + return true; |
| 138 | + }; |
| 139 | + autotuneAndCheck(tc::TC_Var_2D_1D_NAME, inputs, options, check_fun); |
| 140 | +} |
| 141 | + |
| 142 | +void Moments2_2D_1D::runSumAndSquares_2D_1D( |
| 143 | + const tc::CudaMappingOptions& options) { |
| 144 | + std::vector<at::Tensor> inputs{I}; |
| 145 | + auto check_fun = [&](const std::vector<at::Tensor>& inputs, |
| 146 | + const std::vector<at::Tensor>& outputs) { |
| 147 | + checkRtol(outputs[0] - sum, inputs, 2 * K, 1e-5); |
| 148 | + checkRtol(outputs[1] - sumSquares, inputs, 2 * K, 1e-5); |
| 149 | + return true; |
| 150 | + }; |
| 151 | + autotuneAndCheck( |
| 152 | + tc::TC_Sum_And_Squares_2D_1D_NAME, inputs, options, check_fun); |
| 153 | +} |
| 154 | + |
| 155 | +void Moments2_2D_1D::runMoments2_2D_1D(const tc::CudaMappingOptions& options) { |
| 156 | + std::vector<at::Tensor> inputs{I}; |
| 157 | + auto check_fun = [&](const std::vector<at::Tensor>& inputs, |
| 158 | + const std::vector<at::Tensor>& outputs) { |
| 159 | + checkRtol(outputs[0] - mean, inputs, K, 1e-5); |
| 160 | + checkRtol(outputs[1] - var, inputs, 2 * K, 1e-5); |
| 161 | + return true; |
| 162 | + }; |
| 163 | + autotuneAndCheck(tc::TC_Moments2_2D_1D_NAME, inputs, options, check_fun); |
| 164 | +} |
| 165 | + |
| 166 | +/// Sum |
| 167 | +// Generic |
| 168 | +TEST_F(Moments2_2D_1D, Sum_2D_1D) { |
| 169 | + Init(FLAGS_N, FLAGS_K); |
| 170 | + runSum_2D_1D(tc::CudaMappingOptions::makeNaiveMappingOptions()); |
| 171 | +} |
| 172 | + |
| 173 | +// P100 |
| 174 | +TEST_F(Moments2_2D_1D, Sum_2D_1D_P100_autotuned_N_128_K_2304) { |
| 175 | + Init(128, 2304); |
| 176 | + runSum_2D_1D(tc::options_Sum_2D_1D_P100_autotuned_N_128_K_2304); |
| 177 | +} |
| 178 | + |
| 179 | +TEST_F(Moments2_2D_1D, Sum_2D_1D_P100_autotuned_N_1024_K_36864) { |
| 180 | + Init(1024, 36864); |
| 181 | + runSum_2D_1D(tc::options_Sum_2D_1D_P100_autotuned_N_1024_K_36864); |
| 182 | +} |
| 183 | + |
| 184 | +// V100 |
| 185 | +TEST_F(Moments2_2D_1D, Sum_2D_1D_V100_autotuned_N_128_K_2304) { |
| 186 | + Init(128, 2304); |
| 187 | + runSum_2D_1D(tc::options_Sum_2D_1D_V100_autotuned_N_128_K_2304); |
| 188 | +} |
| 189 | + |
| 190 | +TEST_F(Moments2_2D_1D, Sum_2D_1D_V100_autotuned_N_1024_K_36864) { |
| 191 | + Init(1024, 36864); |
| 192 | + runSum_2D_1D(tc::options_Sum_2D_1D_V100_autotuned_N_1024_K_36864); |
| 193 | +} |
| 194 | + |
| 195 | +// Autotunes and benchmarks mean |
| 196 | +TEST_F(Moments2_2D_1D, Mean_2D_1D) { |
| 197 | + Init(FLAGS_N, FLAGS_K); |
| 198 | + runMean_2D_1D(tc::CudaMappingOptions::makeNaiveMappingOptions()); |
| 199 | +} |
| 200 | + |
| 201 | +// P100 |
| 202 | +TEST_F(Moments2_2D_1D, Mean_2D_1D_P100_autotuned_N_128_K_2304) { |
| 203 | + Init(128, 2304); |
| 204 | + runMean_2D_1D(tc::options_Mean_2D_1D_P100_autotuned_N_128_K_2304); |
| 205 | +} |
| 206 | + |
| 207 | +TEST_F(Moments2_2D_1D, Mean_2D_1D_P100_autotuned_N_1024_K_36864) { |
| 208 | + Init(1024, 36864); |
| 209 | + runMean_2D_1D(tc::options_Mean_2D_1D_P100_autotuned_N_1024_K_36864); |
| 210 | +} |
| 211 | + |
| 212 | +// V100 |
| 213 | +TEST_F(Moments2_2D_1D, Mean_2D_1D_V100_autotuned_N_128_K_2304) { |
| 214 | + Init(128, 2304); |
| 215 | + runMean_2D_1D(tc::options_Mean_2D_1D_V100_autotuned_N_128_K_2304); |
| 216 | +} |
| 217 | + |
| 218 | +TEST_F(Moments2_2D_1D, Mean_2D_1D_V100_autotuned_N_1024_K_36864) { |
| 219 | + Init(1024, 36864); |
| 220 | + runMean_2D_1D(tc::options_Mean_2D_1D_V100_autotuned_N_1024_K_36864); |
| 221 | +} |
| 222 | + |
| 223 | +// Autotunes and benchmarks sum_squares |
| 224 | +TEST_F(Moments2_2D_1D, Sum_Squares_2D_1D) { |
| 225 | + Init(FLAGS_N, FLAGS_K); |
| 226 | + runSumSquares_2D_1D(tc::CudaMappingOptions::makeNaiveMappingOptions()); |
| 227 | +} |
| 228 | + |
| 229 | +// P100 |
| 230 | +TEST_F(Moments2_2D_1D, Sum_Squares_2D_1D_P100_autotuned_N_128_K_2304) { |
| 231 | + Init(128, 2304); |
| 232 | + runSumSquares_2D_1D( |
| 233 | + tc::options_Sum_Squares_2D_1D_P100_autotuned_N_128_K_2304); |
| 234 | +} |
| 235 | + |
| 236 | +TEST_F(Moments2_2D_1D, Sum_Squares_2D_1D_P100_autotuned_N_1024_K_36864) { |
| 237 | + Init(1024, 36864); |
| 238 | + runSumSquares_2D_1D( |
| 239 | + tc::options_Sum_Squares_2D_1D_P100_autotuned_N_1024_K_36864); |
| 240 | +} |
| 241 | + |
| 242 | +// V100 |
| 243 | +TEST_F(Moments2_2D_1D, Sum_Squares_2D_1D_V100_autotuned_N_128_K_2304) { |
| 244 | + Init(128, 2304); |
| 245 | + runSumSquares_2D_1D( |
| 246 | + tc::options_Sum_Squares_2D_1D_V100_autotuned_N_128_K_2304); |
| 247 | +} |
| 248 | + |
| 249 | +TEST_F(Moments2_2D_1D, Sum_Squares_2D_1D_V100_autotuned_N_1024_K_36864) { |
| 250 | + Init(1024, 36864); |
| 251 | + runSumSquares_2D_1D( |
| 252 | + tc::options_Sum_Squares_2D_1D_V100_autotuned_N_1024_K_36864); |
| 253 | +} |
| 254 | + |
| 255 | +// Autotunes and benchmarks var |
| 256 | +TEST_F(Moments2_2D_1D, Var_2D_1D) { |
| 257 | + Init(FLAGS_N, FLAGS_K); |
| 258 | + runVar_2D_1D(tc::CudaMappingOptions::makeNaiveMappingOptions()); |
| 259 | +} |
| 260 | + |
| 261 | +// P100 |
| 262 | +TEST_F(Moments2_2D_1D, Var_2D_1D_P100_autotuned_N_128_K_2304) { |
| 263 | + Init(128, 2304); |
| 264 | + runVar_2D_1D(tc::options_Var_2D_1D_P100_autotuned_N_128_K_2304); |
| 265 | +} |
| 266 | + |
| 267 | +TEST_F(Moments2_2D_1D, Var_2D_1D_P100_autotuned_N_1024_K_36864) { |
| 268 | + Init(1024, 36864); |
| 269 | + runVar_2D_1D(tc::options_Var_2D_1D_P100_autotuned_N_1024_K_36864); |
| 270 | +} |
| 271 | + |
| 272 | +// V100 |
| 273 | +TEST_F(Moments2_2D_1D, Var_2D_1D_V100_autotuned_N_128_K_2304) { |
| 274 | + Init(128, 2304); |
| 275 | + runVar_2D_1D(tc::options_Var_2D_1D_V100_autotuned_N_128_K_2304); |
| 276 | +} |
| 277 | + |
| 278 | +TEST_F(Moments2_2D_1D, Var_2D_1D_V100_autotuned_N_1024_K_36864) { |
| 279 | + Init(1024, 36864); |
| 280 | + runVar_2D_1D(tc::options_Var_2D_1D_V100_autotuned_N_1024_K_36864); |
| 281 | +} |
| 282 | + |
| 283 | +// Autotunes and benchmarks sum_and_squares |
| 284 | +TEST_F(Moments2_2D_1D, Sum_And_Squares_2D_1D) { |
| 285 | + Init(FLAGS_N, FLAGS_K); |
| 286 | + runSumAndSquares_2D_1D(tc::CudaMappingOptions::makeNaiveMappingOptions()); |
| 287 | +} |
| 288 | + |
| 289 | +// P100 |
| 290 | +TEST_F(Moments2_2D_1D, Sum_And_Squares_2D_1D_P100_autotuned_N_128_K_2304) { |
| 291 | + Init(128, 2304); |
| 292 | + runSumAndSquares_2D_1D( |
| 293 | + tc::options_Sum_And_Squares_2D_1D_P100_autotuned_N_128_K_2304); |
| 294 | +} |
| 295 | + |
| 296 | +TEST_F(Moments2_2D_1D, Sum_And_Squares_2D_1D_P100_autotuned_N_1024_K_36864) { |
| 297 | + Init(1024, 36864); |
| 298 | + runSumAndSquares_2D_1D( |
| 299 | + tc::options_Sum_And_Squares_2D_1D_P100_autotuned_N_1024_K_36864); |
| 300 | +} |
| 301 | + |
| 302 | +// V100 |
| 303 | +TEST_F(Moments2_2D_1D, Sum_And_Squares_2D_1D_V100_autotuned_N_128_K_2304) { |
| 304 | + Init(128, 2304); |
| 305 | + runSumAndSquares_2D_1D( |
| 306 | + tc::options_Sum_And_Squares_2D_1D_V100_autotuned_N_128_K_2304); |
| 307 | +} |
| 308 | + |
| 309 | +TEST_F(Moments2_2D_1D, Sum_And_Squares_2D_1D_V100_autotuned_N_1024_K_36864) { |
| 310 | + Init(1024, 36864); |
| 311 | + runSumAndSquares_2D_1D( |
| 312 | + tc::options_Sum_And_Squares_2D_1D_V100_autotuned_N_1024_K_36864); |
| 313 | +} |
| 314 | + |
| 315 | +// Benchmarks 2 moments (mean and var) |
| 316 | +TEST_F(Moments2_2D_1D, Moments2_2D_1D) { |
| 317 | + Init(FLAGS_N, FLAGS_K); |
| 318 | + runMoments2_2D_1D(tc::CudaMappingOptions::makeNaiveMappingOptions()); |
| 319 | +} |
| 320 | + |
| 321 | +// P100 |
| 322 | +TEST_F(Moments2_2D_1D, Moments2_2D_1D_P100_autotuned_N_128_K_2304) { |
| 323 | + Init(128, 2304); |
| 324 | + runMoments2_2D_1D(tc::options_Moments2_2D_1D_P100_autotuned_N_128_K_2304); |
| 325 | +} |
| 326 | + |
| 327 | +TEST_F(Moments2_2D_1D, Moments2_2D_1D_P100_autotuned_N_1024_K_36864) { |
| 328 | + Init(1024, 36864); |
| 329 | + runMoments2_2D_1D(tc::options_Moments2_2D_1D_P100_autotuned_N_1024_K_36864); |
| 330 | +} |
| 331 | + |
| 332 | +// V100 |
| 333 | +TEST_F(Moments2_2D_1D, Moments2_2D_1D_V100_autotuned_N_128_K_2304) { |
| 334 | + Init(128, 2304); |
| 335 | + runMoments2_2D_1D(tc::options_Moments2_2D_1D_V100_autotuned_N_128_K_2304); |
| 336 | +} |
| 337 | + |
| 338 | +TEST_F(Moments2_2D_1D, Moments2_2D_1D_V100_autotuned_N_1024_K_36864) { |
| 339 | + Init(1024, 36864); |
| 340 | + runMoments2_2D_1D(tc::options_Moments2_2D_1D_V100_autotuned_N_1024_K_36864); |
| 341 | +} |
| 342 | + |
| 343 | +int main(int argc, char** argv) { |
| 344 | + ::testing::InitGoogleTest(&argc, argv); |
| 345 | + ::gflags::ParseCommandLineFlags(&argc, &argv, true); |
| 346 | + ::google::InitGoogleLogging(argv[0]); |
| 347 | + tc::aten::setAtenSeed(tc::initRandomSeed(), at::Backend::CUDA); |
| 348 | + return RUN_ALL_TESTS(); |
| 349 | +} |
0 commit comments