From 08779437194f025ab2d2b06931aec530bf69c9cb Mon Sep 17 00:00:00 2001 From: frsama <108132842+frsama@users.noreply.github.com> Date: Fri, 29 Jul 2022 21:12:41 +0800 Subject: [PATCH 1/2] Update main.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1.首先立即尝试就尝试将AOS修改为SOA 使用array替换了vector,效果群拔,立即就将1300ms左右的耗时降低到了130ms左右 2.尝试抠各种细节 (1)把内层循环中,乘除法为常量的部分拉到外层循环,只需要一次计算即可。 (2)把循环拆开,主要是指计算energy那里,energy自增和自减两部分完全不相关啊,我就拆成了两个循环。因为循环越简单,编译器越容易优化。 但是扣这些细节似乎没什么效果,好像之前光把AOS改为SOA就已经成功矢量化优化了。 我还到处添加了#pragma unroll,压根没用,去网上搜,结果人家说开-O1 -O2 -O3,这个宏就会失效,好像是因为开了优化本身就会尝试unroll。 3对初始化下手了 我看汇编代码,发现轮流px[i] py[i] pz[i] vx[i] vy[i] vz[i] mass[i]的方法并没有发生矢量化优化, 结果特地去写了先全部赋值px,在全部赋值px,再全部赋值py的方法。为了处理mass和其它不同情况,还特地写了模板。 结果发现成功的实现了矢量化,而且还发现模板函数以内联的方式插进去了,并没有想象中的jump或者是call。 优化是真的优化了,但是没用也是真的一点用也没有。 初始化时间复杂度O(n),完全比不上step和calc的O(n^2)时间复杂度。 结果我一顿操作猛如虎,耗时压根没变多少,就从原来的130ms出头,挤进130ms以内而已,甚至可以说只是误差....... --- main.cpp | 112 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 77 insertions(+), 35 deletions(-) diff --git a/main.cpp b/main.cpp index cf6369b..e4cc091 100644 --- a/main.cpp +++ b/main.cpp @@ -3,64 +3,106 @@ #include #include #include +#include + +#define N 48 float frand() { return (float)rand() / RAND_MAX * 2 - 1; } -struct Star { - float px, py, pz; - float vx, vy, vz; - float mass; +struct alignas(16) Star { + std::array px, py, pz; + std::array vx, vy, vz; + std::array mass; }; -std::vector stars; +Star stars; -void init() { - for (int i = 0; i < 48; i++) { - stars.push_back({ - frand(), frand(), frand(), - frand(), frand(), frand(), - frand() + 1, - }); +// 原计划的初始化,但是完全没有矢量化 +// void init() { +// for (size_t i = 0; i < N; ++i) { +// stars.px[i] = frand(); +// stars.py[i] = frand(); +// stars.pz[i] = frand(); +// stars.vx[i] = frand(); +// stars.vy[i] = frand(); +// stars.vz[i] = frand(); +// stars.mass[i] = frand() + 1; +// } +// } + +template +void sub_init(std::array &arr) { + #pragma GCC unroll 4 + for (size_t i = 0; i < N; ++i) { + arr[i] = frand() + M; } } +void init() { + sub_init<0>(stars.px); + sub_init<0>(stars.py); + sub_init<0>(stars.pz); + sub_init<0>(stars.vx); + sub_init<0>(stars.vy); + sub_init<0>(stars.vz); + sub_init<1>(stars.mass); +} + + float G = 0.001; float eps = 0.001; float dt = 0.01; +float eps2 = eps * eps; +float Gdt = G * dt; + void step() { - for (auto &star: stars) { - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; - float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - d2 *= sqrt(d2); - star.vx += dx * other.mass * G * dt / d2; - star.vy += dy * other.mass * G * dt / d2; - star.vz += dz * other.mass * G * dt / d2; + for (size_t i = 0; i < N; ++i) { + float dvx = 0, dvy = 0, dvz = 0; + #pragma GCC unroll 4 + for (size_t j = 0; j < N; ++j) { + float dx = stars.px[j] - stars.px[i]; + float dy = stars.py[j] - stars.py[i]; + float dz = stars.pz[j] - stars.pz[i]; + float d2 = dx * dx + dy * dy + dz * dz + eps2; + d2 *= std::sqrt(d2); + + float tmp = stars.mass[j] * Gdt / d2; + dvx += dx * tmp; + dvy += dy * tmp; + dvz += dz * tmp; } + stars.vx[i] += dvx; + stars.vy[i] += dvy; + stars.vz[i] += dvz; } - for (auto &star: stars) { - star.px += star.vx * dt; - star.py += star.vy * dt; - star.pz += star.vz * dt; + #pragma GCC unroll 4 + for (size_t i = 0; i < N; ++i) { + stars.px[i] += stars.vx[i] * dt; + stars.py[i] += stars.vy[i] * dt; + stars.pz[i] += stars.vz[i] * dt; } } float calc() { float energy = 0; - for (auto &star: stars) { - float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz; - energy += star.mass * v2 / 2; - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; - float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - energy -= other.mass * star.mass * G / sqrt(d2) / 2; + #pragma GCC unroll 4 + for (size_t i = 0; i < N; ++i) { + float v2 = stars.vx[i] * stars.vx[i] + stars.vy[i] * stars.vy[i] + stars.vz[i] * stars.vz[i]; + energy += stars.mass[i] * v2 / 2; + } + for (size_t i = 0; i < N; ++i) { + float tmp = stars.mass[i] * G / 2; + #pragma GCC unroll 4 + for (size_t j = 0; j < N; ++j) { + float dx = stars.px[j] - stars.px[i]; + float dy = stars.py[j] - stars.py[i]; + float dz = stars.pz[j] - stars.pz[i]; + float d2 = dx * dx + dy * dy + dz * dz + eps2; + float reverse_sqrt_d2 = 1 / std::sqrt(d2); + energy -= stars.mass[j] * tmp *reverse_sqrt_d2; } } return energy; From 5da59354d5e3131c45a98f3ac9b4f648d9a56de7 Mon Sep 17 00:00:00 2001 From: frsama <108132842+frsama@users.noreply.github.com> Date: Fri, 29 Jul 2022 21:13:54 +0800 Subject: [PATCH 2/2] Update CMakeLists.txt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加了-ffast-math -march=native优化选项 --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 29b152c..b4eb6e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,3 +7,5 @@ if (NOT CMAKE_BUILD_TYPE) endif() add_executable(main main.cpp) + +target_compile_options(main PUBLIC -ffast-math -march=native)