diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H index 81a87a56423..7ccd3ec30c2 100644 --- a/Src/Base/AMReX_GpuLaunch.H +++ b/Src/Base/AMReX_GpuLaunch.H @@ -298,9 +298,14 @@ void ParallelForOMP (Box const& box, L const& f) noexcept #pragma omp parallel for collapse(2) for (int k = lo.z; k <= hi.z; ++k) { for (int j = lo.y; j <= hi.y; ++j) { - AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { - f(i,j,k); + constexpr int WIDTH = amrex::simd::native_simd_size_real; + int i = lo.x; + for (; i + WIDTH <= hi.x; i+=WIDTH) { + f(SIMDindex{i}, j, k); + } + for (; i <= hi.x; ++i) { + // TODO: template, etc. + f(SIMDindex<1, int>{i}, j, k); } } } diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H index 38ea1db0243..ca8df366146 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsC.H +++ b/Src/Base/AMReX_GpuLaunchFunctsC.H @@ -2,6 +2,8 @@ #define AMREX_GPU_LAUNCH_FUNCTS_C_H_ #include +#include + namespace amrex { /** Helper type to store/access the SIMD width in ParallelForSIMD lambdas @@ -12,7 +14,7 @@ namespace amrex { * @tparam WIDTH SIMD width in elements * @tparam N index type (integer) */ -template +template struct SIMDindex { /** SIMD width in elements */ diff --git a/Src/Base/AMReX_SIMD.H b/Src/Base/AMReX_SIMD.H index 13676b1ba7a..cec70b75615 100644 --- a/Src/Base/AMReX_SIMD.H +++ b/Src/Base/AMReX_SIMD.H @@ -10,6 +10,7 @@ # include // includes SIMD TS2 header # if __cplusplus >= 202002L # include +# include # endif #endif @@ -26,6 +27,7 @@ namespace amrex::simd using namespace vir::stdx; # if __cplusplus >= 202002L using vir::cvt; + using vir::iota_v; # endif #else // fallback implementations for functions that are commonly used in portable code paths diff --git a/Src/FFT/AMReX_FFT_OpenBCSolver.H b/Src/FFT/AMReX_FFT_OpenBCSolver.H index e54aedb0a8e..ade9a2aaf4c 100644 --- a/Src/FFT/AMReX_FFT_OpenBCSolver.H +++ b/Src/FFT/AMReX_FFT_OpenBCSolver.H @@ -97,17 +97,23 @@ void OpenBCSolver::setGreensFunction (F const& greens_function) } AMREX_ASSERT(nimages[0] == 2); box.shift(-lo); - amrex::ParallelForOMP(box, [=] AMREX_GPU_DEVICE (int i, int j, int k) + amrex::ParallelForOMP(box, [=] AMREX_GPU_DEVICE (amrex::SIMDindex i, int j, int k) { - T G; - if (i == len[0] || j == len[1] || k == len[2]) { - G = 0; - } else { - auto ii = i; + using SIMD_T = simd::stdx::fixed_size_simd; + using SIMD_int = simd::stdx::fixed_size_simd; // simd::stdx::rebind_simd_t; + + SIMD_T G = 0; + if (j != len[1] && k != len[2]) + { + SIMD_int ii = simd::stdx::iota_v + i.index; auto jj = (j > len[1]) ? 2*len[1]-j : j; auto kk = (k > len[2]) ? 2*len[2]-k : k; - G = greens_function(ii+lo3.x,jj+lo3.y,kk+lo3.z); + + auto const i_bound = ii == len[0]; + simd::stdx::where(simd::stdx::cvt(i_bound), G) = 0.0; + simd::stdx::where(simd::stdx::cvt(!i_bound), G) = greens_function.template operator()(ii+lo3.x,jj+lo3.y,kk+lo3.z); } + for (int koff = 0; koff < nimages[2]; ++koff) { int k2 = (koff == 0) ? k : 2*len[2]-k; if ((k2 == 2*len[2]) || (koff == 1 && k == len[2])) { @@ -119,11 +125,14 @@ void OpenBCSolver::setGreensFunction (F const& greens_function) continue; } for (int ioff = 0; ioff < nimages[0]; ++ioff) { - int i2 = (ioff == 0) ? i : 2*len[0]-i; - if ((i2 == 2*len[0]) || (ioff == 1 && i == len[0])) { - continue; + for (int iw = i.index; iw < i.index+i.width; ++iw) { + int i2 = (ioff == 0) ? iw : 2*len[0]-iw; + if ((i2 == 2*len[0]) || (ioff == 1 && iw == len[0])) { + continue; + } + // TODO: SIMD-assign N values + a(i2+lo3.x,j2+lo3.y,k2+lo3.z) = G[iw]; } - a(i2+lo3.x,j2+lo3.y,k2+lo3.z) = G; } } }