Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 226 additions & 0 deletions nbody/nbody_unsafe_simd.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
# Based on https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/nbody-rust-7.html
#
# The basic strategy matches Rust #7, based on gcc #4: use vectorized rsqrt
# to compute pairwise distances.
#
# We deviate by also skipping a single Newton step

module NBody

using StaticArrays, SIMD, Printf
using Base: llvmcall

const solar_mass = 4π^2
const days_per_year = 365.24
const NBODIES = 5
const NPAIRS = Int(NBODIES * (NBODIES - 1) / 2)
const PAIRS = Tuple((i,j) for i = 1:5, j = 1:5 if j > i)

struct Bodies
x::MMatrix{NBODIES, 3, Float64}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is cheating imo. Are other languages really encoding the number of bodies as a compile time constant?

Copy link
Author

@smallnamespace smallnamespace Sep 6, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very much so, e.g.:

I don't have a strong opinion about this, although it seems inevitable that benchmark implementations slowly slide towards more compile-time computation in the absence of clear, explicit guidelines. It's also quite arguable whether hand-vectorizing everything shows much, other than how easily a language lets you call intrinsics.

Julia's well-equipped to play this game though, especially through the macro facilities. Up to you whether you think it's a net benefit to the community to showcase this.

v::MMatrix{NBODIES, 3, Float64}
m::NTuple{NBODIES, Float64}
end

function init_bodies!(bodies)
x, v = bodies.x, bodies.v
# Sun
x[1, :] = [0, 0, 0]
v[1, :] = [0, 0, 0]

# Jupiter
x[2, :] = [
4.84143144246472090e+00,
-1.16032004402742839e+00,
-1.03622044471123109e-01,
]
v[2, :] = [
1.66007664274403694e-03,
7.69901118419740425e-03,
-6.90460016972063023e-05,
] .* days_per_year

# Saturn
x[3, :] = [
8.34336671824457987e+00,
4.12479856412430479e+00,
-4.03523417114321381e-01,
]
v[3, :] = [
-2.76742510726862411e-03,
4.99852801234917238e-03,
2.30417297573763929e-05,
] .* days_per_year

# Uranus
x[4, :] = [
1.28943695621391310e+01,
-1.51111514016986312e+01,
-2.23307578892655734e-01,
]
v[4, :] = [
2.96460137564761618e-03,
2.37847173959480950e-03,
-2.96589568540237556e-05,
] .* days_per_year

# Neptune
x[5, :] = [
1.53796971148509165e+01,
-2.59193146099879641e+01,
1.79258772950371181e-01,
]
v[5, :] = [
2.68067772490389322e-03,
1.62824170038242295e-03,
-9.51592254519715870e-05,
] * days_per_year
end

const __m128 = NTuple{4, VecElement{Float32}}
const __m128d = NTuple{2, VecElement{Float64}}
const v2d = Vec{2, Float64}

@inline function rsqrt_pd(v2::v2d)
v2d(rsqrt_ccall(v2.elts))
end

@inline function rsqrt_pd_newton(v2::v2d)
guess = rsqrt_pd(v2)
# We only need one Newton step to achieve desired accuracy
guess = guess * 1.5 - ((0.5 * v2) * guess) * (guess * guess)
guess
end

rsqrt(f::__m128) = ccall("llvm.x86.sse.rsqrt.ps", llvmcall, __m128, (__m128, ), f);
_mm_cvtpd_ps(f::__m128d) = ccall("llvm.x86.sse2.cvtpd2ps", llvmcall, __m128, (__m128d, ), f);
_mm_cvtps_pd(f::__m128) = llvmcall(("", "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How difficult would it be to contribute these to SIMD.jl instead of including them inline here? It seems like rsqrt at least should be relatively trivial.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@non-Jedi I think it's doable, but probably beyond my current Julia abilities.

I did find that LLVM will only emit the rsqrt assembly instructions for single and 4x floats (under -ffast-math and SSE3, I believe), see https://godbolt.org/z/v5DF_s . It does a single Newton-Raphson step with FMA.

The tricky bit here is that we would need to explicitly llvmcall the double to float conversions and word packing (for 2x case), as LLVM doesn't help us out here.

Also, I think SIMD.jl covers a subset of Julia math functions - would it be a bit strange to add a SIMD primitive that isn't in Base?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess rsqrt is only simple if you define it for only Vec{4,Float32}, which the current architecture of SIMD.jl doesn't really allow. To me the functionality still belongs in there even if it's not a Base Julia function. I'll open an issue on the repo and see what the maintainer thinks.

%2 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
%3 = fpext <2 x float> %2 to <2 x double>
ret <2 x double> %3"),
__m128d,
Tuple{__m128}, f)
@inline rsqrt_ccall(f::__m128d) = _mm_cvtps_pd(rsqrt(_mm_cvtpd_ps(f)))

@inline function advance(#x, v, m, dt, dx, dmag)
x::MMatrix{NBODIES, 3, Float64, NBODIES * 3},
v::MMatrix{NBODIES, 3, Float64, NBODIES * 3},
m::NTuple{NBODIES, Float64},
dt::Float64,
dx::MMatrix{NPAIRS, 3, Float64, NPAIRS * 3},
dmag::MVector{NPAIRS, Float64})

dmag_v2d_ptr = Base.unsafe_convert(Ptr{v2d}, pointer_from_objref(dmag))
dx_v2d_ptr = Base.unsafe_convert(Ptr{v2d}, pointer_from_objref(dx))

# Unroll loop to calculate distances + store two at a time
@inbounds for k1 = 1:2:length(PAIRS)
k2 = k1 + 1
k_v2d = k2 ÷ 2

i1, j1 = PAIRS[k1]
i2, j2 = PAIRS[k2]

dx1 = v2d((x[i1, 1], x[i2, 1])) - v2d((x[j1, 1], x[j2, 1]))
dx2 = v2d((x[i1, 2], x[i2, 2])) - v2d((x[j1, 2], x[j2, 2]))
dx3 = v2d((x[i1, 3], x[i2, 3])) - v2d((x[j1, 3], x[j2, 3]))
unsafe_store!(dx_v2d_ptr, dx1, k_v2d)
unsafe_store!(dx_v2d_ptr, dx2, k_v2d + NPAIRS ÷ 2)
unsafe_store!(dx_v2d_ptr, dx3, k_v2d + NPAIRS)

dsq = dx1^2 + dx2^2 + dx3^2
drsqrt = rsqrt_pd_newton(dsq)
Copy link
Contributor

@non-Jedi non-Jedi Sep 10, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you tried a version of this that uses all 4 available Float32 and only has 3 calls to _mm_rsqrt_ps instead of 5? It may be slower since it would require caching the dsq values and drsqrt values so you could break this single loop into 3.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't yet, will give it a try.

The double-single conversions and shuffling may end up being a bottleneck, at least for the competition's core2 target since they are only 2-wide; in particular cvtps_pd only expands a contiguous pair of floats into a contiguous pairs of doubles.

If we allow AVX, a more optimal implementation may end up using Vec4 coordinates + 4-wide rsqrt, e.g. switching between broadcasting across coordinates vs. across planets.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A 4-wide _mm_rsqrt_ps is slightly slower due to the extra caching involved (~4.3s vs. 4.15s on my machine).

Would you like me to clean this up this PR?

mag = dt * drsqrt / dsq
unsafe_store!(dmag_v2d_ptr, mag, k_v2d)
end

k = 1
@inbounds for (i, j) = PAIRS
dmag_i = dmag[k] * m[i]
dmag_j = dmag[k] * m[j]
for d = 1:3
dx_k = dx[k, d]
v[i, d] -= dx_k * dmag_j
v[j, d] += dx_k * dmag_i
end
k += 1
end

@inbounds for i = 1:NBODIES
for d = 1:3
x[i, d] += dt * v[i, d]
end
end
end

function energy(bodies)
x, v, m = bodies.x, bodies.v, bodies.m
e = 0.0
for i = 1:NBODIES
e += 0.5 * m[i] * sum(v[i, :].^2)
for j = i + 1:NBODIES
dx = x[i, :] - x[j, :]
distance = sqrt(sum(dx .* dx))
e -= (m[i] * m[j]) / distance
end
end
return e
end

function init_sun!(bodies)
px = [0.0, 0.0, 0.0]
for i = 1:NBODIES
px += bodies.v[i, :] * bodies.m[i]
end
bodies.v[1, :] = -px ./ solar_mass
end

function main(iterations::Int64)
n = iterations

x = zeros(MMatrix{NBODIES, 3, Float64, 15})
v = zeros(MMatrix{NBODIES, 3, Float64, 15})
m = NTuple{NBODIES, Float64}((
1.0,
9.54791938424326609e-04,
2.85885980666130812e-04,
4.36624404335156298e-05,
5.15138902046611451e-05,
) .* solar_mass)
bodies = Bodies(x, v, m)

init_bodies!(bodies)
init_sun!(bodies)
@printf("%.9f\n", energy(bodies))

# Buffers
dx = zeros(MMatrix{NPAIRS, 3, Float64, 30})
dmag = zeros(MVector{NPAIRS, Float64})
for _ = 1:n
advance(x, v, m, 0.01, dx, dmag)
end

@printf("%.9f\n", energy(bodies))
end

end

@time NBody.main(parse(Int64, ARGS[1]))
@time NBody.main(parse(Int64, ARGS[1]))

# > julia -O3 -C core2 -- nbody_unsafe_simd.jl 50000000
# -0.169075164
# -0.169060076
# 7.603160 seconds (7.88 M allocations: 397.438 MiB, 1.82% gc time)
# -0.169075164
# -0.169060076
# 4.172609 seconds (470 allocations: 11.891 KiB)

# using StaticArrays, InteractiveUtils
# code_native(nb.advance,
# (MMatrix{nb.NBODIES, 3, Float64, nb.NBODIES * 3},
# MMatrix{nb.NBODIES, 3, Float64, nb.NBODIES * 3},
# NTuple{nb.NBODIES, Float64},
# Float64,
# MMatrix{nb.NPAIRS, 3, Float64, nb.NPAIRS * 3},
# MVector{nb.NPAIRS, Float64}))
Loading