Skip to content

Commit f6ddb3f

Browse files
WIP: Hacky(!) but faster nbody implementations
1 parent 175c335 commit f6ddb3f

File tree

2 files changed

+484
-0
lines changed

2 files changed

+484
-0
lines changed

nbody/nbody_unsafe_simd.jl

Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
# Based on https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/nbody-rust-7.html
2+
#
3+
# The basic strategy matches Rust #7, based on gcc #4: use vectorized rsqrt
4+
# to compute pairwise distances.
5+
#
6+
# We deviate by also skipping a single Newton step
7+
8+
module NBody
9+
10+
using StaticArrays, SIMD, Printf
11+
using Base: llvmcall
12+
13+
const solar_mass = 4π^2
14+
const days_per_year = 365.24
15+
const NBODIES = 5
16+
const NPAIRS = Int(NBODIES * (NBODIES - 1) / 2)
17+
const PAIRS = Tuple((i,j) for i = 1:5, j = 1:5 if j > i)
18+
19+
struct Bodies
20+
x::MMatrix{NBODIES, 3, Float64}
21+
v::MMatrix{NBODIES, 3, Float64}
22+
m::NTuple{NBODIES, Float64}
23+
end
24+
25+
function init_bodies!(bodies)
26+
x, v = bodies.x, bodies.v
27+
# Sun
28+
x[1, :] = [0, 0, 0]
29+
v[1, :] = [0, 0, 0]
30+
31+
# Jupiter
32+
x[2, :] = [
33+
4.84143144246472090e+00,
34+
-1.16032004402742839e+00,
35+
-1.03622044471123109e-01,
36+
]
37+
v[2, :] = [
38+
1.66007664274403694e-03,
39+
7.69901118419740425e-03,
40+
-6.90460016972063023e-05,
41+
] .* days_per_year
42+
43+
# Saturn
44+
x[3, :] = [
45+
8.34336671824457987e+00,
46+
4.12479856412430479e+00,
47+
-4.03523417114321381e-01,
48+
]
49+
v[3, :] = [
50+
-2.76742510726862411e-03,
51+
4.99852801234917238e-03,
52+
2.30417297573763929e-05,
53+
] .* days_per_year
54+
55+
# Uranus
56+
x[4, :] = [
57+
1.28943695621391310e+01,
58+
-1.51111514016986312e+01,
59+
-2.23307578892655734e-01,
60+
]
61+
v[4, :] = [
62+
2.96460137564761618e-03,
63+
2.37847173959480950e-03,
64+
-2.96589568540237556e-05,
65+
] .* days_per_year
66+
67+
# Neptune
68+
x[5, :] = [
69+
1.53796971148509165e+01,
70+
-2.59193146099879641e+01,
71+
1.79258772950371181e-01,
72+
]
73+
v[5, :] = [
74+
2.68067772490389322e-03,
75+
1.62824170038242295e-03,
76+
-9.51592254519715870e-05,
77+
] * days_per_year
78+
end
79+
80+
const __m128 = NTuple{4, VecElement{Float32}}
81+
const __m128d = NTuple{2, VecElement{Float64}}
82+
const v2d = Vec{2, Float64}
83+
84+
@inline function rsqrt_pd(v2::v2d)
85+
v2d(rsqrt_ccall(v2.elts))
86+
end
87+
88+
@inline function rsqrt_pd_newton(v2::v2d)
89+
guess = rsqrt_pd(v2)
90+
# We only need one Newton step to achieve desired accuracy
91+
guess = guess * 1.5 - ((0.5 * v2) * guess) * (guess * guess)
92+
guess
93+
end
94+
95+
rsqrt(f::__m128) = ccall("llvm.x86.sse.rsqrt.ps", llvmcall, __m128, (__m128, ), f);
96+
_mm_cvtpd_ps(f::__m128d) = ccall("llvm.x86.sse2.cvtpd2ps", llvmcall, __m128, (__m128d, ), f);
97+
_mm_cvtps_pd(f::__m128) = llvmcall(("", "
98+
%2 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
99+
%3 = fpext <2 x float> %2 to <2 x double>
100+
ret <2 x double> %3"),
101+
__m128d,
102+
Tuple{__m128}, f)
103+
@inline rsqrt_ccall(f::__m128d) = _mm_cvtps_pd(rsqrt(_mm_cvtpd_ps(f)))
104+
105+
@inline function advance(#x, v, m, dt, dx, dmag)
106+
x::MMatrix{NBODIES, 3, Float64, NBODIES * 3},
107+
v::MMatrix{NBODIES, 3, Float64, NBODIES * 3},
108+
m::NTuple{NBODIES, Float64},
109+
dt::Float64,
110+
dx::MMatrix{NPAIRS, 3, Float64, NPAIRS * 3},
111+
dmag::MVector{NPAIRS, Float64})
112+
113+
dmag_v2d_ptr = Base.unsafe_convert(Ptr{v2d}, pointer_from_objref(dmag))
114+
dx_v2d_ptr = Base.unsafe_convert(Ptr{v2d}, pointer_from_objref(dx))
115+
116+
# Unroll loop to calculate distances + store two at a time
117+
@inbounds for k1 = 1:2:length(PAIRS)
118+
k2 = k1 + 1
119+
k_v2d = k2 ÷ 2
120+
121+
i1, j1 = PAIRS[k1]
122+
i2, j2 = PAIRS[k2]
123+
124+
dx1 = v2d((x[i1, 1], x[i2, 1])) - v2d((x[j1, 1], x[j2, 1]))
125+
dx2 = v2d((x[i1, 2], x[i2, 2])) - v2d((x[j1, 2], x[j2, 2]))
126+
dx3 = v2d((x[i1, 3], x[i2, 3])) - v2d((x[j1, 3], x[j2, 3]))
127+
unsafe_store!(dx_v2d_ptr, dx1, k_v2d)
128+
unsafe_store!(dx_v2d_ptr, dx2, k_v2d + NPAIRS ÷ 2)
129+
unsafe_store!(dx_v2d_ptr, dx3, k_v2d + NPAIRS)
130+
131+
dsq = dx1^2 + dx2^2 + dx3^2
132+
drsqrt = rsqrt_pd_newton(dsq)
133+
mag = dt * drsqrt / dsq
134+
unsafe_store!(dmag_v2d_ptr, mag, k_v2d)
135+
end
136+
137+
k = 1
138+
@inbounds for (i, j) = PAIRS
139+
dmag_i = dmag[k] * m[i]
140+
dmag_j = dmag[k] * m[j]
141+
for d = 1:3
142+
dx_k = dx[k, d]
143+
v[i, d] -= dx_k * dmag_j
144+
v[j, d] += dx_k * dmag_i
145+
end
146+
k += 1
147+
end
148+
149+
@inbounds for i = 1:NBODIES
150+
for d = 1:3
151+
x[i, d] += dt * v[i, d]
152+
end
153+
end
154+
end
155+
156+
function energy(bodies)
157+
x, v, m = bodies.x, bodies.v, bodies.m
158+
e = 0.0
159+
for i = 1:NBODIES
160+
e += 0.5 * m[i] * sum(v[i, :].^2)
161+
for j = i + 1:NBODIES
162+
dx = x[i, :] - x[j, :]
163+
distance = sqrt(sum(dx .* dx))
164+
e -= (m[i] * m[j]) / distance
165+
end
166+
end
167+
return e
168+
end
169+
170+
function init_sun!(bodies)
171+
px = [0.0, 0.0, 0.0]
172+
for i = 1:NBODIES
173+
px += bodies.v[i, :] * bodies.m[i]
174+
end
175+
bodies.v[1, :] = -px ./ solar_mass
176+
end
177+
178+
function main(iterations::Int64)
179+
n = iterations
180+
181+
x = zeros(MMatrix{NBODIES, 3, Float64, 15})
182+
v = zeros(MMatrix{NBODIES, 3, Float64, 15})
183+
m = NTuple{NBODIES, Float64}((
184+
1.0,
185+
9.54791938424326609e-04,
186+
2.85885980666130812e-04,
187+
4.36624404335156298e-05,
188+
5.15138902046611451e-05,
189+
) .* solar_mass)
190+
bodies = Bodies(x, v, m)
191+
192+
init_bodies!(bodies)
193+
init_sun!(bodies)
194+
@printf("%.9f\n", energy(bodies))
195+
196+
# Buffers
197+
dx = zeros(MMatrix{NPAIRS, 3, Float64, 30})
198+
dmag = zeros(MVector{NPAIRS, Float64})
199+
for _ = 1:n
200+
advance(x, v, m, 0.01, dx, dmag)
201+
end
202+
203+
@printf("%.9f\n", energy(bodies))
204+
end
205+
206+
end
207+
208+
@time NBody.main(parse(Int64, ARGS[1]))
209+
@time NBody.main(parse(Int64, ARGS[1]))
210+
211+
# > julia -O3 -C core2 -- nbody_unsafe_simd.jl 50000000
212+
# -0.169075164
213+
# -0.169060076
214+
# 7.603160 seconds (7.88 M allocations: 397.438 MiB, 1.82% gc time)
215+
# -0.169075164
216+
# -0.169060076
217+
# 4.172609 seconds (470 allocations: 11.891 KiB)
218+
219+
# using StaticArrays, InteractiveUtils
220+
# code_native(nb.advance,
221+
# (MMatrix{nb.NBODIES, 3, Float64, nb.NBODIES * 3},
222+
# MMatrix{nb.NBODIES, 3, Float64, nb.NBODIES * 3},
223+
# NTuple{nb.NBODIES, Float64},
224+
# Float64,
225+
# MMatrix{nb.NPAIRS, 3, Float64, nb.NPAIRS * 3},
226+
# MVector{nb.NPAIRS, Float64}))

0 commit comments

Comments
 (0)