Skip to content

Commit 442031e

Browse files
committed
Tweak mandelbrot-fast.jl for performance.
Multiple versions with different threadings included because different versions are faster depending on the machine. Depending on machine, gains can be over 20% compared to original mandelbrot-fast.jl. NOTE: running mandelbrot-fast.v3.jl requires installation of https://github.com/mohamed82008/KissThreading.jl Changes included in every version: - Removing threading from filling xvals and yvals--threading overhead is too high for such a simple operation. - Remove @simd annotation from mandel_inner--simd is occurring at the level of mand8; @simd doesn't hurt runtime but increases compilation time. - Only run mandelbrot when !isinteractive() to make development and debugging easier. - Various tweaks and minor stylistic updates for succinctness and maybe a marginal increase in performance.
1 parent 0329d60 commit 442031e

File tree

3 files changed

+157
-38
lines changed

3 files changed

+157
-38
lines changed

mandelbrot/mandelbrot-fast.jl

Lines changed: 28 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,8 @@ The Computer Language Benchmarks Game
66
modified for Julia 1.0 by Simon Danisch
77
=#
88
const zerov8 = ntuple(x-> 0f0, 8)
9-
10-
@inline function step_mandel(Zr,Zi,Tr,Ti,cr,ci)
11-
Zi = 2f0 .* Zr .* Zi .+ ci
12-
Zr = Tr .- Ti .+ cr
13-
Tr = Zr .* Zr
14-
Ti = Zi .* Zi
15-
return Zr,Zi,Tr,Ti
16-
end
9+
const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
10+
0b11111011, 0b11111101, 0b11111110)
1711

1812
# Calculate mandelbrot set for one Vec8 into one byte
1913
Base.@propagate_inbounds function mand8(cr, ci)
@@ -24,51 +18,47 @@ Base.@propagate_inbounds function mand8(cr, ci)
2418
t = zerov8
2519
i = 0
2620

27-
while i<50
28-
for _ in 1:5
29-
Zr,Zi,Tr,Ti = step_mandel(Zr,Zi,Tr,Ti,cr,ci)
30-
i += 1
21+
for _=1:10
22+
for _=1:5
23+
Zi = 2f0 .* Zr .* Zi .+ ci
24+
Zr = Tr .- Ti .+ cr
25+
Tr = Zr .* Zr
26+
Ti = Zi .* Zi
3127
end
3228
t = Tr .+ Ti
3329
all(x-> x > 4f0, t) && (return 0x00)
3430
end
31+
3532
byte = 0xff
36-
t[1] <= 4.0 || (byte &= 0b01111111)
37-
t[2] <= 4.0 || (byte &= 0b10111111)
38-
t[3] <= 4.0 || (byte &= 0b11011111)
39-
t[4] <= 4.0 || (byte &= 0b11101111)
40-
t[5] <= 4.0 || (byte &= 0b11110111)
41-
t[6] <= 4.0 || (byte &= 0b11111011)
42-
t[7] <= 4.0 || (byte &= 0b11111101)
43-
t[8] <= 4.0 || (byte &= 0b11111110)
33+
for i=1:8
34+
t[i] <= 4.0 || (byte &= masks[i])
35+
end
4436
return byte
4537
end
4638

4739
function mandel_inner(rows, ci, y, N, xvals)
48-
@simd for x in 1:8:N
49-
@inbounds begin
50-
cr = ntuple(i-> xvals[x + (i - 1)], 8)
51-
rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
52-
end
40+
@inbounds for x=1:8:N
41+
cr = ntuple(i-> xvals[x + i - 1], 8)
42+
rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
5343
end
5444
end
5545

56-
function mandelbrot(n = 200)
46+
function mandelbrot(io, n = 200)
5747
inv_ = 2.0 / n
58-
N = n
59-
xvals = zeros(Float32, n)
60-
yvals = zeros(Float32, n)
61-
Threads.@threads for i in 0:(N-1)
62-
@inbounds xvals[i + 1] = i * inv_ - 1.5
63-
@inbounds yvals[i + 1] = i * inv_ - 1.0
48+
xvals = Vector{Float32}(undef, n)
49+
yvals = Vector{Float32}(undef, n)
50+
@inbounds for i in 0:(n-1)
51+
xvals[i + 1] = i * inv_ - 1.5
52+
yvals[i + 1] = i * inv_ - 1.0
6453
end
65-
rows = zeros(UInt8, n*N÷8)
66-
Threads.@threads for y in 1:N
54+
55+
rows = Vector{UInt8}(undef, n^2 ÷ 8)
56+
@sync for y=1:n
6757
@inbounds ci = yvals[y]
68-
mandel_inner(rows, ci, y, N, xvals)
58+
Threads.@spawn mandel_inner(rows, ci, y, n, xvals)
6959
end
70-
write(stdout, "P4\n$n $n\n")
71-
write(stdout, rows)
60+
write(io, "P4\n$n $n\n")
61+
write(io, rows)
7262
end
7363

74-
mandelbrot(parse(Int, ARGS[1]))
64+
isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1]))

mandelbrot/mandelbrot-fast.v2.jl

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#=
2+
The Computer Language Benchmarks Game
3+
https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
4+
direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn:
5+
https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html
6+
modified for Julia 1.0 by Simon Danisch
7+
=#
8+
const zerov8 = ntuple(x-> 0f0, 8)
9+
const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
10+
0b11111011, 0b11111101, 0b11111110)
11+
12+
# Calculate mandelbrot set for one Vec8 into one byte
13+
Base.@propagate_inbounds function mand8(cr, ci)
14+
Zr = zerov8
15+
Zi = zerov8
16+
Tr = zerov8
17+
Ti = zerov8
18+
t = zerov8
19+
i = 0
20+
21+
for _=1:10
22+
for _=1:5
23+
Zi = 2f0 .* Zr .* Zi .+ ci
24+
Zr = Tr .- Ti .+ cr
25+
Tr = Zr .* Zr
26+
Ti = Zi .* Zi
27+
end
28+
t = Tr .+ Ti
29+
all(x-> x > 4f0, t) && (return 0x00)
30+
end
31+
32+
byte = 0xff
33+
for i=1:8
34+
t[i] <= 4.0 || (byte &= masks[i])
35+
end
36+
return byte
37+
end
38+
39+
function mandel_inner(rows, ci, y, N, xvals)
40+
@inbounds for x=1:8:N
41+
cr = ntuple(i-> xvals[x + i - 1], 8)
42+
rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
43+
end
44+
end
45+
46+
function mandelbrot(io, n = 200)
47+
inv_ = 2.0 / n
48+
xvals = Vector{Float32}(undef, n)
49+
yvals = Vector{Float32}(undef, n)
50+
@inbounds for i in 0:(n-1)
51+
xvals[i + 1] = i * inv_ - 1.5
52+
yvals[i + 1] = i * inv_ - 1.0
53+
end
54+
55+
rows = Vector{UInt8}(undef, n^2 ÷ 8)
56+
Threads.@threads for y=1:n
57+
@inbounds ci = yvals[y]
58+
mandel_inner(rows, ci, y, n, xvals)
59+
end
60+
write(io, "P4\n$n $n\n")
61+
write(io, rows)
62+
end
63+
64+
isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1]))

mandelbrot/mandelbrot-fast.v3.jl

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#=
2+
The Computer Language Benchmarks Game
3+
https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
4+
direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn:
5+
https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html
6+
modified for Julia 1.0 by Simon Danisch
7+
=#
8+
using KissThreading
9+
10+
const zerov8 = ntuple(x-> 0f0, 8)
11+
const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
12+
0b11111011, 0b11111101, 0b11111110)
13+
14+
# Calculate mandelbrot set for one Vec8 into one byte
15+
Base.@propagate_inbounds function mand8(cr, ci)
16+
Zr = zerov8
17+
Zi = zerov8
18+
Tr = zerov8
19+
Ti = zerov8
20+
t = zerov8
21+
i = 0
22+
23+
for _=1:10
24+
for _=1:5
25+
Zi = 2f0 .* Zr .* Zi .+ ci
26+
Zr = Tr .- Ti .+ cr
27+
Tr = Zr .* Zr
28+
Ti = Zi .* Zi
29+
end
30+
t = Tr .+ Ti
31+
all(x-> x > 4f0, t) && (return 0x00)
32+
end
33+
34+
byte = 0xff
35+
for i=1:8
36+
t[i] <= 4.0 || (byte &= masks[i])
37+
end
38+
return byte
39+
end
40+
41+
function mandel_inner(rows, ci, y, N, xvals)
42+
@inbounds for x=1:8:N
43+
cr = ntuple(i-> xvals[x + i - 1], 8)
44+
rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
45+
end
46+
end
47+
48+
function mandelbrot(io, n = 200)
49+
inv_ = 2.0 / n
50+
xvals = Vector{Float32}(undef, n)
51+
yvals = Vector{Float32}(undef, n)
52+
@inbounds for i in 0:(n-1)
53+
xvals[i + 1] = i * inv_ - 1.5
54+
yvals[i + 1] = i * inv_ - 1.0
55+
end
56+
57+
rows = Vector{UInt8}(undef, n^2 ÷ 8)
58+
f(y) = @inbounds mandel_inner(rows, yvals[y], y, n, xvals)
59+
tmap!(f, Vector{Nothing}(undef, n), collect(1:n); batch_size=8)
60+
61+
write(io, "P4\n$n $n\n")
62+
write(io, rows)
63+
end
64+
65+
isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1]))

0 commit comments

Comments
 (0)