Optimize pow5_12(::Float32) (#493)

kimikage · web-flow · commit bf61512febc0 · 2021-06-24T00:21:07.000+09:00
diff --git a/src/conversions.jl b/src/conversions.jl
@@ -81,8 +81,9 @@ correct_gamut(c::CV) where {CV<:TransparentRGB} =
 @inline function srgb_compand(v)
     F = typeof(0.5f0v) === Float32 ? Float32 : promote_type(Float64, typeof(v))
     vf = F(v)
+    vc = @fastmath max(vf, F(0.0031308))
     # `pow5_12` is an optimized function to get `v^(1/2.4)`
-    vf > F(0.0031308) ? muladd(F(1.055), F(pow5_12(vf)), F(-0.055)) : F(12.92) * vf
+    vf > F(0.0031308) ? muladd(F(1.055), F(pow5_12(vc)), F(-0.055)) : F(12.92) * vf
 end
 
 function _hsx_to_rgb(im::UInt8, v, n, m)
diff --git a/src/utilities.jl b/src/utilities.jl
@@ -92,8 +92,8 @@ pow3_4(x) = (y = @fastmath(sqrt(x)); y*@fastmath(sqrt(y))) # x^(3/4)
 
 # `pow5_12` is called from `srgb_compand`.
 pow5_12(x) = pow3_4(x) / cbrt(x) # 5/12 == 1/2 + 1/4 - 1/3 == 3/4 - 1/3
-pow5_12(x::Float32) = Float32(pow5_12(Float64(x)))
 @inline function pow5_12(x::Float64)
+    @noinline _cbrt(x) = cbrt01(x)
     p3_4 = pow3_4(x)
     # x^(-1/6)
     if x < 0.02
@@ -106,7 +106,7 @@ pow5_12(x::Float32) = Float32(pow5_12(Float64(x)))
         t0 = @evalpoly(x, 1.7047813285940905, -3.1261253501167308,
             7.498744828350077, -10.100319516746419, 6.820601476522508, -1.7978894213531524)
     else
-        return p3_4 / cbrt(x)
+        return p3_4 / _cbrt(x)
     end
     # x^(-1/3)
     t1 = t0 * t0
@@ -117,6 +117,20 @@ pow5_12(x::Float32) = Float32(pow5_12(Float64(x)))
     # x^(3/4) * x^(-1/3)
     muladd(p3_4, t2, p3_4 * t2h)
 end
+@inline function pow5_12(x::Float32)
+    # x^(-1/3)
+    rc = rcbrt(x)
+    rcx = -rc * x
+    rch = muladd(muladd(rc, x, rcx), -rc^2, muladd(rc^2, rcx, 1.0f0)) # 1 - x * rc^3
+    rce = muladd(2/9f0, rch, 1/3f0) * rch * rc
+    # x^(3/4)
+    p3_4_f64 = pow3_4(Float64(x))
+    p3_4r = reinterpret(Float64, reinterpret(UInt64, p3_4_f64) & 0xffffffff_e0000000)
+    p3_4 = Float32(p3_4r)
+    p3_4e = Float32(p3_4_f64 - p3_4r)
+    # x^(3/4) * x^(-1/3)
+    muladd(p3_4, rc, muladd(p3_4, rce, p3_4e * rc))
+end
 
 # `pow12_5` is called from `invert_srgb_compand`.
 pow12_5(x) = pow12_5(Float64(x))