774774function imfilter! (r:: AbstractCPU{FIRTiled{N}} , out:: AbstractArray{S,N} , A:: AbstractArray{T,N} , kernel:: Tuple{Any,Any,Vararg{Any}} , border:: NoPad , inds:: Indices = axes (out)) where {S,T,N}
775775 kern = kernel[1 ]
776776 iscopy (kern) && return imfilter! (r, out, A, tail (kernel), border, inds)
777- tmp = tile_allocate (filter_type (A, kernel), r. settings. tilesize, kernel)
777+ TTile, f = native_eltype (filter_type (A, kernel))
778+ @assert f == 1
779+ # @show r.settings.tilesize
780+ tmp = tile_allocate (TTile, r. settings. tilesize, kernel)
778781 _imfilter_tiled! (r, out, A, kernel, border, tmp, inds)
779782 out
780783end
834837
835838# Single-threaded, pair of kernels (with only one temporary buffer required)
836839function _imfilter_tiled! (r:: CPU1 , out, A, kernel:: Tuple{Any,Any} , border:: NoPad , tiles:: Vector{AA} , indsout) where AA<: AbstractArray
840+ out, A, kernel = maybe_reinterpret (out, A, kernel)
837841 k1, k2 = kernel
838842 tile = tiles[1 ]
839843 indsk2, indstile = axes (k2), axes (tile)
850854
851855# Multithreaded, pair of kernels
852856function _imfilter_tiled! (r:: CPUThreads , out, A, kernel:: Tuple{Any,Any} , border:: NoPad , tiles:: Vector{AA} , indsout) where AA<: AbstractArray
857+ out, A, kernel = maybe_reinterpret (out, A, kernel)
853858 k1, k2 = kernel
854859 tile = tiles[1 ]
855860 indsk2, indstile = axes (k2), axes (tile)
908913 out
909914end
910915
911- # The first of the pair in `tmp` has the current data. We also make
912- # the second a plain array so there's no doubt about who's holding the
913- # proper indices.
914- function _imfilter_tiled_swap! (r, out, kernel:: Tuple{Any,Any,Vararg{Any}} , border, tmp:: Tuple{TileBuffer,Array} )
916+ # The first of the pair in `tmp` has the current data.
917+ function _imfilter_tiled_swap! (r, out, kernel:: Tuple{Any,Any,Vararg{Any}} , border, tmp:: Tuple{TileBuffer,AbstractArray} )
915918 tileb1, tile2 = tmp
916919 k1, kt = kernel[1 ], tail (kernel)
917920 parentinds = axes (tileb1)
@@ -922,7 +925,7 @@ function _imfilter_tiled_swap!(r, out, kernel::Tuple{Any,Any,Vararg{Any}}, borde
922925end
923926
924927# on the last call we write to `out` instead of one of the buffers
925- function _imfilter_tiled_swap! (r, out, kernel:: Tuple{Any} , border, tmp:: Tuple{TileBuffer,Array } )
928+ function _imfilter_tiled_swap! (r, out, kernel:: Tuple{Any} , border, tmp:: Tuple{TileBuffer,AbstractArray } )
926929 tileb1 = tmp[1 ]
927930 k1 = kernel[1 ]
928931 parentinds = axes (tileb1)
@@ -1014,26 +1017,26 @@ end
10141017# This is unfortunate, but specializing this saves an add in the inner
10151018# loop and results in a modest performance improvement. It would be
10161019# nice if LLVM did this automatically. (@polly?)
1017- function __imfilter_inbounds! (r, out, A:: OffsetArray , kern:: OffsetArray , border, R, z)
1018- off, k = CartesianIndex (kern. offsets), parent (kern)
1019- o, O = safehead (off), safetail (off)
1020- Rnew = CartesianIndices (map ((x,y)-> x.+ y, R. indices, Tuple (off)))
1021- Rk = CartesianIndices (axes (k))
1022- offA, pA = CartesianIndex (A. offsets), parent (A)
1023- oA, OA = safehead (offA), safetail (offA)
1024- for I in safetail (Rnew)
1025- IA = I- OA
1026- for i in safehead (Rnew)
1027- tmp = z
1028- iA = i- oA
1029- @inbounds for J in safetail (Rk), j in safehead (Rk)
1030- tmp += safe_for_prod (pA[iA+ j,IA+ J], tmp)* k[j,J]
1031- end
1032- @inbounds out[i- o,I- O] = tmp
1033- end
1034- end
1035- out
1036- end
1020+ # function __imfilter_inbounds!(r, out, A::OffsetArray, kern::OffsetArray, border, R, z)
1021+ # off, k = CartesianIndex(kern.offsets), parent(kern)
1022+ # o, O = safehead(off), safetail(off)
1023+ # Rnew = CartesianIndices(map((x,y)->x.+y, R.indices, Tuple(off)))
1024+ # Rk = CartesianIndices(axes(k))
1025+ # offA, pA = CartesianIndex(A.offsets), parent(A)
1026+ # oA, OA = safehead(offA), safetail(offA)
1027+ # for I in safetail(Rnew)
1028+ # IA = I-OA
1029+ # for i in safehead(Rnew)
1030+ # tmp = z
1031+ # iA = i-oA
1032+ # @inbounds for J in safetail(Rk), j in safehead(Rk)
1033+ # tmp += safe_for_prod(pA[iA+j,IA+J], tmp)*k[j,J]
1034+ # end
1035+ # @inbounds out[i-o,I-O] = tmp
1036+ # end
1037+ # end
1038+ # out
1039+ # end
10371040
10381041function _imfilter_inbounds! (r:: AbstractResource , out, A:: AbstractArray , kern:: ReshapedOneD , border:: NoPad , inds)
10391042 Rpre, ind, Rpost = iterdims (inds, kern)
@@ -1043,68 +1046,110 @@ function _imfilter_inbounds!(r::AbstractResource, out, A::AbstractArray, kern::R
10431046 return out
10441047 end
10451048 p = accumfilter (A[first (R)+ first (Rk)], first (k))
1046- z = zero (typeof (p+ p))
1049+ z = float (zero (eltype (A)))
1050+ # z = zero(typeof(p+p))
10471051 _imfilter_inbounds! (r, z, out, A, k, Rpre, ind, Rpost)
10481052end
10491053
1050- # Many of the following are unfortunate specializations
1051- function _imfilter_inbounds! (r:: AbstractResource , z, out, A:: AbstractArray , k:: OffsetVector , Rpre:: CartesianIndices , ind, Rpost:: CartesianIndices )
1052- _imfilter_inbounds! (r, z, out, A, parent (k), Rpre, ind, Rpost, k. offsets[1 ])
1053- end
1054+ # # Many of the following are unfortunate specializations
1055+ # function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices)
1056+ # _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, k.offsets[1])
1057+ # end
10541058
1055- function _imfilter_inbounds! (r:: AbstractResource , z, out, A:: AbstractArray , k:: AbstractVector , Rpre:: CartesianIndices , ind, Rpost:: CartesianIndices , koffset= 0 )
1059+ # LoopVectorization.check_type(::Type{T}) where T<:ColorVectorSpace.MathTypes = true
1060+ # @generated function VectorizationBase.zero_vecunroll(::StaticInt{N}, ::StaticInt{W}, ::Type{Gray{T}}, ::StaticInt{RS}) where {N,W,T,RS}
1061+ # Expr(:block, Expr(:meta, :inline), :(VectorizationBase._vzero(VecUnroll{$(N-1),$W,$T,Vec{$W,$T}}, StaticInt{$RS}())))
1062+ # end
1063+ # function VectorizationBase._vload_unroll(
1064+ # sptr::AbstractStridedPointer{Gray{T},N,C,B}, u::Unroll{AU,F,UN,AV,W,M,UX,I}, ::A, ::StaticInt{RS}, ::StaticInt{X}
1065+ # ) where {T<:NativeTypes,N,C,B,AU,F,UN,AV,W,M,UX,I<:IndexNoUnroll,A<:StaticBool,RS,X}
1066+ # VectorizationBase.VecUnroll{N,1,T,T}(x::S) where {N,T<:VectorizationBase.NativeTypes,S<:FixedPoint{T}} =
1067+ # VectorizationBase.VecUnroll{N,1,T,T}(reinterpret(x))
1068+ # VectorizationBase.VecUnroll(x::FixedPoint) = VectorizationBase.VecUnroll(reinterpret(x))
1069+ # VectorizationBase.VecUnroll(x::AbstractGray) = VectorizationBase.VecUnroll(gray(x))
1070+ # VectorizationBase.VecUnroll(x::Gray) where {N,T<:VectorizationBase.NativeTypes} =
1071+ # VectorizationBase.VecUnroll{N,1,T,T}(reinterpret(x))
1072+
1073+ const LVTypes = Union{VectorizationBase. NativeTypes, SVector{N,<: VectorizationBase.NativeTypes } where N}
1074+
1075+ const args = Ref {Any} ()
1076+ function _imfilter_inbounds! (r:: AbstractResource , z, out:: AbstractArray{<:LVTypes} , A:: AbstractArray{<:LVTypes} , k:: AbstractVector , Rpre:: CartesianIndices , ind, Rpost:: CartesianIndices )
1077+ if ! LoopVectorization. check_args (out, A)
1078+ @show summary (out) summary (A)
1079+ args[] = (deepcopy (out), deepcopy (A))
1080+ error (" this should have worked" )
1081+ end
10561082 indsk = axes (k, 1 )
1083+ zout = convert (eltype (out), z)
10571084 for Ipost in Rpost
10581085 for i in ind
1059- ik = i+ koffset
1060- for Ipre in Rpre
1061- tmp = z
1086+ @turbo for Ipre in Rpre
1087+ tmp = zout
10621088 for j in indsk
1063- @inbounds tmp += safe_for_prod (A[Ipre,ik + j,Ipost], tmp )* k[j]
1089+ tmp += safe_for_prod (A[Ipre,i + j,Ipost], z )* k[j]
10641090 end
1065- @inbounds out[Ipre,i,Ipost] = tmp
1091+ out[Ipre,i,Ipost] = tmp
10661092 end
10671093 end
10681094 end
10691095 out
10701096end
10711097
1072- function _imfilter_inbounds! (r:: AbstractResource , out, A:: OffsetArray , kern:: ReshapedVector , border:: NoPad , inds)
1073- Rpre, ind, Rpost = iterdims (inds, kern)
1074- k = kern. data
1075- R, Rk = CartesianIndices (inds), CartesianIndices (axes (kern))
1076- if isempty (R) || isempty (Rk)
1077- return out
1078- end
1079- p = accumfilter (A[first (R)+ first (Rk)], first (k))
1080- z = zero (typeof (p+ p))
1081- Opre, o, Opost = KernelFactors. indexsplit (CartesianIndex (A. offsets), kern)
1082- _imfilter_inbounds! (r, z, out, parent (A), k, Rpre, ind, Rpost, Opre, o, Opost)
1083- end
1084-
1085- function _imfilter_inbounds! (r:: AbstractResource , z, out, A:: AbstractArray , k:: OffsetVector , Rpre:: CartesianIndices , ind, Rpost:: CartesianIndices , Opre, o, Opost)
1086- _imfilter_inbounds! (r, z, out, A, parent (k), Rpre, ind, Rpost, Opre, o, Opost, k. offsets[1 ])
1087- end
1088-
1089- function _imfilter_inbounds! (r:: AbstractResource , z, out, A:: AbstractArray , k:: AbstractVector , Rpre:: CartesianIndices , ind, Rpost:: CartesianIndices , Opre, o, Opost, koffset= 0 )
1098+ # No @turbo version
1099+ function _imfilter_inbounds! (r:: AbstractResource , z, out:: AbstractArray{<:LVTypes} , A:: AbstractArray{<:LVTypes} , k:: AbstractVector , Rpre:: CartesianIndices , ind, Rpost:: CartesianIndices )
10901100 indsk = axes (k, 1 )
1101+ zout = convert (eltype (out), z)
10911102 for Ipost in Rpost
1092- IOpost = Ipost - Opost
10931103 for i in ind
1094- io = i- o+ koffset
1095- for Ipre in Rpre
1096- IOpre = Ipre - Opre
1097- tmp = z
1104+ @inbounds for Ipre in Rpre
1105+ tmp = zout
10981106 for j in indsk
1099- @inbounds tmp += safe_for_prod (A[IOpre,io + j,IOpost ], tmp )* k[j]
1107+ tmp += safe_for_prod (A[Ipre,i + j,Ipost ], z )* k[j]
11001108 end
1101- @inbounds out[Ipre,i,Ipost] = tmp
1109+ out[Ipre,i,Ipost] = tmp
11021110 end
11031111 end
11041112 end
11051113 out
11061114end
1107- # end unfortunate specializations
1115+
1116+ # function _imfilter_inbounds!(r::AbstractResource, out, A::OffsetArray, kern::ReshapedVector, border::NoPad, inds)
1117+ # Rpre, ind, Rpost = iterdims(inds, kern)
1118+ # k = kern.data
1119+ # R, Rk = CartesianIndices(inds), CartesianIndices(axes(kern))
1120+ # if isempty(R) || isempty(Rk)
1121+ # return out
1122+ # end
1123+ # p = accumfilter(A[first(R)+first(Rk)], first(k))
1124+ # z = zero(typeof(p+p))
1125+ # Opre, o, Opost = KernelFactors.indexsplit(CartesianIndex(A.offsets), kern)
1126+ # _imfilter_inbounds!(r, z, out, parent(A), k, Rpre, ind, Rpost, Opre, o, Opost)
1127+ # end
1128+
1129+ # function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::OffsetVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost)
1130+ # _imfilter_inbounds!(r, z, out, A, parent(k), Rpre, ind, Rpost, Opre, o, Opost, k.offsets[1])
1131+ # end
1132+
1133+ # function _imfilter_inbounds!(r::AbstractResource, z, out, A::AbstractArray, k::AbstractVector, Rpre::CartesianIndices, ind, Rpost::CartesianIndices, Opre, o, Opost)
1134+ # indsk = axes(k, 1)
1135+ # zout = convert(eltype(out), z)
1136+ # for Ipost in Rpost
1137+ # IOpost = Ipost - Opost
1138+ # for i in ind
1139+ # io = i-o+koffset
1140+ # @turbo for Ipre in Rpre
1141+ # IOpre = Ipre - Opre
1142+ # tmp = zout
1143+ # for j in indsk
1144+ # tmp += safe_for_prod(A[IOpre,io+j,IOpost], z)*k[j]
1145+ # end
1146+ # @inbounds out[Ipre,i,Ipost] = tmp
1147+ # end
1148+ # end
1149+ # end
1150+ # out
1151+ # end
1152+ # # end unfortunate specializations
11081153
11091154# # commented out because "virtual padding" is commented out
11101155# function _imfilter_iter!(r::AbstractResource, out, padded, kernel::AbstractArray, iter)
0 commit comments