Skip to content

Commit c045fbe

Browse files
committed
swizzle_dyn: 64 byte swizzle_dyn for AVX2
1 parent 936d58b commit c045fbe

File tree

1 file changed

+55
-0
lines changed

1 file changed

+55
-0
lines changed

crates/core_simd/src/swizzle_dyn.rs

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ where
8080
};
8181
transize(swizzler, self, idxs)
8282
}
83+
#[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
84+
64 => transize(avx2_pshufb512, self, idxs),
8385
// Notable absence: avx512bw pshufb shuffle
8486
#[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
8587
64 => {
@@ -171,6 +173,59 @@ unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> {
171173
}
172174
}
173175

176+
/// The above function but for 64 bytes
177+
///
178+
/// # Safety
179+
/// This requires AVX2 to work
180+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
181+
#[target_feature(enable = "avx2")]
182+
#[allow(unused)]
183+
#[inline]
184+
#[allow(clippy::let_and_return)]
185+
unsafe fn avx2_pshufb512(bytes: Simd<u8, 64>, idxs: Simd<u8, 64>) -> Simd<u8, 64> {
186+
use crate::simd::cmp::SimdPartialOrd;
187+
#[cfg(target_arch = "x86")]
188+
use core::arch::x86;
189+
#[cfg(target_arch = "x86_64")]
190+
use core::arch::x86_64 as x86;
191+
use x86::_mm256_blendv_epi8 as avx2_blend;
192+
use x86::_mm256_permute2x128_si256 as avx2_cross_shuffle;
193+
use x86::_mm256_shuffle_epi8 as avx2_half_pshufb;
194+
let high = Simd::splat(64u8);
195+
// SAFETY: Caller promised AVX2
196+
unsafe {
197+
let half_swizzler = |bytes0: Simd<u8, 32>, bytes1: Simd<u8, 32>, idxs: Simd<u8, 32>| {
198+
let mask0 = idxs << 2;
199+
let mask1 = idxs << 3;
200+
201+
let lolo0 = avx2_cross_shuffle::<0x00>(bytes0.into(), bytes0.into());
202+
let hihi0 = avx2_cross_shuffle::<0x11>(bytes0.into(), bytes0.into());
203+
let lolo0 = avx2_half_pshufb(lolo0, idxs.into());
204+
let hihi0 = avx2_half_pshufb(hihi0, idxs.into());
205+
let x = avx2_blend(lolo0, hihi0, mask1.into());
206+
207+
let lolo1 = avx2_cross_shuffle::<0x00>(bytes1.into(), bytes1.into());
208+
let hihi1 = avx2_cross_shuffle::<0x11>(bytes1.into(), bytes1.into());
209+
let lolo1 = avx2_half_pshufb(lolo1, idxs.into());
210+
let hihi1 = avx2_half_pshufb(hihi1, idxs.into());
211+
let y = avx2_blend(lolo1, hihi1, mask1.into());
212+
213+
avx2_blend(x, y, mask0.into())
214+
};
215+
216+
let bytes0 = bytes.extract::<0, 32>();
217+
let bytes1 = bytes.extract::<32, 32>();
218+
let idxs0 = idxs.extract::<0, 32>();
219+
let idxs1 = idxs.extract::<32, 32>();
220+
221+
let z0 = half_swizzler(bytes0, bytes1, idxs0);
222+
let z1 = half_swizzler(bytes0, bytes1, idxs1);
223+
let z = mem::transmute::<[Simd<u8, 32>; 2], Simd<u8, 64>>([z0.into(), z1.into()]);
224+
225+
idxs.simd_lt(high).select(z, Simd::splat(0u8))
226+
}
227+
}
228+
174229
/// This sets up a call to an architecture-specific function, and in doing so
175230
/// it persuades rustc that everything is the correct size. Which it is.
176231
/// This would not be needed if one could convince Rust that, by matching on N,

0 commit comments

Comments
 (0)