|
80 | 80 | };
|
81 | 81 | transize(swizzler, self, idxs)
|
82 | 82 | }
|
| 83 | + #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))] |
| 84 | + 64 => transize(avx2_pshufb512, self, idxs), |
83 | 85 | // Notable absence: avx512bw pshufb shuffle
|
84 | 86 | #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
|
85 | 87 | 64 => {
|
@@ -171,6 +173,59 @@ unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> {
|
171 | 173 | }
|
172 | 174 | }
|
173 | 175 |
|
| 176 | +/// The above function but for 64 bytes |
| 177 | +/// |
| 178 | +/// # Safety |
| 179 | +/// This requires AVX2 to work |
| 180 | +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| 181 | +#[target_feature(enable = "avx2")] |
| 182 | +#[allow(unused)] |
| 183 | +#[inline] |
| 184 | +#[allow(clippy::let_and_return)] |
| 185 | +unsafe fn avx2_pshufb512(bytes: Simd<u8, 64>, idxs: Simd<u8, 64>) -> Simd<u8, 64> { |
| 186 | + use crate::simd::cmp::SimdPartialOrd; |
| 187 | + #[cfg(target_arch = "x86")] |
| 188 | + use core::arch::x86; |
| 189 | + #[cfg(target_arch = "x86_64")] |
| 190 | + use core::arch::x86_64 as x86; |
| 191 | + use x86::_mm256_permute2x128_si256 as avx2_cross_shuffle; |
| 192 | + use x86::_mm256_shuffle_epi8 as avx2_half_pshufb; |
| 193 | + use x86::_mm256_blendv_epi8 as avx2_blend; |
| 194 | + let high = Simd::splat(64u8); |
| 195 | + // SAFETY: Caller promised AVX2 |
| 196 | + unsafe { |
| 197 | + let half_swizzler = |bytes0: Simd<u8, 32>, bytes1: Simd<u8, 32>, idxs: Simd<u8, 32>| { |
| 198 | + let mask0 = idxs << 2; |
| 199 | + let mask1 = idxs << 3; |
| 200 | + |
| 201 | + let lolo0 = avx2_cross_shuffle::<0x00>(bytes0.into(), bytes0.into()); |
| 202 | + let hihi0 = avx2_cross_shuffle::<0x11>(bytes0.into(), bytes0.into()); |
| 203 | + let lolo0 = avx2_half_pshufb(lolo0, idxs.into()); |
| 204 | + let hihi0 = avx2_half_pshufb(hihi0, idxs.into()); |
| 205 | + let x = avx2_blend(lolo0, hihi0, mask1.into()); |
| 206 | + |
| 207 | + let lolo1 = avx2_cross_shuffle::<0x00>(bytes1.into(), bytes1.into()); |
| 208 | + let hihi1 = avx2_cross_shuffle::<0x11>(bytes1.into(), bytes1.into()); |
| 209 | + let lolo1 = avx2_half_pshufb(lolo1, idxs.into()); |
| 210 | + let hihi1 = avx2_half_pshufb(hihi1, idxs.into()); |
| 211 | + let y = avx2_blend(lolo1, hihi1, mask1.into()); |
| 212 | + |
| 213 | + avx2_blend(x, y, mask0.into()) |
| 214 | + }; |
| 215 | + |
| 216 | + let bytes0 = bytes.extract::<0, 32>(); |
| 217 | + let bytes1 = bytes.extract::<32, 32>(); |
| 218 | + let idxs0 = idxs.extract::<0, 32>(); |
| 219 | + let idxs1 = idxs.extract::<32, 32>(); |
| 220 | + |
| 221 | + let z0 = half_swizzler(bytes0, bytes1, idxs0); |
| 222 | + let z1 = half_swizzler(bytes0, bytes1, idxs1); |
| 223 | + let z = mem::transmute::<[Simd<u8, 32>; 2], Simd<u8, 64>>([z0.into(), z1.into()]); |
| 224 | + |
| 225 | + idxs.simd_lt(high).select(z, Simd::splat(0u8)) |
| 226 | + } |
| 227 | +} |
| 228 | + |
174 | 229 | /// This sets up a call to an architecture-specific function, and in doing so
|
175 | 230 | /// it persuades rustc that everything is the correct size. Which it is.
|
176 | 231 | /// This would not be needed if one could convince Rust that, by matching on N,
|
|
0 commit comments