Merge pull request #1928 from sayantn/use-intrinsics
Use SIMD intrinsics whereever possible
diff --git a/crates/core_arch/src/x86/adx.rs b/crates/core_arch/src/x86/adx.rs
index 5ba7664..9ce65b7 100644
--- a/crates/core_arch/src/x86/adx.rs
+++ b/crates/core_arch/src/x86/adx.rs
@@ -5,8 +5,6 @@
unsafe extern "unadjusted" {
#[link_name = "llvm.x86.addcarry.32"]
fn llvm_addcarry_u32(a: u8, b: u32, c: u32) -> (u8, u32);
- #[link_name = "llvm.x86.addcarryx.u32"]
- fn llvm_addcarryx_u32(a: u8, b: u32, c: u32, d: *mut u32) -> u8;
#[link_name = "llvm.x86.subborrow.32"]
fn llvm_subborrow_u32(a: u8, b: u32, c: u32) -> (u8, u32);
}
@@ -35,7 +33,7 @@
#[cfg_attr(test, assert_instr(adc))]
#[stable(feature = "simd_x86_adx", since = "1.33.0")]
pub unsafe fn _addcarryx_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
- llvm_addcarryx_u32(c_in, a, b, out as *mut _)
+ _addcarry_u32(c_in, a, b, out)
}
/// Adds unsigned 32-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
index 24e0cf6..c1bb897 100644
--- a/crates/core_arch/src/x86/avx.rs
+++ b/crates/core_arch/src/x86/avx.rs
@@ -587,7 +587,11 @@
#[cfg_attr(test, assert_instr(vhaddpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
- unsafe { vhaddpd(a, b) }
+ unsafe {
+ let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
+ let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
+ simd_add(even, odd)
+ }
}
/// Horizontal addition of adjacent pairs in the two packed vectors
@@ -602,7 +606,11 @@
#[cfg_attr(test, assert_instr(vhaddps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
- unsafe { vhaddps(a, b) }
+ unsafe {
+ let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
+ let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
+ simd_add(even, odd)
+ }
}
/// Horizontal subtraction of adjacent pairs in the two packed vectors
@@ -616,7 +624,11 @@
#[cfg_attr(test, assert_instr(vhsubpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
- unsafe { vhsubpd(a, b) }
+ unsafe {
+ let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
+ let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
+ simd_sub(even, odd)
+ }
}
/// Horizontal subtraction of adjacent pairs in the two packed vectors
@@ -631,7 +643,11 @@
#[cfg_attr(test, assert_instr(vhsubps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
- unsafe { vhsubps(a, b) }
+ unsafe {
+ let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
+ let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
+ simd_sub(even, odd)
+ }
}
/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
@@ -1218,7 +1234,10 @@
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
static_assert_uimm_bits!(IMM8, 8);
- unsafe { vperm2f128ps256(a, b, IMM8 as i8) }
+ _mm256_castsi256_ps(_mm256_permute2f128_si256::<IMM8>(
+ _mm256_castps_si256(a),
+ _mm256_castps_si256(b),
+ ))
}
/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
@@ -1232,7 +1251,10 @@
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
static_assert_uimm_bits!(IMM8, 8);
- unsafe { vperm2f128pd256(a, b, IMM8 as i8) }
+ _mm256_castsi256_pd(_mm256_permute2f128_si256::<IMM8>(
+ _mm256_castpd_si256(a),
+ _mm256_castpd_si256(b),
+ ))
}
/// Shuffles 128-bits (composed of integer data) selected by `imm8`
@@ -1246,7 +1268,35 @@
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
static_assert_uimm_bits!(IMM8, 8);
- unsafe { transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8)) }
+ const fn idx(imm8: i32, pos: u32) -> u32 {
+ let part = if pos < 2 {
+ imm8 & 0xf
+ } else {
+ (imm8 & 0xf0) >> 4
+ };
+ 2 * (part as u32 & 0b11) + (pos & 1)
+ }
+ const fn idx0(imm8: i32, pos: u32) -> u32 {
+ let part = if pos < 2 {
+ imm8 & 0xf
+ } else {
+ (imm8 & 0xf0) >> 4
+ };
+ if part & 0b1000 != 0 { 4 } else { pos }
+ }
+ unsafe {
+ let r = simd_shuffle!(
+ a.as_i64x4(),
+ b.as_i64x4(),
+ [idx(IMM8, 0), idx(IMM8, 1), idx(IMM8, 2), idx(IMM8, 3)]
+ );
+ let r: i64x4 = simd_shuffle!(
+ r,
+ i64x4::ZERO,
+ [idx0(IMM8, 0), idx0(IMM8, 1), idx0(IMM8, 2), idx0(IMM8, 3)]
+ );
+ r.as_m256i()
+ }
}
/// Broadcasts a single-precision (32-bit) floating-point element from memory
@@ -1933,7 +1983,10 @@
#[cfg_attr(test, assert_instr(vptest))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
- unsafe { ptestz256(a.as_i64x4(), b.as_i64x4()) }
+ unsafe {
+ let r = simd_and(a.as_i64x4(), b.as_i64x4());
+ (0i64 == simd_reduce_or(r)) as i32
+ }
}
/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
@@ -1947,7 +2000,10 @@
#[cfg_attr(test, assert_instr(vptest))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
- unsafe { ptestc256(a.as_i64x4(), b.as_i64x4()) }
+ unsafe {
+ let r = simd_and(simd_xor(a.as_i64x4(), i64x4::splat(!0)), b.as_i64x4());
+ (0i64 == simd_reduce_or(r)) as i32
+ }
}
/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
@@ -2031,7 +2087,10 @@
#[cfg_attr(test, assert_instr(vtestpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
- unsafe { vtestzpd(a, b) }
+ unsafe {
+ let r: i64x2 = simd_lt(transmute(_mm_and_pd(a, b)), i64x2::ZERO);
+ (0i64 == simd_reduce_or(r)) as i32
+ }
}
/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
@@ -2048,7 +2107,10 @@
#[cfg_attr(test, assert_instr(vtestpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
- unsafe { vtestcpd(a, b) }
+ unsafe {
+ let r: i64x2 = simd_lt(transmute(_mm_andnot_pd(a, b)), i64x2::ZERO);
+ (0i64 == simd_reduce_or(r)) as i32
+ }
}
/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
@@ -2135,7 +2197,10 @@
#[cfg_attr(test, assert_instr(vtestps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
- unsafe { vtestzps(a, b) }
+ unsafe {
+ let r: i32x4 = simd_lt(transmute(_mm_and_ps(a, b)), i32x4::ZERO);
+ (0i32 == simd_reduce_or(r)) as i32
+ }
}
/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
@@ -2152,7 +2217,10 @@
#[cfg_attr(test, assert_instr(vtestps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
- unsafe { vtestcps(a, b) }
+ unsafe {
+ let r: i32x4 = simd_lt(transmute(_mm_andnot_ps(a, b)), i32x4::ZERO);
+ (0i32 == simd_reduce_or(r)) as i32
+ }
}
/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
@@ -3044,14 +3112,6 @@
fn roundps256(a: __m256, b: i32) -> __m256;
#[link_name = "llvm.x86.avx.dp.ps.256"]
fn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
- #[link_name = "llvm.x86.avx.hadd.pd.256"]
- fn vhaddpd(a: __m256d, b: __m256d) -> __m256d;
- #[link_name = "llvm.x86.avx.hadd.ps.256"]
- fn vhaddps(a: __m256, b: __m256) -> __m256;
- #[link_name = "llvm.x86.avx.hsub.pd.256"]
- fn vhsubpd(a: __m256d, b: __m256d) -> __m256d;
- #[link_name = "llvm.x86.avx.hsub.ps.256"]
- fn vhsubps(a: __m256, b: __m256) -> __m256;
#[link_name = "llvm.x86.sse2.cmp.pd"]
fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
#[link_name = "llvm.x86.avx.cmp.pd.256"]
@@ -3084,12 +3144,6 @@
fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
#[link_name = "llvm.x86.avx.vpermilvar.pd"]
fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
- #[link_name = "llvm.x86.avx.vperm2f128.ps.256"]
- fn vperm2f128ps256(a: __m256, b: __m256, imm8: i8) -> __m256;
- #[link_name = "llvm.x86.avx.vperm2f128.pd.256"]
- fn vperm2f128pd256(a: __m256d, b: __m256d, imm8: i8) -> __m256d;
- #[link_name = "llvm.x86.avx.vperm2f128.si.256"]
- fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
#[link_name = "llvm.x86.avx.maskload.pd.256"]
fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d;
#[link_name = "llvm.x86.avx.maskstore.pd.256"]
@@ -3112,10 +3166,6 @@
fn vrcpps(a: __m256) -> __m256;
#[link_name = "llvm.x86.avx.rsqrt.ps.256"]
fn vrsqrtps(a: __m256) -> __m256;
- #[link_name = "llvm.x86.avx.ptestz.256"]
- fn ptestz256(a: i64x4, b: i64x4) -> i32;
- #[link_name = "llvm.x86.avx.ptestc.256"]
- fn ptestc256(a: i64x4, b: i64x4) -> i32;
#[link_name = "llvm.x86.avx.ptestnzc.256"]
fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
#[link_name = "llvm.x86.avx.vtestz.pd.256"]
@@ -3124,10 +3174,6 @@
fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
#[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
- #[link_name = "llvm.x86.avx.vtestz.pd"]
- fn vtestzpd(a: __m128d, b: __m128d) -> i32;
- #[link_name = "llvm.x86.avx.vtestc.pd"]
- fn vtestcpd(a: __m128d, b: __m128d) -> i32;
#[link_name = "llvm.x86.avx.vtestnzc.pd"]
fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
#[link_name = "llvm.x86.avx.vtestz.ps.256"]
@@ -3136,10 +3182,6 @@
fn vtestcps256(a: __m256, b: __m256) -> i32;
#[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
fn vtestnzcps256(a: __m256, b: __m256) -> i32;
- #[link_name = "llvm.x86.avx.vtestz.ps"]
- fn vtestzps(a: __m128, b: __m128) -> i32;
- #[link_name = "llvm.x86.avx.vtestc.ps"]
- fn vtestcps(a: __m128, b: __m128) -> i32;
#[link_name = "llvm.x86.avx.vtestnzc.ps"]
fn vtestnzcps(a: __m128, b: __m128) -> i32;
#[link_name = "llvm.x86.avx.min.ps.256"]
diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
index 739de2b..8be6629 100644
--- a/crates/core_arch/src/x86/avx2.rs
+++ b/crates/core_arch/src/x86/avx2.rs
@@ -891,7 +891,21 @@
#[cfg_attr(test, assert_instr(vphaddw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
- unsafe { transmute(phaddw(a.as_i16x16(), b.as_i16x16())) }
+ let a = a.as_i16x16();
+ let b = b.as_i16x16();
+ unsafe {
+ let even: i16x16 = simd_shuffle!(
+ a,
+ b,
+ [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
+ );
+ let odd: i16x16 = simd_shuffle!(
+ a,
+ b,
+ [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
+ );
+ simd_add(even, odd).as_m256i()
+ }
}
/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
@@ -902,7 +916,13 @@
#[cfg_attr(test, assert_instr(vphaddd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
- unsafe { transmute(phaddd(a.as_i32x8(), b.as_i32x8())) }
+ let a = a.as_i32x8();
+ let b = b.as_i32x8();
+ unsafe {
+ let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
+ let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
+ simd_add(even, odd).as_m256i()
+ }
}
/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
@@ -925,7 +945,21 @@
#[cfg_attr(test, assert_instr(vphsubw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
- unsafe { transmute(phsubw(a.as_i16x16(), b.as_i16x16())) }
+ let a = a.as_i16x16();
+ let b = b.as_i16x16();
+ unsafe {
+ let even: i16x16 = simd_shuffle!(
+ a,
+ b,
+ [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
+ );
+ let odd: i16x16 = simd_shuffle!(
+ a,
+ b,
+ [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
+ );
+ simd_sub(even, odd).as_m256i()
+ }
}
/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
@@ -936,7 +970,13 @@
#[cfg_attr(test, assert_instr(vphsubd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
- unsafe { transmute(phsubd(a.as_i32x8(), b.as_i32x8())) }
+ let a = a.as_i32x8();
+ let b = b.as_i32x8();
+ unsafe {
+ let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
+ let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
+ simd_sub(even, odd).as_m256i()
+ }
}
/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
@@ -1714,7 +1754,12 @@
#[cfg_attr(test, assert_instr(vpmaddwd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
- unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
+ unsafe {
+ let r: i32x16 = simd_mul(simd_cast(a.as_i16x16()), simd_cast(b.as_i16x16()));
+ let even: i32x8 = simd_shuffle!(r, r, [0, 2, 4, 6, 8, 10, 12, 14]);
+ let odd: i32x8 = simd_shuffle!(r, r, [1, 3, 5, 7, 9, 11, 13, 15]);
+ simd_add(even, odd).as_m256i()
+ }
}
/// Vertically multiplies each unsigned 8-bit integer from `a` with the
@@ -2285,7 +2330,7 @@
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
static_assert_uimm_bits!(IMM8, 8);
- unsafe { transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8)) }
+ _mm256_permute2f128_si256::<IMM8>(a, b)
}
/// Shuffles 64-bit floating-point elements in `a` across lanes using the
@@ -2733,7 +2778,7 @@
#[cfg_attr(test, assert_instr(vpsllvd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
- unsafe { transmute(psllvd(a.as_i32x4(), count.as_i32x4())) }
+ unsafe { transmute(simd_shl(a.as_u32x4(), count.as_u32x4())) }
}
/// Shifts packed 32-bit integers in `a` left by the amount
@@ -2746,7 +2791,7 @@
#[cfg_attr(test, assert_instr(vpsllvd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
- unsafe { transmute(psllvd256(a.as_i32x8(), count.as_i32x8())) }
+ unsafe { transmute(simd_shl(a.as_u32x8(), count.as_u32x8())) }
}
/// Shifts packed 64-bit integers in `a` left by the amount
@@ -2759,7 +2804,7 @@
#[cfg_attr(test, assert_instr(vpsllvq))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
- unsafe { transmute(psllvq(a.as_i64x2(), count.as_i64x2())) }
+ unsafe { transmute(simd_shl(a.as_u64x2(), count.as_u64x2())) }
}
/// Shifts packed 64-bit integers in `a` left by the amount
@@ -2772,7 +2817,7 @@
#[cfg_attr(test, assert_instr(vpsllvq))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
- unsafe { transmute(psllvq256(a.as_i64x4(), count.as_i64x4())) }
+ unsafe { transmute(simd_shl(a.as_u64x4(), count.as_u64x4())) }
}
/// Shifts packed 16-bit integers in `a` right by `count` while
@@ -2836,7 +2881,7 @@
#[cfg_attr(test, assert_instr(vpsravd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
- unsafe { transmute(psravd(a.as_i32x4(), count.as_i32x4())) }
+ unsafe { transmute(simd_shr(a.as_i32x4(), count.as_i32x4())) }
}
/// Shifts packed 32-bit integers in `a` right by the amount specified by the
@@ -2848,7 +2893,7 @@
#[cfg_attr(test, assert_instr(vpsravd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
- unsafe { transmute(psravd256(a.as_i32x8(), count.as_i32x8())) }
+ unsafe { transmute(simd_shr(a.as_i32x8(), count.as_i32x8())) }
}
/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
@@ -3031,7 +3076,7 @@
#[cfg_attr(test, assert_instr(vpsrlvd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
- unsafe { transmute(psrlvd(a.as_i32x4(), count.as_i32x4())) }
+ unsafe { transmute(simd_shr(a.as_u32x4(), count.as_u32x4())) }
}
/// Shifts packed 32-bit integers in `a` right by the amount specified by
@@ -3043,7 +3088,7 @@
#[cfg_attr(test, assert_instr(vpsrlvd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
- unsafe { transmute(psrlvd256(a.as_i32x8(), count.as_i32x8())) }
+ unsafe { transmute(simd_shr(a.as_u32x8(), count.as_u32x8())) }
}
/// Shifts packed 64-bit integers in `a` right by the amount specified by
@@ -3055,7 +3100,7 @@
#[cfg_attr(test, assert_instr(vpsrlvq))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
- unsafe { transmute(psrlvq(a.as_i64x2(), count.as_i64x2())) }
+ unsafe { transmute(simd_shr(a.as_u64x2(), count.as_u64x2())) }
}
/// Shifts packed 64-bit integers in `a` right by the amount specified by
@@ -3067,7 +3112,7 @@
#[cfg_attr(test, assert_instr(vpsrlvq))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
- unsafe { transmute(psrlvq256(a.as_i64x4(), count.as_i64x4())) }
+ unsafe { transmute(simd_shr(a.as_u64x4(), count.as_u64x4())) }
}
/// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
@@ -3594,20 +3639,10 @@
#[allow(improper_ctypes)]
unsafe extern "C" {
- #[link_name = "llvm.x86.avx2.phadd.w"]
- fn phaddw(a: i16x16, b: i16x16) -> i16x16;
- #[link_name = "llvm.x86.avx2.phadd.d"]
- fn phaddd(a: i32x8, b: i32x8) -> i32x8;
#[link_name = "llvm.x86.avx2.phadd.sw"]
fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
- #[link_name = "llvm.x86.avx2.phsub.w"]
- fn phsubw(a: i16x16, b: i16x16) -> i16x16;
- #[link_name = "llvm.x86.avx2.phsub.d"]
- fn phsubd(a: i32x8, b: i32x8) -> i32x8;
#[link_name = "llvm.x86.avx2.phsub.sw"]
fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
- #[link_name = "llvm.x86.avx2.pmadd.wd"]
- fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
#[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
#[link_name = "llvm.x86.avx2.maskload.d"]
@@ -3652,44 +3687,22 @@
fn pslld(a: i32x8, count: i32x4) -> i32x8;
#[link_name = "llvm.x86.avx2.psll.q"]
fn psllq(a: i64x4, count: i64x2) -> i64x4;
- #[link_name = "llvm.x86.avx2.psllv.d"]
- fn psllvd(a: i32x4, count: i32x4) -> i32x4;
- #[link_name = "llvm.x86.avx2.psllv.d.256"]
- fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
- #[link_name = "llvm.x86.avx2.psllv.q"]
- fn psllvq(a: i64x2, count: i64x2) -> i64x2;
- #[link_name = "llvm.x86.avx2.psllv.q.256"]
- fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
#[link_name = "llvm.x86.avx2.psra.w"]
fn psraw(a: i16x16, count: i16x8) -> i16x16;
#[link_name = "llvm.x86.avx2.psra.d"]
fn psrad(a: i32x8, count: i32x4) -> i32x8;
- #[link_name = "llvm.x86.avx2.psrav.d"]
- fn psravd(a: i32x4, count: i32x4) -> i32x4;
- #[link_name = "llvm.x86.avx2.psrav.d.256"]
- fn psravd256(a: i32x8, count: i32x8) -> i32x8;
#[link_name = "llvm.x86.avx2.psrl.w"]
fn psrlw(a: i16x16, count: i16x8) -> i16x16;
#[link_name = "llvm.x86.avx2.psrl.d"]
fn psrld(a: i32x8, count: i32x4) -> i32x8;
#[link_name = "llvm.x86.avx2.psrl.q"]
fn psrlq(a: i64x4, count: i64x2) -> i64x4;
- #[link_name = "llvm.x86.avx2.psrlv.d"]
- fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
- #[link_name = "llvm.x86.avx2.psrlv.d.256"]
- fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
- #[link_name = "llvm.x86.avx2.psrlv.q"]
- fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
- #[link_name = "llvm.x86.avx2.psrlv.q.256"]
- fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
#[link_name = "llvm.x86.avx2.pshuf.b"]
fn pshufb(a: u8x32, b: u8x32) -> u8x32;
#[link_name = "llvm.x86.avx2.permd"]
fn permd(a: u32x8, b: u32x8) -> u32x8;
#[link_name = "llvm.x86.avx2.permps"]
fn permps(a: __m256, b: i32x8) -> __m256;
- #[link_name = "llvm.x86.avx2.vperm2i128"]
- fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
#[link_name = "llvm.x86.avx2.gather.d.d"]
fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
#[link_name = "llvm.x86.avx2.gather.d.d.256"]
diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 8139b8c..1771f19 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -5835,7 +5835,20 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpmaddwd))]
pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
- unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) }
+ unsafe {
+ let r: i32x32 = simd_mul(simd_cast(a.as_i16x32()), simd_cast(b.as_i16x32()));
+ let even: i32x16 = simd_shuffle!(
+ r,
+ r,
+ [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
+ );
+ let odd: i32x16 = simd_shuffle!(
+ r,
+ r,
+ [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]
+ );
+ simd_add(even, odd).as_m512i()
+ }
}
/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6839,7 +6852,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpsllvw))]
pub fn _mm512_sllv_epi16(a: __m512i, count: __m512i) -> __m512i {
- unsafe { transmute(vpsllvw(a.as_i16x32(), count.as_i16x32())) }
+ unsafe { transmute(simd_shl(a.as_u16x32(), count.as_u16x32())) }
}
/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6878,7 +6891,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpsllvw))]
pub fn _mm256_sllv_epi16(a: __m256i, count: __m256i) -> __m256i {
- unsafe { transmute(vpsllvw256(a.as_i16x16(), count.as_i16x16())) }
+ unsafe { transmute(simd_shl(a.as_u16x16(), count.as_u16x16())) }
}
/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6917,7 +6930,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpsllvw))]
pub fn _mm_sllv_epi16(a: __m128i, count: __m128i) -> __m128i {
- unsafe { transmute(vpsllvw128(a.as_i16x8(), count.as_i16x8())) }
+ unsafe { transmute(simd_shl(a.as_u16x8(), count.as_u16x8())) }
}
/// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7175,7 +7188,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpsrlvw))]
pub fn _mm512_srlv_epi16(a: __m512i, count: __m512i) -> __m512i {
- unsafe { transmute(vpsrlvw(a.as_i16x32(), count.as_i16x32())) }
+ unsafe { transmute(simd_shr(a.as_u16x32(), count.as_u16x32())) }
}
/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7214,7 +7227,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpsrlvw))]
pub fn _mm256_srlv_epi16(a: __m256i, count: __m256i) -> __m256i {
- unsafe { transmute(vpsrlvw256(a.as_i16x16(), count.as_i16x16())) }
+ unsafe { transmute(simd_shr(a.as_u16x16(), count.as_u16x16())) }
}
/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7253,7 +7266,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpsrlvw))]
pub fn _mm_srlv_epi16(a: __m128i, count: __m128i) -> __m128i {
- unsafe { transmute(vpsrlvw128(a.as_i16x8(), count.as_i16x8())) }
+ unsafe { transmute(simd_shr(a.as_u16x8(), count.as_u16x8())) }
}
/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7498,7 +7511,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpsravw))]
pub fn _mm512_srav_epi16(a: __m512i, count: __m512i) -> __m512i {
- unsafe { transmute(vpsravw(a.as_i16x32(), count.as_i16x32())) }
+ unsafe { transmute(simd_shr(a.as_i16x32(), count.as_i16x32())) }
}
/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7537,7 +7550,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpsravw))]
pub fn _mm256_srav_epi16(a: __m256i, count: __m256i) -> __m256i {
- unsafe { transmute(vpsravw256(a.as_i16x16(), count.as_i16x16())) }
+ unsafe { transmute(simd_shr(a.as_i16x16(), count.as_i16x16())) }
}
/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7576,7 +7589,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpsravw))]
pub fn _mm_srav_epi16(a: __m128i, count: __m128i) -> __m128i {
- unsafe { transmute(vpsravw128(a.as_i16x8(), count.as_i16x8())) }
+ unsafe { transmute(simd_shr(a.as_i16x8(), count.as_i16x8())) }
}
/// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11617,8 +11630,6 @@
#[link_name = "llvm.x86.avx512.pmul.hr.sw.512"]
fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32;
- #[link_name = "llvm.x86.avx512.pmaddw.d.512"]
- fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16;
#[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
fn vpmaddubsw(a: i8x64, b: i8x64) -> i16x32;
@@ -11634,33 +11645,12 @@
#[link_name = "llvm.x86.avx512.psll.w.512"]
fn vpsllw(a: i16x32, count: i16x8) -> i16x32;
- #[link_name = "llvm.x86.avx512.psllv.w.512"]
- fn vpsllvw(a: i16x32, b: i16x32) -> i16x32;
- #[link_name = "llvm.x86.avx512.psllv.w.256"]
- fn vpsllvw256(a: i16x16, b: i16x16) -> i16x16;
- #[link_name = "llvm.x86.avx512.psllv.w.128"]
- fn vpsllvw128(a: i16x8, b: i16x8) -> i16x8;
-
#[link_name = "llvm.x86.avx512.psrl.w.512"]
fn vpsrlw(a: i16x32, count: i16x8) -> i16x32;
- #[link_name = "llvm.x86.avx512.psrlv.w.512"]
- fn vpsrlvw(a: i16x32, b: i16x32) -> i16x32;
- #[link_name = "llvm.x86.avx512.psrlv.w.256"]
- fn vpsrlvw256(a: i16x16, b: i16x16) -> i16x16;
- #[link_name = "llvm.x86.avx512.psrlv.w.128"]
- fn vpsrlvw128(a: i16x8, b: i16x8) -> i16x8;
-
#[link_name = "llvm.x86.avx512.psra.w.512"]
fn vpsraw(a: i16x32, count: i16x8) -> i16x32;
- #[link_name = "llvm.x86.avx512.psrav.w.512"]
- fn vpsravw(a: i16x32, count: i16x32) -> i16x32;
- #[link_name = "llvm.x86.avx512.psrav.w.256"]
- fn vpsravw256(a: i16x16, count: i16x16) -> i16x16;
- #[link_name = "llvm.x86.avx512.psrav.w.128"]
- fn vpsravw128(a: i16x8, count: i16x8) -> i16x8;
-
#[link_name = "llvm.x86.avx512.vpermi2var.hi.512"]
fn vpermi2w(a: i16x32, idx: i16x32, b: i16x32) -> i16x32;
#[link_name = "llvm.x86.avx512.vpermi2var.hi.256"]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 52c6a11..002534a 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -19077,12 +19077,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm512_rol_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x16();
- let r = vprold(a, IMM8);
- transmute(r)
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_rolv_epi32(a, _mm512_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19094,12 +19090,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub fn _mm512_mask_rol_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x16();
- let r = vprold(a, IMM8);
- transmute(simd_select_bitmask(k, r, src.as_i32x16()))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_mask_rolv_epi32(src, k, a, _mm512_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19111,12 +19103,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub fn _mm512_maskz_rol_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x16();
- let r = vprold(a, IMM8);
- transmute(simd_select_bitmask(k, r, i32x16::ZERO))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_maskz_rolv_epi32(k, a, _mm512_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -19128,12 +19116,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm256_rol_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x8();
- let r = vprold256(a, IMM8);
- transmute(r)
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_rolv_epi32(a, _mm256_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19145,12 +19129,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub fn _mm256_mask_rol_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x8();
- let r = vprold256(a, IMM8);
- transmute(simd_select_bitmask(k, r, src.as_i32x8()))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_mask_rolv_epi32(src, k, a, _mm256_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19162,12 +19142,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub fn _mm256_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x8();
- let r = vprold256(a, IMM8);
- transmute(simd_select_bitmask(k, r, i32x8::ZERO))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_maskz_rolv_epi32(k, a, _mm256_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -19179,12 +19155,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm_rol_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x4();
- let r = vprold128(a, IMM8);
- transmute(r)
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_rolv_epi32(a, _mm_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19196,12 +19168,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub fn _mm_mask_rol_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x4();
- let r = vprold128(a, IMM8);
- transmute(simd_select_bitmask(k, r, src.as_i32x4()))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_rolv_epi32(src, k, a, _mm_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19213,12 +19181,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub fn _mm_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x4();
- let r = vprold128(a, IMM8);
- transmute(simd_select_bitmask(k, r, i32x4::ZERO))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_maskz_rolv_epi32(k, a, _mm_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -19230,12 +19194,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm512_ror_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x16();
- let r = vprord(a, IMM8);
- transmute(r)
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_rorv_epi32(a, _mm512_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19247,12 +19207,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(3)]
pub fn _mm512_mask_ror_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x16();
- let r = vprord(a, IMM8);
- transmute(simd_select_bitmask(k, r, src.as_i32x16()))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_mask_rorv_epi32(src, k, a, _mm512_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19264,12 +19220,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(2)]
pub fn _mm512_maskz_ror_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x16();
- let r = vprord(a, IMM8);
- transmute(simd_select_bitmask(k, r, i32x16::ZERO))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_maskz_rorv_epi32(k, a, _mm512_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -19281,12 +19233,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm256_ror_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x8();
- let r = vprord256(a, IMM8);
- transmute(r)
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_rorv_epi32(a, _mm256_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19298,12 +19246,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(3)]
pub fn _mm256_mask_ror_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x8();
- let r = vprord256(a, IMM8);
- transmute(simd_select_bitmask(k, r, src.as_i32x8()))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_mask_rorv_epi32(src, k, a, _mm256_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19315,12 +19259,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(2)]
pub fn _mm256_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x8();
- let r = vprord256(a, IMM8);
- transmute(simd_select_bitmask(k, r, i32x8::ZERO))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_maskz_rorv_epi32(k, a, _mm256_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -19332,12 +19272,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm_ror_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x4();
- let r = vprord128(a, IMM8);
- transmute(r)
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_rorv_epi32(a, _mm_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19349,12 +19285,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(3)]
pub fn _mm_mask_ror_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x4();
- let r = vprord128(a, IMM8);
- transmute(simd_select_bitmask(k, r, src.as_i32x4()))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_rorv_epi32(src, k, a, _mm_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19366,12 +19298,8 @@
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(2)]
pub fn _mm_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i32x4();
- let r = vprord128(a, IMM8);
- transmute(simd_select_bitmask(k, r, i32x4::ZERO))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_maskz_rorv_epi32(k, a, _mm_set1_epi32(IMM8))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -19383,12 +19311,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm512_rol_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x8();
- let r = vprolq(a, IMM8);
- transmute(r)
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_rolv_epi64(a, _mm512_set1_epi64(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19400,12 +19324,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub fn _mm512_mask_rol_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x8();
- let r = vprolq(a, IMM8);
- transmute(simd_select_bitmask(k, r, src.as_i64x8()))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_mask_rolv_epi64(src, k, a, _mm512_set1_epi64(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19417,12 +19337,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub fn _mm512_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x8();
- let r = vprolq(a, IMM8);
- transmute(simd_select_bitmask(k, r, i64x8::ZERO))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_maskz_rolv_epi64(k, a, _mm512_set1_epi64(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -19434,12 +19350,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm256_rol_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x4();
- let r = vprolq256(a, IMM8);
- transmute(r)
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_rolv_epi64(a, _mm256_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19451,12 +19363,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub fn _mm256_mask_rol_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x4();
- let r = vprolq256(a, IMM8);
- transmute(simd_select_bitmask(k, r, src.as_i64x4()))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_mask_rolv_epi64(src, k, a, _mm256_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19468,12 +19376,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub fn _mm256_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x4();
- let r = vprolq256(a, IMM8);
- transmute(simd_select_bitmask(k, r, i64x4::ZERO))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_maskz_rolv_epi64(k, a, _mm256_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -19485,12 +19389,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub fn _mm_rol_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x2();
- let r = vprolq128(a, IMM8);
- transmute(r)
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_rolv_epi64(a, _mm_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19502,12 +19402,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub fn _mm_mask_rol_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x2();
- let r = vprolq128(a, IMM8);
- transmute(simd_select_bitmask(k, r, src.as_i64x2()))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_rolv_epi64(src, k, a, _mm_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19519,12 +19415,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub fn _mm_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x2();
- let r = vprolq128(a, IMM8);
- transmute(simd_select_bitmask(k, r, i64x2::ZERO))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_maskz_rolv_epi64(k, a, _mm_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -19536,12 +19428,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(1)]
pub fn _mm512_ror_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x8();
- let r = vprorq(a, IMM8);
- transmute(r)
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_rorv_epi64(a, _mm512_set1_epi64(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19553,12 +19441,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(3)]
pub fn _mm512_mask_ror_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x8();
- let r = vprorq(a, IMM8);
- transmute(simd_select_bitmask(k, r, src.as_i64x8()))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_mask_rorv_epi64(src, k, a, _mm512_set1_epi64(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19570,12 +19454,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(2)]
pub fn _mm512_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x8();
- let r = vprorq(a, IMM8);
- transmute(simd_select_bitmask(k, r, i64x8::ZERO))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm512_maskz_rorv_epi64(k, a, _mm512_set1_epi64(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -19587,12 +19467,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(1)]
pub fn _mm256_ror_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x4();
- let r = vprorq256(a, IMM8);
- transmute(r)
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_rorv_epi64(a, _mm256_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19604,12 +19480,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(3)]
pub fn _mm256_mask_ror_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x4();
- let r = vprorq256(a, IMM8);
- transmute(simd_select_bitmask(k, r, src.as_i64x4()))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_mask_rorv_epi64(src, k, a, _mm256_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19621,12 +19493,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(2)]
pub fn _mm256_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x4();
- let r = vprorq256(a, IMM8);
- transmute(simd_select_bitmask(k, r, i64x4::ZERO))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm256_maskz_rorv_epi64(k, a, _mm256_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -19638,12 +19506,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(1)]
pub fn _mm_ror_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x2();
- let r = vprorq128(a, IMM8);
- transmute(r)
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_rorv_epi64(a, _mm_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19655,12 +19519,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(3)]
pub fn _mm_mask_ror_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x2();
- let r = vprorq128(a, IMM8);
- transmute(simd_select_bitmask(k, r, src.as_i64x2()))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_mask_rorv_epi64(src, k, a, _mm_set1_epi64x(IMM8 as i64))
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19672,12 +19532,8 @@
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(2)]
pub fn _mm_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
- unsafe {
- static_assert_uimm_bits!(IMM8, 8);
- let a = a.as_i64x2();
- let r = vprorq128(a, IMM8);
- transmute(simd_select_bitmask(k, r, i64x2::ZERO))
- }
+ static_assert_uimm_bits!(IMM8, 8);
+ _mm_maskz_rorv_epi64(k, a, _mm_set1_epi64x(IMM8 as i64))
}
/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
@@ -21084,7 +20940,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpsravd))]
pub fn _mm512_srav_epi32(a: __m512i, count: __m512i) -> __m512i {
- unsafe { transmute(vpsravd(a.as_i32x16(), count.as_i32x16())) }
+ unsafe { transmute(simd_shr(a.as_i32x16(), count.as_i32x16())) }
}
/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21179,7 +21035,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpsravq))]
pub fn _mm512_srav_epi64(a: __m512i, count: __m512i) -> __m512i {
- unsafe { transmute(vpsravq(a.as_i64x8(), count.as_i64x8())) }
+ unsafe { transmute(simd_shr(a.as_i64x8(), count.as_i64x8())) }
}
/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21218,7 +21074,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpsravq))]
pub fn _mm256_srav_epi64(a: __m256i, count: __m256i) -> __m256i {
- unsafe { transmute(vpsravq256(a.as_i64x4(), count.as_i64x4())) }
+ unsafe { transmute(simd_shr(a.as_i64x4(), count.as_i64x4())) }
}
/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21257,7 +21113,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpsravq))]
pub fn _mm_srav_epi64(a: __m128i, count: __m128i) -> __m128i {
- unsafe { transmute(vpsravq128(a.as_i64x2(), count.as_i64x2())) }
+ unsafe { transmute(simd_shr(a.as_i64x2(), count.as_i64x2())) }
}
/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21296,7 +21152,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprolvd))]
pub fn _mm512_rolv_epi32(a: __m512i, b: __m512i) -> __m512i {
- unsafe { transmute(vprolvd(a.as_i32x16(), b.as_i32x16())) }
+ unsafe { transmute(simd_funnel_shl(a.as_u32x16(), a.as_u32x16(), b.as_u32x16())) }
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21335,7 +21191,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprolvd))]
pub fn _mm256_rolv_epi32(a: __m256i, b: __m256i) -> __m256i {
- unsafe { transmute(vprolvd256(a.as_i32x8(), b.as_i32x8())) }
+ unsafe { transmute(simd_funnel_shl(a.as_u32x8(), a.as_u32x8(), b.as_u32x8())) }
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21374,7 +21230,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprolvd))]
pub fn _mm_rolv_epi32(a: __m128i, b: __m128i) -> __m128i {
- unsafe { transmute(vprolvd128(a.as_i32x4(), b.as_i32x4())) }
+ unsafe { transmute(simd_funnel_shl(a.as_u32x4(), a.as_u32x4(), b.as_u32x4())) }
}
/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21413,7 +21269,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprorvd))]
pub fn _mm512_rorv_epi32(a: __m512i, b: __m512i) -> __m512i {
- unsafe { transmute(vprorvd(a.as_i32x16(), b.as_i32x16())) }
+ unsafe { transmute(simd_funnel_shr(a.as_u32x16(), a.as_u32x16(), b.as_u32x16())) }
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21452,7 +21308,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprorvd))]
pub fn _mm256_rorv_epi32(a: __m256i, b: __m256i) -> __m256i {
- unsafe { transmute(vprorvd256(a.as_i32x8(), b.as_i32x8())) }
+ unsafe { transmute(simd_funnel_shr(a.as_u32x8(), a.as_u32x8(), b.as_u32x8())) }
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21491,7 +21347,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprorvd))]
pub fn _mm_rorv_epi32(a: __m128i, b: __m128i) -> __m128i {
- unsafe { transmute(vprorvd128(a.as_i32x4(), b.as_i32x4())) }
+ unsafe { transmute(simd_funnel_shr(a.as_u32x4(), a.as_u32x4(), b.as_u32x4())) }
}
/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21530,7 +21386,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprolvq))]
pub fn _mm512_rolv_epi64(a: __m512i, b: __m512i) -> __m512i {
- unsafe { transmute(vprolvq(a.as_i64x8(), b.as_i64x8())) }
+ unsafe { transmute(simd_funnel_shl(a.as_u64x8(), a.as_u64x8(), b.as_u64x8())) }
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21569,7 +21425,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprolvq))]
pub fn _mm256_rolv_epi64(a: __m256i, b: __m256i) -> __m256i {
- unsafe { transmute(vprolvq256(a.as_i64x4(), b.as_i64x4())) }
+ unsafe { transmute(simd_funnel_shl(a.as_u64x4(), a.as_u64x4(), b.as_u64x4())) }
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21608,7 +21464,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprolvq))]
pub fn _mm_rolv_epi64(a: __m128i, b: __m128i) -> __m128i {
- unsafe { transmute(vprolvq128(a.as_i64x2(), b.as_i64x2())) }
+ unsafe { transmute(simd_funnel_shl(a.as_u64x2(), a.as_u64x2(), b.as_u64x2())) }
}
/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21647,7 +21503,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprorvq))]
pub fn _mm512_rorv_epi64(a: __m512i, b: __m512i) -> __m512i {
- unsafe { transmute(vprorvq(a.as_i64x8(), b.as_i64x8())) }
+ unsafe { transmute(simd_funnel_shr(a.as_u64x8(), a.as_u64x8(), b.as_u64x8())) }
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21686,7 +21542,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprorvq))]
pub fn _mm256_rorv_epi64(a: __m256i, b: __m256i) -> __m256i {
- unsafe { transmute(vprorvq256(a.as_i64x4(), b.as_i64x4())) }
+ unsafe { transmute(simd_funnel_shr(a.as_u64x4(), a.as_u64x4(), b.as_u64x4())) }
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21725,7 +21581,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vprorvq))]
pub fn _mm_rorv_epi64(a: __m128i, b: __m128i) -> __m128i {
- unsafe { transmute(vprorvq128(a.as_i64x2(), b.as_i64x2())) }
+ unsafe { transmute(simd_funnel_shr(a.as_u64x2(), a.as_u64x2(), b.as_u64x2())) }
}
/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21764,7 +21620,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpsllvd))]
pub fn _mm512_sllv_epi32(a: __m512i, count: __m512i) -> __m512i {
- unsafe { transmute(vpsllvd(a.as_i32x16(), count.as_i32x16())) }
+ unsafe { transmute(simd_shl(a.as_u32x16(), count.as_u32x16())) }
}
/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21859,7 +21715,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpsrlvd))]
pub fn _mm512_srlv_epi32(a: __m512i, count: __m512i) -> __m512i {
- unsafe { transmute(vpsrlvd(a.as_i32x16(), count.as_i32x16())) }
+ unsafe { transmute(simd_shr(a.as_u32x16(), count.as_u32x16())) }
}
/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21954,7 +21810,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpsllvq))]
pub fn _mm512_sllv_epi64(a: __m512i, count: __m512i) -> __m512i {
- unsafe { transmute(vpsllvq(a.as_i64x8(), count.as_i64x8())) }
+ unsafe { transmute(simd_shl(a.as_u64x8(), count.as_u64x8())) }
}
/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22049,7 +21905,7 @@
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpsrlvq))]
pub fn _mm512_srlv_epi64(a: __m512i, count: __m512i) -> __m512i {
- unsafe { transmute(vpsrlvq(a.as_i64x8(), count.as_i64x8())) }
+ unsafe { transmute(simd_shr(a.as_u64x8(), count.as_u64x8())) }
}
/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -42902,71 +42758,6 @@
#[link_name = "llvm.x86.avx512.mask.cmp.pd.128"]
fn vcmppd128(a: f64x2, b: f64x2, op: i32, m: i8) -> i8;
- #[link_name = "llvm.x86.avx512.mask.prol.d.512"]
- fn vprold(a: i32x16, i8: i32) -> i32x16;
- #[link_name = "llvm.x86.avx512.mask.prol.d.256"]
- fn vprold256(a: i32x8, i8: i32) -> i32x8;
- #[link_name = "llvm.x86.avx512.mask.prol.d.128"]
- fn vprold128(a: i32x4, i8: i32) -> i32x4;
-
- #[link_name = "llvm.x86.avx512.mask.pror.d.512"]
- fn vprord(a: i32x16, i8: i32) -> i32x16;
- #[link_name = "llvm.x86.avx512.mask.pror.d.256"]
- fn vprord256(a: i32x8, i8: i32) -> i32x8;
- #[link_name = "llvm.x86.avx512.mask.pror.d.128"]
- fn vprord128(a: i32x4, i8: i32) -> i32x4;
-
- #[link_name = "llvm.x86.avx512.mask.prol.q.512"]
- fn vprolq(a: i64x8, i8: i32) -> i64x8;
- #[link_name = "llvm.x86.avx512.mask.prol.q.256"]
- fn vprolq256(a: i64x4, i8: i32) -> i64x4;
- #[link_name = "llvm.x86.avx512.mask.prol.q.128"]
- fn vprolq128(a: i64x2, i8: i32) -> i64x2;
-
- #[link_name = "llvm.x86.avx512.mask.pror.q.512"]
- fn vprorq(a: i64x8, i8: i32) -> i64x8;
- #[link_name = "llvm.x86.avx512.mask.pror.q.256"]
- fn vprorq256(a: i64x4, i8: i32) -> i64x4;
- #[link_name = "llvm.x86.avx512.mask.pror.q.128"]
- fn vprorq128(a: i64x2, i8: i32) -> i64x2;
-
- #[link_name = "llvm.x86.avx512.mask.prolv.d.512"]
- fn vprolvd(a: i32x16, b: i32x16) -> i32x16;
- #[link_name = "llvm.x86.avx512.mask.prolv.d.256"]
- fn vprolvd256(a: i32x8, b: i32x8) -> i32x8;
- #[link_name = "llvm.x86.avx512.mask.prolv.d.128"]
- fn vprolvd128(a: i32x4, b: i32x4) -> i32x4;
-
- #[link_name = "llvm.x86.avx512.mask.prorv.d.512"]
- fn vprorvd(a: i32x16, b: i32x16) -> i32x16;
- #[link_name = "llvm.x86.avx512.mask.prorv.d.256"]
- fn vprorvd256(a: i32x8, b: i32x8) -> i32x8;
- #[link_name = "llvm.x86.avx512.mask.prorv.d.128"]
- fn vprorvd128(a: i32x4, b: i32x4) -> i32x4;
-
- #[link_name = "llvm.x86.avx512.mask.prolv.q.512"]
- fn vprolvq(a: i64x8, b: i64x8) -> i64x8;
- #[link_name = "llvm.x86.avx512.mask.prolv.q.256"]
- fn vprolvq256(a: i64x4, b: i64x4) -> i64x4;
- #[link_name = "llvm.x86.avx512.mask.prolv.q.128"]
- fn vprolvq128(a: i64x2, b: i64x2) -> i64x2;
-
- #[link_name = "llvm.x86.avx512.mask.prorv.q.512"]
- fn vprorvq(a: i64x8, b: i64x8) -> i64x8;
- #[link_name = "llvm.x86.avx512.mask.prorv.q.256"]
- fn vprorvq256(a: i64x4, b: i64x4) -> i64x4;
- #[link_name = "llvm.x86.avx512.mask.prorv.q.128"]
- fn vprorvq128(a: i64x2, b: i64x2) -> i64x2;
-
- #[link_name = "llvm.x86.avx512.psllv.d.512"]
- fn vpsllvd(a: i32x16, b: i32x16) -> i32x16;
- #[link_name = "llvm.x86.avx512.psrlv.d.512"]
- fn vpsrlvd(a: i32x16, b: i32x16) -> i32x16;
- #[link_name = "llvm.x86.avx512.psllv.q.512"]
- fn vpsllvq(a: i64x8, b: i64x8) -> i64x8;
- #[link_name = "llvm.x86.avx512.psrlv.q.512"]
- fn vpsrlvq(a: i64x8, b: i64x8) -> i64x8;
-
#[link_name = "llvm.x86.avx512.psll.d.512"]
fn vpslld(a: i32x16, count: i32x4) -> i32x16;
#[link_name = "llvm.x86.avx512.psrl.d.512"]
@@ -42986,16 +42777,6 @@
#[link_name = "llvm.x86.avx512.psra.q.128"]
fn vpsraq128(a: i64x2, count: i64x2) -> i64x2;
- #[link_name = "llvm.x86.avx512.psrav.d.512"]
- fn vpsravd(a: i32x16, count: i32x16) -> i32x16;
-
- #[link_name = "llvm.x86.avx512.psrav.q.512"]
- fn vpsravq(a: i64x8, count: i64x8) -> i64x8;
- #[link_name = "llvm.x86.avx512.psrav.q.256"]
- fn vpsravq256(a: i64x4, count: i64x4) -> i64x4;
- #[link_name = "llvm.x86.avx512.psrav.q.128"]
- fn vpsravq128(a: i64x2, count: i64x2) -> i64x2;
-
#[link_name = "llvm.x86.avx512.vpermilvar.ps.512"]
fn vpermilps(a: f32x16, b: i32x16) -> f32x16;
#[link_name = "llvm.x86.avx512.vpermilvar.pd.512"]
diff --git a/crates/core_arch/src/x86/avx512fp16.rs b/crates/core_arch/src/x86/avx512fp16.rs
index a86fc71..a8cf1f2 100644
--- a/crates/core_arch/src/x86/avx512fp16.rs
+++ b/crates/core_arch/src/x86/avx512fp16.rs
@@ -1615,7 +1615,7 @@
#[cfg_attr(test, assert_instr(vaddsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
- _mm_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+ unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) + _mm_cvtsh_h(b)) }
}
/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -1628,7 +1628,16 @@
#[cfg_attr(test, assert_instr(vaddsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
- _mm_mask_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+ unsafe {
+ let extractsrc: f16 = simd_extract!(src, 0);
+ let mut add: f16 = extractsrc;
+ if (k & 0b00000001) != 0 {
+ let extracta: f16 = simd_extract!(a, 0);
+ let extractb: f16 = simd_extract!(b, 0);
+ add = extracta + extractb;
+ }
+ simd_insert!(a, 0, add)
+ }
}
/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -1641,7 +1650,15 @@
#[cfg_attr(test, assert_instr(vaddsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
- _mm_maskz_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+ unsafe {
+ let mut add: f16 = 0.;
+ if (k & 0b00000001) != 0 {
+ let extracta: f16 = simd_extract!(a, 0);
+ let extractb: f16 = simd_extract!(b, 0);
+ add = extracta + extractb;
+ }
+ simd_insert!(a, 0, add)
+ }
}
/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
@@ -1927,7 +1944,7 @@
#[cfg_attr(test, assert_instr(vsubsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
- _mm_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+ unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) - _mm_cvtsh_h(b)) }
}
/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
@@ -1940,7 +1957,16 @@
#[cfg_attr(test, assert_instr(vsubsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
- _mm_mask_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+ unsafe {
+ let extractsrc: f16 = simd_extract!(src, 0);
+ let mut add: f16 = extractsrc;
+ if (k & 0b00000001) != 0 {
+ let extracta: f16 = simd_extract!(a, 0);
+ let extractb: f16 = simd_extract!(b, 0);
+ add = extracta - extractb;
+ }
+ simd_insert!(a, 0, add)
+ }
}
/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
@@ -1953,7 +1979,15 @@
#[cfg_attr(test, assert_instr(vsubsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
- _mm_maskz_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+ unsafe {
+ let mut add: f16 = 0.;
+ if (k & 0b00000001) != 0 {
+ let extracta: f16 = simd_extract!(a, 0);
+ let extractb: f16 = simd_extract!(b, 0);
+ add = extracta - extractb;
+ }
+ simd_insert!(a, 0, add)
+ }
}
/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
@@ -2239,7 +2273,7 @@
#[cfg_attr(test, assert_instr(vmulsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
- _mm_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+ unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) * _mm_cvtsh_h(b)) }
}
/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -2252,7 +2286,16 @@
#[cfg_attr(test, assert_instr(vmulsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
- _mm_mask_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+ unsafe {
+ let extractsrc: f16 = simd_extract!(src, 0);
+ let mut add: f16 = extractsrc;
+ if (k & 0b00000001) != 0 {
+ let extracta: f16 = simd_extract!(a, 0);
+ let extractb: f16 = simd_extract!(b, 0);
+ add = extracta * extractb;
+ }
+ simd_insert!(a, 0, add)
+ }
}
/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -2265,7 +2308,15 @@
#[cfg_attr(test, assert_instr(vmulsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
- _mm_maskz_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+ unsafe {
+ let mut add: f16 = 0.;
+ if (k & 0b00000001) != 0 {
+ let extracta: f16 = simd_extract!(a, 0);
+ let extractb: f16 = simd_extract!(b, 0);
+ add = extracta * extractb;
+ }
+ simd_insert!(a, 0, add)
+ }
}
/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
@@ -2551,7 +2602,7 @@
#[cfg_attr(test, assert_instr(vdivsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
- _mm_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+ unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) / _mm_cvtsh_h(b)) }
}
/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
@@ -2564,7 +2615,16 @@
#[cfg_attr(test, assert_instr(vdivsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
- _mm_mask_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+ unsafe {
+ let extractsrc: f16 = simd_extract!(src, 0);
+ let mut add: f16 = extractsrc;
+ if (k & 0b00000001) != 0 {
+ let extracta: f16 = simd_extract!(a, 0);
+ let extractb: f16 = simd_extract!(b, 0);
+ add = extracta / extractb;
+ }
+ simd_insert!(a, 0, add)
+ }
}
/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
@@ -2577,7 +2637,15 @@
#[cfg_attr(test, assert_instr(vdivsh))]
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
pub fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
- _mm_maskz_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+ unsafe {
+ let mut add: f16 = 0.;
+ if (k & 0b00000001) != 0 {
+ let extracta: f16 = simd_extract!(a, 0);
+ let extractb: f16 = simd_extract!(b, 0);
+ add = extracta / extractb;
+ }
+ simd_insert!(a, 0, add)
+ }
}
/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
diff --git a/crates/core_arch/src/x86/f16c.rs b/crates/core_arch/src/x86/f16c.rs
index 7686b31..519cc38 100644
--- a/crates/core_arch/src/x86/f16c.rs
+++ b/crates/core_arch/src/x86/f16c.rs
@@ -3,16 +3,13 @@
//! [F16C intrinsics]: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=fp16&expand=1769
use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::simd::*;
#[cfg(test)]
use stdarch_test::assert_instr;
#[allow(improper_ctypes)]
unsafe extern "unadjusted" {
- #[link_name = "llvm.x86.vcvtph2ps.128"]
- fn llvm_vcvtph2ps_128(a: i16x8) -> f32x4;
- #[link_name = "llvm.x86.vcvtph2ps.256"]
- fn llvm_vcvtph2ps_256(a: i16x8) -> f32x8;
#[link_name = "llvm.x86.vcvtps2ph.128"]
fn llvm_vcvtps2ph_128(a: f32x4, rounding: i32) -> i16x8;
#[link_name = "llvm.x86.vcvtps2ph.256"]
@@ -29,7 +26,11 @@
#[cfg_attr(test, assert_instr("vcvtph2ps"))]
#[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
pub fn _mm_cvtph_ps(a: __m128i) -> __m128 {
- unsafe { transmute(llvm_vcvtph2ps_128(transmute(a))) }
+ unsafe {
+ let a: f16x8 = transmute(a);
+ let a: f16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
+ simd_cast(a)
+ }
}
/// Converts the 8 x 16-bit half-precision float values in the 128-bit vector
@@ -41,7 +42,10 @@
#[cfg_attr(test, assert_instr("vcvtph2ps"))]
#[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
pub fn _mm256_cvtph_ps(a: __m128i) -> __m256 {
- unsafe { transmute(llvm_vcvtph2ps_256(transmute(a))) }
+ unsafe {
+ let a: f16x8 = transmute(a);
+ simd_cast(a)
+ }
}
/// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x
diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs
index 1eca66a..c5c6dc2 100644
--- a/crates/core_arch/src/x86/sse.rs
+++ b/crates/core_arch/src/x86/sse.rs
@@ -882,7 +882,7 @@
#[cfg_attr(test, assert_instr(cvtsi2ss))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
- unsafe { cvtsi2ss(a, b) }
+ unsafe { simd_insert!(a, 0, b as f32) }
}
/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
@@ -1989,8 +1989,6 @@
fn cvtss2si(a: __m128) -> i32;
#[link_name = "llvm.x86.sse.cvttss2si"]
fn cvttss2si(a: __m128) -> i32;
- #[link_name = "llvm.x86.sse.cvtsi2ss"]
- fn cvtsi2ss(a: __m128, b: i32) -> __m128;
#[link_name = "llvm.x86.sse.sfence"]
fn sfence();
#[link_name = "llvm.x86.sse.stmxcsr"]
diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs
index 1eaa896..c9530a2 100644
--- a/crates/core_arch/src/x86/sse2.rs
+++ b/crates/core_arch/src/x86/sse2.rs
@@ -201,7 +201,12 @@
#[cfg_attr(test, assert_instr(pmaddwd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
- unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
+ unsafe {
+ let r: i32x8 = simd_mul(simd_cast(a.as_i16x8()), simd_cast(b.as_i16x8()));
+ let even: i32x4 = simd_shuffle!(r, r, [0, 2, 4, 6]);
+ let odd: i32x4 = simd_shuffle!(r, r, [1, 3, 5, 7]);
+ simd_add(even, odd).as_m128i()
+ }
}
/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
@@ -2417,7 +2422,10 @@
#[cfg_attr(test, assert_instr(cvtss2sd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
- unsafe { cvtss2sd(a, b) }
+ unsafe {
+ let elt: f32 = simd_extract!(b, 0);
+ simd_insert!(a, 0, elt as f64)
+ }
}
/// Converts packed double-precision (64-bit) floating-point elements in `a` to
@@ -3043,8 +3051,6 @@
fn lfence();
#[link_name = "llvm.x86.sse2.mfence"]
fn mfence();
- #[link_name = "llvm.x86.sse2.pmadd.wd"]
- fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
#[link_name = "llvm.x86.sse2.psad.bw"]
fn psadbw(a: u8x16, b: u8x16) -> u64x2;
#[link_name = "llvm.x86.sse2.psll.w"]
@@ -3115,8 +3121,6 @@
fn cvtsd2si(a: __m128d) -> i32;
#[link_name = "llvm.x86.sse2.cvtsd2ss"]
fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
- #[link_name = "llvm.x86.sse2.cvtss2sd"]
- fn cvtss2sd(a: __m128d, b: __m128) -> __m128d;
#[link_name = "llvm.x86.sse2.cvttpd2dq"]
fn cvttpd2dq(a: __m128d) -> i32x4;
#[link_name = "llvm.x86.sse2.cvttsd2si"]
diff --git a/crates/core_arch/src/x86/sse3.rs b/crates/core_arch/src/x86/sse3.rs
index 7a32cfe..79be7a7 100644
--- a/crates/core_arch/src/x86/sse3.rs
+++ b/crates/core_arch/src/x86/sse3.rs
@@ -51,7 +51,11 @@
#[cfg_attr(test, assert_instr(haddpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
- unsafe { haddpd(a, b) }
+ unsafe {
+ let even = simd_shuffle!(a, b, [0, 2]);
+ let odd = simd_shuffle!(a, b, [1, 3]);
+ simd_add(even, odd)
+ }
}
/// Horizontally adds adjacent pairs of single-precision (32-bit)
@@ -63,7 +67,11 @@
#[cfg_attr(test, assert_instr(haddps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
- unsafe { haddps(a, b) }
+ unsafe {
+ let even = simd_shuffle!(a, b, [0, 2, 4, 6]);
+ let odd = simd_shuffle!(a, b, [1, 3, 5, 7]);
+ simd_add(even, odd)
+ }
}
/// Horizontally subtract adjacent pairs of double-precision (64-bit)
@@ -75,7 +83,11 @@
#[cfg_attr(test, assert_instr(hsubpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
- unsafe { hsubpd(a, b) }
+ unsafe {
+ let even = simd_shuffle!(a, b, [0, 2]);
+ let odd = simd_shuffle!(a, b, [1, 3]);
+ simd_sub(even, odd)
+ }
}
/// Horizontally adds adjacent pairs of single-precision (32-bit)
@@ -87,7 +99,11 @@
#[cfg_attr(test, assert_instr(hsubps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
- unsafe { hsubps(a, b) }
+ unsafe {
+ let even = simd_shuffle!(a, b, [0, 2, 4, 6]);
+ let odd = simd_shuffle!(a, b, [1, 3, 5, 7]);
+ simd_sub(even, odd)
+ }
}
/// Loads 128-bits of integer data from unaligned memory.
@@ -153,14 +169,6 @@
#[allow(improper_ctypes)]
unsafe extern "C" {
- #[link_name = "llvm.x86.sse3.hadd.pd"]
- fn haddpd(a: __m128d, b: __m128d) -> __m128d;
- #[link_name = "llvm.x86.sse3.hadd.ps"]
- fn haddps(a: __m128, b: __m128) -> __m128;
- #[link_name = "llvm.x86.sse3.hsub.pd"]
- fn hsubpd(a: __m128d, b: __m128d) -> __m128d;
- #[link_name = "llvm.x86.sse3.hsub.ps"]
- fn hsubps(a: __m128, b: __m128) -> __m128;
#[link_name = "llvm.x86.sse3.ldu.dq"]
fn lddqu(mem_addr: *const i8) -> i8x16;
}
diff --git a/crates/core_arch/src/x86/sse41.rs b/crates/core_arch/src/x86/sse41.rs
index 9aa200d..f457c74 100644
--- a/crates/core_arch/src/x86/sse41.rs
+++ b/crates/core_arch/src/x86/sse41.rs
@@ -1006,7 +1006,10 @@
#[cfg_attr(test, assert_instr(ptest))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
- unsafe { ptestz(a.as_i64x2(), mask.as_i64x2()) }
+ unsafe {
+ let r = simd_reduce_or(simd_and(a.as_i64x2(), mask.as_i64x2()));
+ (0i64 == r) as i32
+ }
}
/// Tests whether the specified bits in a 128-bit integer vector are all
@@ -1029,7 +1032,13 @@
#[cfg_attr(test, assert_instr(ptest))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
- unsafe { ptestc(a.as_i64x2(), mask.as_i64x2()) }
+ unsafe {
+ let r = simd_reduce_or(simd_and(
+ simd_xor(a.as_i64x2(), i64x2::splat(!0)),
+ mask.as_i64x2(),
+ ));
+ (0i64 == r) as i32
+ }
}
/// Tests whether the specified bits in a 128-bit integer vector are
@@ -1165,10 +1174,6 @@
fn phminposuw(a: u16x8) -> u16x8;
#[link_name = "llvm.x86.sse41.mpsadbw"]
fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
- #[link_name = "llvm.x86.sse41.ptestz"]
- fn ptestz(a: i64x2, mask: i64x2) -> i32;
- #[link_name = "llvm.x86.sse41.ptestc"]
- fn ptestc(a: i64x2, mask: i64x2) -> i32;
#[link_name = "llvm.x86.sse41.ptestnzc"]
fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
}
diff --git a/crates/core_arch/src/x86/ssse3.rs b/crates/core_arch/src/x86/ssse3.rs
index 2be182e..ac067bd 100644
--- a/crates/core_arch/src/x86/ssse3.rs
+++ b/crates/core_arch/src/x86/ssse3.rs
@@ -164,7 +164,13 @@
#[cfg_attr(test, assert_instr(phaddw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
- unsafe { transmute(phaddw128(a.as_i16x8(), b.as_i16x8())) }
+ let a = a.as_i16x8();
+ let b = b.as_i16x8();
+ unsafe {
+ let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+ let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+ simd_add(even, odd).as_m128i()
+ }
}
/// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -189,7 +195,13 @@
#[cfg_attr(test, assert_instr(phaddd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
- unsafe { transmute(phaddd128(a.as_i32x4(), b.as_i32x4())) }
+ let a = a.as_i32x4();
+ let b = b.as_i32x4();
+ unsafe {
+ let even: i32x4 = simd_shuffle!(a, b, [0, 2, 4, 6]);
+ let odd: i32x4 = simd_shuffle!(a, b, [1, 3, 5, 7]);
+ simd_add(even, odd).as_m128i()
+ }
}
/// Horizontally subtract the adjacent pairs of values contained in 2
@@ -201,7 +213,13 @@
#[cfg_attr(test, assert_instr(phsubw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
- unsafe { transmute(phsubw128(a.as_i16x8(), b.as_i16x8())) }
+ let a = a.as_i16x8();
+ let b = b.as_i16x8();
+ unsafe {
+ let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+ let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+ simd_sub(even, odd).as_m128i()
+ }
}
/// Horizontally subtract the adjacent pairs of values contained in 2
@@ -227,7 +245,13 @@
#[cfg_attr(test, assert_instr(phsubd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
- unsafe { transmute(phsubd128(a.as_i32x4(), b.as_i32x4())) }
+ let a = a.as_i32x4();
+ let b = b.as_i32x4();
+ unsafe {
+ let even: i32x4 = simd_shuffle!(a, b, [0, 2, 4, 6]);
+ let odd: i32x4 = simd_shuffle!(a, b, [1, 3, 5, 7]);
+ simd_sub(even, odd).as_m128i()
+ }
}
/// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -305,24 +329,12 @@
#[link_name = "llvm.x86.ssse3.pshuf.b.128"]
fn pshufb128(a: u8x16, b: u8x16) -> u8x16;
- #[link_name = "llvm.x86.ssse3.phadd.w.128"]
- fn phaddw128(a: i16x8, b: i16x8) -> i16x8;
-
#[link_name = "llvm.x86.ssse3.phadd.sw.128"]
fn phaddsw128(a: i16x8, b: i16x8) -> i16x8;
- #[link_name = "llvm.x86.ssse3.phadd.d.128"]
- fn phaddd128(a: i32x4, b: i32x4) -> i32x4;
-
- #[link_name = "llvm.x86.ssse3.phsub.w.128"]
- fn phsubw128(a: i16x8, b: i16x8) -> i16x8;
-
#[link_name = "llvm.x86.ssse3.phsub.sw.128"]
fn phsubsw128(a: i16x8, b: i16x8) -> i16x8;
- #[link_name = "llvm.x86.ssse3.phsub.d.128"]
- fn phsubd128(a: i32x4, b: i32x4) -> i32x4;
-
#[link_name = "llvm.x86.ssse3.pmadd.ub.sw.128"]
fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8;
diff --git a/crates/core_arch/src/x86_64/adx.rs b/crates/core_arch/src/x86_64/adx.rs
index bdc534b..cf378cc 100644
--- a/crates/core_arch/src/x86_64/adx.rs
+++ b/crates/core_arch/src/x86_64/adx.rs
@@ -5,8 +5,6 @@
unsafe extern "unadjusted" {
#[link_name = "llvm.x86.addcarry.64"]
fn llvm_addcarry_u64(a: u8, b: u64, c: u64) -> (u8, u64);
- #[link_name = "llvm.x86.addcarryx.u64"]
- fn llvm_addcarryx_u64(a: u8, b: u64, c: u64, d: *mut u64) -> u8;
#[link_name = "llvm.x86.subborrow.64"]
fn llvm_subborrow_u64(a: u8, b: u64, c: u64) -> (u8, u64);
}
@@ -35,7 +33,7 @@
#[cfg_attr(test, assert_instr(adc))]
#[stable(feature = "simd_x86_adx", since = "1.33.0")]
pub unsafe fn _addcarryx_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
- llvm_addcarryx_u64(c_in, a, b, out as *mut _)
+ _addcarry_u64(c_in, a, b, out)
}
/// Adds unsigned 64-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`.
@@ -95,27 +93,27 @@
let a = u64::MAX;
let mut out = 0;
- let r = _addcarry_u64(0, a, 1, &mut out);
+ let r = _addcarryx_u64(0, a, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, 0);
- let r = _addcarry_u64(0, a, 0, &mut out);
+ let r = _addcarryx_u64(0, a, 0, &mut out);
assert_eq!(r, 0);
assert_eq!(out, a);
- let r = _addcarry_u64(1, a, 1, &mut out);
+ let r = _addcarryx_u64(1, a, 1, &mut out);
assert_eq!(r, 1);
assert_eq!(out, 1);
- let r = _addcarry_u64(1, a, 0, &mut out);
+ let r = _addcarryx_u64(1, a, 0, &mut out);
assert_eq!(r, 1);
assert_eq!(out, 0);
- let r = _addcarry_u64(0, 3, 4, &mut out);
+ let r = _addcarryx_u64(0, 3, 4, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 7);
- let r = _addcarry_u64(1, 3, 4, &mut out);
+ let r = _addcarryx_u64(1, 3, 4, &mut out);
assert_eq!(r, 0);
assert_eq!(out, 8);
}
diff --git a/crates/core_arch/src/x86_64/sse.rs b/crates/core_arch/src/x86_64/sse.rs
index 863c3cd..6bd7ec8 100644
--- a/crates/core_arch/src/x86_64/sse.rs
+++ b/crates/core_arch/src/x86_64/sse.rs
@@ -11,8 +11,6 @@
fn cvtss2si64(a: __m128) -> i64;
#[link_name = "llvm.x86.sse.cvttss2si64"]
fn cvttss2si64(a: __m128) -> i64;
- #[link_name = "llvm.x86.sse.cvtsi642ss"]
- fn cvtsi642ss(a: __m128, b: i64) -> __m128;
}
/// Converts the lowest 32 bit float in the input vector to a 64 bit integer.
@@ -65,7 +63,7 @@
#[cfg_attr(test, assert_instr(cvtsi2ss))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_cvtsi64_ss(a: __m128, b: i64) -> __m128 {
- unsafe { cvtsi642ss(a, b) }
+ unsafe { simd_insert!(a, 0, b as f32) }
}
#[cfg(test)]