Merge pull request #1928 from sayantn/use-intrinsics

Use SIMD intrinsics whereever possible
diff --git a/crates/core_arch/src/x86/adx.rs b/crates/core_arch/src/x86/adx.rs
index 5ba7664..9ce65b7 100644
--- a/crates/core_arch/src/x86/adx.rs
+++ b/crates/core_arch/src/x86/adx.rs
@@ -5,8 +5,6 @@
 unsafe extern "unadjusted" {
     #[link_name = "llvm.x86.addcarry.32"]
     fn llvm_addcarry_u32(a: u8, b: u32, c: u32) -> (u8, u32);
-    #[link_name = "llvm.x86.addcarryx.u32"]
-    fn llvm_addcarryx_u32(a: u8, b: u32, c: u32, d: *mut u32) -> u8;
     #[link_name = "llvm.x86.subborrow.32"]
     fn llvm_subborrow_u32(a: u8, b: u32, c: u32) -> (u8, u32);
 }
@@ -35,7 +33,7 @@
 #[cfg_attr(test, assert_instr(adc))]
 #[stable(feature = "simd_x86_adx", since = "1.33.0")]
 pub unsafe fn _addcarryx_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 {
-    llvm_addcarryx_u32(c_in, a, b, out as *mut _)
+    _addcarry_u32(c_in, a, b, out)
 }
 
 /// Adds unsigned 32-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`
diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
index 24e0cf6..c1bb897 100644
--- a/crates/core_arch/src/x86/avx.rs
+++ b/crates/core_arch/src/x86/avx.rs
@@ -587,7 +587,11 @@
 #[cfg_attr(test, assert_instr(vhaddpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
-    unsafe { vhaddpd(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
+        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
+        simd_add(even, odd)
+    }
 }
 
 /// Horizontal addition of adjacent pairs in the two packed vectors
@@ -602,7 +606,11 @@
 #[cfg_attr(test, assert_instr(vhaddps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
-    unsafe { vhaddps(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
+        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
+        simd_add(even, odd)
+    }
 }
 
 /// Horizontal subtraction of adjacent pairs in the two packed vectors
@@ -616,7 +624,11 @@
 #[cfg_attr(test, assert_instr(vhsubpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
-    unsafe { vhsubpd(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
+        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
+        simd_sub(even, odd)
+    }
 }
 
 /// Horizontal subtraction of adjacent pairs in the two packed vectors
@@ -631,7 +643,11 @@
 #[cfg_attr(test, assert_instr(vhsubps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
-    unsafe { vhsubps(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
+        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
+        simd_sub(even, odd)
+    }
 }
 
 /// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
@@ -1218,7 +1234,10 @@
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
     static_assert_uimm_bits!(IMM8, 8);
-    unsafe { vperm2f128ps256(a, b, IMM8 as i8) }
+    _mm256_castsi256_ps(_mm256_permute2f128_si256::<IMM8>(
+        _mm256_castps_si256(a),
+        _mm256_castps_si256(b),
+    ))
 }
 
 /// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
@@ -1232,7 +1251,10 @@
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
     static_assert_uimm_bits!(IMM8, 8);
-    unsafe { vperm2f128pd256(a, b, IMM8 as i8) }
+    _mm256_castsi256_pd(_mm256_permute2f128_si256::<IMM8>(
+        _mm256_castpd_si256(a),
+        _mm256_castpd_si256(b),
+    ))
 }
 
 /// Shuffles 128-bits (composed of integer data) selected by `imm8`
@@ -1246,7 +1268,35 @@
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    unsafe { transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8)) }
+    const fn idx(imm8: i32, pos: u32) -> u32 {
+        let part = if pos < 2 {
+            imm8 & 0xf
+        } else {
+            (imm8 & 0xf0) >> 4
+        };
+        2 * (part as u32 & 0b11) + (pos & 1)
+    }
+    const fn idx0(imm8: i32, pos: u32) -> u32 {
+        let part = if pos < 2 {
+            imm8 & 0xf
+        } else {
+            (imm8 & 0xf0) >> 4
+        };
+        if part & 0b1000 != 0 { 4 } else { pos }
+    }
+    unsafe {
+        let r = simd_shuffle!(
+            a.as_i64x4(),
+            b.as_i64x4(),
+            [idx(IMM8, 0), idx(IMM8, 1), idx(IMM8, 2), idx(IMM8, 3)]
+        );
+        let r: i64x4 = simd_shuffle!(
+            r,
+            i64x4::ZERO,
+            [idx0(IMM8, 0), idx0(IMM8, 1), idx0(IMM8, 2), idx0(IMM8, 3)]
+        );
+        r.as_m256i()
+    }
 }
 
 /// Broadcasts a single-precision (32-bit) floating-point element from memory
@@ -1933,7 +1983,10 @@
 #[cfg_attr(test, assert_instr(vptest))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
-    unsafe { ptestz256(a.as_i64x4(), b.as_i64x4()) }
+    unsafe {
+        let r = simd_and(a.as_i64x4(), b.as_i64x4());
+        (0i64 == simd_reduce_or(r)) as i32
+    }
 }
 
 /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
@@ -1947,7 +2000,10 @@
 #[cfg_attr(test, assert_instr(vptest))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
-    unsafe { ptestc256(a.as_i64x4(), b.as_i64x4()) }
+    unsafe {
+        let r = simd_and(simd_xor(a.as_i64x4(), i64x4::splat(!0)), b.as_i64x4());
+        (0i64 == simd_reduce_or(r)) as i32
+    }
 }
 
 /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
@@ -2031,7 +2087,10 @@
 #[cfg_attr(test, assert_instr(vtestpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
-    unsafe { vtestzpd(a, b) }
+    unsafe {
+        let r: i64x2 = simd_lt(transmute(_mm_and_pd(a, b)), i64x2::ZERO);
+        (0i64 == simd_reduce_or(r)) as i32
+    }
 }
 
 /// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
@@ -2048,7 +2107,10 @@
 #[cfg_attr(test, assert_instr(vtestpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
-    unsafe { vtestcpd(a, b) }
+    unsafe {
+        let r: i64x2 = simd_lt(transmute(_mm_andnot_pd(a, b)), i64x2::ZERO);
+        (0i64 == simd_reduce_or(r)) as i32
+    }
 }
 
 /// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
@@ -2135,7 +2197,10 @@
 #[cfg_attr(test, assert_instr(vtestps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
-    unsafe { vtestzps(a, b) }
+    unsafe {
+        let r: i32x4 = simd_lt(transmute(_mm_and_ps(a, b)), i32x4::ZERO);
+        (0i32 == simd_reduce_or(r)) as i32
+    }
 }
 
 /// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
@@ -2152,7 +2217,10 @@
 #[cfg_attr(test, assert_instr(vtestps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
-    unsafe { vtestcps(a, b) }
+    unsafe {
+        let r: i32x4 = simd_lt(transmute(_mm_andnot_ps(a, b)), i32x4::ZERO);
+        (0i32 == simd_reduce_or(r)) as i32
+    }
 }
 
 /// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
@@ -3044,14 +3112,6 @@
     fn roundps256(a: __m256, b: i32) -> __m256;
     #[link_name = "llvm.x86.avx.dp.ps.256"]
     fn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
-    #[link_name = "llvm.x86.avx.hadd.pd.256"]
-    fn vhaddpd(a: __m256d, b: __m256d) -> __m256d;
-    #[link_name = "llvm.x86.avx.hadd.ps.256"]
-    fn vhaddps(a: __m256, b: __m256) -> __m256;
-    #[link_name = "llvm.x86.avx.hsub.pd.256"]
-    fn vhsubpd(a: __m256d, b: __m256d) -> __m256d;
-    #[link_name = "llvm.x86.avx.hsub.ps.256"]
-    fn vhsubps(a: __m256, b: __m256) -> __m256;
     #[link_name = "llvm.x86.sse2.cmp.pd"]
     fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
     #[link_name = "llvm.x86.avx.cmp.pd.256"]
@@ -3084,12 +3144,6 @@
     fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
     #[link_name = "llvm.x86.avx.vpermilvar.pd"]
     fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
-    #[link_name = "llvm.x86.avx.vperm2f128.ps.256"]
-    fn vperm2f128ps256(a: __m256, b: __m256, imm8: i8) -> __m256;
-    #[link_name = "llvm.x86.avx.vperm2f128.pd.256"]
-    fn vperm2f128pd256(a: __m256d, b: __m256d, imm8: i8) -> __m256d;
-    #[link_name = "llvm.x86.avx.vperm2f128.si.256"]
-    fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
     #[link_name = "llvm.x86.avx.maskload.pd.256"]
     fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d;
     #[link_name = "llvm.x86.avx.maskstore.pd.256"]
@@ -3112,10 +3166,6 @@
     fn vrcpps(a: __m256) -> __m256;
     #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
     fn vrsqrtps(a: __m256) -> __m256;
-    #[link_name = "llvm.x86.avx.ptestz.256"]
-    fn ptestz256(a: i64x4, b: i64x4) -> i32;
-    #[link_name = "llvm.x86.avx.ptestc.256"]
-    fn ptestc256(a: i64x4, b: i64x4) -> i32;
     #[link_name = "llvm.x86.avx.ptestnzc.256"]
     fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
     #[link_name = "llvm.x86.avx.vtestz.pd.256"]
@@ -3124,10 +3174,6 @@
     fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
     #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
     fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
-    #[link_name = "llvm.x86.avx.vtestz.pd"]
-    fn vtestzpd(a: __m128d, b: __m128d) -> i32;
-    #[link_name = "llvm.x86.avx.vtestc.pd"]
-    fn vtestcpd(a: __m128d, b: __m128d) -> i32;
     #[link_name = "llvm.x86.avx.vtestnzc.pd"]
     fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
     #[link_name = "llvm.x86.avx.vtestz.ps.256"]
@@ -3136,10 +3182,6 @@
     fn vtestcps256(a: __m256, b: __m256) -> i32;
     #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
     fn vtestnzcps256(a: __m256, b: __m256) -> i32;
-    #[link_name = "llvm.x86.avx.vtestz.ps"]
-    fn vtestzps(a: __m128, b: __m128) -> i32;
-    #[link_name = "llvm.x86.avx.vtestc.ps"]
-    fn vtestcps(a: __m128, b: __m128) -> i32;
     #[link_name = "llvm.x86.avx.vtestnzc.ps"]
     fn vtestnzcps(a: __m128, b: __m128) -> i32;
     #[link_name = "llvm.x86.avx.min.ps.256"]
diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
index 739de2b..8be6629 100644
--- a/crates/core_arch/src/x86/avx2.rs
+++ b/crates/core_arch/src/x86/avx2.rs
@@ -891,7 +891,21 @@
 #[cfg_attr(test, assert_instr(vphaddw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(phaddw(a.as_i16x16(), b.as_i16x16())) }
+    let a = a.as_i16x16();
+    let b = b.as_i16x16();
+    unsafe {
+        let even: i16x16 = simd_shuffle!(
+            a,
+            b,
+            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
+        );
+        let odd: i16x16 = simd_shuffle!(
+            a,
+            b,
+            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
+        );
+        simd_add(even, odd).as_m256i()
+    }
 }
 
 /// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
@@ -902,7 +916,13 @@
 #[cfg_attr(test, assert_instr(vphaddd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(phaddd(a.as_i32x8(), b.as_i32x8())) }
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    unsafe {
+        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
+        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
+        simd_add(even, odd).as_m256i()
+    }
 }
 
 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
@@ -925,7 +945,21 @@
 #[cfg_attr(test, assert_instr(vphsubw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(phsubw(a.as_i16x16(), b.as_i16x16())) }
+    let a = a.as_i16x16();
+    let b = b.as_i16x16();
+    unsafe {
+        let even: i16x16 = simd_shuffle!(
+            a,
+            b,
+            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
+        );
+        let odd: i16x16 = simd_shuffle!(
+            a,
+            b,
+            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
+        );
+        simd_sub(even, odd).as_m256i()
+    }
 }
 
 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
@@ -936,7 +970,13 @@
 #[cfg_attr(test, assert_instr(vphsubd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(phsubd(a.as_i32x8(), b.as_i32x8())) }
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    unsafe {
+        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
+        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
+        simd_sub(even, odd).as_m256i()
+    }
 }
 
 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
@@ -1714,7 +1754,12 @@
 #[cfg_attr(test, assert_instr(vpmaddwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
+    unsafe {
+        let r: i32x16 = simd_mul(simd_cast(a.as_i16x16()), simd_cast(b.as_i16x16()));
+        let even: i32x8 = simd_shuffle!(r, r, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let odd: i32x8 = simd_shuffle!(r, r, [1, 3, 5, 7, 9, 11, 13, 15]);
+        simd_add(even, odd).as_m256i()
+    }
 }
 
 /// Vertically multiplies each unsigned 8-bit integer from `a` with the
@@ -2285,7 +2330,7 @@
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    unsafe { transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8)) }
+    _mm256_permute2f128_si256::<IMM8>(a, b)
 }
 
 /// Shuffles 64-bit floating-point elements in `a` across lanes using the
@@ -2733,7 +2778,7 @@
 #[cfg_attr(test, assert_instr(vpsllvd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(psllvd(a.as_i32x4(), count.as_i32x4())) }
+    unsafe { transmute(simd_shl(a.as_u32x4(), count.as_u32x4())) }
 }
 
 /// Shifts packed 32-bit integers in `a` left by the amount
@@ -2746,7 +2791,7 @@
 #[cfg_attr(test, assert_instr(vpsllvd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(psllvd256(a.as_i32x8(), count.as_i32x8())) }
+    unsafe { transmute(simd_shl(a.as_u32x8(), count.as_u32x8())) }
 }
 
 /// Shifts packed 64-bit integers in `a` left by the amount
@@ -2759,7 +2804,7 @@
 #[cfg_attr(test, assert_instr(vpsllvq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(psllvq(a.as_i64x2(), count.as_i64x2())) }
+    unsafe { transmute(simd_shl(a.as_u64x2(), count.as_u64x2())) }
 }
 
 /// Shifts packed 64-bit integers in `a` left by the amount
@@ -2772,7 +2817,7 @@
 #[cfg_attr(test, assert_instr(vpsllvq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(psllvq256(a.as_i64x4(), count.as_i64x4())) }
+    unsafe { transmute(simd_shl(a.as_u64x4(), count.as_u64x4())) }
 }
 
 /// Shifts packed 16-bit integers in `a` right by `count` while
@@ -2836,7 +2881,7 @@
 #[cfg_attr(test, assert_instr(vpsravd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(psravd(a.as_i32x4(), count.as_i32x4())) }
+    unsafe { transmute(simd_shr(a.as_i32x4(), count.as_i32x4())) }
 }
 
 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
@@ -2848,7 +2893,7 @@
 #[cfg_attr(test, assert_instr(vpsravd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(psravd256(a.as_i32x8(), count.as_i32x8())) }
+    unsafe { transmute(simd_shr(a.as_i32x8(), count.as_i32x8())) }
 }
 
 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
@@ -3031,7 +3076,7 @@
 #[cfg_attr(test, assert_instr(vpsrlvd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(psrlvd(a.as_i32x4(), count.as_i32x4())) }
+    unsafe { transmute(simd_shr(a.as_u32x4(), count.as_u32x4())) }
 }
 
 /// Shifts packed 32-bit integers in `a` right by the amount specified by
@@ -3043,7 +3088,7 @@
 #[cfg_attr(test, assert_instr(vpsrlvd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(psrlvd256(a.as_i32x8(), count.as_i32x8())) }
+    unsafe { transmute(simd_shr(a.as_u32x8(), count.as_u32x8())) }
 }
 
 /// Shifts packed 64-bit integers in `a` right by the amount specified by
@@ -3055,7 +3100,7 @@
 #[cfg_attr(test, assert_instr(vpsrlvq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(psrlvq(a.as_i64x2(), count.as_i64x2())) }
+    unsafe { transmute(simd_shr(a.as_u64x2(), count.as_u64x2())) }
 }
 
 /// Shifts packed 64-bit integers in `a` right by the amount specified by
@@ -3067,7 +3112,7 @@
 #[cfg_attr(test, assert_instr(vpsrlvq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(psrlvq256(a.as_i64x4(), count.as_i64x4())) }
+    unsafe { transmute(simd_shr(a.as_u64x4(), count.as_u64x4())) }
 }
 
 /// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
@@ -3594,20 +3639,10 @@
 
 #[allow(improper_ctypes)]
 unsafe extern "C" {
-    #[link_name = "llvm.x86.avx2.phadd.w"]
-    fn phaddw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.phadd.d"]
-    fn phaddd(a: i32x8, b: i32x8) -> i32x8;
     #[link_name = "llvm.x86.avx2.phadd.sw"]
     fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.phsub.w"]
-    fn phsubw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.phsub.d"]
-    fn phsubd(a: i32x8, b: i32x8) -> i32x8;
     #[link_name = "llvm.x86.avx2.phsub.sw"]
     fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.pmadd.wd"]
-    fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
     #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
     fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
     #[link_name = "llvm.x86.avx2.maskload.d"]
@@ -3652,44 +3687,22 @@
     fn pslld(a: i32x8, count: i32x4) -> i32x8;
     #[link_name = "llvm.x86.avx2.psll.q"]
     fn psllq(a: i64x4, count: i64x2) -> i64x4;
-    #[link_name = "llvm.x86.avx2.psllv.d"]
-    fn psllvd(a: i32x4, count: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.psllv.d.256"]
-    fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx2.psllv.q"]
-    fn psllvq(a: i64x2, count: i64x2) -> i64x2;
-    #[link_name = "llvm.x86.avx2.psllv.q.256"]
-    fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
     #[link_name = "llvm.x86.avx2.psra.w"]
     fn psraw(a: i16x16, count: i16x8) -> i16x16;
     #[link_name = "llvm.x86.avx2.psra.d"]
     fn psrad(a: i32x8, count: i32x4) -> i32x8;
-    #[link_name = "llvm.x86.avx2.psrav.d"]
-    fn psravd(a: i32x4, count: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.psrav.d.256"]
-    fn psravd256(a: i32x8, count: i32x8) -> i32x8;
     #[link_name = "llvm.x86.avx2.psrl.w"]
     fn psrlw(a: i16x16, count: i16x8) -> i16x16;
     #[link_name = "llvm.x86.avx2.psrl.d"]
     fn psrld(a: i32x8, count: i32x4) -> i32x8;
     #[link_name = "llvm.x86.avx2.psrl.q"]
     fn psrlq(a: i64x4, count: i64x2) -> i64x4;
-    #[link_name = "llvm.x86.avx2.psrlv.d"]
-    fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.psrlv.d.256"]
-    fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx2.psrlv.q"]
-    fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
-    #[link_name = "llvm.x86.avx2.psrlv.q.256"]
-    fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
     #[link_name = "llvm.x86.avx2.pshuf.b"]
     fn pshufb(a: u8x32, b: u8x32) -> u8x32;
     #[link_name = "llvm.x86.avx2.permd"]
     fn permd(a: u32x8, b: u32x8) -> u32x8;
     #[link_name = "llvm.x86.avx2.permps"]
     fn permps(a: __m256, b: i32x8) -> __m256;
-    #[link_name = "llvm.x86.avx2.vperm2i128"]
-    fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
     #[link_name = "llvm.x86.avx2.gather.d.d"]
     fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
     #[link_name = "llvm.x86.avx2.gather.d.d.256"]
diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 8139b8c..1771f19 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -5835,7 +5835,20 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmaddwd))]
 pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) }
+    unsafe {
+        let r: i32x32 = simd_mul(simd_cast(a.as_i16x32()), simd_cast(b.as_i16x32()));
+        let even: i32x16 = simd_shuffle!(
+            r,
+            r,
+            [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
+        );
+        let odd: i32x16 = simd_shuffle!(
+            r,
+            r,
+            [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]
+        );
+        simd_add(even, odd).as_m512i()
+    }
 }
 
 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6839,7 +6852,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsllvw))]
 pub fn _mm512_sllv_epi16(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsllvw(a.as_i16x32(), count.as_i16x32())) }
+    unsafe { transmute(simd_shl(a.as_u16x32(), count.as_u16x32())) }
 }
 
 /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6878,7 +6891,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsllvw))]
 pub fn _mm256_sllv_epi16(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(vpsllvw256(a.as_i16x16(), count.as_i16x16())) }
+    unsafe { transmute(simd_shl(a.as_u16x16(), count.as_u16x16())) }
 }
 
 /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6917,7 +6930,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsllvw))]
 pub fn _mm_sllv_epi16(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(vpsllvw128(a.as_i16x8(), count.as_i16x8())) }
+    unsafe { transmute(simd_shl(a.as_u16x8(), count.as_u16x8())) }
 }
 
 /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7175,7 +7188,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsrlvw))]
 pub fn _mm512_srlv_epi16(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsrlvw(a.as_i16x32(), count.as_i16x32())) }
+    unsafe { transmute(simd_shr(a.as_u16x32(), count.as_u16x32())) }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7214,7 +7227,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsrlvw))]
 pub fn _mm256_srlv_epi16(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(vpsrlvw256(a.as_i16x16(), count.as_i16x16())) }
+    unsafe { transmute(simd_shr(a.as_u16x16(), count.as_u16x16())) }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7253,7 +7266,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsrlvw))]
 pub fn _mm_srlv_epi16(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(vpsrlvw128(a.as_i16x8(), count.as_i16x8())) }
+    unsafe { transmute(simd_shr(a.as_u16x8(), count.as_u16x8())) }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7498,7 +7511,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsravw))]
 pub fn _mm512_srav_epi16(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsravw(a.as_i16x32(), count.as_i16x32())) }
+    unsafe { transmute(simd_shr(a.as_i16x32(), count.as_i16x32())) }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7537,7 +7550,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsravw))]
 pub fn _mm256_srav_epi16(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(vpsravw256(a.as_i16x16(), count.as_i16x16())) }
+    unsafe { transmute(simd_shr(a.as_i16x16(), count.as_i16x16())) }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7576,7 +7589,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsravw))]
 pub fn _mm_srav_epi16(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(vpsravw128(a.as_i16x8(), count.as_i16x8())) }
+    unsafe { transmute(simd_shr(a.as_i16x8(), count.as_i16x8())) }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11617,8 +11630,6 @@
     #[link_name = "llvm.x86.avx512.pmul.hr.sw.512"]
     fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32;
 
-    #[link_name = "llvm.x86.avx512.pmaddw.d.512"]
-    fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16;
     #[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
     fn vpmaddubsw(a: i8x64, b: i8x64) -> i16x32;
 
@@ -11634,33 +11645,12 @@
     #[link_name = "llvm.x86.avx512.psll.w.512"]
     fn vpsllw(a: i16x32, count: i16x8) -> i16x32;
 
-    #[link_name = "llvm.x86.avx512.psllv.w.512"]
-    fn vpsllvw(a: i16x32, b: i16x32) -> i16x32;
-    #[link_name = "llvm.x86.avx512.psllv.w.256"]
-    fn vpsllvw256(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.psllv.w.128"]
-    fn vpsllvw128(a: i16x8, b: i16x8) -> i16x8;
-
     #[link_name = "llvm.x86.avx512.psrl.w.512"]
     fn vpsrlw(a: i16x32, count: i16x8) -> i16x32;
 
-    #[link_name = "llvm.x86.avx512.psrlv.w.512"]
-    fn vpsrlvw(a: i16x32, b: i16x32) -> i16x32;
-    #[link_name = "llvm.x86.avx512.psrlv.w.256"]
-    fn vpsrlvw256(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.psrlv.w.128"]
-    fn vpsrlvw128(a: i16x8, b: i16x8) -> i16x8;
-
     #[link_name = "llvm.x86.avx512.psra.w.512"]
     fn vpsraw(a: i16x32, count: i16x8) -> i16x32;
 
-    #[link_name = "llvm.x86.avx512.psrav.w.512"]
-    fn vpsravw(a: i16x32, count: i16x32) -> i16x32;
-    #[link_name = "llvm.x86.avx512.psrav.w.256"]
-    fn vpsravw256(a: i16x16, count: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.psrav.w.128"]
-    fn vpsravw128(a: i16x8, count: i16x8) -> i16x8;
-
     #[link_name = "llvm.x86.avx512.vpermi2var.hi.512"]
     fn vpermi2w(a: i16x32, idx: i16x32, b: i16x32) -> i16x32;
     #[link_name = "llvm.x86.avx512.vpermi2var.hi.256"]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 52c6a11..002534a 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -19077,12 +19077,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm512_rol_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprold(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_rolv_epi32(a, _mm512_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19094,12 +19090,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm512_mask_rol_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprold(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_rolv_epi32(src, k, a, _mm512_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19111,12 +19103,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm512_maskz_rol_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprold(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_maskz_rolv_epi32(k, a, _mm512_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -19128,12 +19116,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm256_rol_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprold256(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_rolv_epi32(a, _mm256_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19145,12 +19129,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm256_mask_rol_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprold256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_rolv_epi32(src, k, a, _mm256_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19162,12 +19142,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm256_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprold256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_maskz_rolv_epi32(k, a, _mm256_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -19179,12 +19155,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm_rol_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprold128(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_rolv_epi32(a, _mm_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19196,12 +19168,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm_mask_rol_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprold128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_rolv_epi32(src, k, a, _mm_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19213,12 +19181,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprold128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_maskz_rolv_epi32(k, a, _mm_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -19230,12 +19194,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm512_ror_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprord(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_rorv_epi32(a, _mm512_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19247,12 +19207,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm512_mask_ror_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprord(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_rorv_epi32(src, k, a, _mm512_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19264,12 +19220,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm512_maskz_ror_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x16();
-        let r = vprord(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_maskz_rorv_epi32(k, a, _mm512_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -19281,12 +19233,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm256_ror_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprord256(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_rorv_epi32(a, _mm256_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19298,12 +19246,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm256_mask_ror_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprord256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_rorv_epi32(src, k, a, _mm256_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19315,12 +19259,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm256_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x8();
-        let r = vprord256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_maskz_rorv_epi32(k, a, _mm256_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -19332,12 +19272,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm_ror_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprord128(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_rorv_epi32(a, _mm_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19349,12 +19285,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm_mask_ror_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprord128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_rorv_epi32(src, k, a, _mm_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19366,12 +19298,8 @@
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i32x4();
-        let r = vprord128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_maskz_rorv_epi32(k, a, _mm_set1_epi32(IMM8))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -19383,12 +19311,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm512_rol_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprolq(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_rolv_epi64(a, _mm512_set1_epi64(IMM8 as i64))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19400,12 +19324,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm512_mask_rol_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprolq(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x8()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_rolv_epi64(src, k, a, _mm512_set1_epi64(IMM8 as i64))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19417,12 +19337,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm512_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprolq(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_maskz_rolv_epi64(k, a, _mm512_set1_epi64(IMM8 as i64))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -19434,12 +19350,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm256_rol_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprolq256(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_rolv_epi64(a, _mm256_set1_epi64x(IMM8 as i64))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19451,12 +19363,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm256_mask_rol_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprolq256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x4()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_rolv_epi64(src, k, a, _mm256_set1_epi64x(IMM8 as i64))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19468,12 +19376,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm256_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprolq256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_maskz_rolv_epi64(k, a, _mm256_set1_epi64x(IMM8 as i64))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -19485,12 +19389,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm_rol_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprolq128(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_rolv_epi64(a, _mm_set1_epi64x(IMM8 as i64))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19502,12 +19402,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm_mask_rol_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprolq128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x2()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_rolv_epi64(src, k, a, _mm_set1_epi64x(IMM8 as i64))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19519,12 +19415,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprolq128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_maskz_rolv_epi64(k, a, _mm_set1_epi64x(IMM8 as i64))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -19536,12 +19428,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm512_ror_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprorq(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_rorv_epi64(a, _mm512_set1_epi64(IMM8 as i64))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19553,12 +19441,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm512_mask_ror_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprorq(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x8()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_mask_rorv_epi64(src, k, a, _mm512_set1_epi64(IMM8 as i64))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19570,12 +19454,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm512_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x8();
-        let r = vprorq(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm512_maskz_rorv_epi64(k, a, _mm512_set1_epi64(IMM8 as i64))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -19587,12 +19467,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm256_ror_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprorq256(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_rorv_epi64(a, _mm256_set1_epi64x(IMM8 as i64))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19604,12 +19480,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm256_mask_ror_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprorq256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x4()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_mask_rorv_epi64(src, k, a, _mm256_set1_epi64x(IMM8 as i64))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19621,12 +19493,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm256_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x4();
-        let r = vprorq256(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm256_maskz_rorv_epi64(k, a, _mm256_set1_epi64x(IMM8 as i64))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -19638,12 +19506,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(1)]
 pub fn _mm_ror_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprorq128(a, IMM8);
-        transmute(r)
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_rorv_epi64(a, _mm_set1_epi64x(IMM8 as i64))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19655,12 +19519,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(3)]
 pub fn _mm_mask_ror_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprorq128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, src.as_i64x2()))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_mask_rorv_epi64(src, k, a, _mm_set1_epi64x(IMM8 as i64))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19672,12 +19532,8 @@
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(2)]
 pub fn _mm_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    unsafe {
-        static_assert_uimm_bits!(IMM8, 8);
-        let a = a.as_i64x2();
-        let r = vprorq128(a, IMM8);
-        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
-    }
+    static_assert_uimm_bits!(IMM8, 8);
+    _mm_maskz_rorv_epi64(k, a, _mm_set1_epi64x(IMM8 as i64))
 }
 
 /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
@@ -21084,7 +20940,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsravd))]
 pub fn _mm512_srav_epi32(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsravd(a.as_i32x16(), count.as_i32x16())) }
+    unsafe { transmute(simd_shr(a.as_i32x16(), count.as_i32x16())) }
 }
 
 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21179,7 +21035,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsravq))]
 pub fn _mm512_srav_epi64(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsravq(a.as_i64x8(), count.as_i64x8())) }
+    unsafe { transmute(simd_shr(a.as_i64x8(), count.as_i64x8())) }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21218,7 +21074,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsravq))]
 pub fn _mm256_srav_epi64(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(vpsravq256(a.as_i64x4(), count.as_i64x4())) }
+    unsafe { transmute(simd_shr(a.as_i64x4(), count.as_i64x4())) }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21257,7 +21113,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsravq))]
 pub fn _mm_srav_epi64(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(vpsravq128(a.as_i64x2(), count.as_i64x2())) }
+    unsafe { transmute(simd_shr(a.as_i64x2(), count.as_i64x2())) }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21296,7 +21152,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprolvd))]
 pub fn _mm512_rolv_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vprolvd(a.as_i32x16(), b.as_i32x16())) }
+    unsafe { transmute(simd_funnel_shl(a.as_u32x16(), a.as_u32x16(), b.as_u32x16())) }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21335,7 +21191,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprolvd))]
 pub fn _mm256_rolv_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vprolvd256(a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(simd_funnel_shl(a.as_u32x8(), a.as_u32x8(), b.as_u32x8())) }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21374,7 +21230,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprolvd))]
 pub fn _mm_rolv_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vprolvd128(a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(simd_funnel_shl(a.as_u32x4(), a.as_u32x4(), b.as_u32x4())) }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21413,7 +21269,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprorvd))]
 pub fn _mm512_rorv_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vprorvd(a.as_i32x16(), b.as_i32x16())) }
+    unsafe { transmute(simd_funnel_shr(a.as_u32x16(), a.as_u32x16(), b.as_u32x16())) }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21452,7 +21308,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprorvd))]
 pub fn _mm256_rorv_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vprorvd256(a.as_i32x8(), b.as_i32x8())) }
+    unsafe { transmute(simd_funnel_shr(a.as_u32x8(), a.as_u32x8(), b.as_u32x8())) }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21491,7 +21347,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprorvd))]
 pub fn _mm_rorv_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vprorvd128(a.as_i32x4(), b.as_i32x4())) }
+    unsafe { transmute(simd_funnel_shr(a.as_u32x4(), a.as_u32x4(), b.as_u32x4())) }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21530,7 +21386,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprolvq))]
 pub fn _mm512_rolv_epi64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vprolvq(a.as_i64x8(), b.as_i64x8())) }
+    unsafe { transmute(simd_funnel_shl(a.as_u64x8(), a.as_u64x8(), b.as_u64x8())) }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21569,7 +21425,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprolvq))]
 pub fn _mm256_rolv_epi64(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vprolvq256(a.as_i64x4(), b.as_i64x4())) }
+    unsafe { transmute(simd_funnel_shl(a.as_u64x4(), a.as_u64x4(), b.as_u64x4())) }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21608,7 +21464,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprolvq))]
 pub fn _mm_rolv_epi64(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vprolvq128(a.as_i64x2(), b.as_i64x2())) }
+    unsafe { transmute(simd_funnel_shl(a.as_u64x2(), a.as_u64x2(), b.as_u64x2())) }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21647,7 +21503,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprorvq))]
 pub fn _mm512_rorv_epi64(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vprorvq(a.as_i64x8(), b.as_i64x8())) }
+    unsafe { transmute(simd_funnel_shr(a.as_u64x8(), a.as_u64x8(), b.as_u64x8())) }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21686,7 +21542,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprorvq))]
 pub fn _mm256_rorv_epi64(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(vprorvq256(a.as_i64x4(), b.as_i64x4())) }
+    unsafe { transmute(simd_funnel_shr(a.as_u64x4(), a.as_u64x4(), b.as_u64x4())) }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21725,7 +21581,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vprorvq))]
 pub fn _mm_rorv_epi64(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(vprorvq128(a.as_i64x2(), b.as_i64x2())) }
+    unsafe { transmute(simd_funnel_shr(a.as_u64x2(), a.as_u64x2(), b.as_u64x2())) }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21764,7 +21620,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsllvd))]
 pub fn _mm512_sllv_epi32(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsllvd(a.as_i32x16(), count.as_i32x16())) }
+    unsafe { transmute(simd_shl(a.as_u32x16(), count.as_u32x16())) }
 }
 
 /// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21859,7 +21715,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsrlvd))]
 pub fn _mm512_srlv_epi32(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsrlvd(a.as_i32x16(), count.as_i32x16())) }
+    unsafe { transmute(simd_shr(a.as_u32x16(), count.as_u32x16())) }
 }
 
 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21954,7 +21810,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsllvq))]
 pub fn _mm512_sllv_epi64(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsllvq(a.as_i64x8(), count.as_i64x8())) }
+    unsafe { transmute(simd_shl(a.as_u64x8(), count.as_u64x8())) }
 }
 
 /// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22049,7 +21905,7 @@
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsrlvq))]
 pub fn _mm512_srlv_epi64(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsrlvq(a.as_i64x8(), count.as_i64x8())) }
+    unsafe { transmute(simd_shr(a.as_u64x8(), count.as_u64x8())) }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -42902,71 +42758,6 @@
     #[link_name = "llvm.x86.avx512.mask.cmp.pd.128"]
     fn vcmppd128(a: f64x2, b: f64x2, op: i32, m: i8) -> i8;
 
-    #[link_name = "llvm.x86.avx512.mask.prol.d.512"]
-    fn vprold(a: i32x16, i8: i32) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.prol.d.256"]
-    fn vprold256(a: i32x8, i8: i32) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.prol.d.128"]
-    fn vprold128(a: i32x4, i8: i32) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.pror.d.512"]
-    fn vprord(a: i32x16, i8: i32) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.pror.d.256"]
-    fn vprord256(a: i32x8, i8: i32) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.pror.d.128"]
-    fn vprord128(a: i32x4, i8: i32) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.prol.q.512"]
-    fn vprolq(a: i64x8, i8: i32) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.prol.q.256"]
-    fn vprolq256(a: i64x4, i8: i32) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.prol.q.128"]
-    fn vprolq128(a: i64x2, i8: i32) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.pror.q.512"]
-    fn vprorq(a: i64x8, i8: i32) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.pror.q.256"]
-    fn vprorq256(a: i64x4, i8: i32) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.pror.q.128"]
-    fn vprorq128(a: i64x2, i8: i32) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.prolv.d.512"]
-    fn vprolvd(a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.prolv.d.256"]
-    fn vprolvd256(a: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.prolv.d.128"]
-    fn vprolvd128(a: i32x4, b: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.prorv.d.512"]
-    fn vprorvd(a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.mask.prorv.d.256"]
-    fn vprorvd256(a: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.mask.prorv.d.128"]
-    fn vprorvd128(a: i32x4, b: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.mask.prolv.q.512"]
-    fn vprolvq(a: i64x8, b: i64x8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.prolv.q.256"]
-    fn vprolvq256(a: i64x4, b: i64x4) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.prolv.q.128"]
-    fn vprolvq128(a: i64x2, b: i64x2) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.prorv.q.512"]
-    fn vprorvq(a: i64x8, b: i64x8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.prorv.q.256"]
-    fn vprorvq256(a: i64x4, b: i64x4) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.prorv.q.128"]
-    fn vprorvq128(a: i64x2, b: i64x2) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.psllv.d.512"]
-    fn vpsllvd(a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.psrlv.d.512"]
-    fn vpsrlvd(a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.psllv.q.512"]
-    fn vpsllvq(a: i64x8, b: i64x8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.psrlv.q.512"]
-    fn vpsrlvq(a: i64x8, b: i64x8) -> i64x8;
-
     #[link_name = "llvm.x86.avx512.psll.d.512"]
     fn vpslld(a: i32x16, count: i32x4) -> i32x16;
     #[link_name = "llvm.x86.avx512.psrl.d.512"]
@@ -42986,16 +42777,6 @@
     #[link_name = "llvm.x86.avx512.psra.q.128"]
     fn vpsraq128(a: i64x2, count: i64x2) -> i64x2;
 
-    #[link_name = "llvm.x86.avx512.psrav.d.512"]
-    fn vpsravd(a: i32x16, count: i32x16) -> i32x16;
-
-    #[link_name = "llvm.x86.avx512.psrav.q.512"]
-    fn vpsravq(a: i64x8, count: i64x8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.psrav.q.256"]
-    fn vpsravq256(a: i64x4, count: i64x4) -> i64x4;
-    #[link_name = "llvm.x86.avx512.psrav.q.128"]
-    fn vpsravq128(a: i64x2, count: i64x2) -> i64x2;
-
     #[link_name = "llvm.x86.avx512.vpermilvar.ps.512"]
     fn vpermilps(a: f32x16, b: i32x16) -> f32x16;
     #[link_name = "llvm.x86.avx512.vpermilvar.pd.512"]
diff --git a/crates/core_arch/src/x86/avx512fp16.rs b/crates/core_arch/src/x86/avx512fp16.rs
index a86fc71..a8cf1f2 100644
--- a/crates/core_arch/src/x86/avx512fp16.rs
+++ b/crates/core_arch/src/x86/avx512fp16.rs
@@ -1615,7 +1615,7 @@
 #[cfg_attr(test, assert_instr(vaddsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) + _mm_cvtsh_h(b)) }
 }
 
 /// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -1628,7 +1628,16 @@
 #[cfg_attr(test, assert_instr(vaddsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+    unsafe {
+        let extractsrc: f16 = simd_extract!(src, 0);
+        let mut add: f16 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            add = extracta + extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
 }
 
 /// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -1641,7 +1650,15 @@
 #[cfg_attr(test, assert_instr(vaddsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_maskz_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+    unsafe {
+        let mut add: f16 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            add = extracta + extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
 }
 
 /// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
@@ -1927,7 +1944,7 @@
 #[cfg_attr(test, assert_instr(vsubsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) - _mm_cvtsh_h(b)) }
 }
 
 /// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
@@ -1940,7 +1957,16 @@
 #[cfg_attr(test, assert_instr(vsubsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+    unsafe {
+        let extractsrc: f16 = simd_extract!(src, 0);
+        let mut add: f16 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            add = extracta - extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
 }
 
 /// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
@@ -1953,7 +1979,15 @@
 #[cfg_attr(test, assert_instr(vsubsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_maskz_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+    unsafe {
+        let mut add: f16 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            add = extracta - extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
@@ -2239,7 +2273,7 @@
 #[cfg_attr(test, assert_instr(vmulsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) * _mm_cvtsh_h(b)) }
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -2252,7 +2286,16 @@
 #[cfg_attr(test, assert_instr(vmulsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+    unsafe {
+        let extractsrc: f16 = simd_extract!(src, 0);
+        let mut add: f16 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            add = extracta * extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -2265,7 +2308,15 @@
 #[cfg_attr(test, assert_instr(vmulsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_maskz_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+    unsafe {
+        let mut add: f16 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            add = extracta * extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
 }
 
 /// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
@@ -2551,7 +2602,7 @@
 #[cfg_attr(test, assert_instr(vdivsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
-    _mm_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
+    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) / _mm_cvtsh_h(b)) }
 }
 
 /// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
@@ -2564,7 +2615,16 @@
 #[cfg_attr(test, assert_instr(vdivsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_mask_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+    unsafe {
+        let extractsrc: f16 = simd_extract!(src, 0);
+        let mut add: f16 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            add = extracta / extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
 }
 
 /// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
@@ -2577,7 +2637,15 @@
 #[cfg_attr(test, assert_instr(vdivsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    _mm_maskz_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
+    unsafe {
+        let mut add: f16 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            add = extracta / extractb;
+        }
+        simd_insert!(a, 0, add)
+    }
 }
 
 /// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
diff --git a/crates/core_arch/src/x86/f16c.rs b/crates/core_arch/src/x86/f16c.rs
index 7686b31..519cc38 100644
--- a/crates/core_arch/src/x86/f16c.rs
+++ b/crates/core_arch/src/x86/f16c.rs
@@ -3,16 +3,13 @@
 //! [F16C intrinsics]: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=fp16&expand=1769
 
 use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::simd::*;
 
 #[cfg(test)]
 use stdarch_test::assert_instr;
 
 #[allow(improper_ctypes)]
 unsafe extern "unadjusted" {
-    #[link_name = "llvm.x86.vcvtph2ps.128"]
-    fn llvm_vcvtph2ps_128(a: i16x8) -> f32x4;
-    #[link_name = "llvm.x86.vcvtph2ps.256"]
-    fn llvm_vcvtph2ps_256(a: i16x8) -> f32x8;
     #[link_name = "llvm.x86.vcvtps2ph.128"]
     fn llvm_vcvtps2ph_128(a: f32x4, rounding: i32) -> i16x8;
     #[link_name = "llvm.x86.vcvtps2ph.256"]
@@ -29,7 +26,11 @@
 #[cfg_attr(test, assert_instr("vcvtph2ps"))]
 #[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
 pub fn _mm_cvtph_ps(a: __m128i) -> __m128 {
-    unsafe { transmute(llvm_vcvtph2ps_128(transmute(a))) }
+    unsafe {
+        let a: f16x8 = transmute(a);
+        let a: f16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
+        simd_cast(a)
+    }
 }
 
 /// Converts the 8 x 16-bit half-precision float values in the 128-bit vector
@@ -41,7 +42,10 @@
 #[cfg_attr(test, assert_instr("vcvtph2ps"))]
 #[stable(feature = "x86_f16c_intrinsics", since = "1.68.0")]
 pub fn _mm256_cvtph_ps(a: __m128i) -> __m256 {
-    unsafe { transmute(llvm_vcvtph2ps_256(transmute(a))) }
+    unsafe {
+        let a: f16x8 = transmute(a);
+        simd_cast(a)
+    }
 }
 
 /// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x
diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs
index 1eca66a..c5c6dc2 100644
--- a/crates/core_arch/src/x86/sse.rs
+++ b/crates/core_arch/src/x86/sse.rs
@@ -882,7 +882,7 @@
 #[cfg_attr(test, assert_instr(cvtsi2ss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
-    unsafe { cvtsi2ss(a, b) }
+    unsafe { simd_insert!(a, 0, b as f32) }
 }
 
 /// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
@@ -1989,8 +1989,6 @@
     fn cvtss2si(a: __m128) -> i32;
     #[link_name = "llvm.x86.sse.cvttss2si"]
     fn cvttss2si(a: __m128) -> i32;
-    #[link_name = "llvm.x86.sse.cvtsi2ss"]
-    fn cvtsi2ss(a: __m128, b: i32) -> __m128;
     #[link_name = "llvm.x86.sse.sfence"]
     fn sfence();
     #[link_name = "llvm.x86.sse.stmxcsr"]
diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs
index 1eaa896..c9530a2 100644
--- a/crates/core_arch/src/x86/sse2.rs
+++ b/crates/core_arch/src/x86/sse2.rs
@@ -201,7 +201,12 @@
 #[cfg_attr(test, assert_instr(pmaddwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
+    unsafe {
+        let r: i32x8 = simd_mul(simd_cast(a.as_i16x8()), simd_cast(b.as_i16x8()));
+        let even: i32x4 = simd_shuffle!(r, r, [0, 2, 4, 6]);
+        let odd: i32x4 = simd_shuffle!(r, r, [1, 3, 5, 7]);
+        simd_add(even, odd).as_m128i()
+    }
 }
 
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
@@ -2417,7 +2422,10 @@
 #[cfg_attr(test, assert_instr(cvtss2sd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
-    unsafe { cvtss2sd(a, b) }
+    unsafe {
+        let elt: f32 = simd_extract!(b, 0);
+        simd_insert!(a, 0, elt as f64)
+    }
 }
 
 /// Converts packed double-precision (64-bit) floating-point elements in `a` to
@@ -3043,8 +3051,6 @@
     fn lfence();
     #[link_name = "llvm.x86.sse2.mfence"]
     fn mfence();
-    #[link_name = "llvm.x86.sse2.pmadd.wd"]
-    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
     #[link_name = "llvm.x86.sse2.psad.bw"]
     fn psadbw(a: u8x16, b: u8x16) -> u64x2;
     #[link_name = "llvm.x86.sse2.psll.w"]
@@ -3115,8 +3121,6 @@
     fn cvtsd2si(a: __m128d) -> i32;
     #[link_name = "llvm.x86.sse2.cvtsd2ss"]
     fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
-    #[link_name = "llvm.x86.sse2.cvtss2sd"]
-    fn cvtss2sd(a: __m128d, b: __m128) -> __m128d;
     #[link_name = "llvm.x86.sse2.cvttpd2dq"]
     fn cvttpd2dq(a: __m128d) -> i32x4;
     #[link_name = "llvm.x86.sse2.cvttsd2si"]
diff --git a/crates/core_arch/src/x86/sse3.rs b/crates/core_arch/src/x86/sse3.rs
index 7a32cfe..79be7a7 100644
--- a/crates/core_arch/src/x86/sse3.rs
+++ b/crates/core_arch/src/x86/sse3.rs
@@ -51,7 +51,11 @@
 #[cfg_attr(test, assert_instr(haddpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
-    unsafe { haddpd(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2]);
+        let odd = simd_shuffle!(a, b, [1, 3]);
+        simd_add(even, odd)
+    }
 }
 
 /// Horizontally adds adjacent pairs of single-precision (32-bit)
@@ -63,7 +67,11 @@
 #[cfg_attr(test, assert_instr(haddps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { haddps(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let odd = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        simd_add(even, odd)
+    }
 }
 
 /// Horizontally subtract adjacent pairs of double-precision (64-bit)
@@ -75,7 +83,11 @@
 #[cfg_attr(test, assert_instr(hsubpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
-    unsafe { hsubpd(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2]);
+        let odd = simd_shuffle!(a, b, [1, 3]);
+        simd_sub(even, odd)
+    }
 }
 
 /// Horizontally adds adjacent pairs of single-precision (32-bit)
@@ -87,7 +99,11 @@
 #[cfg_attr(test, assert_instr(hsubps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { hsubps(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let odd = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        simd_sub(even, odd)
+    }
 }
 
 /// Loads 128-bits of integer data from unaligned memory.
@@ -153,14 +169,6 @@
 
 #[allow(improper_ctypes)]
 unsafe extern "C" {
-    #[link_name = "llvm.x86.sse3.hadd.pd"]
-    fn haddpd(a: __m128d, b: __m128d) -> __m128d;
-    #[link_name = "llvm.x86.sse3.hadd.ps"]
-    fn haddps(a: __m128, b: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse3.hsub.pd"]
-    fn hsubpd(a: __m128d, b: __m128d) -> __m128d;
-    #[link_name = "llvm.x86.sse3.hsub.ps"]
-    fn hsubps(a: __m128, b: __m128) -> __m128;
     #[link_name = "llvm.x86.sse3.ldu.dq"]
     fn lddqu(mem_addr: *const i8) -> i8x16;
 }
diff --git a/crates/core_arch/src/x86/sse41.rs b/crates/core_arch/src/x86/sse41.rs
index 9aa200d..f457c74 100644
--- a/crates/core_arch/src/x86/sse41.rs
+++ b/crates/core_arch/src/x86/sse41.rs
@@ -1006,7 +1006,10 @@
 #[cfg_attr(test, assert_instr(ptest))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
-    unsafe { ptestz(a.as_i64x2(), mask.as_i64x2()) }
+    unsafe {
+        let r = simd_reduce_or(simd_and(a.as_i64x2(), mask.as_i64x2()));
+        (0i64 == r) as i32
+    }
 }
 
 /// Tests whether the specified bits in a 128-bit integer vector are all
@@ -1029,7 +1032,13 @@
 #[cfg_attr(test, assert_instr(ptest))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
-    unsafe { ptestc(a.as_i64x2(), mask.as_i64x2()) }
+    unsafe {
+        let r = simd_reduce_or(simd_and(
+            simd_xor(a.as_i64x2(), i64x2::splat(!0)),
+            mask.as_i64x2(),
+        ));
+        (0i64 == r) as i32
+    }
 }
 
 /// Tests whether the specified bits in a 128-bit integer vector are
@@ -1165,10 +1174,6 @@
     fn phminposuw(a: u16x8) -> u16x8;
     #[link_name = "llvm.x86.sse41.mpsadbw"]
     fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
-    #[link_name = "llvm.x86.sse41.ptestz"]
-    fn ptestz(a: i64x2, mask: i64x2) -> i32;
-    #[link_name = "llvm.x86.sse41.ptestc"]
-    fn ptestc(a: i64x2, mask: i64x2) -> i32;
     #[link_name = "llvm.x86.sse41.ptestnzc"]
     fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
 }
diff --git a/crates/core_arch/src/x86/ssse3.rs b/crates/core_arch/src/x86/ssse3.rs
index 2be182e..ac067bd 100644
--- a/crates/core_arch/src/x86/ssse3.rs
+++ b/crates/core_arch/src/x86/ssse3.rs
@@ -164,7 +164,13 @@
 #[cfg_attr(test, assert_instr(phaddw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(phaddw128(a.as_i16x8(), b.as_i16x8())) }
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
+    unsafe {
+        let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+        simd_add(even, odd).as_m128i()
+    }
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -189,7 +195,13 @@
 #[cfg_attr(test, assert_instr(phaddd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(phaddd128(a.as_i32x4(), b.as_i32x4())) }
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    unsafe {
+        let even: i32x4 = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let odd: i32x4 = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        simd_add(even, odd).as_m128i()
+    }
 }
 
 /// Horizontally subtract the adjacent pairs of values contained in 2
@@ -201,7 +213,13 @@
 #[cfg_attr(test, assert_instr(phsubw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(phsubw128(a.as_i16x8(), b.as_i16x8())) }
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
+    unsafe {
+        let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+        simd_sub(even, odd).as_m128i()
+    }
 }
 
 /// Horizontally subtract the adjacent pairs of values contained in 2
@@ -227,7 +245,13 @@
 #[cfg_attr(test, assert_instr(phsubd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(phsubd128(a.as_i32x4(), b.as_i32x4())) }
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    unsafe {
+        let even: i32x4 = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let odd: i32x4 = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        simd_sub(even, odd).as_m128i()
+    }
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -305,24 +329,12 @@
     #[link_name = "llvm.x86.ssse3.pshuf.b.128"]
     fn pshufb128(a: u8x16, b: u8x16) -> u8x16;
 
-    #[link_name = "llvm.x86.ssse3.phadd.w.128"]
-    fn phaddw128(a: i16x8, b: i16x8) -> i16x8;
-
     #[link_name = "llvm.x86.ssse3.phadd.sw.128"]
     fn phaddsw128(a: i16x8, b: i16x8) -> i16x8;
 
-    #[link_name = "llvm.x86.ssse3.phadd.d.128"]
-    fn phaddd128(a: i32x4, b: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.ssse3.phsub.w.128"]
-    fn phsubw128(a: i16x8, b: i16x8) -> i16x8;
-
     #[link_name = "llvm.x86.ssse3.phsub.sw.128"]
     fn phsubsw128(a: i16x8, b: i16x8) -> i16x8;
 
-    #[link_name = "llvm.x86.ssse3.phsub.d.128"]
-    fn phsubd128(a: i32x4, b: i32x4) -> i32x4;
-
     #[link_name = "llvm.x86.ssse3.pmadd.ub.sw.128"]
     fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8;
 
diff --git a/crates/core_arch/src/x86_64/adx.rs b/crates/core_arch/src/x86_64/adx.rs
index bdc534b..cf378cc 100644
--- a/crates/core_arch/src/x86_64/adx.rs
+++ b/crates/core_arch/src/x86_64/adx.rs
@@ -5,8 +5,6 @@
 unsafe extern "unadjusted" {
     #[link_name = "llvm.x86.addcarry.64"]
     fn llvm_addcarry_u64(a: u8, b: u64, c: u64) -> (u8, u64);
-    #[link_name = "llvm.x86.addcarryx.u64"]
-    fn llvm_addcarryx_u64(a: u8, b: u64, c: u64, d: *mut u64) -> u8;
     #[link_name = "llvm.x86.subborrow.64"]
     fn llvm_subborrow_u64(a: u8, b: u64, c: u64) -> (u8, u64);
 }
@@ -35,7 +33,7 @@
 #[cfg_attr(test, assert_instr(adc))]
 #[stable(feature = "simd_x86_adx", since = "1.33.0")]
 pub unsafe fn _addcarryx_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 {
-    llvm_addcarryx_u64(c_in, a, b, out as *mut _)
+    _addcarry_u64(c_in, a, b, out)
 }
 
 /// Adds unsigned 64-bit integers `a` and `b` with unsigned 8-bit carry-in `c_in`.
@@ -95,27 +93,27 @@
         let a = u64::MAX;
         let mut out = 0;
 
-        let r = _addcarry_u64(0, a, 1, &mut out);
+        let r = _addcarryx_u64(0, a, 1, &mut out);
         assert_eq!(r, 1);
         assert_eq!(out, 0);
 
-        let r = _addcarry_u64(0, a, 0, &mut out);
+        let r = _addcarryx_u64(0, a, 0, &mut out);
         assert_eq!(r, 0);
         assert_eq!(out, a);
 
-        let r = _addcarry_u64(1, a, 1, &mut out);
+        let r = _addcarryx_u64(1, a, 1, &mut out);
         assert_eq!(r, 1);
         assert_eq!(out, 1);
 
-        let r = _addcarry_u64(1, a, 0, &mut out);
+        let r = _addcarryx_u64(1, a, 0, &mut out);
         assert_eq!(r, 1);
         assert_eq!(out, 0);
 
-        let r = _addcarry_u64(0, 3, 4, &mut out);
+        let r = _addcarryx_u64(0, 3, 4, &mut out);
         assert_eq!(r, 0);
         assert_eq!(out, 7);
 
-        let r = _addcarry_u64(1, 3, 4, &mut out);
+        let r = _addcarryx_u64(1, 3, 4, &mut out);
         assert_eq!(r, 0);
         assert_eq!(out, 8);
     }
diff --git a/crates/core_arch/src/x86_64/sse.rs b/crates/core_arch/src/x86_64/sse.rs
index 863c3cd..6bd7ec8 100644
--- a/crates/core_arch/src/x86_64/sse.rs
+++ b/crates/core_arch/src/x86_64/sse.rs
@@ -11,8 +11,6 @@
     fn cvtss2si64(a: __m128) -> i64;
     #[link_name = "llvm.x86.sse.cvttss2si64"]
     fn cvttss2si64(a: __m128) -> i64;
-    #[link_name = "llvm.x86.sse.cvtsi642ss"]
-    fn cvtsi642ss(a: __m128, b: i64) -> __m128;
 }
 
 /// Converts the lowest 32 bit float in the input vector to a 64 bit integer.
@@ -65,7 +63,7 @@
 #[cfg_attr(test, assert_instr(cvtsi2ss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_cvtsi64_ss(a: __m128, b: i64) -> __m128 {
-    unsafe { cvtsi642ss(a, b) }
+    unsafe { simd_insert!(a, 0, b as f32) }
 }
 
 #[cfg(test)]