| //! [F16C intrinsics]. |
| //! |
| //! [F16C intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=fp16&expand=1769 |
| |
| use crate::{ |
| core_arch::{simd::*, x86::*}, |
| hint::unreachable_unchecked, |
| mem::transmute, |
| }; |
| |
| #[cfg(test)] |
| use stdarch_test::assert_instr; |
| |
| #[allow(improper_ctypes)] |
| extern "unadjusted" { |
| #[link_name = "llvm.x86.vcvtph2ps.128"] |
| fn llvm_vcvtph2ps_128(a: i16x8) -> f32x4; |
| #[link_name = "llvm.x86.vcvtph2ps.256"] |
| fn llvm_vcvtph2ps_256(a: i16x8) -> f32x8; |
| #[link_name = "llvm.x86.vcvtps2ph.128"] |
| fn llvm_vcvtps2ph_128(a: f32x4, rounding: i32) -> i16x8; |
| #[link_name = "llvm.x86.vcvtps2ph.256"] |
| fn llvm_vcvtps2ph_256(a: f32x8, rounding: i32) -> i16x8; |
| } |
| |
| /// Converts the 4 x 16-bit half-precision float values in the lowest 64-bit of |
| /// the 128-bit vector `a` into 4 x 32-bit float values stored in a 128-bit wide |
| /// vector. |
| #[inline] |
| #[target_feature(enable = "f16c")] |
| #[cfg_attr(test, assert_instr("vcvtph2ps"))] |
| pub unsafe fn _mm_cvtph_ps(a: __m128i) -> __m128 { |
| transmute(llvm_vcvtph2ps_128(transmute(a))) |
| } |
| |
| /// Converts the 8 x 16-bit half-precision float values in the 128-bit vector |
| /// `a` into 8 x 32-bit float values stored in a 256-bit wide vector. |
| #[inline] |
| #[target_feature(enable = "f16c")] |
| #[cfg_attr(test, assert_instr("vcvtph2ps"))] |
| pub unsafe fn _mm256_cvtph_ps(a: __m128i) -> __m256 { |
| transmute(llvm_vcvtph2ps_256(transmute(a))) |
| } |
| |
| macro_rules! dispatch_rounding { |
| ($rounding:ident, $call:ident) => {{ |
| match $rounding { |
| 0 => call!(0), |
| 1 => call!(1), |
| 2 => call!(2), |
| 3 => call!(3), |
| 4 => call!(4), |
| 5 => call!(5), |
| 6 => call!(6), |
| 7 => call!(7), |
| _ => unreachable_unchecked(), |
| } |
| }}; |
| } |
| |
| /// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x |
| /// 16-bit half-precision float values stored in the lowest 64-bit of a 128-bit |
| /// vector. |
| /// |
| /// Rounding is done according to the `imm_rounding` parameter, which can be one of: |
| /// |
| /// * `_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC`: round to nearest and suppress exceptions, |
| /// * `_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC`: round down and suppress exceptions, |
| /// * `_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC`: round up and suppress exceptions, |
| /// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions, |
| /// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]. |
| #[inline] |
| #[target_feature(enable = "f16c")] |
| #[rustc_args_required_const(1)] |
| #[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))] |
| pub unsafe fn _mm_cvtps_ph(a: __m128, imm_rounding: i32) -> __m128i { |
| let a = transmute(a); |
| macro_rules! call { |
| ($rounding:expr) => { |
| llvm_vcvtps2ph_128(a, $rounding) |
| }; |
| } |
| transmute(dispatch_rounding!(imm_rounding, call)) |
| } |
| |
| /// Converts the 8 x 32-bit float values in the 256-bit vector `a` into 8 x |
| /// 16-bit half-precision float values stored in a 128-bit wide vector. |
| /// |
| /// Rounding is done according to the `imm_rounding` parameter, which can be one of: |
| /// |
| /// * `_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC`: round to nearest and suppress exceptions, |
| /// * `_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC`: round down and suppress exceptions, |
| /// * `_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC`: round up and suppress exceptions, |
| /// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions, |
| /// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]. |
| #[inline] |
| #[target_feature(enable = "f16c")] |
| #[rustc_args_required_const(1)] |
| #[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))] |
| pub unsafe fn _mm256_cvtps_ph(a: __m256, imm_rounding: i32) -> __m128i { |
| let a = transmute(a); |
| macro_rules! call { |
| ($rounding:expr) => { |
| llvm_vcvtps2ph_256(a, $rounding) |
| }; |
| } |
| transmute(dispatch_rounding!(imm_rounding, call)) |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use crate::{core_arch::x86::*, mem::transmute}; |
| use stdarch_test::simd_test; |
| |
| #[simd_test(enable = "f16c")] |
| unsafe fn test_mm_cvtph_ps() { |
| let array = [1_f32, 2_f32, 3_f32, 4_f32]; |
| let float_vec: __m128 = transmute(array); |
| let halfs: __m128i = _mm_cvtps_ph(float_vec, 0); |
| let floats: __m128 = _mm_cvtph_ps(halfs); |
| let result: [f32; 4] = transmute(floats); |
| assert_eq!(result, array); |
| } |
| |
| #[simd_test(enable = "f16c")] |
| unsafe fn test_mm256_cvtph_ps() { |
| let array = [1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32]; |
| let float_vec: __m256 = transmute(array); |
| let halfs: __m128i = _mm256_cvtps_ph(float_vec, 0); |
| let floats: __m256 = _mm256_cvtph_ps(halfs); |
| let result: [f32; 8] = transmute(floats); |
| assert_eq!(result, array); |
| } |
| } |