| // ARM Neon intrinsic specification. |
| // |
| // This file contains the specification for a number of |
| // intrinsics that allows us to generate them along with |
| // their test cases. |
| // |
| // To the syntax of the file - it's not very intelligently parsed! |
| // |
| // # Comments |
| // start with AT LEAST two, or four or more slashes so // is a |
| // comment /////// is too. |
| // |
| // # Sections |
| // Sections start with EXACTLY three slashes followed |
| // by AT LEAST one space. Sections are used for two things: |
| // |
| // 1) they serve as the doc comment for the given intrinics. |
| // 2) they reset all variables (name, fn, etc.) |
| // |
| // # Variables |
| // |
| // name - The prefix of the function, suffixes are auto |
| // generated by the type they get passed. |
| // |
| // fn - The function to call in rust-land. |
| // |
| // aarch64 - The intrinsic to check on aarch64 architecture. |
| // If this is given but no arm intrinsic is provided, |
| // the function will exclusively be generated for |
| // aarch64. |
| // This is used to generate both aarch64 specific and |
| // shared intrinics by first only specifying th aarch64 |
| // variant then the arm variant. |
| // |
| // arm - The arm v7 intrinics used to checked for arm code |
| // generation. All neon functions available in arm are |
| // also available in aarch64. If no aarch64 intrinic was |
| // set they are assumed to be the same. |
| // Intrinics ending with a `.` will have a size suffixes |
| // added (such as `i8` or `i64`) that is not sign specific |
| // Intrinics ending with a `.s` will have a size suffixes |
| // added (such as `s8` or `u64`) that is sign specific |
| // |
| // a - First input for tests, it gets scaled to the size of |
| // the type. |
| // |
| // b - Second input for tests, it gets scaled to the size of |
| // the type. |
| // |
| // # special values |
| // |
| // TRUE - 'true' all bits are set to 1 |
| // FALSE - 'false' all bits are set to 0 |
| // FF - same as 'true' |
| // MIN - minimal value (either 0 or the lowest negative number) |
| // MAX - maximal value propr to overflow |
| // |
| // # validate <values> |
| // Validates a and b aginst the expected result of the test. |
| // The special values 'TRUE' and 'FALSE' can be used to |
| // represent the corect NEON representation of true or |
| // false values. It too gets scaled to the type. |
| // |
| // Validate needs to be called before generate as it sets |
| // up the rules for validation that get generated for each |
| // type. |
| // # generate <types> |
| // The generate command generates the intrinsics, it uses the |
| // Variables set and can be called multiple times while overwriting |
| // some of the variables. |
| |
| /// Vector bitwise and |
| name = vand |
| fn = simd_and |
| arm = vand |
| aarch64 = and |
| a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00 |
| b = 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F |
| validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00 |
| b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| validate 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| generate int*_t, uint*_t, int64x*_t, uint64x*_t |
| |
| /// Vector bitwise or (immediate, inclusive) |
| name = vorr |
| fn = simd_or |
| arm = vorr |
| aarch64 = orr |
| a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F |
| b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F |
| generate int*_t, uint*_t, int64x*_t, uint64x*_t |
| |
| |
| /// Vector bitwise exclusive or (vector) |
| name = veor |
| fn = simd_xor |
| arm = veor |
| aarch64 = eor |
| a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F |
| b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F |
| generate int*_t, uint*_t, int64x*_t, uint64x*_t |
| |
| //////////////////// |
| // equality |
| //////////////////// |
| |
| /// Compare bitwise Equal (vector) |
| name = vceq |
| fn = simd_eq |
| a = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX |
| b = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| a = MIN, MIN, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, MAX |
| b = MIN, MAX, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, MIN |
| validate TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE |
| |
| aarch64 = cmeq |
| generate uint64x*_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t |
| |
| arm = vceq. |
| generate uint*_t, int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t |
| |
| /// Floating-point compare equal |
| name = vceq |
| fn = simd_eq |
| a = 1.2, 3.4, 5.6, 7.8 |
| b = 1.2, 3.4, 5.6, 7.8 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = fcmeq |
| generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| arm = vceq. |
| // we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
| |
| //////////////////// |
| // greater then |
| //////////////////// |
| |
| /// Compare signed greater than |
| name = vcgt |
| fn = simd_gt |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| aarch64 = cmgt |
| generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t |
| |
| arm = vcgt.s |
| generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t |
| |
| /// Compare unsigned highe |
| name = vcgt |
| fn = simd_gt |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = cmhi |
| generate uint64x*_t |
| |
| arm = vcgt.s |
| generate uint*_t |
| |
| /// Floating-point compare greater than |
| name = vcgt |
| fn = simd_gt |
| a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 |
| b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = fcmgt |
| generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| arm = vcgt.s |
| // we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
| |
| //////////////////// |
| // lesser then |
| //////////////////// |
| |
| /// Compare signed less than |
| name = vclt |
| fn = simd_lt |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| aarch64 = cmgt |
| generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t |
| |
| arm = vcgt.s |
| generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t |
| |
| /// Compare unsigned less than |
| name = vclt |
| fn = simd_lt |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = cmhi |
| generate uint64x*_t |
| |
| arm = vcgt.s |
| generate uint*_t |
| |
| /// Floating-point compare less than |
| name = vclt |
| fn = simd_lt |
| a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 |
| b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = fcmgt |
| generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| arm = vcgt.s |
| // we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
| |
| //////////////////// |
| // lesser then equals |
| //////////////////// |
| |
| /// Compare signed less than or equal |
| name = vcle |
| fn = simd_le |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = cmge |
| generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t |
| |
| arm = vcge.s |
| generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t |
| |
| /// Compare unsigned less than or equal |
| name = vcle |
| fn = simd_le |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = cmhs |
| generate uint64x*_t |
| |
| arm = vcge.s |
| generate uint*_t |
| |
| /// Floating-point compare less than or equal |
| name = vcle |
| fn = simd_le |
| a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 |
| b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| aarch64 = fcmge |
| generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| // we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t |
| arm = vcge.s |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
| |
| //////////////////// |
| // greater then equals |
| //////////////////// |
| |
| /// Compare signed greater than or equal |
| name = vcge |
| fn = simd_ge |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = cmge |
| generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t |
| |
| arm = vcge.s |
| generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t |
| |
| /// Compare unsigned greater than or equal |
| name = vcge |
| fn = simd_ge |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = cmhs |
| generate uint64x*_t |
| |
| arm = vcge.s |
| generate uint*_t |
| |
| /// Floating-point compare greater than or equal |
| name = vcge |
| fn = simd_ge |
| a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 |
| b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = fcmge |
| generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| arm = vcge.s |
| // we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
| |
| /// Saturating subtract |
| name = vqsub |
| a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26 |
| |
| arm = vqsub.s |
| aarch64 = uqsub |
| link-arm = vqsubu._EXT_ |
| link-aarch64 = uqsub._EXT_ |
| generate uint*_t |
| |
| arm = vqsub.s |
| aarch64 = sqsub |
| link-arm = vqsubs._EXT_ |
| link-aarch64 = sqsub._EXT_ |
| generate int*_t |
| |
| /// Halving add |
| name = vhadd |
| a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29 |
| |
| |
| arm = vhadd.s |
| aarch64 = uhadd |
| link-aarch64 = uhadd._EXT_ |
| link-arm = vhaddu._EXT_ |
| generate uint*_t |
| |
| |
| arm = vhadd.s |
| aarch64 = shadd |
| link-aarch64 = shadd._EXT_ |
| link-arm = vhadds._EXT_ |
| generate int*_t |
| |
| /// Rounding halving add |
| name = vrhadd |
| a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29 |
| |
| arm = vrhadd.s |
| aarch64 = urhadd |
| link-arm = vrhaddu._EXT_ |
| link-aarch64 = urhadd._EXT_ |
| generate uint*_t |
| |
| arm = vrhadd.s |
| aarch64 = srhadd |
| link-arm = vrhadds._EXT_ |
| link-aarch64 = srhadd._EXT_ |
| generate int*_t |
| |
| /// Saturating add |
| name = vqadd |
| a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58 |
| |
| arm = vqadd.s |
| aarch64 = uqadd |
| link-arm = vqaddu._EXT_ |
| link-aarch64 = uqadd._EXT_ |
| generate uint*_t |
| |
| arm = vqadd.s |
| aarch64 = sqadd |
| link-arm = vqadds._EXT_ |
| link-aarch64 = sqadd._EXT_ |
| generate int*_t |
| |
| // requires 1st and second argument to be different, this not implemented yet |
| // /// Signed saturating accumulate of unsigned value |
| // |
| // name = vuqadd |
| // a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 |
| // b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| // e = 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58 |
| |
| // it seems like we don't have those in rustland :( |
| // aarch64 = suqadd |
| // link-aarch64 = usqadd._EXT_ |
| // generate int64x*_t |
| |
| / arm = suqadd |
| // link-arm = vuqadds._EXT_ |
| // link-aarch64 = suqadd._EXT_ |
| // generate int*_t |
| |
| |
| /// Multiply |
| name = vmul |
| a = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32 |
| arm = vmul. |
| aarch64 = mul |
| fn = simd_mul |
| generate int*_t, uint*_t |
| |
| /// Multiply |
| name = vmul |
| fn = simd_mul |
| a = 1.0, 2.0, 1.0, 2.0 |
| b = 2.0, 3.0, 4.0, 5.0 |
| validate 2.0, 6.0, 4.0, 10.0 |
| |
| aarch64 = fmul |
| generate float64x*_t |
| |
| arm = vmul. |
| generate float*_t |
| |
| |
| /// Subtract |
| name = vsub |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 |
| validate 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 |
| arm = vsub. |
| aarch64 = sub |
| fn = simd_sub |
| generate int*_t, uint*_t, int64x*_t, uint64x*_t |
| |
| /// Subtract |
| name = vsub |
| fn = simd_sub |
| a = 1.0, 4.0, 3.0, 8.0 |
| b = 1.0, 2.0, 3.0, 4.0 |
| validate 0.0, 2.0, 0.0, 4.0 |
| |
| aarch64 = fsub |
| generate float64x*_t |
| |
| arm = vsub. |
| generate float*_t |
| |
| |
| /// Signed halving subtract |
| name = vhsub |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 |
| validate 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 |
| |
| arm = vhsub.s |
| aarch64 = uhsub |
| link-arm = vhsubu._EXT_ |
| link-aarch64 = uhsub._EXT_ |
| generate uint*_t |
| |
| arm = vhsub.s |
| aarch64 = shsub |
| link-arm = vhsubs._EXT_ |
| link-aarch64 = shsub._EXT_ |
| generate int*_t |