diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 11c3a52870..3241583cf0 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -3001,19 +3001,10 @@ pub fn vcmla_lane_f16( c: float16x4_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float16x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmla_f16(a, b, c) - } + let c = vreinterpret_u32_f16(c); + let c = vdup_lane_u32::(c); + let c = vreinterpret_f16_u32(c); + vcmla_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_lane_f16)"] @@ -3030,23 +3021,10 @@ pub fn vcmlaq_lane_f16( c: float16x4_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float16x8_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_f16(a, b, c) - } + let c = vreinterpret_u32_f16(c); + let c = vdupq_lane_u32::(c); + let c = vreinterpretq_f16_u32(c); + vcmlaq_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_lane_f32)"] @@ -3061,10 +3039,10 @@ pub fn vcmla_lane_f32( c: float32x2_t, ) -> float32x2_t { static_assert!(LANE == 0); - unsafe { - let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]); - vcmla_f32(a, b, c) - } + let c = vreinterpret_u64_f32(c); + let c = vdup_lane_u64::(c); + let c = vreinterpret_f32_u64(c); + vcmla_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_lane_f32)"] @@ -3079,19 +3057,10 @@ pub fn vcmlaq_lane_f32( c: float32x2_t, ) -> float32x4_t { static_assert!(LANE == 0); - unsafe { - let c: float32x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_f32(a, b, c) - } + let c = vreinterpret_u64_f32(c); + let c = vdupq_lane_u64::(c); + let c = vreinterpretq_f32_u64(c); + vcmlaq_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_laneq_f16)"] @@ -3108,19 +3077,10 @@ pub fn vcmla_laneq_f16( c: float16x8_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: float16x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmla_f16(a, b, c) - } + let c = vreinterpretq_u32_f16(c); + let c = vdup_laneq_u32::(c); + let c = vreinterpret_f16_u32(c); + vcmla_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_laneq_f16)"] @@ -3137,23 +3097,10 @@ pub fn vcmlaq_laneq_f16( c: float16x8_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: float16x8_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_f16(a, b, c) - } + let c = vreinterpretq_u32_f16(c); + let c = vdupq_laneq_u32::(c); + let c = vreinterpretq_f16_u32(c); + vcmlaq_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_laneq_f32)"] @@ -3168,10 +3115,10 @@ pub fn vcmla_laneq_f32( c: float32x4_t, ) -> float32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]); - vcmla_f32(a, b, c) - } + let c = vreinterpretq_u64_f32(c); + let c = vdup_laneq_u64::(c); + let c = vreinterpret_f32_u64(c); + vcmla_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_laneq_f32)"] @@ -3186,19 +3133,10 @@ pub fn vcmlaq_laneq_f32( c: float32x4_t, ) -> float32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float32x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_f32(a, b, c) - } + let c = vreinterpretq_u64_f32(c); + let c = vdupq_laneq_u64::(c); + let c = vreinterpretq_f32_u64(c); + vcmlaq_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_f16)"] @@ -3299,19 +3237,10 @@ pub fn vcmla_rot180_lane_f16( c: float16x4_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float16x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmla_rot180_f16(a, b, c) - } + let c = vreinterpret_u32_f16(c); + let c = vdup_lane_u32::(c); + let c = vreinterpret_f16_u32(c); + vcmla_rot180_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_lane_f16)"] @@ -3328,23 +3257,10 @@ pub fn vcmlaq_rot180_lane_f16( c: float16x4_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float16x8_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot180_f16(a, b, c) - } + let c = vreinterpret_u32_f16(c); + let c = vdupq_lane_u32::(c); + let c = vreinterpretq_f16_u32(c); + vcmlaq_rot180_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_lane_f32)"] @@ -3359,10 +3275,10 @@ pub fn vcmla_rot180_lane_f32( c: float32x2_t, ) -> float32x2_t { static_assert!(LANE == 0); - unsafe { - let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]); - vcmla_rot180_f32(a, b, c) - } + let c = vreinterpret_u64_f32(c); + let c = vdup_lane_u64::(c); + let c = vreinterpret_f32_u64(c); + vcmla_rot180_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_lane_f32)"] @@ -3377,19 +3293,10 @@ pub fn vcmlaq_rot180_lane_f32( c: float32x2_t, ) -> float32x4_t { static_assert!(LANE == 0); - unsafe { - let c: float32x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot180_f32(a, b, c) - } + let c = vreinterpret_u64_f32(c); + let c = vdupq_lane_u64::(c); + let c = vreinterpretq_f32_u64(c); + vcmlaq_rot180_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_laneq_f16)"] @@ -3406,19 +3313,10 @@ pub fn vcmla_rot180_laneq_f16( c: float16x8_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: float16x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmla_rot180_f16(a, b, c) - } + let c = vreinterpretq_u32_f16(c); + let c = vdup_laneq_u32::(c); + let c = vreinterpret_f16_u32(c); + vcmla_rot180_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_laneq_f16)"] @@ -3435,23 +3333,10 @@ pub fn vcmlaq_rot180_laneq_f16( c: float16x8_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: float16x8_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot180_f16(a, b, c) - } + let c = vreinterpretq_u32_f16(c); + let c = vdupq_laneq_u32::(c); + let c = vreinterpretq_f16_u32(c); + vcmlaq_rot180_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_laneq_f32)"] @@ -3466,10 +3351,10 @@ pub fn vcmla_rot180_laneq_f32( c: float32x4_t, ) -> float32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]); - vcmla_rot180_f32(a, b, c) - } + let c = vreinterpretq_u64_f32(c); + let c = vdup_laneq_u64::(c); + let c = vreinterpret_f32_u64(c); + vcmla_rot180_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_laneq_f32)"] @@ -3484,19 +3369,10 @@ pub fn vcmlaq_rot180_laneq_f32( c: float32x4_t, ) -> float32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float32x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot180_f32(a, b, c) - } + let c = vreinterpretq_u64_f32(c); + let c = vdupq_laneq_u64::(c); + let c = vreinterpretq_f32_u64(c); + vcmlaq_rot180_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_f16)"] @@ -3597,19 +3473,10 @@ pub fn vcmla_rot270_lane_f16( c: float16x4_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float16x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmla_rot270_f16(a, b, c) - } + let c = vreinterpret_u32_f16(c); + let c = vdup_lane_u32::(c); + let c = vreinterpret_f16_u32(c); + vcmla_rot270_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_lane_f16)"] @@ -3626,23 +3493,10 @@ pub fn vcmlaq_rot270_lane_f16( c: float16x4_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float16x8_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot270_f16(a, b, c) - } + let c = vreinterpret_u32_f16(c); + let c = vdupq_lane_u32::(c); + let c = vreinterpretq_f16_u32(c); + vcmlaq_rot270_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_lane_f32)"] @@ -3657,10 +3511,10 @@ pub fn vcmla_rot270_lane_f32( c: float32x2_t, ) -> float32x2_t { static_assert!(LANE == 0); - unsafe { - let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]); - vcmla_rot270_f32(a, b, c) - } + let c = vreinterpret_u64_f32(c); + let c = vdup_lane_u64::(c); + let c = vreinterpret_f32_u64(c); + vcmla_rot270_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_lane_f32)"] @@ -3675,19 +3529,10 @@ pub fn vcmlaq_rot270_lane_f32( c: float32x2_t, ) -> float32x4_t { static_assert!(LANE == 0); - unsafe { - let c: float32x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot270_f32(a, b, c) - } + let c = vreinterpret_u64_f32(c); + let c = vdupq_lane_u64::(c); + let c = vreinterpretq_f32_u64(c); + vcmlaq_rot270_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_laneq_f16)"] @@ -3704,19 +3549,10 @@ pub fn vcmla_rot270_laneq_f16( c: float16x8_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: float16x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmla_rot270_f16(a, b, c) - } + let c = vreinterpretq_u32_f16(c); + let c = vdup_laneq_u32::(c); + let c = vreinterpret_f16_u32(c); + vcmla_rot270_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_laneq_f16)"] @@ -3733,23 +3569,10 @@ pub fn vcmlaq_rot270_laneq_f16( c: float16x8_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: float16x8_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot270_f16(a, b, c) - } + let c = vreinterpretq_u32_f16(c); + let c = vdupq_laneq_u32::(c); + let c = vreinterpretq_f16_u32(c); + vcmlaq_rot270_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_laneq_f32)"] @@ -3764,10 +3587,10 @@ pub fn vcmla_rot270_laneq_f32( c: float32x4_t, ) -> float32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]); - vcmla_rot270_f32(a, b, c) - } + let c = vreinterpretq_u64_f32(c); + let c = vdup_laneq_u64::(c); + let c = vreinterpret_f32_u64(c); + vcmla_rot270_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_laneq_f32)"] @@ -3782,19 +3605,10 @@ pub fn vcmlaq_rot270_laneq_f32( c: float32x4_t, ) -> float32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float32x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot270_f32(a, b, c) - } + let c = vreinterpretq_u64_f32(c); + let c = vdupq_laneq_u64::(c); + let c = vreinterpretq_f32_u64(c); + vcmlaq_rot270_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_f16)"] @@ -3895,19 +3709,10 @@ pub fn vcmla_rot90_lane_f16( c: float16x4_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float16x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmla_rot90_f16(a, b, c) - } + let c = vreinterpret_u32_f16(c); + let c = vdup_lane_u32::(c); + let c = vreinterpret_f16_u32(c); + vcmla_rot90_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_lane_f16)"] @@ -3924,23 +3729,10 @@ pub fn vcmlaq_rot90_lane_f16( c: float16x4_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float16x8_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot90_f16(a, b, c) - } + let c = vreinterpret_u32_f16(c); + let c = vdupq_lane_u32::(c); + let c = vreinterpretq_f16_u32(c); + vcmlaq_rot90_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_lane_f32)"] @@ -3955,10 +3747,10 @@ pub fn vcmla_rot90_lane_f32( c: float32x2_t, ) -> float32x2_t { static_assert!(LANE == 0); - unsafe { - let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]); - vcmla_rot90_f32(a, b, c) - } + let c = vreinterpret_u64_f32(c); + let c = vdup_lane_u64::(c); + let c = vreinterpret_f32_u64(c); + vcmla_rot90_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_lane_f32)"] @@ -3973,19 +3765,10 @@ pub fn vcmlaq_rot90_lane_f32( c: float32x2_t, ) -> float32x4_t { static_assert!(LANE == 0); - unsafe { - let c: float32x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot90_f32(a, b, c) - } + let c = vreinterpret_u64_f32(c); + let c = vdupq_lane_u64::(c); + let c = vreinterpretq_f32_u64(c); + vcmlaq_rot90_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_laneq_f16)"] @@ -4002,19 +3785,10 @@ pub fn vcmla_rot90_laneq_f16( c: float16x8_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: float16x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmla_rot90_f16(a, b, c) - } + let c = vreinterpretq_u32_f16(c); + let c = vdup_laneq_u32::(c); + let c = vreinterpret_f16_u32(c); + vcmla_rot90_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_laneq_f16)"] @@ -4031,23 +3805,10 @@ pub fn vcmlaq_rot90_laneq_f16( c: float16x8_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: float16x8_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot90_f16(a, b, c) - } + let c = vreinterpretq_u32_f16(c); + let c = vdupq_laneq_u32::(c); + let c = vreinterpretq_f16_u32(c); + vcmlaq_rot90_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_laneq_f32)"] @@ -4062,10 +3823,10 @@ pub fn vcmla_rot90_laneq_f32( c: float32x4_t, ) -> float32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]); - vcmla_rot90_f32(a, b, c) - } + let c = vreinterpretq_u64_f32(c); + let c = vdup_laneq_u64::(c); + let c = vreinterpret_f32_u64(c); + vcmla_rot90_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_laneq_f32)"] @@ -4080,19 +3841,10 @@ pub fn vcmlaq_rot90_laneq_f32( c: float32x4_t, ) -> float32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float32x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot90_f32(a, b, c) - } + let c = vreinterpretq_u64_f32(c); + let c = vdupq_laneq_u64::(c); + let c = vreinterpretq_f32_u64(c); + vcmlaq_rot90_f32(a, b, c) } #[doc = "Join two smaller vectors into a single larger vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcombine_f64)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index cfd44332ec..2f7f2fc2b0 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -5914,14 +5914,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float32x2_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float32x4_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float32x2_t, float32x4_t, ''] + - [float32x4_t, float32x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 1]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpretq_u64_f32, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u64', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}] - FnCall: ["vcmla{neon_type[0].no}", [a, b, c]] - name: "vcmla{neon_type[0].laneq_nox}" @@ -5938,14 +5937,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float16x8_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float16x4_t, float16x8_t, ''] + - [float16x8_t, float16x8_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 2]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpretq_u32_f16, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u32', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}] - FnCall: ["vcmla{neon_type[0].no}", [a, b, c]] - name: "vcmla{neon_type[0].rot90_laneq}" @@ -5960,14 +5958,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float32x2_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float32x4_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float32x2_t, float32x4_t, ''] + - [float32x4_t, float32x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 1]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpretq_u64_f32, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u64', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}] - FnCall: ["vcmla{neon_type[0].rot90}", [a, b, c]] - name: "vcmla{neon_type[0].rot90_laneq}" @@ -5984,14 +5981,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float16x8_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float16x4_t, float16x8_t, ''] + - [float16x8_t, float16x8_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 2]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpretq_u32_f16, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u32', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}] - FnCall: ["vcmla{neon_type[0].rot90}", [a, b, c]] - name: "vcmla{neon_type[0].rot90_lane}" @@ -6006,14 +6002,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float32x2_t, float32x2_t, ''] + - [float32x4_t, float32x2_t, 'q'] compose: - FnCall: [static_assert!, ['LANE == 0']] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpret_u64_f32, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_lane_u64', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}] - FnCall: ["vcmla{neon_type[0].rot90}", [a, b, c]] - name: "vcmla{neon_type[0].rot90_lane}" @@ -6030,14 +6025,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float16x8_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float16x4_t, float16x4_t, ''] + - [float16x8_t, float16x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 1]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpret_u32_f16, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_lane_u32', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}] - FnCall: ["vcmla{neon_type[0].rot90}", [a, b, c]] - name: "vcmla{neon_type.rot180}" @@ -6095,14 +6089,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float32x2_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float32x4_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float32x2_t, float32x4_t, ''] + - [float32x4_t, float32x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 1]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpretq_u64_f32, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u64', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}] - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]] - name: "vcmla{neon_type[0].rot180_laneq}" @@ -6119,19 +6112,16 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float16x8_t, float16x8_t, - '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]' - ] + - [float16x4_t, float16x8_t, ''] + - [float16x8_t, float16x8_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 2]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpretq_u32_f16, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u32', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}] - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]] - - name: "vcmla{type[3]}" + - name: "vcmla{neon_type[0].rot180_lane}" doc: Floating-point complex multiply accumulate arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"] return_type: "{neon_type[0]}" @@ -6143,17 +6133,16 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]', '_rot180_lane_f32'] - - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]', 'q_rot180_lane_f32'] + - [float32x2_t, float32x2_t, ''] + - [float32x4_t, float32x2_t, 'q'] compose: - FnCall: [static_assert!, ['LANE == 0']] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpret_u64_f32, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_lane_u64', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}] - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]] - - name: "vcmla{type[3]}" + - name: "vcmla{neon_type[0].rot180_lane}" doc: Floating-point complex multiply accumulate arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"] return_type: "{neon_type[0]}" @@ -6167,16 +6156,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]', '_rot180_lane_f16'] - - [float16x8_t, float16x4_t, - '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]', 'q_rot180_lane_f16' - ] + - [float16x4_t, float16x4_t, ''] + - [float16x8_t, float16x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 1]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpret_u32_f16, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_lane_u32', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}] - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]] - name: "vcmla{neon_type[0].rot270_laneq}" @@ -6191,14 +6177,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float32x2_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float32x4_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float32x2_t, float32x4_t, ''] + - [float32x4_t, float32x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 1]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpretq_u64_f32, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u64', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}] - FnCall: ["vcmla{neon_type[0].rot270}", [a, b, c]] - name: "vcmla{neon_type[0].rot270_laneq}" @@ -6215,14 +6200,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float16x8_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float16x4_t, float16x8_t, ''] + - [float16x8_t, float16x8_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 2]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpretq_u32_f16, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u32', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}] - FnCall: ["vcmla{neon_type[0].rot270}", [a, b, c]] - name: "vcmla{neon_type[0].lane_nox}" @@ -6237,14 +6221,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float32x2_t, float32x2_t, ''] + - [float32x4_t, float32x2_t, 'q'] compose: - FnCall: [static_assert!, ['LANE == 0']] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpret_u64_f32, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_lane_u64', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}] - FnCall: ["vcmla{neon_type[0].no}", [a, b, c]] @@ -6262,14 +6245,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float16x8_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float16x4_t, float16x4_t, ''] + - [float16x8_t, float16x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 1]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpret_u32_f16, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_lane_u32', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}] - FnCall: ["vcmla{neon_type[0].no}", [a, b, c]] - name: "vcmla{neon_type[0].rot270_lane}" @@ -6284,11 +6266,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float32x2_t, float32x2_t, ''] + - [float32x4_t, float32x2_t, 'q'] compose: - FnCall: [static_assert!, ['LANE == 0']] - - Let: [c, "{neon_type[0]}", {FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]}] + - Let: [c, {FnCall: [vreinterpret_u64_f32, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_lane_u64', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}] - FnCall: ["vcmla{neon_type[0].rot270}", [a, b, c]] - name: "vcmla{neon_type[0].rot270_lane}" @@ -6305,11 +6289,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float16x8_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float16x4_t, float16x4_t, ''] + - [float16x8_t, float16x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 1]] - - Let: [c, "{neon_type[0]}", {FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]}] + - Let: [c, {FnCall: [vreinterpret_u32_f16, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_lane_u32', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}] - FnCall: ["vcmla{neon_type[0].rot270}", [a, b, c]] - name: "vmax{neon_type.no}" diff --git a/crates/stdarch-gen-arm/src/fn_suffix.rs b/crates/stdarch-gen-arm/src/fn_suffix.rs index 26c156ae17..6fba3dc744 100644 --- a/crates/stdarch-gen-arm/src/fn_suffix.rs +++ b/crates/stdarch-gen-arm/src/fn_suffix.rs @@ -188,7 +188,7 @@ impl FromStr for SuffixKind { "rot90_lane" => Ok(SuffixKind::Rot90Lane), "rot90_laneq" => Ok(SuffixKind::Rot90LaneQ), "rot180" => Ok(SuffixKind::Rot180), - "rot180_lane" => Ok(SuffixKind::Rot180LaneQ), + "rot180_lane" => Ok(SuffixKind::Rot180Lane), "rot180_laneq" => Ok(SuffixKind::Rot180LaneQ), "u" => Ok(SuffixKind::Unsigned), "nox" => Ok(SuffixKind::NoX),