From 1316fe8b9241ee4f5d9be5ae1cccced038ab6314 Mon Sep 17 00:00:00 2001 From: sayantn Date: Mon, 11 May 2026 21:28:47 +0530 Subject: [PATCH 1/2] Correct small typo in gen-arm --- crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml | 14 ++++++-------- crates/stdarch-gen-arm/src/fn_suffix.rs | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index cfd44332ec..102447eae8 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -6131,7 +6131,7 @@ intrinsics: - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]] - - name: "vcmla{type[3]}" + - name: "vcmla{neon_type[0].rot180_lane}" doc: Floating-point complex multiply accumulate arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"] return_type: "{neon_type[0]}" @@ -6143,8 +6143,8 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]', '_rot180_lane_f32'] - - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]', 'q_rot180_lane_f32'] + - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] compose: - FnCall: [static_assert!, ['LANE == 0']] - Let: @@ -6153,7 +6153,7 @@ intrinsics: - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]] - - name: "vcmla{type[3]}" + - name: "vcmla{neon_type[0].rot180_lane}" doc: Floating-point complex multiply accumulate arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"] return_type: "{neon_type[0]}" @@ -6167,10 +6167,8 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]', '_rot180_lane_f16'] - - [float16x8_t, float16x4_t, - '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]', 'q_rot180_lane_f16' - ] + - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float16x8_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 1]] - Let: diff --git a/crates/stdarch-gen-arm/src/fn_suffix.rs b/crates/stdarch-gen-arm/src/fn_suffix.rs index 26c156ae17..6fba3dc744 100644 --- a/crates/stdarch-gen-arm/src/fn_suffix.rs +++ b/crates/stdarch-gen-arm/src/fn_suffix.rs @@ -188,7 +188,7 @@ impl FromStr for SuffixKind { "rot90_lane" => Ok(SuffixKind::Rot90Lane), "rot90_laneq" => Ok(SuffixKind::Rot90LaneQ), "rot180" => Ok(SuffixKind::Rot180), - "rot180_lane" => Ok(SuffixKind::Rot180LaneQ), + "rot180_lane" => Ok(SuffixKind::Rot180Lane), "rot180_laneq" => Ok(SuffixKind::Rot180LaneQ), "u" => Ok(SuffixKind::Unsigned), "nox" => Ok(SuffixKind::NoX), From 8a370d2091d8c61701b05c5df70efa8bc9aee101 Mon Sep 17 00:00:00 2001 From: sayantn Date: Mon, 11 May 2026 21:29:36 +0530 Subject: [PATCH 2/2] Implement `vcmla_lane` with ARM intrinsics --- .../core_arch/src/aarch64/neon/generated.rs | 504 +++++------------- .../spec/neon/aarch64.spec.yml | 172 +++--- 2 files changed, 208 insertions(+), 468 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 11c3a52870..3241583cf0 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -3001,19 +3001,10 @@ pub fn vcmla_lane_f16( c: float16x4_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float16x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmla_f16(a, b, c) - } + let c = vreinterpret_u32_f16(c); + let c = vdup_lane_u32::(c); + let c = vreinterpret_f16_u32(c); + vcmla_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_lane_f16)"] @@ -3030,23 +3021,10 @@ pub fn vcmlaq_lane_f16( c: float16x4_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float16x8_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_f16(a, b, c) - } + let c = vreinterpret_u32_f16(c); + let c = vdupq_lane_u32::(c); + let c = vreinterpretq_f16_u32(c); + vcmlaq_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_lane_f32)"] @@ -3061,10 +3039,10 @@ pub fn vcmla_lane_f32( c: float32x2_t, ) -> float32x2_t { static_assert!(LANE == 0); - unsafe { - let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]); - vcmla_f32(a, b, c) - } + let c = vreinterpret_u64_f32(c); + let c = vdup_lane_u64::(c); + let c = vreinterpret_f32_u64(c); + vcmla_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_lane_f32)"] @@ -3079,19 +3057,10 @@ pub fn vcmlaq_lane_f32( c: float32x2_t, ) -> float32x4_t { static_assert!(LANE == 0); - unsafe { - let c: float32x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_f32(a, b, c) - } + let c = vreinterpret_u64_f32(c); + let c = vdupq_lane_u64::(c); + let c = vreinterpretq_f32_u64(c); + vcmlaq_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_laneq_f16)"] @@ -3108,19 +3077,10 @@ pub fn vcmla_laneq_f16( c: float16x8_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: float16x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmla_f16(a, b, c) - } + let c = vreinterpretq_u32_f16(c); + let c = vdup_laneq_u32::(c); + let c = vreinterpret_f16_u32(c); + vcmla_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_laneq_f16)"] @@ -3137,23 +3097,10 @@ pub fn vcmlaq_laneq_f16( c: float16x8_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: float16x8_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_f16(a, b, c) - } + let c = vreinterpretq_u32_f16(c); + let c = vdupq_laneq_u32::(c); + let c = vreinterpretq_f16_u32(c); + vcmlaq_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_laneq_f32)"] @@ -3168,10 +3115,10 @@ pub fn vcmla_laneq_f32( c: float32x4_t, ) -> float32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]); - vcmla_f32(a, b, c) - } + let c = vreinterpretq_u64_f32(c); + let c = vdup_laneq_u64::(c); + let c = vreinterpret_f32_u64(c); + vcmla_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_laneq_f32)"] @@ -3186,19 +3133,10 @@ pub fn vcmlaq_laneq_f32( c: float32x4_t, ) -> float32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float32x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_f32(a, b, c) - } + let c = vreinterpretq_u64_f32(c); + let c = vdupq_laneq_u64::(c); + let c = vreinterpretq_f32_u64(c); + vcmlaq_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_f16)"] @@ -3299,19 +3237,10 @@ pub fn vcmla_rot180_lane_f16( c: float16x4_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float16x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmla_rot180_f16(a, b, c) - } + let c = vreinterpret_u32_f16(c); + let c = vdup_lane_u32::(c); + let c = vreinterpret_f16_u32(c); + vcmla_rot180_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_lane_f16)"] @@ -3328,23 +3257,10 @@ pub fn vcmlaq_rot180_lane_f16( c: float16x4_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float16x8_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot180_f16(a, b, c) - } + let c = vreinterpret_u32_f16(c); + let c = vdupq_lane_u32::(c); + let c = vreinterpretq_f16_u32(c); + vcmlaq_rot180_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_lane_f32)"] @@ -3359,10 +3275,10 @@ pub fn vcmla_rot180_lane_f32( c: float32x2_t, ) -> float32x2_t { static_assert!(LANE == 0); - unsafe { - let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]); - vcmla_rot180_f32(a, b, c) - } + let c = vreinterpret_u64_f32(c); + let c = vdup_lane_u64::(c); + let c = vreinterpret_f32_u64(c); + vcmla_rot180_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_lane_f32)"] @@ -3377,19 +3293,10 @@ pub fn vcmlaq_rot180_lane_f32( c: float32x2_t, ) -> float32x4_t { static_assert!(LANE == 0); - unsafe { - let c: float32x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot180_f32(a, b, c) - } + let c = vreinterpret_u64_f32(c); + let c = vdupq_lane_u64::(c); + let c = vreinterpretq_f32_u64(c); + vcmlaq_rot180_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_laneq_f16)"] @@ -3406,19 +3313,10 @@ pub fn vcmla_rot180_laneq_f16( c: float16x8_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: float16x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmla_rot180_f16(a, b, c) - } + let c = vreinterpretq_u32_f16(c); + let c = vdup_laneq_u32::(c); + let c = vreinterpret_f16_u32(c); + vcmla_rot180_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_laneq_f16)"] @@ -3435,23 +3333,10 @@ pub fn vcmlaq_rot180_laneq_f16( c: float16x8_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: float16x8_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot180_f16(a, b, c) - } + let c = vreinterpretq_u32_f16(c); + let c = vdupq_laneq_u32::(c); + let c = vreinterpretq_f16_u32(c); + vcmlaq_rot180_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_laneq_f32)"] @@ -3466,10 +3351,10 @@ pub fn vcmla_rot180_laneq_f32( c: float32x4_t, ) -> float32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]); - vcmla_rot180_f32(a, b, c) - } + let c = vreinterpretq_u64_f32(c); + let c = vdup_laneq_u64::(c); + let c = vreinterpret_f32_u64(c); + vcmla_rot180_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_laneq_f32)"] @@ -3484,19 +3369,10 @@ pub fn vcmlaq_rot180_laneq_f32( c: float32x4_t, ) -> float32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float32x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot180_f32(a, b, c) - } + let c = vreinterpretq_u64_f32(c); + let c = vdupq_laneq_u64::(c); + let c = vreinterpretq_f32_u64(c); + vcmlaq_rot180_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_f16)"] @@ -3597,19 +3473,10 @@ pub fn vcmla_rot270_lane_f16( c: float16x4_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float16x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmla_rot270_f16(a, b, c) - } + let c = vreinterpret_u32_f16(c); + let c = vdup_lane_u32::(c); + let c = vreinterpret_f16_u32(c); + vcmla_rot270_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_lane_f16)"] @@ -3626,23 +3493,10 @@ pub fn vcmlaq_rot270_lane_f16( c: float16x4_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float16x8_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot270_f16(a, b, c) - } + let c = vreinterpret_u32_f16(c); + let c = vdupq_lane_u32::(c); + let c = vreinterpretq_f16_u32(c); + vcmlaq_rot270_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_lane_f32)"] @@ -3657,10 +3511,10 @@ pub fn vcmla_rot270_lane_f32( c: float32x2_t, ) -> float32x2_t { static_assert!(LANE == 0); - unsafe { - let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]); - vcmla_rot270_f32(a, b, c) - } + let c = vreinterpret_u64_f32(c); + let c = vdup_lane_u64::(c); + let c = vreinterpret_f32_u64(c); + vcmla_rot270_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_lane_f32)"] @@ -3675,19 +3529,10 @@ pub fn vcmlaq_rot270_lane_f32( c: float32x2_t, ) -> float32x4_t { static_assert!(LANE == 0); - unsafe { - let c: float32x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot270_f32(a, b, c) - } + let c = vreinterpret_u64_f32(c); + let c = vdupq_lane_u64::(c); + let c = vreinterpretq_f32_u64(c); + vcmlaq_rot270_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_laneq_f16)"] @@ -3704,19 +3549,10 @@ pub fn vcmla_rot270_laneq_f16( c: float16x8_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: float16x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmla_rot270_f16(a, b, c) - } + let c = vreinterpretq_u32_f16(c); + let c = vdup_laneq_u32::(c); + let c = vreinterpret_f16_u32(c); + vcmla_rot270_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_laneq_f16)"] @@ -3733,23 +3569,10 @@ pub fn vcmlaq_rot270_laneq_f16( c: float16x8_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: float16x8_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot270_f16(a, b, c) - } + let c = vreinterpretq_u32_f16(c); + let c = vdupq_laneq_u32::(c); + let c = vreinterpretq_f16_u32(c); + vcmlaq_rot270_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_laneq_f32)"] @@ -3764,10 +3587,10 @@ pub fn vcmla_rot270_laneq_f32( c: float32x4_t, ) -> float32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]); - vcmla_rot270_f32(a, b, c) - } + let c = vreinterpretq_u64_f32(c); + let c = vdup_laneq_u64::(c); + let c = vreinterpret_f32_u64(c); + vcmla_rot270_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_laneq_f32)"] @@ -3782,19 +3605,10 @@ pub fn vcmlaq_rot270_laneq_f32( c: float32x4_t, ) -> float32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float32x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot270_f32(a, b, c) - } + let c = vreinterpretq_u64_f32(c); + let c = vdupq_laneq_u64::(c); + let c = vreinterpretq_f32_u64(c); + vcmlaq_rot270_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_f16)"] @@ -3895,19 +3709,10 @@ pub fn vcmla_rot90_lane_f16( c: float16x4_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float16x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmla_rot90_f16(a, b, c) - } + let c = vreinterpret_u32_f16(c); + let c = vdup_lane_u32::(c); + let c = vreinterpret_f16_u32(c); + vcmla_rot90_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_lane_f16)"] @@ -3924,23 +3729,10 @@ pub fn vcmlaq_rot90_lane_f16( c: float16x4_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float16x8_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot90_f16(a, b, c) - } + let c = vreinterpret_u32_f16(c); + let c = vdupq_lane_u32::(c); + let c = vreinterpretq_f16_u32(c); + vcmlaq_rot90_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_lane_f32)"] @@ -3955,10 +3747,10 @@ pub fn vcmla_rot90_lane_f32( c: float32x2_t, ) -> float32x2_t { static_assert!(LANE == 0); - unsafe { - let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]); - vcmla_rot90_f32(a, b, c) - } + let c = vreinterpret_u64_f32(c); + let c = vdup_lane_u64::(c); + let c = vreinterpret_f32_u64(c); + vcmla_rot90_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_lane_f32)"] @@ -3973,19 +3765,10 @@ pub fn vcmlaq_rot90_lane_f32( c: float32x2_t, ) -> float32x4_t { static_assert!(LANE == 0); - unsafe { - let c: float32x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot90_f32(a, b, c) - } + let c = vreinterpret_u64_f32(c); + let c = vdupq_lane_u64::(c); + let c = vreinterpretq_f32_u64(c); + vcmlaq_rot90_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_laneq_f16)"] @@ -4002,19 +3785,10 @@ pub fn vcmla_rot90_laneq_f16( c: float16x8_t, ) -> float16x4_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: float16x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmla_rot90_f16(a, b, c) - } + let c = vreinterpretq_u32_f16(c); + let c = vdup_laneq_u32::(c); + let c = vreinterpret_f16_u32(c); + vcmla_rot90_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_laneq_f16)"] @@ -4031,23 +3805,10 @@ pub fn vcmlaq_rot90_laneq_f16( c: float16x8_t, ) -> float16x8_t { static_assert_uimm_bits!(LANE, 2); - unsafe { - let c: float16x8_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot90_f16(a, b, c) - } + let c = vreinterpretq_u32_f16(c); + let c = vdupq_laneq_u32::(c); + let c = vreinterpretq_f16_u32(c); + vcmlaq_rot90_f16(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_laneq_f32)"] @@ -4062,10 +3823,10 @@ pub fn vcmla_rot90_laneq_f32( c: float32x4_t, ) -> float32x2_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]); - vcmla_rot90_f32(a, b, c) - } + let c = vreinterpretq_u64_f32(c); + let c = vdup_laneq_u64::(c); + let c = vreinterpret_f32_u64(c); + vcmla_rot90_f32(a, b, c) } #[doc = "Floating-point complex multiply accumulate"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_laneq_f32)"] @@ -4080,19 +3841,10 @@ pub fn vcmlaq_rot90_laneq_f32( c: float32x4_t, ) -> float32x4_t { static_assert_uimm_bits!(LANE, 1); - unsafe { - let c: float32x4_t = simd_shuffle!( - c, - c, - [ - 2 * LANE as u32, - 2 * LANE as u32 + 1, - 2 * LANE as u32, - 2 * LANE as u32 + 1 - ] - ); - vcmlaq_rot90_f32(a, b, c) - } + let c = vreinterpretq_u64_f32(c); + let c = vdupq_laneq_u64::(c); + let c = vreinterpretq_f32_u64(c); + vcmlaq_rot90_f32(a, b, c) } #[doc = "Join two smaller vectors into a single larger vector"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcombine_f64)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index 102447eae8..2f7f2fc2b0 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -5914,14 +5914,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float32x2_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float32x4_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float32x2_t, float32x4_t, ''] + - [float32x4_t, float32x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 1]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpretq_u64_f32, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u64', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}] - FnCall: ["vcmla{neon_type[0].no}", [a, b, c]] - name: "vcmla{neon_type[0].laneq_nox}" @@ -5938,14 +5937,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float16x8_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float16x4_t, float16x8_t, ''] + - [float16x8_t, float16x8_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 2]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpretq_u32_f16, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u32', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}] - FnCall: ["vcmla{neon_type[0].no}", [a, b, c]] - name: "vcmla{neon_type[0].rot90_laneq}" @@ -5960,14 +5958,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float32x2_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float32x4_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float32x2_t, float32x4_t, ''] + - [float32x4_t, float32x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 1]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpretq_u64_f32, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u64', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}] - FnCall: ["vcmla{neon_type[0].rot90}", [a, b, c]] - name: "vcmla{neon_type[0].rot90_laneq}" @@ -5984,14 +5981,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float16x8_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float16x4_t, float16x8_t, ''] + - [float16x8_t, float16x8_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 2]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpretq_u32_f16, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u32', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}] - FnCall: ["vcmla{neon_type[0].rot90}", [a, b, c]] - name: "vcmla{neon_type[0].rot90_lane}" @@ -6006,14 +6002,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float32x2_t, float32x2_t, ''] + - [float32x4_t, float32x2_t, 'q'] compose: - FnCall: [static_assert!, ['LANE == 0']] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpret_u64_f32, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_lane_u64', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}] - FnCall: ["vcmla{neon_type[0].rot90}", [a, b, c]] - name: "vcmla{neon_type[0].rot90_lane}" @@ -6030,14 +6025,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float16x8_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float16x4_t, float16x4_t, ''] + - [float16x8_t, float16x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 1]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpret_u32_f16, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_lane_u32', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}] - FnCall: ["vcmla{neon_type[0].rot90}", [a, b, c]] - name: "vcmla{neon_type.rot180}" @@ -6095,14 +6089,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float32x2_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float32x4_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float32x2_t, float32x4_t, ''] + - [float32x4_t, float32x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 1]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpretq_u64_f32, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u64', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}] - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]] - name: "vcmla{neon_type[0].rot180_laneq}" @@ -6119,16 +6112,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float16x8_t, float16x8_t, - '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]' - ] + - [float16x4_t, float16x8_t, ''] + - [float16x8_t, float16x8_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 2]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpretq_u32_f16, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u32', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}] - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]] - name: "vcmla{neon_type[0].rot180_lane}" @@ -6143,14 +6133,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float32x2_t, float32x2_t, ''] + - [float32x4_t, float32x2_t, 'q'] compose: - FnCall: [static_assert!, ['LANE == 0']] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpret_u64_f32, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_lane_u64', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}] - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]] - name: "vcmla{neon_type[0].rot180_lane}" @@ -6167,14 +6156,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float16x8_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float16x4_t, float16x4_t, ''] + - [float16x8_t, float16x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 1]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpret_u32_f16, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_lane_u32', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}] - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]] - name: "vcmla{neon_type[0].rot270_laneq}" @@ -6189,14 +6177,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float32x2_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float32x4_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float32x2_t, float32x4_t, ''] + - [float32x4_t, float32x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 1]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpretq_u64_f32, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u64', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}] - FnCall: ["vcmla{neon_type[0].rot270}", [a, b, c]] - name: "vcmla{neon_type[0].rot270_laneq}" @@ -6213,14 +6200,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float16x8_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float16x4_t, float16x8_t, ''] + - [float16x8_t, float16x8_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 2]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpretq_u32_f16, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u32', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}] - FnCall: ["vcmla{neon_type[0].rot270}", [a, b, c]] - name: "vcmla{neon_type[0].lane_nox}" @@ -6235,14 +6221,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float32x2_t, float32x2_t, ''] + - [float32x4_t, float32x2_t, 'q'] compose: - FnCall: [static_assert!, ['LANE == 0']] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpret_u64_f32, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_lane_u64', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}] - FnCall: ["vcmla{neon_type[0].no}", [a, b, c]] @@ -6260,14 +6245,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float16x8_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float16x4_t, float16x4_t, ''] + - [float16x8_t, float16x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 1]] - - Let: - - c - - "{neon_type[0]}" - - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]] + - Let: [c, {FnCall: [vreinterpret_u32_f16, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_lane_u32', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}] - FnCall: ["vcmla{neon_type[0].no}", [a, b, c]] - name: "vcmla{neon_type[0].rot270_lane}" @@ -6282,11 +6266,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float32x2_t, float32x2_t, ''] + - [float32x4_t, float32x2_t, 'q'] compose: - FnCall: [static_assert!, ['LANE == 0']] - - Let: [c, "{neon_type[0]}", {FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]}] + - Let: [c, {FnCall: [vreinterpret_u64_f32, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_lane_u64', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}] - FnCall: ["vcmla{neon_type[0].rot270}", [a, b, c]] - name: "vcmla{neon_type[0].rot270_lane}" @@ -6303,11 +6289,13 @@ intrinsics: static_defs: ["const LANE: i32"] safety: safe types: - - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] - - [float16x8_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'] + - [float16x4_t, float16x4_t, ''] + - [float16x8_t, float16x4_t, 'q'] compose: - FnCall: [static_assert_uimm_bits!, [LANE, 1]] - - Let: [c, "{neon_type[0]}", {FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]}] + - Let: [c, {FnCall: [vreinterpret_u32_f16, [c]]}] + - Let: [c, {FnCall: ['vdup{type[2]}_lane_u32', [c], [LANE]]}] + - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}] - FnCall: ["vcmla{neon_type[0].rot270}", [a, b, c]] - name: "vmax{neon_type.no}"