From 1316fe8b9241ee4f5d9be5ae1cccced038ab6314 Mon Sep 17 00:00:00 2001
From: sayantn <sayantn05@gmail.com>
Date: Mon, 11 May 2026 21:28:47 +0530
Subject: [PATCH 1/2] Correct small typo in gen-arm

---
 crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml | 14 ++++++--------
 crates/stdarch-gen-arm/src/fn_suffix.rs           |  2 +-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
index cfd44332ec..102447eae8 100644
--- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
+++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
@@ -6131,7 +6131,7 @@ intrinsics:
           - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
       - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]]
 
-  - name: "vcmla{type[3]}"
+  - name: "vcmla{neon_type[0].rot180_lane}"
     doc: Floating-point complex multiply accumulate
     arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
     return_type: "{neon_type[0]}"
@@ -6143,8 +6143,8 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]', '_rot180_lane_f32']
-      - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]', 'q_rot180_lane_f32']
+      - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
     compose:
       - FnCall: [static_assert!, ['LANE == 0']]
       - Let:
@@ -6153,7 +6153,7 @@ intrinsics:
           - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
       - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]]
 
-  - name: "vcmla{type[3]}"
+  - name: "vcmla{neon_type[0].rot180_lane}"
     doc: Floating-point complex multiply accumulate
     arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}", "c: {neon_type[1]}"]
     return_type: "{neon_type[0]}"
@@ -6167,10 +6167,8 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]', '_rot180_lane_f16']
-      - [float16x8_t, float16x4_t,
-          '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]', 'q_rot180_lane_f16'
-        ]
+      - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float16x8_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, 1]]
       - Let:
diff --git a/crates/stdarch-gen-arm/src/fn_suffix.rs b/crates/stdarch-gen-arm/src/fn_suffix.rs
index 26c156ae17..6fba3dc744 100644
--- a/crates/stdarch-gen-arm/src/fn_suffix.rs
+++ b/crates/stdarch-gen-arm/src/fn_suffix.rs
@@ -188,7 +188,7 @@ impl FromStr for SuffixKind {
             "rot90_lane" => Ok(SuffixKind::Rot90Lane),
             "rot90_laneq" => Ok(SuffixKind::Rot90LaneQ),
             "rot180" => Ok(SuffixKind::Rot180),
-            "rot180_lane" => Ok(SuffixKind::Rot180LaneQ),
+            "rot180_lane" => Ok(SuffixKind::Rot180Lane),
             "rot180_laneq" => Ok(SuffixKind::Rot180LaneQ),
             "u" => Ok(SuffixKind::Unsigned),
             "nox" => Ok(SuffixKind::NoX),

From 8a370d2091d8c61701b05c5df70efa8bc9aee101 Mon Sep 17 00:00:00 2001
From: sayantn <sayantn05@gmail.com>
Date: Mon, 11 May 2026 21:29:36 +0530
Subject: [PATCH 2/2] Implement `vcmla_lane` with ARM intrinsics

---
 .../core_arch/src/aarch64/neon/generated.rs   | 504 +++++-------------
 .../spec/neon/aarch64.spec.yml                | 172 +++---
 2 files changed, 208 insertions(+), 468 deletions(-)

diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
index 11c3a52870..3241583cf0 100644
--- a/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/crates/core_arch/src/aarch64/neon/generated.rs
@@ -3001,19 +3001,10 @@ pub fn vcmla_lane_f16<const LANE: i32>(
     c: float16x4_t,
 ) -> float16x4_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        let c: float16x4_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmla_f16(a, b, c)
-    }
+    let c = vreinterpret_u32_f16(c);
+    let c = vdup_lane_u32::<LANE>(c);
+    let c = vreinterpret_f16_u32(c);
+    vcmla_f16(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_lane_f16)"]
@@ -3030,23 +3021,10 @@ pub fn vcmlaq_lane_f16<const LANE: i32>(
     c: float16x4_t,
 ) -> float16x8_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        let c: float16x8_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmlaq_f16(a, b, c)
-    }
+    let c = vreinterpret_u32_f16(c);
+    let c = vdupq_lane_u32::<LANE>(c);
+    let c = vreinterpretq_f16_u32(c);
+    vcmlaq_f16(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_lane_f32)"]
@@ -3061,10 +3039,10 @@ pub fn vcmla_lane_f32<const LANE: i32>(
     c: float32x2_t,
 ) -> float32x2_t {
     static_assert!(LANE == 0);
-    unsafe {
-        let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]);
-        vcmla_f32(a, b, c)
-    }
+    let c = vreinterpret_u64_f32(c);
+    let c = vdup_lane_u64::<LANE>(c);
+    let c = vreinterpret_f32_u64(c);
+    vcmla_f32(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_lane_f32)"]
@@ -3079,19 +3057,10 @@ pub fn vcmlaq_lane_f32<const LANE: i32>(
     c: float32x2_t,
 ) -> float32x4_t {
     static_assert!(LANE == 0);
-    unsafe {
-        let c: float32x4_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmlaq_f32(a, b, c)
-    }
+    let c = vreinterpret_u64_f32(c);
+    let c = vdupq_lane_u64::<LANE>(c);
+    let c = vreinterpretq_f32_u64(c);
+    vcmlaq_f32(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_laneq_f16)"]
@@ -3108,19 +3077,10 @@ pub fn vcmla_laneq_f16<const LANE: i32>(
     c: float16x8_t,
 ) -> float16x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        let c: float16x4_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmla_f16(a, b, c)
-    }
+    let c = vreinterpretq_u32_f16(c);
+    let c = vdup_laneq_u32::<LANE>(c);
+    let c = vreinterpret_f16_u32(c);
+    vcmla_f16(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_laneq_f16)"]
@@ -3137,23 +3097,10 @@ pub fn vcmlaq_laneq_f16<const LANE: i32>(
     c: float16x8_t,
 ) -> float16x8_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        let c: float16x8_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmlaq_f16(a, b, c)
-    }
+    let c = vreinterpretq_u32_f16(c);
+    let c = vdupq_laneq_u32::<LANE>(c);
+    let c = vreinterpretq_f16_u32(c);
+    vcmlaq_f16(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_laneq_f32)"]
@@ -3168,10 +3115,10 @@ pub fn vcmla_laneq_f32<const LANE: i32>(
     c: float32x4_t,
 ) -> float32x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]);
-        vcmla_f32(a, b, c)
-    }
+    let c = vreinterpretq_u64_f32(c);
+    let c = vdup_laneq_u64::<LANE>(c);
+    let c = vreinterpret_f32_u64(c);
+    vcmla_f32(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_laneq_f32)"]
@@ -3186,19 +3133,10 @@ pub fn vcmlaq_laneq_f32<const LANE: i32>(
     c: float32x4_t,
 ) -> float32x4_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        let c: float32x4_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmlaq_f32(a, b, c)
-    }
+    let c = vreinterpretq_u64_f32(c);
+    let c = vdupq_laneq_u64::<LANE>(c);
+    let c = vreinterpretq_f32_u64(c);
+    vcmlaq_f32(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_f16)"]
@@ -3299,19 +3237,10 @@ pub fn vcmla_rot180_lane_f16<const LANE: i32>(
     c: float16x4_t,
 ) -> float16x4_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        let c: float16x4_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmla_rot180_f16(a, b, c)
-    }
+    let c = vreinterpret_u32_f16(c);
+    let c = vdup_lane_u32::<LANE>(c);
+    let c = vreinterpret_f16_u32(c);
+    vcmla_rot180_f16(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_lane_f16)"]
@@ -3328,23 +3257,10 @@ pub fn vcmlaq_rot180_lane_f16<const LANE: i32>(
     c: float16x4_t,
 ) -> float16x8_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        let c: float16x8_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmlaq_rot180_f16(a, b, c)
-    }
+    let c = vreinterpret_u32_f16(c);
+    let c = vdupq_lane_u32::<LANE>(c);
+    let c = vreinterpretq_f16_u32(c);
+    vcmlaq_rot180_f16(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_lane_f32)"]
@@ -3359,10 +3275,10 @@ pub fn vcmla_rot180_lane_f32<const LANE: i32>(
     c: float32x2_t,
 ) -> float32x2_t {
     static_assert!(LANE == 0);
-    unsafe {
-        let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]);
-        vcmla_rot180_f32(a, b, c)
-    }
+    let c = vreinterpret_u64_f32(c);
+    let c = vdup_lane_u64::<LANE>(c);
+    let c = vreinterpret_f32_u64(c);
+    vcmla_rot180_f32(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_lane_f32)"]
@@ -3377,19 +3293,10 @@ pub fn vcmlaq_rot180_lane_f32<const LANE: i32>(
     c: float32x2_t,
 ) -> float32x4_t {
     static_assert!(LANE == 0);
-    unsafe {
-        let c: float32x4_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmlaq_rot180_f32(a, b, c)
-    }
+    let c = vreinterpret_u64_f32(c);
+    let c = vdupq_lane_u64::<LANE>(c);
+    let c = vreinterpretq_f32_u64(c);
+    vcmlaq_rot180_f32(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_laneq_f16)"]
@@ -3406,19 +3313,10 @@ pub fn vcmla_rot180_laneq_f16<const LANE: i32>(
     c: float16x8_t,
 ) -> float16x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        let c: float16x4_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmla_rot180_f16(a, b, c)
-    }
+    let c = vreinterpretq_u32_f16(c);
+    let c = vdup_laneq_u32::<LANE>(c);
+    let c = vreinterpret_f16_u32(c);
+    vcmla_rot180_f16(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_laneq_f16)"]
@@ -3435,23 +3333,10 @@ pub fn vcmlaq_rot180_laneq_f16<const LANE: i32>(
     c: float16x8_t,
 ) -> float16x8_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        let c: float16x8_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmlaq_rot180_f16(a, b, c)
-    }
+    let c = vreinterpretq_u32_f16(c);
+    let c = vdupq_laneq_u32::<LANE>(c);
+    let c = vreinterpretq_f16_u32(c);
+    vcmlaq_rot180_f16(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot180_laneq_f32)"]
@@ -3466,10 +3351,10 @@ pub fn vcmla_rot180_laneq_f32<const LANE: i32>(
     c: float32x4_t,
 ) -> float32x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]);
-        vcmla_rot180_f32(a, b, c)
-    }
+    let c = vreinterpretq_u64_f32(c);
+    let c = vdup_laneq_u64::<LANE>(c);
+    let c = vreinterpret_f32_u64(c);
+    vcmla_rot180_f32(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot180_laneq_f32)"]
@@ -3484,19 +3369,10 @@ pub fn vcmlaq_rot180_laneq_f32<const LANE: i32>(
     c: float32x4_t,
 ) -> float32x4_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        let c: float32x4_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmlaq_rot180_f32(a, b, c)
-    }
+    let c = vreinterpretq_u64_f32(c);
+    let c = vdupq_laneq_u64::<LANE>(c);
+    let c = vreinterpretq_f32_u64(c);
+    vcmlaq_rot180_f32(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_f16)"]
@@ -3597,19 +3473,10 @@ pub fn vcmla_rot270_lane_f16<const LANE: i32>(
     c: float16x4_t,
 ) -> float16x4_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        let c: float16x4_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmla_rot270_f16(a, b, c)
-    }
+    let c = vreinterpret_u32_f16(c);
+    let c = vdup_lane_u32::<LANE>(c);
+    let c = vreinterpret_f16_u32(c);
+    vcmla_rot270_f16(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_lane_f16)"]
@@ -3626,23 +3493,10 @@ pub fn vcmlaq_rot270_lane_f16<const LANE: i32>(
     c: float16x4_t,
 ) -> float16x8_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        let c: float16x8_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmlaq_rot270_f16(a, b, c)
-    }
+    let c = vreinterpret_u32_f16(c);
+    let c = vdupq_lane_u32::<LANE>(c);
+    let c = vreinterpretq_f16_u32(c);
+    vcmlaq_rot270_f16(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_lane_f32)"]
@@ -3657,10 +3511,10 @@ pub fn vcmla_rot270_lane_f32<const LANE: i32>(
     c: float32x2_t,
 ) -> float32x2_t {
     static_assert!(LANE == 0);
-    unsafe {
-        let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]);
-        vcmla_rot270_f32(a, b, c)
-    }
+    let c = vreinterpret_u64_f32(c);
+    let c = vdup_lane_u64::<LANE>(c);
+    let c = vreinterpret_f32_u64(c);
+    vcmla_rot270_f32(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_lane_f32)"]
@@ -3675,19 +3529,10 @@ pub fn vcmlaq_rot270_lane_f32<const LANE: i32>(
     c: float32x2_t,
 ) -> float32x4_t {
     static_assert!(LANE == 0);
-    unsafe {
-        let c: float32x4_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmlaq_rot270_f32(a, b, c)
-    }
+    let c = vreinterpret_u64_f32(c);
+    let c = vdupq_lane_u64::<LANE>(c);
+    let c = vreinterpretq_f32_u64(c);
+    vcmlaq_rot270_f32(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_laneq_f16)"]
@@ -3704,19 +3549,10 @@ pub fn vcmla_rot270_laneq_f16<const LANE: i32>(
     c: float16x8_t,
 ) -> float16x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        let c: float16x4_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmla_rot270_f16(a, b, c)
-    }
+    let c = vreinterpretq_u32_f16(c);
+    let c = vdup_laneq_u32::<LANE>(c);
+    let c = vreinterpret_f16_u32(c);
+    vcmla_rot270_f16(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_laneq_f16)"]
@@ -3733,23 +3569,10 @@ pub fn vcmlaq_rot270_laneq_f16<const LANE: i32>(
     c: float16x8_t,
 ) -> float16x8_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        let c: float16x8_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmlaq_rot270_f16(a, b, c)
-    }
+    let c = vreinterpretq_u32_f16(c);
+    let c = vdupq_laneq_u32::<LANE>(c);
+    let c = vreinterpretq_f16_u32(c);
+    vcmlaq_rot270_f16(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot270_laneq_f32)"]
@@ -3764,10 +3587,10 @@ pub fn vcmla_rot270_laneq_f32<const LANE: i32>(
     c: float32x4_t,
 ) -> float32x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]);
-        vcmla_rot270_f32(a, b, c)
-    }
+    let c = vreinterpretq_u64_f32(c);
+    let c = vdup_laneq_u64::<LANE>(c);
+    let c = vreinterpret_f32_u64(c);
+    vcmla_rot270_f32(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot270_laneq_f32)"]
@@ -3782,19 +3605,10 @@ pub fn vcmlaq_rot270_laneq_f32<const LANE: i32>(
     c: float32x4_t,
 ) -> float32x4_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        let c: float32x4_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmlaq_rot270_f32(a, b, c)
-    }
+    let c = vreinterpretq_u64_f32(c);
+    let c = vdupq_laneq_u64::<LANE>(c);
+    let c = vreinterpretq_f32_u64(c);
+    vcmlaq_rot270_f32(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_f16)"]
@@ -3895,19 +3709,10 @@ pub fn vcmla_rot90_lane_f16<const LANE: i32>(
     c: float16x4_t,
 ) -> float16x4_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        let c: float16x4_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmla_rot90_f16(a, b, c)
-    }
+    let c = vreinterpret_u32_f16(c);
+    let c = vdup_lane_u32::<LANE>(c);
+    let c = vreinterpret_f16_u32(c);
+    vcmla_rot90_f16(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_lane_f16)"]
@@ -3924,23 +3729,10 @@ pub fn vcmlaq_rot90_lane_f16<const LANE: i32>(
     c: float16x4_t,
 ) -> float16x8_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        let c: float16x8_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmlaq_rot90_f16(a, b, c)
-    }
+    let c = vreinterpret_u32_f16(c);
+    let c = vdupq_lane_u32::<LANE>(c);
+    let c = vreinterpretq_f16_u32(c);
+    vcmlaq_rot90_f16(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_lane_f32)"]
@@ -3955,10 +3747,10 @@ pub fn vcmla_rot90_lane_f32<const LANE: i32>(
     c: float32x2_t,
 ) -> float32x2_t {
     static_assert!(LANE == 0);
-    unsafe {
-        let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]);
-        vcmla_rot90_f32(a, b, c)
-    }
+    let c = vreinterpret_u64_f32(c);
+    let c = vdup_lane_u64::<LANE>(c);
+    let c = vreinterpret_f32_u64(c);
+    vcmla_rot90_f32(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_lane_f32)"]
@@ -3973,19 +3765,10 @@ pub fn vcmlaq_rot90_lane_f32<const LANE: i32>(
     c: float32x2_t,
 ) -> float32x4_t {
     static_assert!(LANE == 0);
-    unsafe {
-        let c: float32x4_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmlaq_rot90_f32(a, b, c)
-    }
+    let c = vreinterpret_u64_f32(c);
+    let c = vdupq_lane_u64::<LANE>(c);
+    let c = vreinterpretq_f32_u64(c);
+    vcmlaq_rot90_f32(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_laneq_f16)"]
@@ -4002,19 +3785,10 @@ pub fn vcmla_rot90_laneq_f16<const LANE: i32>(
     c: float16x8_t,
 ) -> float16x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        let c: float16x4_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmla_rot90_f16(a, b, c)
-    }
+    let c = vreinterpretq_u32_f16(c);
+    let c = vdup_laneq_u32::<LANE>(c);
+    let c = vreinterpret_f16_u32(c);
+    vcmla_rot90_f16(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_laneq_f16)"]
@@ -4031,23 +3805,10 @@ pub fn vcmlaq_rot90_laneq_f16<const LANE: i32>(
     c: float16x8_t,
 ) -> float16x8_t {
     static_assert_uimm_bits!(LANE, 2);
-    unsafe {
-        let c: float16x8_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmlaq_rot90_f16(a, b, c)
-    }
+    let c = vreinterpretq_u32_f16(c);
+    let c = vdupq_laneq_u32::<LANE>(c);
+    let c = vreinterpretq_f16_u32(c);
+    vcmlaq_rot90_f16(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmla_rot90_laneq_f32)"]
@@ -4062,10 +3823,10 @@ pub fn vcmla_rot90_laneq_f32<const LANE: i32>(
     c: float32x4_t,
 ) -> float32x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        let c: float32x2_t = simd_shuffle!(c, c, [2 * LANE as u32, 2 * LANE as u32 + 1]);
-        vcmla_rot90_f32(a, b, c)
-    }
+    let c = vreinterpretq_u64_f32(c);
+    let c = vdup_laneq_u64::<LANE>(c);
+    let c = vreinterpret_f32_u64(c);
+    vcmla_rot90_f32(a, b, c)
 }
 #[doc = "Floating-point complex multiply accumulate"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcmlaq_rot90_laneq_f32)"]
@@ -4080,19 +3841,10 @@ pub fn vcmlaq_rot90_laneq_f32<const LANE: i32>(
     c: float32x4_t,
 ) -> float32x4_t {
     static_assert_uimm_bits!(LANE, 1);
-    unsafe {
-        let c: float32x4_t = simd_shuffle!(
-            c,
-            c,
-            [
-                2 * LANE as u32,
-                2 * LANE as u32 + 1,
-                2 * LANE as u32,
-                2 * LANE as u32 + 1
-            ]
-        );
-        vcmlaq_rot90_f32(a, b, c)
-    }
+    let c = vreinterpretq_u64_f32(c);
+    let c = vdupq_laneq_u64::<LANE>(c);
+    let c = vreinterpretq_f32_u64(c);
+    vcmlaq_rot90_f32(a, b, c)
 }
 #[doc = "Join two smaller vectors into a single larger vector"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcombine_f64)"]
diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
index 102447eae8..2f7f2fc2b0 100644
--- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
+++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
@@ -5914,14 +5914,13 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float32x2_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]']
-      - [float32x4_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float32x2_t, float32x4_t, '']
+      - [float32x4_t, float32x4_t, 'q']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, 1]]
-      - Let:
-          - c
-          - "{neon_type[0]}"
-          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - Let: [c, {FnCall: [vreinterpretq_u64_f32, [c]]}]
+      - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u64', [c], [LANE]]}]
+      - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}]
       - FnCall: ["vcmla{neon_type[0].no}", [a, b, c]]
 
   - name: "vcmla{neon_type[0].laneq_nox}"
@@ -5938,14 +5937,13 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float16x4_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
-      - [float16x8_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float16x4_t, float16x8_t, '']
+      - [float16x8_t, float16x8_t, 'q']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, 2]]
-      - Let:
-          - c
-          - "{neon_type[0]}"
-          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - Let: [c, {FnCall: [vreinterpretq_u32_f16, [c]]}]
+      - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u32', [c], [LANE]]}]
+      - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}]
       - FnCall: ["vcmla{neon_type[0].no}", [a, b, c]]
 
   - name: "vcmla{neon_type[0].rot90_laneq}"
@@ -5960,14 +5958,13 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float32x2_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]']
-      - [float32x4_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float32x2_t, float32x4_t, '']
+      - [float32x4_t, float32x4_t, 'q']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, 1]]
-      - Let:
-          - c
-          - "{neon_type[0]}"
-          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - Let: [c, {FnCall: [vreinterpretq_u64_f32, [c]]}]
+      - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u64', [c], [LANE]]}]
+      - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}]
       - FnCall: ["vcmla{neon_type[0].rot90}", [a, b, c]]
 
   - name: "vcmla{neon_type[0].rot90_laneq}"
@@ -5984,14 +5981,13 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float16x4_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
-      - [float16x8_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float16x4_t, float16x8_t, '']
+      - [float16x8_t, float16x8_t, 'q']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, 2]]
-      - Let:
-          - c
-          - "{neon_type[0]}"
-          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - Let: [c, {FnCall: [vreinterpretq_u32_f16, [c]]}]
+      - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u32', [c], [LANE]]}]
+      - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}]
       - FnCall: ["vcmla{neon_type[0].rot90}", [a, b, c]]
 
   - name: "vcmla{neon_type[0].rot90_lane}"
@@ -6006,14 +6002,13 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]']
-      - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float32x2_t, float32x2_t, '']
+      - [float32x4_t, float32x2_t, 'q']
     compose:
       - FnCall: [static_assert!, ['LANE == 0']]
-      - Let:
-          - c
-          - "{neon_type[0]}"
-          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - Let: [c, {FnCall: [vreinterpret_u64_f32, [c]]}]
+      - Let: [c, {FnCall: ['vdup{type[2]}_lane_u64', [c], [LANE]]}]
+      - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}]
       - FnCall: ["vcmla{neon_type[0].rot90}", [a, b, c]]
 
   - name: "vcmla{neon_type[0].rot90_lane}"
@@ -6030,14 +6025,13 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
-      - [float16x8_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float16x4_t, float16x4_t, '']
+      - [float16x8_t, float16x4_t, 'q']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, 1]]
-      - Let:
-          - c
-          - "{neon_type[0]}"
-          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - Let: [c, {FnCall: [vreinterpret_u32_f16, [c]]}]
+      - Let: [c, {FnCall: ['vdup{type[2]}_lane_u32', [c], [LANE]]}]
+      - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}]
       - FnCall: ["vcmla{neon_type[0].rot90}", [a, b, c]]
 
   - name: "vcmla{neon_type.rot180}"
@@ -6095,14 +6089,13 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float32x2_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]']
-      - [float32x4_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float32x2_t, float32x4_t, '']
+      - [float32x4_t, float32x4_t, 'q']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, 1]]
-      - Let:
-          - c
-          - "{neon_type[0]}"
-          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - Let: [c, {FnCall: [vreinterpretq_u64_f32, [c]]}]
+      - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u64', [c], [LANE]]}]
+      - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}]
       - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]]
 
   - name: "vcmla{neon_type[0].rot180_laneq}"
@@ -6119,16 +6112,13 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float16x4_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
-      - [float16x8_t, float16x8_t,
-        '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]'
-        ]
+      - [float16x4_t, float16x8_t, '']
+      - [float16x8_t, float16x8_t, 'q']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, 2]]
-      - Let:
-          - c
-          - "{neon_type[0]}"
-          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - Let: [c, {FnCall: [vreinterpretq_u32_f16, [c]]}]
+      - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u32', [c], [LANE]]}]
+      - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}]
       - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]]
 
   - name: "vcmla{neon_type[0].rot180_lane}"
@@ -6143,14 +6133,13 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]']
-      - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float32x2_t, float32x2_t, '']
+      - [float32x4_t, float32x2_t, 'q']
     compose:
       - FnCall: [static_assert!, ['LANE == 0']]
-      - Let:
-          - c
-          - "{neon_type[0]}"
-          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - Let: [c, {FnCall: [vreinterpret_u64_f32, [c]]}]
+      - Let: [c, {FnCall: ['vdup{type[2]}_lane_u64', [c], [LANE]]}]
+      - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}]
       - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]]
 
   - name: "vcmla{neon_type[0].rot180_lane}"
@@ -6167,14 +6156,13 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
-      - [float16x8_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float16x4_t, float16x4_t, '']
+      - [float16x8_t, float16x4_t, 'q']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, 1]]
-      - Let:
-          - c
-          - "{neon_type[0]}"
-          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - Let: [c, {FnCall: [vreinterpret_u32_f16, [c]]}]
+      - Let: [c, {FnCall: ['vdup{type[2]}_lane_u32', [c], [LANE]]}]
+      - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}]
       - FnCall: ["vcmla{neon_type[0].rot180}", [a, b, c]]
 
   - name: "vcmla{neon_type[0].rot270_laneq}"
@@ -6189,14 +6177,13 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float32x2_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]']
-      - [float32x4_t, float32x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float32x2_t, float32x4_t, '']
+      - [float32x4_t, float32x4_t, 'q']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, 1]]
-      - Let:
-          - c
-          - "{neon_type[0]}"
-          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - Let: [c, {FnCall: [vreinterpretq_u64_f32, [c]]}]
+      - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u64', [c], [LANE]]}]
+      - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}]
       - FnCall: ["vcmla{neon_type[0].rot270}", [a, b, c]]
 
   - name: "vcmla{neon_type[0].rot270_laneq}"
@@ -6213,14 +6200,13 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float16x4_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
-      - [float16x8_t, float16x8_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float16x4_t, float16x8_t, '']
+      - [float16x8_t, float16x8_t, 'q']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, 2]]
-      - Let:
-          - c
-          - "{neon_type[0]}"
-          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - Let: [c, {FnCall: [vreinterpretq_u32_f16, [c]]}]
+      - Let: [c, {FnCall: ['vdup{type[2]}_laneq_u32', [c], [LANE]]}]
+      - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}]
       - FnCall: ["vcmla{neon_type[0].rot270}", [a, b, c]]
 
   - name: "vcmla{neon_type[0].lane_nox}"
@@ -6235,14 +6221,13 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]']
-      - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float32x2_t, float32x2_t, '']
+      - [float32x4_t, float32x2_t, 'q']
     compose:
       - FnCall: [static_assert!, ['LANE == 0']]
-      - Let:
-          - c
-          - "{neon_type[0]}"
-          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - Let: [c, {FnCall: [vreinterpret_u64_f32, [c]]}]
+      - Let: [c, {FnCall: ['vdup{type[2]}_lane_u64', [c], [LANE]]}]
+      - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}]
       - FnCall: ["vcmla{neon_type[0].no}", [a, b, c]]
 
 
@@ -6260,14 +6245,13 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
-      - [float16x8_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float16x4_t, float16x4_t, '']
+      - [float16x8_t, float16x4_t, 'q']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, 1]]
-      - Let:
-          - c
-          - "{neon_type[0]}"
-          - FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]
+      - Let: [c, {FnCall: [vreinterpret_u32_f16, [c]]}]
+      - Let: [c, {FnCall: ['vdup{type[2]}_lane_u32', [c], [LANE]]}]
+      - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}]
       - FnCall: ["vcmla{neon_type[0].no}", [a, b, c]]
 
   - name: "vcmla{neon_type[0].rot270_lane}"
@@ -6282,11 +6266,13 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float32x2_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1]']
-      - [float32x4_t, float32x2_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float32x2_t, float32x2_t, '']
+      - [float32x4_t, float32x2_t, 'q']
     compose:
       - FnCall: [static_assert!, ['LANE == 0']]
-      - Let: [c, "{neon_type[0]}", {FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]}]
+      - Let: [c, {FnCall: [vreinterpret_u64_f32, [c]]}]
+      - Let: [c, {FnCall: ['vdup{type[2]}_lane_u64', [c], [LANE]]}]
+      - Let: [c, {FnCall: ['vreinterpret{type[2]}_f32_u64', [c]]}]
       - FnCall: ["vcmla{neon_type[0].rot270}", [a, b, c]]
 
   - name: "vcmla{neon_type[0].rot270_lane}"
@@ -6303,11 +6289,13 @@ intrinsics:
     static_defs: ["const LANE: i32"]
     safety: safe
     types:
-      - [float16x4_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
-      - [float16x8_t, float16x4_t, '[2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1, 2 * LANE as u32, 2 * LANE as u32 + 1]']
+      - [float16x4_t, float16x4_t, '']
+      - [float16x8_t, float16x4_t, 'q']
     compose:
       - FnCall: [static_assert_uimm_bits!, [LANE, 1]]
-      - Let: [c, "{neon_type[0]}", {FnCall: [simd_shuffle!, [c, c, "{type[2]}"]]}]
+      - Let: [c, {FnCall: [vreinterpret_u32_f16, [c]]}]
+      - Let: [c, {FnCall: ['vdup{type[2]}_lane_u32', [c], [LANE]]}]
+      - Let: [c, {FnCall: ['vreinterpret{type[2]}_f16_u32', [c]]}]
       - FnCall: ["vcmla{neon_type[0].rot270}", [a, b, c]]
 
   - name: "vmax{neon_type.no}"