| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | 
 | ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s | 
 |  | 
 | define void @vaddq(ptr %x, ptr %y, i32 %n, i32 %z) { | 
 | ; CHECK-LABEL: vaddq: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB0_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB0_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    vadd.i32 q0, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB0_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer | 
 |   %cmp11 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp11, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 | 
 |   %2 = add <4 x i32> %1, %.splat | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 | 
 |   %sub = add nsw i32 %i.012, -4 | 
 |   %cmp = icmp sgt i32 %i.012, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vadd(ptr %s1, i32 %c0, i32 %N) { | 
 | ; CHECK-LABEL: vadd: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB1_1: @ %while.body.lr.ph | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB1_2: @ %while.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0] | 
 | ; CHECK-NEXT:    vadd.i32 q0, q0, r1 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB1_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %while.end | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %cmp11 = icmp sgt i32 %N, 0 | 
 |   br i1 %cmp11, label %while.body.lr.ph, label %while.end | 
 |  | 
 | while.body.lr.ph:                                 ; preds = %entry | 
 |   %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer | 
 |   br label %while.body | 
 |  | 
 | while.body:                                       ; preds = %while.body.lr.ph, %while.body | 
 |   %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] | 
 |   %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %2 = tail call <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %1) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 | 
 |   %sub = add nsw i32 %N.addr.012, -4 | 
 |   %cmp = icmp sgt i32 %N.addr.012, 4 | 
 |   br i1 %cmp, label %while.body, label %while.end | 
 |  | 
 | while.end:                                        ; preds = %while.body, %entry | 
 |   ret void | 
 | } | 
 |  | 
 | define void @vsubq(ptr %x, ptr %y, i32 %n, i32 %z) { | 
 | ; CHECK-LABEL: vsubq: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB2_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB2_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    vsub.i32 q0, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB2_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer | 
 |   %cmp11 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp11, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 | 
 |   %2 = sub <4 x i32> %1, %.splat | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 | 
 |   %sub = add nsw i32 %i.012, -4 | 
 |   %cmp = icmp sgt i32 %i.012, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vsub(ptr %s1, i32 %c0, i32 %N) { | 
 | ; CHECK-LABEL: vsub: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB3_1: @ %while.body.lr.ph | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB3_2: @ %while.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0] | 
 | ; CHECK-NEXT:    vsub.i32 q0, q0, r1 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB3_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %while.end | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %cmp11 = icmp sgt i32 %N, 0 | 
 |   br i1 %cmp11, label %while.body.lr.ph, label %while.end | 
 |  | 
 | while.body.lr.ph:                                 ; preds = %entry | 
 |   %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer | 
 |   br label %while.body | 
 |  | 
 | while.body:                                       ; preds = %while.body.lr.ph, %while.body | 
 |   %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] | 
 |   %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %2 = tail call <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %1) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 | 
 |   %sub = add nsw i32 %N.addr.012, -4 | 
 |   %cmp = icmp sgt i32 %N.addr.012, 4 | 
 |   br i1 %cmp, label %while.body, label %while.end | 
 |  | 
 | while.end:                                        ; preds = %while.body, %entry | 
 |   ret void | 
 | } | 
 |  | 
 | define void @vmulq(ptr %x, ptr %y, i32 %n, i32 %z) { | 
 | ; CHECK-LABEL: vmulq: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB4_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB4_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    vmul.i32 q0, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB4_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer | 
 |   %cmp11 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp11, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 | 
 |   %2 = mul <4 x i32> %1, %.splat | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 | 
 |   %sub = add nsw i32 %i.012, -4 | 
 |   %cmp = icmp sgt i32 %i.012, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vmul(ptr %s1, i32 %c0, i32 %N) { | 
 | ; CHECK-LABEL: vmul: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB5_1: @ %while.body.lr.ph | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB5_2: @ %while.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0] | 
 | ; CHECK-NEXT:    vmul.i32 q0, q0, r1 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB5_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %while.end | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %cmp11 = icmp sgt i32 %N, 0 | 
 |   br i1 %cmp11, label %while.body.lr.ph, label %while.end | 
 |  | 
 | while.body.lr.ph:                                 ; preds = %entry | 
 |   %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer | 
 |   br label %while.body | 
 |  | 
 | while.body:                                       ; preds = %while.body.lr.ph, %while.body | 
 |   %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] | 
 |   %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %2 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %1) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 | 
 |   %sub = add nsw i32 %N.addr.012, -4 | 
 |   %cmp = icmp sgt i32 %N.addr.012, 4 | 
 |   br i1 %cmp, label %while.body, label %while.end | 
 |  | 
 | while.end:                                        ; preds = %while.body, %entry | 
 |   ret void | 
 | } | 
 |  | 
 | define void @vqaddq(ptr %x, ptr %y, i32 %n, i32 %z) { | 
 | ; CHECK-LABEL: vqaddq: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB6_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB6_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    vqadd.s32 q0, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB6_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer | 
 |   %cmp11 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp11, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 | 
 |   %2 = tail call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %1, <4 x i32> %.splat) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 | 
 |   %sub = add nsw i32 %i.012, -4 | 
 |   %cmp = icmp sgt i32 %i.012, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vqaddqu(ptr %x, ptr %y, i32 %n, i32 %z) { | 
 | ; CHECK-LABEL: vqaddqu: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB7_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB7_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    vqadd.u32 q0, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB7_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer | 
 |   %cmp11 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp11, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 | 
 |   %2 = tail call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %1, <4 x i32> %.splat) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 | 
 |   %sub = add nsw i32 %i.012, -4 | 
 |   %cmp = icmp sgt i32 %i.012, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vqadd(ptr %s1, i32 %c0, i32 %N) { | 
 | ; CHECK-LABEL: vqadd: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB8_1: @ %while.body.lr.ph | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB8_2: @ %while.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0] | 
 | ; CHECK-NEXT:    vqadd.s32 q0, q0, r1 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB8_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %while.end | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %cmp11 = icmp sgt i32 %N, 0 | 
 |   br i1 %cmp11, label %while.body.lr.ph, label %while.end | 
 |  | 
 | while.body.lr.ph:                                 ; preds = %entry | 
 |   %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer | 
 |   br label %while.body | 
 |  | 
 | while.body:                                       ; preds = %while.body.lr.ph, %while.body | 
 |   %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] | 
 |   %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %2 = tail call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %1) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 | 
 |   %sub = add nsw i32 %N.addr.012, -4 | 
 |   %cmp = icmp sgt i32 %N.addr.012, 4 | 
 |   br i1 %cmp, label %while.body, label %while.end | 
 |  | 
 | while.end:                                        ; preds = %while.body, %entry | 
 |   ret void | 
 | } | 
 |  | 
 | define void @vqsubq(ptr %x, ptr %y, i32 %n, i32 %z) { | 
 | ; CHECK-LABEL: vqsubq: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB9_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB9_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    vqsub.s32 q0, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB9_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer | 
 |   %cmp11 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp11, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 | 
 |   %2 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %1, <4 x i32> %.splat) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 | 
 |   %sub = add nsw i32 %i.012, -4 | 
 |   %cmp = icmp sgt i32 %i.012, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vqsubqu(ptr %x, ptr %y, i32 %n, i32 %z) { | 
 | ; CHECK-LABEL: vqsubqu: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB10_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB10_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    vqsub.u32 q0, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB10_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer | 
 |   %cmp11 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp11, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 | 
 |   %2 = tail call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %1, <4 x i32> %.splat) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 | 
 |   %sub = add nsw i32 %i.012, -4 | 
 |   %cmp = icmp sgt i32 %i.012, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vqsub(ptr %s1, i32 %c0, i32 %N) { | 
 | ; CHECK-LABEL: vqsub: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB11_1: @ %while.body.lr.ph | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB11_2: @ %while.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0] | 
 | ; CHECK-NEXT:    vqsub.s32 q0, q0, r1 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB11_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %while.end | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %cmp11 = icmp sgt i32 %N, 0 | 
 |   br i1 %cmp11, label %while.body.lr.ph, label %while.end | 
 |  | 
 | while.body.lr.ph:                                 ; preds = %entry | 
 |   %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer | 
 |   br label %while.body | 
 |  | 
 | while.body:                                       ; preds = %while.body.lr.ph, %while.body | 
 |   %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] | 
 |   %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %2 = tail call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %1) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 | 
 |   %sub = add nsw i32 %N.addr.012, -4 | 
 |   %cmp = icmp sgt i32 %N.addr.012, 4 | 
 |   br i1 %cmp, label %while.body, label %while.end | 
 |  | 
 | while.end:                                        ; preds = %while.body, %entry | 
 |   ret void | 
 | } | 
 |  | 
 | define void @vhaddq(ptr %x, ptr %y, i32 %n, i32 %z) { | 
 | ; CHECK-LABEL: vhaddq: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB12_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB12_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    vhadd.s32 q0, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB12_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer | 
 |   %cmp11 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp11, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 | 
 |   %2 = tail call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> %1, <4 x i32> %.splat, i32 0) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 | 
 |   %sub = add nsw i32 %i.012, -4 | 
 |   %cmp = icmp sgt i32 %i.012, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vhadd(ptr %s1, i32 %c0, i32 %N) { | 
 | ; CHECK-LABEL: vhadd: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB13_1: @ %while.body.lr.ph | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB13_2: @ %while.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0] | 
 | ; CHECK-NEXT:    vhadd.s32 q0, q0, r1 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB13_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %while.end | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %cmp11 = icmp sgt i32 %N, 0 | 
 |   br i1 %cmp11, label %while.body.lr.ph, label %while.end | 
 |  | 
 | while.body.lr.ph:                                 ; preds = %entry | 
 |   %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer | 
 |   br label %while.body | 
 |  | 
 | while.body:                                       ; preds = %while.body.lr.ph, %while.body | 
 |   %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] | 
 |   %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %2 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %1) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 | 
 |   %sub = add nsw i32 %N.addr.012, -4 | 
 |   %cmp = icmp sgt i32 %N.addr.012, 4 | 
 |   br i1 %cmp, label %while.body, label %while.end | 
 |  | 
 | while.end:                                        ; preds = %while.body, %entry | 
 |   ret void | 
 | } | 
 |  | 
 | define void @vhsubq(ptr %x, ptr %y, i32 %n, i32 %z) { | 
 | ; CHECK-LABEL: vhsubq: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB14_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB14_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    vhsub.s32 q0, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB14_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer | 
 |   %cmp11 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp11, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 | 
 |   %2 = tail call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> %1, <4 x i32> %.splat, i32 0) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 | 
 |   %sub = add nsw i32 %i.012, -4 | 
 |   %cmp = icmp sgt i32 %i.012, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vhsub(ptr %s1, i32 %c0, i32 %N) { | 
 | ; CHECK-LABEL: vhsub: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB15_1: @ %while.body.lr.ph | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB15_2: @ %while.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0] | 
 | ; CHECK-NEXT:    vhsub.s32 q0, q0, r1 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB15_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %while.end | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %cmp11 = icmp sgt i32 %N, 0 | 
 |   br i1 %cmp11, label %while.body.lr.ph, label %while.end | 
 |  | 
 | while.body.lr.ph:                                 ; preds = %entry | 
 |   %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer | 
 |   br label %while.body | 
 |  | 
 | while.body:                                       ; preds = %while.body.lr.ph, %while.body | 
 |   %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] | 
 |   %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %2 = tail call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %1) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 | 
 |   %sub = add nsw i32 %N.addr.012, -4 | 
 |   %cmp = icmp sgt i32 %N.addr.012, 4 | 
 |   br i1 %cmp, label %while.body, label %while.end | 
 |  | 
 | while.end:                                        ; preds = %while.body, %entry | 
 |   ret void | 
 | } | 
 |  | 
 | define void @vqdmullbq(ptr %x, ptr %y, i32 %n, i32 %z) { | 
 | ; CHECK-LABEL: vqdmullbq: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB16_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB16_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    vqdmullb.s32 q1, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q1, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB16_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer | 
 |   %cmp11 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp11, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 | 
 |   %2 = tail call <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32> %1, <4 x i32> %.splat, i32 0) | 
 |   %3 = bitcast <2 x i64> %2 to <4 x i32> | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %y.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 | 
 |   %sub = add nsw i32 %i.012, -4 | 
 |   %cmp = icmp sgt i32 %i.012, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 |  | 
 | define void @vqdmull(ptr %s1, i32 %c0, i32 %N) { | 
 | ; CHECK-LABEL: vqdmull: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB17_1: @ %while.body.lr.ph | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB17_2: @ %while.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrh.s32 q0, [r0] | 
 | ; CHECK-NEXT:    vqdmullb.s16 q0, q0, r1 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB17_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %while.end | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %cmp11 = icmp sgt i32 %N, 0 | 
 |   br i1 %cmp11, label %while.body.lr.ph, label %while.end | 
 |  | 
 | while.body.lr.ph:                                 ; preds = %entry | 
 |   %conv = trunc i32 %c0 to i16 | 
 |   %.splatinsert = insertelement <8 x i16> undef, i16 %conv, i32 0 | 
 |   %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer | 
 |   br label %while.body | 
 |  | 
 | while.body:                                       ; preds = %while.body.lr.ph, %while.body | 
 |   %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] | 
 |   %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) | 
 |   %1 = tail call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %s1.addr.013, i32 2, <4 x i1> %0, <4 x i16> zeroinitializer) | 
 |   %2 = sext <4 x i16> %1 to <4 x i32> | 
 |   %3 = bitcast <4 x i32> %2 to <8 x i16> | 
 |   %4 = tail call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> %3, <8 x i16> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %4, ptr %s1.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 | 
 |   %sub = add nsw i32 %N.addr.012, -4 | 
 |   %cmp = icmp sgt i32 %N.addr.012, 4 | 
 |   br i1 %cmp, label %while.body, label %while.end | 
 |  | 
 | while.end:                                        ; preds = %while.body, %entry | 
 |   ret void | 
 | } | 
 |  | 
 | define void @vqdmulhq(ptr %x, ptr %y, i32 %n, i32 %z) { | 
 | ; CHECK-LABEL: vqdmulhq: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB18_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB18_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    vqdmulh.s32 q0, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB18_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer | 
 |   %cmp11 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp11, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 | 
 |   %2 = tail call <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32> %1, <4 x i32> %.splat) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 | 
 |   %sub = add nsw i32 %i.012, -4 | 
 |   %cmp = icmp sgt i32 %i.012, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vqdmulh(ptr %s1, i32 %c0, i32 %N) { | 
 | ; CHECK-LABEL: vqdmulh: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB19_1: @ %while.body.lr.ph | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB19_2: @ %while.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0] | 
 | ; CHECK-NEXT:    vqdmulh.s32 q0, q0, r1 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB19_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %while.end | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %cmp11 = icmp sgt i32 %N, 0 | 
 |   br i1 %cmp11, label %while.body.lr.ph, label %while.end | 
 |  | 
 | while.body.lr.ph:                                 ; preds = %entry | 
 |   %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer | 
 |   br label %while.body | 
 |  | 
 | while.body:                                       ; preds = %while.body.lr.ph, %while.body | 
 |   %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] | 
 |   %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %2 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %1) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 | 
 |   %sub = add nsw i32 %N.addr.012, -4 | 
 |   %cmp = icmp sgt i32 %N.addr.012, 4 | 
 |   br i1 %cmp, label %while.body, label %while.end | 
 |  | 
 | while.end:                                        ; preds = %while.body, %entry | 
 |   ret void | 
 | } | 
 |  | 
 | define void @vqrdmulhq(ptr %x, ptr %y, i32 %n, i32 %z) { | 
 | ; CHECK-LABEL: vqrdmulhq: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB20_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB20_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    vqrdmulh.s32 q0, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB20_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer | 
 |   %cmp11 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp11, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 | 
 |   %2 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %1, <4 x i32> %.splat) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 | 
 |   %sub = add nsw i32 %i.012, -4 | 
 |   %cmp = icmp sgt i32 %i.012, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vqrdmulh(ptr %s1, i32 %c0, i32 %N) { | 
 | ; CHECK-LABEL: vqrdmulh: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB21_1: @ %while.body.lr.ph | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB21_2: @ %while.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0] | 
 | ; CHECK-NEXT:    vqrdmulh.s32 q0, q0, r1 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB21_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %while.end | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %cmp11 = icmp sgt i32 %N, 0 | 
 |   br i1 %cmp11, label %while.body.lr.ph, label %while.end | 
 |  | 
 | while.body.lr.ph:                                 ; preds = %entry | 
 |   %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer | 
 |   br label %while.body | 
 |  | 
 | while.body:                                       ; preds = %while.body.lr.ph, %while.body | 
 |   %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] | 
 |   %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %2 = tail call <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %1) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 | 
 |   %sub = add nsw i32 %N.addr.012, -4 | 
 |   %cmp = icmp sgt i32 %N.addr.012, 4 | 
 |   br i1 %cmp, label %while.body, label %while.end | 
 |  | 
 | while.end:                                        ; preds = %while.body, %entry | 
 |   ret void | 
 | } | 
 |  | 
 | define void @vmlaq(ptr %x, ptr %y, i32 %n, i32 %z) { | 
 | ; CHECK-LABEL: vmlaq: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB22_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB22_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r1] | 
 | ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16 | 
 | ; CHECK-NEXT:    vmla.i32 q1, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q1, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB22_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer | 
 |   %cmp14 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp14, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.017 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.016 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.017, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %x.addr.017, i32 4 | 
 |   %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %y.addr.016, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %3 = mul <4 x i32> %2, %.splat | 
 |   %4 = add <4 x i32> %3, %1 | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %4, ptr %y.addr.016, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.016, i32 4 | 
 |   %sub = add nsw i32 %i.015, -4 | 
 |   %cmp = icmp sgt i32 %i.015, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vmlaqp(ptr %x, ptr %y, i32 %n, i32 %z) { | 
 | ; CHECK-LABEL: vmlaqp: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB23_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB23_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r1] | 
 | ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16 | 
 | ; CHECK-NEXT:    vmla.i32 q1, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q1, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB23_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %cmp15 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp15, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.018 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.017 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.016 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.016) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.018, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %x.addr.018, i32 4 | 
 |   %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %y.addr.017, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %3 = tail call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %2, i32 %z, <4 x i1> %0) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %y.addr.017, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.017, i32 4 | 
 |   %sub = add nsw i32 %i.016, -4 | 
 |   %cmp = icmp sgt i32 %i.016, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vmlasq(ptr %x, ptr %y, i32 %n, i32 %z) { | 
 | ; CHECK-LABEL: vmlasq: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB24_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB24_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    vldrw.u32 q1, [r1] | 
 | ; CHECK-NEXT:    vmlas.i32 q1, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q1, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB24_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0 | 
 |   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer | 
 |   %cmp14 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp14, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.017 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.016 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.017, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %x.addr.017, i32 4 | 
 |   %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %y.addr.016, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %3 = mul <4 x i32> %2, %1 | 
 |   %4 = add <4 x i32> %3, %.splat | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %4, ptr %y.addr.016, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.016, i32 4 | 
 |   %sub = add nsw i32 %i.015, -4 | 
 |   %cmp = icmp sgt i32 %i.015, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vmlasqp(ptr %x, ptr %y, i32 %n, i32 %z) { | 
 | ; CHECK-LABEL: vmlasqp: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB25_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB25_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r1] | 
 | ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16 | 
 | ; CHECK-NEXT:    vmlas.i32 q1, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q1, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB25_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %cmp15 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp15, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.018 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.017 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.016 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.016) | 
 |   %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.018, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds i32, ptr %x.addr.018, i32 4 | 
 |   %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %y.addr.017, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) | 
 |   %3 = tail call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %2, i32 %z, <4 x i1> %0) | 
 |   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %y.addr.017, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.017, i32 4 | 
 |   %sub = add nsw i32 %i.016, -4 | 
 |   %cmp = icmp sgt i32 %i.016, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vaddqf(ptr %x, ptr %y, i32 %n, float %z) { | 
 | ; CHECK-LABEL: vaddqf: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB26_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB26_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    vadd.f32 q0, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB26_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x float> poison, float %z, i32 0 | 
 |   %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer | 
 |   %cmp11 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp11, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) | 
 |   %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds float, ptr %x.addr.014, i32 4 | 
 |   %2 = fadd fast <4 x float> %1, %.splat | 
 |   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %2, ptr %y.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds float, ptr %y.addr.013, i32 4 | 
 |   %sub = add nsw i32 %i.012, -4 | 
 |   %cmp = icmp sgt i32 %i.012, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vaddf(ptr %s1, float %c0, i32 %N) { | 
 | ; CHECK-LABEL: vaddf: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB27_1: @ %while.body.lr.ph | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB27_2: @ %while.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0] | 
 | ; CHECK-NEXT:    vadd.f32 q0, q0, r1 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB27_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %while.end | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %cmp11 = icmp sgt i32 %N, 0 | 
 |   br i1 %cmp11, label %while.body.lr.ph, label %while.end | 
 |  | 
 | while.body.lr.ph:                                 ; preds = %entry | 
 |   %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0 | 
 |   %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer | 
 |   br label %while.body | 
 |  | 
 | while.body:                                       ; preds = %while.body.lr.ph, %while.body | 
 |   %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] | 
 |   %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) | 
 |   %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) | 
 |   %2 = tail call fast <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %1, <4 x float> %.splat, <4 x i1> %0, <4 x float> %1) | 
 |   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr = getelementptr inbounds float, ptr %s1.addr.013, i32 4 | 
 |   %sub = add nsw i32 %N.addr.012, -4 | 
 |   %cmp = icmp sgt i32 %N.addr.012, 4 | 
 |   br i1 %cmp, label %while.body, label %while.end | 
 |  | 
 | while.end:                                        ; preds = %while.body, %entry | 
 |   ret void | 
 | } | 
 |  | 
 | define void @vsubqf(ptr %x, ptr %y, i32 %n, float %z) { | 
 | ; CHECK-LABEL: vsubqf: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB28_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB28_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    vsub.f32 q0, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB28_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x float> poison, float %z, i32 0 | 
 |   %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer | 
 |   %cmp11 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp11, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) | 
 |   %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds float, ptr %x.addr.014, i32 4 | 
 |   %2 = fsub fast <4 x float> %1, %.splat | 
 |   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %2, ptr %y.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds float, ptr %y.addr.013, i32 4 | 
 |   %sub = add nsw i32 %i.012, -4 | 
 |   %cmp = icmp sgt i32 %i.012, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vsubf(ptr %s1, float %c0, i32 %N) { | 
 | ; CHECK-LABEL: vsubf: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB29_1: @ %while.body.lr.ph | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB29_2: @ %while.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0] | 
 | ; CHECK-NEXT:    vsub.f32 q0, q0, r1 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB29_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %while.end | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %cmp11 = icmp sgt i32 %N, 0 | 
 |   br i1 %cmp11, label %while.body.lr.ph, label %while.end | 
 |  | 
 | while.body.lr.ph:                                 ; preds = %entry | 
 |   %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0 | 
 |   %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer | 
 |   br label %while.body | 
 |  | 
 | while.body:                                       ; preds = %while.body.lr.ph, %while.body | 
 |   %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] | 
 |   %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) | 
 |   %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) | 
 |   %2 = tail call fast <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %1, <4 x float> %.splat, <4 x i1> %0, <4 x float> %1) | 
 |   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr = getelementptr inbounds float, ptr %s1.addr.013, i32 4 | 
 |   %sub = add nsw i32 %N.addr.012, -4 | 
 |   %cmp = icmp sgt i32 %N.addr.012, 4 | 
 |   br i1 %cmp, label %while.body, label %while.end | 
 |  | 
 | while.end:                                        ; preds = %while.body, %entry | 
 |   ret void | 
 | } | 
 |  | 
 | define void @vmulqf(ptr %x, ptr %y, i32 %n, float %z) { | 
 | ; CHECK-LABEL: vmulqf: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB30_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB30_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    vmul.f32 q0, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB30_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x float> poison, float %z, i32 0 | 
 |   %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer | 
 |   %cmp11 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp11, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) | 
 |   %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds float, ptr %x.addr.014, i32 4 | 
 |   %2 = fmul fast <4 x float> %1, %.splat | 
 |   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %2, ptr %y.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds float, ptr %y.addr.013, i32 4 | 
 |   %sub = add nsw i32 %i.012, -4 | 
 |   %cmp = icmp sgt i32 %i.012, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vmulf(ptr %s1, float %c0, i32 %N) { | 
 | ; CHECK-LABEL: vmulf: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB31_1: @ %while.body.lr.ph | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB31_2: @ %while.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r0] | 
 | ; CHECK-NEXT:    vmul.f32 q0, q0, r1 | 
 | ; CHECK-NEXT:    vstrw.32 q0, [r0], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB31_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %while.end | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %cmp11 = icmp sgt i32 %N, 0 | 
 |   br i1 %cmp11, label %while.body.lr.ph, label %while.end | 
 |  | 
 | while.body.lr.ph:                                 ; preds = %entry | 
 |   %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0 | 
 |   %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer | 
 |   br label %while.body | 
 |  | 
 | while.body:                                       ; preds = %while.body.lr.ph, %while.body | 
 |   %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] | 
 |   %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) | 
 |   %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) | 
 |   %2 = tail call fast <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %1, <4 x float> %.splat, <4 x i1> %0, <4 x float> %1) | 
 |   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0) | 
 |   %add.ptr = getelementptr inbounds float, ptr %s1.addr.013, i32 4 | 
 |   %sub = add nsw i32 %N.addr.012, -4 | 
 |   %cmp = icmp sgt i32 %N.addr.012, 4 | 
 |   br i1 %cmp, label %while.body, label %while.end | 
 |  | 
 | while.end:                                        ; preds = %while.body, %entry | 
 |   ret void | 
 | } | 
 |  | 
 | define void @vfmaq(ptr %x, ptr %y, i32 %n, float %z) { | 
 | ; CHECK-LABEL: vfmaq: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB32_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB32_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r1] | 
 | ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16 | 
 | ; CHECK-NEXT:    vfma.f32 q1, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q1, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB32_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x float> poison, float %z, i32 0 | 
 |   %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer | 
 |   %cmp14 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp14, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.017 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.016 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015) | 
 |   %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %x.addr.017, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds float, ptr %x.addr.017, i32 4 | 
 |   %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %y.addr.016, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) | 
 |   %3 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %2, <4 x float> %.splat, <4 x float> %1) | 
 |   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %y.addr.016, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds float, ptr %y.addr.016, i32 4 | 
 |   %sub = add nsw i32 %i.015, -4 | 
 |   %cmp = icmp sgt i32 %i.015, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vfma(ptr %s1, ptr %s2, float %c0, i32 %N) { | 
 | ; CHECK-LABEL: vfma: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r3, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB33_1: @ %while.body.lr.ph | 
 | ; CHECK-NEXT:    dlstp.32 lr, r3 | 
 | ; CHECK-NEXT:  .LBB33_2: @ %while.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r1] | 
 | ; CHECK-NEXT:    vldrw.u32 q1, [r0] | 
 | ; CHECK-NEXT:    vfma.f32 q1, q0, r2 | 
 | ; CHECK-NEXT:    vstrw.32 q1, [r0], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB33_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %while.end | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %cmp12 = icmp sgt i32 %N, 0 | 
 |   br i1 %cmp12, label %while.body.lr.ph, label %while.end | 
 |  | 
 | while.body.lr.ph:                                 ; preds = %entry | 
 |   %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0 | 
 |   %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer | 
 |   br label %while.body | 
 |  | 
 | while.body:                                       ; preds = %while.body.lr.ph, %while.body | 
 |   %s1.addr.014 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] | 
 |   %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013) | 
 |   %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %s1.addr.014, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) | 
 |   %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %s2, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) | 
 |   %3 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x float> %1, <4 x i1> %0) | 
 |   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %s1.addr.014, i32 4, <4 x i1> %0) | 
 |   %add.ptr = getelementptr inbounds float, ptr %s1.addr.014, i32 4 | 
 |   %sub = add nsw i32 %N.addr.013, -4 | 
 |   %cmp = icmp sgt i32 %N.addr.013, 4 | 
 |   br i1 %cmp, label %while.body, label %while.end | 
 |  | 
 | while.end:                                        ; preds = %while.body, %entry | 
 |   ret void | 
 | } | 
 |  | 
 | define void @vfmasq(ptr %x, ptr %y, i32 %n, float %z) { | 
 | ; CHECK-LABEL: vfmasq: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r2, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB34_1: @ %for.body.preheader | 
 | ; CHECK-NEXT:    dlstp.32 lr, r2 | 
 | ; CHECK-NEXT:  .LBB34_2: @ %for.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r1] | 
 | ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16 | 
 | ; CHECK-NEXT:    vfmas.f32 q1, q0, r3 | 
 | ; CHECK-NEXT:    vstrw.32 q1, [r1], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB34_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %.splatinsert = insertelement <4 x float> poison, float %z, i32 0 | 
 |   %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer | 
 |   %cmp14 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp14, label %for.body, label %for.cond.cleanup | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   ret void | 
 |  | 
 | for.body:                                         ; preds = %entry, %for.body | 
 |   %x.addr.017 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] | 
 |   %y.addr.016 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] | 
 |   %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015) | 
 |   %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %x.addr.017, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) | 
 |   %add.ptr = getelementptr inbounds float, ptr %x.addr.017, i32 4 | 
 |   %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %y.addr.016, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) | 
 |   %3 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %1, <4 x float> %2, <4 x float> %.splat) | 
 |   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %y.addr.016, i32 4, <4 x i1> %0) | 
 |   %add.ptr1 = getelementptr inbounds float, ptr %y.addr.016, i32 4 | 
 |   %sub = add nsw i32 %i.015, -4 | 
 |   %cmp = icmp sgt i32 %i.015, 4 | 
 |   br i1 %cmp, label %for.body, label %for.cond.cleanup | 
 | } | 
 |  | 
 | define void @vfmas(ptr %s1, ptr %s2, float %c0, i32 %N) { | 
 | ; CHECK-LABEL: vfmas: | 
 | ; CHECK:       @ %bb.0: @ %entry | 
 | ; CHECK-NEXT:    .save {r7, lr} | 
 | ; CHECK-NEXT:    push {r7, lr} | 
 | ; CHECK-NEXT:    cmp r3, #1 | 
 | ; CHECK-NEXT:    it lt | 
 | ; CHECK-NEXT:    poplt {r7, pc} | 
 | ; CHECK-NEXT:  .LBB35_1: @ %while.body.lr.ph | 
 | ; CHECK-NEXT:    dlstp.32 lr, r3 | 
 | ; CHECK-NEXT:  .LBB35_2: @ %while.body | 
 | ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1 | 
 | ; CHECK-NEXT:    vldrw.u32 q0, [r1] | 
 | ; CHECK-NEXT:    vldrw.u32 q1, [r0] | 
 | ; CHECK-NEXT:    vfmas.f32 q1, q0, r2 | 
 | ; CHECK-NEXT:    vstrw.32 q1, [r0], #16 | 
 | ; CHECK-NEXT:    letp lr, .LBB35_2 | 
 | ; CHECK-NEXT:  @ %bb.3: @ %while.end | 
 | ; CHECK-NEXT:    pop {r7, pc} | 
 | entry: | 
 |   %cmp12 = icmp sgt i32 %N, 0 | 
 |   br i1 %cmp12, label %while.body.lr.ph, label %while.end | 
 |  | 
 | while.body.lr.ph:                                 ; preds = %entry | 
 |   %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0 | 
 |   %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer | 
 |   br label %while.body | 
 |  | 
 | while.body:                                       ; preds = %while.body.lr.ph, %while.body | 
 |   %s1.addr.014 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] | 
 |   %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] | 
 |   %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013) | 
 |   %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %s1.addr.014, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) | 
 |   %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %s2, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) | 
 |   %3 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %1, <4 x float> %2, <4 x float> %.splat, <4 x i1> %0) | 
 |   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %s1.addr.014, i32 4, <4 x i1> %0) | 
 |   %add.ptr = getelementptr inbounds float, ptr %s1.addr.014, i32 4 | 
 |   %sub = add nsw i32 %N.addr.013, -4 | 
 |   %cmp = icmp sgt i32 %N.addr.013, 4 | 
 |   br i1 %cmp, label %while.body, label %while.end | 
 |  | 
 | while.end:                                        ; preds = %while.body, %entry | 
 |   ret void | 
 | } | 
 |  | 
 | declare <4 x i1> @llvm.arm.mve.vctp32(i32) | 
 | declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32 immarg, <4 x i1>, <4 x i16>) | 
 | declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) | 
 | declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>) | 
 | declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) | 
 | declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32 immarg, <4 x i1>) | 
 |  | 
 | declare <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) | 
 | declare <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) | 
 | declare <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) | 
 | declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) | 
 | declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) | 
 | declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>) | 
 | declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) | 
 | declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) | 
 | declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) | 
 | declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) | 
 | declare <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32>, <4 x i32>, i32) | 
 | declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) | 
 | declare <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32>, <4 x i32>, i32) | 
 | declare <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32) #1 | 
 | declare <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16>, <8 x i16>, i32, <4 x i1>, <4 x i32>) | 
 | declare <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) | 
 | declare <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32>, <4 x i32>) | 
 | declare <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) | 
 | declare <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) | 
 | declare <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) | 
 | declare <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) | 
 | declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) | 
 | declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) | 
 | declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) | 
 | declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>) | 
 | declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) |