| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s |
| ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} |
| |
| ; TODO: add i1, and <6 x i8> vector tests. |
| |
| ; TODO: add test for vectors that exceed 128-bit length |
| ; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors |
| ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. |
| |
| ; TODO: generate PTX that preserves Concurrent Forward Progress |
| ; for atomic operations to local statespace |
| ; by generating atomic or volatile operations. |
| |
| ; TODO: design exposure for atomic operations on vector types. |
| |
| ; TODO: add weak,atomic,volatile,atomic volatile tests |
| ; for .const and .param statespaces. |
| |
| ;; generic statespace |
| |
| ; generic |
| |
| ; TODO: make the lowering of this weak vector ops consistent with |
| ; the ones of the next tests. This test lowers to a weak PTX |
| ; vector op, but next test lowers to a vector PTX op. |
| define void @generic_2xi8(ptr %a) { |
| ; CHECK-LABEL: generic_2xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi8_param_0]; |
| ; CHECK-NEXT: ld.v2.u8 {%rs1, %rs2}, [%rd1]; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: st.v2.u8 [%rd1], {%rs4, %rs3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x i8>, ptr %a |
| %a.add = add <2 x i8> %a.load, <i8 1, i8 1> |
| store <2 x i8> %a.add, ptr %a |
| ret void |
| } |
| |
| ; TODO: make the lowering of this weak vector ops consistent with |
| ; the ones of the previous test. This test lowers to a weak |
| ; PTX scalar op, but prior test lowers to a vector PTX op. |
| define void @generic_4xi8(ptr %a) { |
| ; CHECK-LABEL: generic_4xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<9>; |
| ; CHECK-NEXT: .reg .b32 %r<13>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi8_param_0]; |
| ; CHECK-NEXT: ld.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; |
| ; CHECK-NEXT: st.u32 [%rd1], %r12; |
| ; CHECK-NEXT: ret; |
| %a.load = load <4 x i8>, ptr %a |
| %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> |
| store <4 x i8> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_8xi8(ptr %a) { |
| ; CHECK-LABEL: generic_8xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<17>; |
| ; CHECK-NEXT: .reg .b32 %r<25>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_8xi8_param_0]; |
| ; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; |
| ; CHECK-NEXT: add.s16 %rs10, %rs9, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; |
| ; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; |
| ; CHECK-NEXT: add.s16 %rs12, %rs11, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; |
| ; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; |
| ; CHECK-NEXT: add.s16 %rs14, %rs13, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; |
| ; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; |
| ; CHECK-NEXT: add.s16 %rs16, %rs15, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; |
| ; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; |
| ; CHECK-NEXT: st.v2.b32 [%rd1], {%r24, %r13}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <8 x i8>, ptr %a |
| %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| store <8 x i8> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_16xi8(ptr %a) { |
| ; CHECK-LABEL: generic_16xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<33>; |
| ; CHECK-NEXT: .reg .b32 %r<49>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_16xi8_param_0]; |
| ; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; |
| ; CHECK-NEXT: add.s16 %rs10, %rs9, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; |
| ; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; |
| ; CHECK-NEXT: add.s16 %rs12, %rs11, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; |
| ; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; |
| ; CHECK-NEXT: add.s16 %rs14, %rs13, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; |
| ; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; |
| ; CHECK-NEXT: add.s16 %rs16, %rs15, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; |
| ; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; |
| ; CHECK-NEXT: add.s16 %rs18, %rs17, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; |
| ; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; |
| ; CHECK-NEXT: add.s16 %rs20, %rs19, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; |
| ; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; |
| ; CHECK-NEXT: add.s16 %rs22, %rs21, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; |
| ; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; |
| ; CHECK-NEXT: add.s16 %rs24, %rs23, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; |
| ; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; |
| ; CHECK-NEXT: add.s16 %rs26, %rs25, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; |
| ; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; |
| ; CHECK-NEXT: add.s16 %rs28, %rs27, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; |
| ; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; |
| ; CHECK-NEXT: add.s16 %rs30, %rs29, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; |
| ; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; |
| ; CHECK-NEXT: add.s16 %rs32, %rs31, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; |
| ; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; |
| ; CHECK-NEXT: st.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <16 x i8>, ptr %a |
| %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| store <16 x i8> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_2xi16(ptr %a) { |
| ; CHECK-LABEL: generic_2xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b32 %r<3>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi16_param_0]; |
| ; CHECK-NEXT: ld.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; |
| ; CHECK-NEXT: st.u32 [%rd1], %r2; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x i16>, ptr %a |
| %a.add = add <2 x i16> %a.load, <i16 1, i16 1> |
| store <2 x i16> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_4xi16(ptr %a) { |
| ; CHECK-LABEL: generic_4xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi16_param_0]; |
| ; CHECK-NEXT: ld.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; |
| ; CHECK-NEXT: add.s16 %rs5, %rs4, 1; |
| ; CHECK-NEXT: add.s16 %rs6, %rs3, 1; |
| ; CHECK-NEXT: add.s16 %rs7, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs8, %rs1, 1; |
| ; CHECK-NEXT: st.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <4 x i16>, ptr %a |
| %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> |
| store <4 x i16> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_8xi16(ptr %a) { |
| ; CHECK-LABEL: generic_8xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<17>; |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_8xi16_param_0]; |
| ; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; |
| ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; |
| ; CHECK-NEXT: add.s16 %rs7, %rs6, 1; |
| ; CHECK-NEXT: add.s16 %rs8, %rs5, 1; |
| ; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; |
| ; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; |
| ; CHECK-NEXT: add.s16 %rs11, %rs10, 1; |
| ; CHECK-NEXT: add.s16 %rs12, %rs9, 1; |
| ; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; |
| ; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; |
| ; CHECK-NEXT: add.s16 %rs15, %rs14, 1; |
| ; CHECK-NEXT: add.s16 %rs16, %rs13, 1; |
| ; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; |
| ; CHECK-NEXT: st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <8 x i16>, ptr %a |
| %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> |
| store <8 x i16> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_2xi32(ptr %a) { |
| ; CHECK-LABEL: generic_2xi32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi32_param_0]; |
| ; CHECK-NEXT: ld.v2.u32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r3, %r2, 1; |
| ; CHECK-NEXT: add.s32 %r4, %r1, 1; |
| ; CHECK-NEXT: st.v2.u32 [%rd1], {%r4, %r3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x i32>, ptr %a |
| %a.add = add <2 x i32> %a.load, <i32 1, i32 1> |
| store <2 x i32> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_4xi32(ptr %a) { |
| ; CHECK-LABEL: generic_4xi32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi32_param_0]; |
| ; CHECK-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r5, %r4, 1; |
| ; CHECK-NEXT: add.s32 %r6, %r3, 1; |
| ; CHECK-NEXT: add.s32 %r7, %r2, 1; |
| ; CHECK-NEXT: add.s32 %r8, %r1, 1; |
| ; CHECK-NEXT: st.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <4 x i32>, ptr %a |
| %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> |
| store <4 x i32> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_2xi64(ptr %a) { |
| ; CHECK-LABEL: generic_2xi64( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<6>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi64_param_0]; |
| ; CHECK-NEXT: ld.v2.u64 {%rd2, %rd3}, [%rd1]; |
| ; CHECK-NEXT: add.s64 %rd4, %rd3, 1; |
| ; CHECK-NEXT: add.s64 %rd5, %rd2, 1; |
| ; CHECK-NEXT: st.v2.u64 [%rd1], {%rd5, %rd4}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x i64>, ptr %a |
| %a.add = add <2 x i64> %a.load, <i64 1, i64 1> |
| store <2 x i64> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_2xfloat(ptr %a) { |
| ; CHECK-LABEL: generic_2xfloat( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .f32 %f<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xfloat_param_0]; |
| ; CHECK-NEXT: ld.v2.f32 {%f1, %f2}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; |
| ; CHECK-NEXT: st.v2.f32 [%rd1], {%f4, %f3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x float>, ptr %a |
| %a.add = fadd <2 x float> %a.load, <float 1., float 1.> |
| store <2 x float> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_4xfloat(ptr %a) { |
| ; CHECK-LABEL: generic_4xfloat( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .f32 %f<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xfloat_param_0]; |
| ; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; |
| ; CHECK-NEXT: st.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <4 x float>, ptr %a |
| %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> |
| store <4 x float> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_2xdouble(ptr %a) { |
| ; CHECK-LABEL: generic_2xdouble( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-NEXT: .reg .f64 %fd<5>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xdouble_param_0]; |
| ; CHECK-NEXT: ld.v2.f64 {%fd1, %fd2}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; |
| ; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; |
| ; CHECK-NEXT: st.v2.f64 [%rd1], {%fd4, %fd3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x double>, ptr %a |
| %a.add = fadd <2 x double> %a.load, <double 1., double 1.> |
| store <2 x double> %a.add, ptr %a |
| ret void |
| } |
| |
| ; generic_volatile |
| |
| ; TODO: volatile, atomic, and volatile atomic memory operations on vector types. |
| ; Currently, LLVM: |
| ; - does not allow atomic operations on vectors. |
| ; - it allows volatile operations but not clear what that means. |
| ; Following both semantics make sense in general and PTX supports both: |
| ; - volatile/atomic/volatile atomic applies to the whole vector |
| ; - volatile/atomic/volatile atomic applies elementwise |
| ; Actions required: |
| ; - clarify LLVM semantics for volatile on vectors and align the NVPTX backend with those |
| ; Below tests show that the current implementation picks the semantics in an inconsistent way |
| ; * volatile <2 x i8> lowers to "elementwise volatile" |
| ; * <4 x i8> lowers to "full vector volatile" |
| ; - provide support for vector atomics, e.g., by extending LLVM IR or via intrinsics |
| ; - update tests in load-store-sm70.ll as well. |
| |
| ; TODO: make this operation consistent with the one for <4 x i8> |
| ; This operation lowers to a "element wise volatile PTX operation". |
| define void @generic_volatile_2xi8(ptr %a) { |
| ; CHECK-LABEL: generic_volatile_2xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi8_param_0]; |
| ; CHECK-NEXT: ld.volatile.v2.u8 {%rs1, %rs2}, [%rd1]; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: st.volatile.v2.u8 [%rd1], {%rs4, %rs3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x i8>, ptr %a |
| %a.add = add <2 x i8> %a.load, <i8 1, i8 1> |
| store volatile <2 x i8> %a.add, ptr %a |
| ret void |
| } |
| |
| ; TODO: make this operation consistent with the one for <2 x i8> |
| ; This operation lowers to a "full vector volatile PTX operation". |
| define void @generic_volatile_4xi8(ptr %a) { |
| ; CHECK-LABEL: generic_volatile_4xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<9>; |
| ; CHECK-NEXT: .reg .b32 %r<13>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi8_param_0]; |
| ; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; |
| ; CHECK-NEXT: st.volatile.u32 [%rd1], %r12; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <4 x i8>, ptr %a |
| %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> |
| store volatile <4 x i8> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_volatile_8xi8(ptr %a) { |
| ; CHECK-LABEL: generic_volatile_8xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<17>; |
| ; CHECK-NEXT: .reg .b32 %r<25>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_8xi8_param_0]; |
| ; CHECK-NEXT: ld.volatile.v2.b32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; |
| ; CHECK-NEXT: add.s16 %rs10, %rs9, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; |
| ; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; |
| ; CHECK-NEXT: add.s16 %rs12, %rs11, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; |
| ; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; |
| ; CHECK-NEXT: add.s16 %rs14, %rs13, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; |
| ; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; |
| ; CHECK-NEXT: add.s16 %rs16, %rs15, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; |
| ; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; |
| ; CHECK-NEXT: st.volatile.v2.b32 [%rd1], {%r24, %r13}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <8 x i8>, ptr %a |
| %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| store volatile <8 x i8> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_volatile_16xi8(ptr %a) { |
| ; CHECK-LABEL: generic_volatile_16xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<33>; |
| ; CHECK-NEXT: .reg .b32 %r<49>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_16xi8_param_0]; |
| ; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; |
| ; CHECK-NEXT: add.s16 %rs10, %rs9, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; |
| ; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; |
| ; CHECK-NEXT: add.s16 %rs12, %rs11, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; |
| ; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; |
| ; CHECK-NEXT: add.s16 %rs14, %rs13, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; |
| ; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; |
| ; CHECK-NEXT: add.s16 %rs16, %rs15, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; |
| ; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; |
| ; CHECK-NEXT: add.s16 %rs18, %rs17, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; |
| ; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; |
| ; CHECK-NEXT: add.s16 %rs20, %rs19, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; |
| ; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; |
| ; CHECK-NEXT: add.s16 %rs22, %rs21, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; |
| ; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; |
| ; CHECK-NEXT: add.s16 %rs24, %rs23, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; |
| ; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; |
| ; CHECK-NEXT: add.s16 %rs26, %rs25, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; |
| ; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; |
| ; CHECK-NEXT: add.s16 %rs28, %rs27, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; |
| ; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; |
| ; CHECK-NEXT: add.s16 %rs30, %rs29, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; |
| ; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; |
| ; CHECK-NEXT: add.s16 %rs32, %rs31, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; |
| ; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; |
| ; CHECK-NEXT: st.volatile.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <16 x i8>, ptr %a |
| %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| store volatile <16 x i8> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_volatile_2xi16(ptr %a) { |
| ; CHECK-LABEL: generic_volatile_2xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b32 %r<3>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi16_param_0]; |
| ; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; |
| ; CHECK-NEXT: st.volatile.u32 [%rd1], %r2; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x i16>, ptr %a |
| %a.add = add <2 x i16> %a.load, <i16 1, i16 1> |
| store volatile <2 x i16> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_volatile_4xi16(ptr %a) { |
| ; CHECK-LABEL: generic_volatile_4xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi16_param_0]; |
| ; CHECK-NEXT: ld.volatile.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; |
| ; CHECK-NEXT: add.s16 %rs5, %rs4, 1; |
| ; CHECK-NEXT: add.s16 %rs6, %rs3, 1; |
| ; CHECK-NEXT: add.s16 %rs7, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs8, %rs1, 1; |
| ; CHECK-NEXT: st.volatile.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <4 x i16>, ptr %a |
| %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> |
| store volatile <4 x i16> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_volatile_8xi16(ptr %a) { |
| ; CHECK-LABEL: generic_volatile_8xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<17>; |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_8xi16_param_0]; |
| ; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; |
| ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; |
| ; CHECK-NEXT: add.s16 %rs7, %rs6, 1; |
| ; CHECK-NEXT: add.s16 %rs8, %rs5, 1; |
| ; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; |
| ; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; |
| ; CHECK-NEXT: add.s16 %rs11, %rs10, 1; |
| ; CHECK-NEXT: add.s16 %rs12, %rs9, 1; |
| ; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; |
| ; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; |
| ; CHECK-NEXT: add.s16 %rs15, %rs14, 1; |
| ; CHECK-NEXT: add.s16 %rs16, %rs13, 1; |
| ; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; |
| ; CHECK-NEXT: st.volatile.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <8 x i16>, ptr %a |
| %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> |
| store volatile <8 x i16> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_volatile_2xi32(ptr %a) { |
| ; CHECK-LABEL: generic_volatile_2xi32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi32_param_0]; |
| ; CHECK-NEXT: ld.volatile.v2.u32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r3, %r2, 1; |
| ; CHECK-NEXT: add.s32 %r4, %r1, 1; |
| ; CHECK-NEXT: st.volatile.v2.u32 [%rd1], {%r4, %r3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x i32>, ptr %a |
| %a.add = add <2 x i32> %a.load, <i32 1, i32 1> |
| store volatile <2 x i32> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_volatile_4xi32(ptr %a) { |
| ; CHECK-LABEL: generic_volatile_4xi32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi32_param_0]; |
| ; CHECK-NEXT: ld.volatile.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r5, %r4, 1; |
| ; CHECK-NEXT: add.s32 %r6, %r3, 1; |
| ; CHECK-NEXT: add.s32 %r7, %r2, 1; |
| ; CHECK-NEXT: add.s32 %r8, %r1, 1; |
| ; CHECK-NEXT: st.volatile.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <4 x i32>, ptr %a |
| %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> |
| store volatile <4 x i32> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_volatile_2xi64(ptr %a) { |
| ; CHECK-LABEL: generic_volatile_2xi64( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<6>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi64_param_0]; |
| ; CHECK-NEXT: ld.volatile.v2.u64 {%rd2, %rd3}, [%rd1]; |
| ; CHECK-NEXT: add.s64 %rd4, %rd3, 1; |
| ; CHECK-NEXT: add.s64 %rd5, %rd2, 1; |
| ; CHECK-NEXT: st.volatile.v2.u64 [%rd1], {%rd5, %rd4}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x i64>, ptr %a |
| %a.add = add <2 x i64> %a.load, <i64 1, i64 1> |
| store volatile <2 x i64> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_volatile_2xfloat(ptr %a) { |
| ; CHECK-LABEL: generic_volatile_2xfloat( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .f32 %f<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xfloat_param_0]; |
| ; CHECK-NEXT: ld.volatile.v2.f32 {%f1, %f2}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; |
| ; CHECK-NEXT: st.volatile.v2.f32 [%rd1], {%f4, %f3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x float>, ptr %a |
| %a.add = fadd <2 x float> %a.load, <float 1., float 1.> |
| store volatile <2 x float> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_volatile_4xfloat(ptr %a) { |
| ; CHECK-LABEL: generic_volatile_4xfloat( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .f32 %f<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xfloat_param_0]; |
| ; CHECK-NEXT: ld.volatile.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; |
| ; CHECK-NEXT: st.volatile.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <4 x float>, ptr %a |
| %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> |
| store volatile <4 x float> %a.add, ptr %a |
| ret void |
| } |
| |
| define void @generic_volatile_2xdouble(ptr %a) { |
| ; CHECK-LABEL: generic_volatile_2xdouble( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-NEXT: .reg .f64 %fd<5>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xdouble_param_0]; |
| ; CHECK-NEXT: ld.volatile.v2.f64 {%fd1, %fd2}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; |
| ; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; |
| ; CHECK-NEXT: st.volatile.v2.f64 [%rd1], {%fd4, %fd3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x double>, ptr %a |
| %a.add = fadd <2 x double> %a.load, <double 1., double 1.> |
| store volatile <2 x double> %a.add, ptr %a |
| ret void |
| } |
| |
| ;; global statespace |
| |
| ; global |
| |
| define void @global_2xi8(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_2xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi8_param_0]; |
| ; CHECK-NEXT: ld.global.v2.u8 {%rs1, %rs2}, [%rd1]; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: st.global.v2.u8 [%rd1], {%rs4, %rs3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x i8>, ptr addrspace(1) %a |
| %a.add = add <2 x i8> %a.load, <i8 1, i8 1> |
| store <2 x i8> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_4xi8(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_4xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<9>; |
| ; CHECK-NEXT: .reg .b32 %r<13>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi8_param_0]; |
| ; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; |
| ; CHECK-NEXT: st.global.u32 [%rd1], %r12; |
| ; CHECK-NEXT: ret; |
| %a.load = load <4 x i8>, ptr addrspace(1) %a |
| %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> |
| store <4 x i8> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_8xi8(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_8xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<17>; |
| ; CHECK-NEXT: .reg .b32 %r<25>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_8xi8_param_0]; |
| ; CHECK-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; |
| ; CHECK-NEXT: add.s16 %rs10, %rs9, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; |
| ; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; |
| ; CHECK-NEXT: add.s16 %rs12, %rs11, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; |
| ; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; |
| ; CHECK-NEXT: add.s16 %rs14, %rs13, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; |
| ; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; |
| ; CHECK-NEXT: add.s16 %rs16, %rs15, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; |
| ; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; |
| ; CHECK-NEXT: st.global.v2.b32 [%rd1], {%r24, %r13}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <8 x i8>, ptr addrspace(1) %a |
| %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| store <8 x i8> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_16xi8(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_16xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<33>; |
| ; CHECK-NEXT: .reg .b32 %r<49>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_16xi8_param_0]; |
| ; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; |
| ; CHECK-NEXT: add.s16 %rs10, %rs9, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; |
| ; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; |
| ; CHECK-NEXT: add.s16 %rs12, %rs11, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; |
| ; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; |
| ; CHECK-NEXT: add.s16 %rs14, %rs13, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; |
| ; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; |
| ; CHECK-NEXT: add.s16 %rs16, %rs15, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; |
| ; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; |
| ; CHECK-NEXT: add.s16 %rs18, %rs17, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; |
| ; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; |
| ; CHECK-NEXT: add.s16 %rs20, %rs19, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; |
| ; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; |
| ; CHECK-NEXT: add.s16 %rs22, %rs21, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; |
| ; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; |
| ; CHECK-NEXT: add.s16 %rs24, %rs23, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; |
| ; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; |
| ; CHECK-NEXT: add.s16 %rs26, %rs25, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; |
| ; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; |
| ; CHECK-NEXT: add.s16 %rs28, %rs27, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; |
| ; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; |
| ; CHECK-NEXT: add.s16 %rs30, %rs29, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; |
| ; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; |
| ; CHECK-NEXT: add.s16 %rs32, %rs31, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; |
| ; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; |
| ; CHECK-NEXT: st.global.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <16 x i8>, ptr addrspace(1) %a |
| %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| store <16 x i8> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_2xi16(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_2xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b32 %r<3>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi16_param_0]; |
| ; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; |
| ; CHECK-NEXT: st.global.u32 [%rd1], %r2; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x i16>, ptr addrspace(1) %a |
| %a.add = add <2 x i16> %a.load, <i16 1, i16 1> |
| store <2 x i16> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_4xi16(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_4xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi16_param_0]; |
| ; CHECK-NEXT: ld.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; |
| ; CHECK-NEXT: add.s16 %rs5, %rs4, 1; |
| ; CHECK-NEXT: add.s16 %rs6, %rs3, 1; |
| ; CHECK-NEXT: add.s16 %rs7, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs8, %rs1, 1; |
| ; CHECK-NEXT: st.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <4 x i16>, ptr addrspace(1) %a |
| %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> |
| store <4 x i16> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_8xi16(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_8xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<17>; |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_8xi16_param_0]; |
| ; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; |
| ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; |
| ; CHECK-NEXT: add.s16 %rs7, %rs6, 1; |
| ; CHECK-NEXT: add.s16 %rs8, %rs5, 1; |
| ; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; |
| ; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; |
| ; CHECK-NEXT: add.s16 %rs11, %rs10, 1; |
| ; CHECK-NEXT: add.s16 %rs12, %rs9, 1; |
| ; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; |
| ; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; |
| ; CHECK-NEXT: add.s16 %rs15, %rs14, 1; |
| ; CHECK-NEXT: add.s16 %rs16, %rs13, 1; |
| ; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; |
| ; CHECK-NEXT: st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <8 x i16>, ptr addrspace(1) %a |
| %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> |
| store <8 x i16> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_2xi32(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_2xi32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi32_param_0]; |
| ; CHECK-NEXT: ld.global.v2.u32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r3, %r2, 1; |
| ; CHECK-NEXT: add.s32 %r4, %r1, 1; |
| ; CHECK-NEXT: st.global.v2.u32 [%rd1], {%r4, %r3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x i32>, ptr addrspace(1) %a |
| %a.add = add <2 x i32> %a.load, <i32 1, i32 1> |
| store <2 x i32> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_4xi32(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_4xi32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi32_param_0]; |
| ; CHECK-NEXT: ld.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r5, %r4, 1; |
| ; CHECK-NEXT: add.s32 %r6, %r3, 1; |
| ; CHECK-NEXT: add.s32 %r7, %r2, 1; |
| ; CHECK-NEXT: add.s32 %r8, %r1, 1; |
| ; CHECK-NEXT: st.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <4 x i32>, ptr addrspace(1) %a |
| %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> |
| store <4 x i32> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_2xi64(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_2xi64( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<6>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi64_param_0]; |
| ; CHECK-NEXT: ld.global.v2.u64 {%rd2, %rd3}, [%rd1]; |
| ; CHECK-NEXT: add.s64 %rd4, %rd3, 1; |
| ; CHECK-NEXT: add.s64 %rd5, %rd2, 1; |
| ; CHECK-NEXT: st.global.v2.u64 [%rd1], {%rd5, %rd4}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x i64>, ptr addrspace(1) %a |
| %a.add = add <2 x i64> %a.load, <i64 1, i64 1> |
| store <2 x i64> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_2xfloat(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_2xfloat( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .f32 %f<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_2xfloat_param_0]; |
| ; CHECK-NEXT: ld.global.v2.f32 {%f1, %f2}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; |
| ; CHECK-NEXT: st.global.v2.f32 [%rd1], {%f4, %f3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x float>, ptr addrspace(1) %a |
| %a.add = fadd <2 x float> %a.load, <float 1., float 1.> |
| store <2 x float> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_4xfloat(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_4xfloat( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .f32 %f<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_4xfloat_param_0]; |
| ; CHECK-NEXT: ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; |
| ; CHECK-NEXT: st.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <4 x float>, ptr addrspace(1) %a |
| %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> |
| store <4 x float> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_2xdouble(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_2xdouble( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-NEXT: .reg .f64 %fd<5>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_2xdouble_param_0]; |
| ; CHECK-NEXT: ld.global.v2.f64 {%fd1, %fd2}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; |
| ; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; |
| ; CHECK-NEXT: st.global.v2.f64 [%rd1], {%fd4, %fd3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x double>, ptr addrspace(1) %a |
| %a.add = fadd <2 x double> %a.load, <double 1., double 1.> |
| store <2 x double> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| ; global_volatile |
| |
| define void @global_volatile_2xi8(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_volatile_2xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi8_param_0]; |
| ; CHECK-NEXT: ld.volatile.global.v2.u8 {%rs1, %rs2}, [%rd1]; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: st.volatile.global.v2.u8 [%rd1], {%rs4, %rs3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x i8>, ptr addrspace(1) %a |
| %a.add = add <2 x i8> %a.load, <i8 1, i8 1> |
| store volatile <2 x i8> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_volatile_4xi8(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_volatile_4xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<9>; |
| ; CHECK-NEXT: .reg .b32 %r<13>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi8_param_0]; |
| ; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; |
| ; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r12; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <4 x i8>, ptr addrspace(1) %a |
| %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> |
| store volatile <4 x i8> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_volatile_8xi8(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_volatile_8xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<17>; |
| ; CHECK-NEXT: .reg .b32 %r<25>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_8xi8_param_0]; |
| ; CHECK-NEXT: ld.volatile.global.v2.b32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; |
| ; CHECK-NEXT: add.s16 %rs10, %rs9, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; |
| ; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; |
| ; CHECK-NEXT: add.s16 %rs12, %rs11, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; |
| ; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; |
| ; CHECK-NEXT: add.s16 %rs14, %rs13, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; |
| ; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; |
| ; CHECK-NEXT: add.s16 %rs16, %rs15, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; |
| ; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; |
| ; CHECK-NEXT: st.volatile.global.v2.b32 [%rd1], {%r24, %r13}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <8 x i8>, ptr addrspace(1) %a |
| %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| store volatile <8 x i8> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_volatile_16xi8(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_volatile_16xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<33>; |
| ; CHECK-NEXT: .reg .b32 %r<49>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_16xi8_param_0]; |
| ; CHECK-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; |
| ; CHECK-NEXT: add.s16 %rs10, %rs9, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; |
| ; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; |
| ; CHECK-NEXT: add.s16 %rs12, %rs11, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; |
| ; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; |
| ; CHECK-NEXT: add.s16 %rs14, %rs13, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; |
| ; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; |
| ; CHECK-NEXT: add.s16 %rs16, %rs15, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; |
| ; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; |
| ; CHECK-NEXT: add.s16 %rs18, %rs17, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; |
| ; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; |
| ; CHECK-NEXT: add.s16 %rs20, %rs19, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; |
| ; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; |
| ; CHECK-NEXT: add.s16 %rs22, %rs21, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; |
| ; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; |
| ; CHECK-NEXT: add.s16 %rs24, %rs23, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; |
| ; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; |
| ; CHECK-NEXT: add.s16 %rs26, %rs25, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; |
| ; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; |
| ; CHECK-NEXT: add.s16 %rs28, %rs27, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; |
| ; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; |
| ; CHECK-NEXT: add.s16 %rs30, %rs29, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; |
| ; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; |
| ; CHECK-NEXT: add.s16 %rs32, %rs31, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; |
| ; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; |
| ; CHECK-NEXT: st.volatile.global.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <16 x i8>, ptr addrspace(1) %a |
| %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| store volatile <16 x i8> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_volatile_2xi16(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_volatile_2xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b32 %r<3>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi16_param_0]; |
| ; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; |
| ; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r2; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x i16>, ptr addrspace(1) %a |
| %a.add = add <2 x i16> %a.load, <i16 1, i16 1> |
| store volatile <2 x i16> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_volatile_4xi16(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_volatile_4xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi16_param_0]; |
| ; CHECK-NEXT: ld.volatile.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; |
| ; CHECK-NEXT: add.s16 %rs5, %rs4, 1; |
| ; CHECK-NEXT: add.s16 %rs6, %rs3, 1; |
| ; CHECK-NEXT: add.s16 %rs7, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs8, %rs1, 1; |
| ; CHECK-NEXT: st.volatile.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <4 x i16>, ptr addrspace(1) %a |
| %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> |
| store volatile <4 x i16> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_volatile_8xi16(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_volatile_8xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<17>; |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_8xi16_param_0]; |
| ; CHECK-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; |
| ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; |
| ; CHECK-NEXT: add.s16 %rs7, %rs6, 1; |
| ; CHECK-NEXT: add.s16 %rs8, %rs5, 1; |
| ; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; |
| ; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; |
| ; CHECK-NEXT: add.s16 %rs11, %rs10, 1; |
| ; CHECK-NEXT: add.s16 %rs12, %rs9, 1; |
| ; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; |
| ; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; |
| ; CHECK-NEXT: add.s16 %rs15, %rs14, 1; |
| ; CHECK-NEXT: add.s16 %rs16, %rs13, 1; |
| ; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; |
| ; CHECK-NEXT: st.volatile.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <8 x i16>, ptr addrspace(1) %a |
| %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> |
| store volatile <8 x i16> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_volatile_2xi32(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_volatile_2xi32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi32_param_0]; |
| ; CHECK-NEXT: ld.volatile.global.v2.u32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r3, %r2, 1; |
| ; CHECK-NEXT: add.s32 %r4, %r1, 1; |
| ; CHECK-NEXT: st.volatile.global.v2.u32 [%rd1], {%r4, %r3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x i32>, ptr addrspace(1) %a |
| %a.add = add <2 x i32> %a.load, <i32 1, i32 1> |
| store volatile <2 x i32> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_volatile_4xi32(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_volatile_4xi32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi32_param_0]; |
| ; CHECK-NEXT: ld.volatile.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r5, %r4, 1; |
| ; CHECK-NEXT: add.s32 %r6, %r3, 1; |
| ; CHECK-NEXT: add.s32 %r7, %r2, 1; |
| ; CHECK-NEXT: add.s32 %r8, %r1, 1; |
| ; CHECK-NEXT: st.volatile.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <4 x i32>, ptr addrspace(1) %a |
| %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> |
| store volatile <4 x i32> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_volatile_2xi64(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_volatile_2xi64( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<6>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi64_param_0]; |
| ; CHECK-NEXT: ld.volatile.global.v2.u64 {%rd2, %rd3}, [%rd1]; |
| ; CHECK-NEXT: add.s64 %rd4, %rd3, 1; |
| ; CHECK-NEXT: add.s64 %rd5, %rd2, 1; |
| ; CHECK-NEXT: st.volatile.global.v2.u64 [%rd1], {%rd5, %rd4}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x i64>, ptr addrspace(1) %a |
| %a.add = add <2 x i64> %a.load, <i64 1, i64 1> |
| store volatile <2 x i64> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_volatile_2xfloat(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_volatile_2xfloat( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .f32 %f<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xfloat_param_0]; |
| ; CHECK-NEXT: ld.volatile.global.v2.f32 {%f1, %f2}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; |
| ; CHECK-NEXT: st.volatile.global.v2.f32 [%rd1], {%f4, %f3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x float>, ptr addrspace(1) %a |
| %a.add = fadd <2 x float> %a.load, <float 1., float 1.> |
| store volatile <2 x float> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_volatile_4xfloat(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_volatile_4xfloat( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .f32 %f<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xfloat_param_0]; |
| ; CHECK-NEXT: ld.volatile.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; |
| ; CHECK-NEXT: st.volatile.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <4 x float>, ptr addrspace(1) %a |
| %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> |
| store volatile <4 x float> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| define void @global_volatile_2xdouble(ptr addrspace(1) %a) { |
| ; CHECK-LABEL: global_volatile_2xdouble( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-NEXT: .reg .f64 %fd<5>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xdouble_param_0]; |
| ; CHECK-NEXT: ld.volatile.global.v2.f64 {%fd1, %fd2}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; |
| ; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; |
| ; CHECK-NEXT: st.volatile.global.v2.f64 [%rd1], {%fd4, %fd3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x double>, ptr addrspace(1) %a |
| %a.add = fadd <2 x double> %a.load, <double 1., double 1.> |
| store volatile <2 x double> %a.add, ptr addrspace(1) %a |
| ret void |
| } |
| |
| ;; shared statespace |
| |
| ; shared |
| |
| define void @shared_2xi8(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_2xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi8_param_0]; |
| ; CHECK-NEXT: ld.shared.v2.u8 {%rs1, %rs2}, [%rd1]; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: st.shared.v2.u8 [%rd1], {%rs4, %rs3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x i8>, ptr addrspace(3) %a |
| %a.add = add <2 x i8> %a.load, <i8 1, i8 1> |
| store <2 x i8> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_4xi8(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_4xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<9>; |
| ; CHECK-NEXT: .reg .b32 %r<13>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi8_param_0]; |
| ; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; |
| ; CHECK-NEXT: st.shared.u32 [%rd1], %r12; |
| ; CHECK-NEXT: ret; |
| %a.load = load <4 x i8>, ptr addrspace(3) %a |
| %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> |
| store <4 x i8> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_8xi8(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_8xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<17>; |
| ; CHECK-NEXT: .reg .b32 %r<25>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_8xi8_param_0]; |
| ; CHECK-NEXT: ld.shared.v2.b32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; |
| ; CHECK-NEXT: add.s16 %rs10, %rs9, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; |
| ; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; |
| ; CHECK-NEXT: add.s16 %rs12, %rs11, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; |
| ; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; |
| ; CHECK-NEXT: add.s16 %rs14, %rs13, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; |
| ; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; |
| ; CHECK-NEXT: add.s16 %rs16, %rs15, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; |
| ; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; |
| ; CHECK-NEXT: st.shared.v2.b32 [%rd1], {%r24, %r13}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <8 x i8>, ptr addrspace(3) %a |
| %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| store <8 x i8> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_16xi8(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_16xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<33>; |
| ; CHECK-NEXT: .reg .b32 %r<49>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_16xi8_param_0]; |
| ; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; |
| ; CHECK-NEXT: add.s16 %rs10, %rs9, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; |
| ; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; |
| ; CHECK-NEXT: add.s16 %rs12, %rs11, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; |
| ; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; |
| ; CHECK-NEXT: add.s16 %rs14, %rs13, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; |
| ; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; |
| ; CHECK-NEXT: add.s16 %rs16, %rs15, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; |
| ; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; |
| ; CHECK-NEXT: add.s16 %rs18, %rs17, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; |
| ; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; |
| ; CHECK-NEXT: add.s16 %rs20, %rs19, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; |
| ; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; |
| ; CHECK-NEXT: add.s16 %rs22, %rs21, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; |
| ; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; |
| ; CHECK-NEXT: add.s16 %rs24, %rs23, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; |
| ; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; |
| ; CHECK-NEXT: add.s16 %rs26, %rs25, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; |
| ; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; |
| ; CHECK-NEXT: add.s16 %rs28, %rs27, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; |
| ; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; |
| ; CHECK-NEXT: add.s16 %rs30, %rs29, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; |
| ; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; |
| ; CHECK-NEXT: add.s16 %rs32, %rs31, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; |
| ; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; |
| ; CHECK-NEXT: st.shared.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <16 x i8>, ptr addrspace(3) %a |
| %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| store <16 x i8> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_2xi16(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_2xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b32 %r<3>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi16_param_0]; |
| ; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; |
| ; CHECK-NEXT: st.shared.u32 [%rd1], %r2; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x i16>, ptr addrspace(3) %a |
| %a.add = add <2 x i16> %a.load, <i16 1, i16 1> |
| store <2 x i16> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_4xi16(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_4xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi16_param_0]; |
| ; CHECK-NEXT: ld.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; |
| ; CHECK-NEXT: add.s16 %rs5, %rs4, 1; |
| ; CHECK-NEXT: add.s16 %rs6, %rs3, 1; |
| ; CHECK-NEXT: add.s16 %rs7, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs8, %rs1, 1; |
| ; CHECK-NEXT: st.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <4 x i16>, ptr addrspace(3) %a |
| %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> |
| store <4 x i16> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_8xi16(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_8xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<17>; |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_8xi16_param_0]; |
| ; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; |
| ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; |
| ; CHECK-NEXT: add.s16 %rs7, %rs6, 1; |
| ; CHECK-NEXT: add.s16 %rs8, %rs5, 1; |
| ; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; |
| ; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; |
| ; CHECK-NEXT: add.s16 %rs11, %rs10, 1; |
| ; CHECK-NEXT: add.s16 %rs12, %rs9, 1; |
| ; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; |
| ; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; |
| ; CHECK-NEXT: add.s16 %rs15, %rs14, 1; |
| ; CHECK-NEXT: add.s16 %rs16, %rs13, 1; |
| ; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; |
| ; CHECK-NEXT: st.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <8 x i16>, ptr addrspace(3) %a |
| %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> |
| store <8 x i16> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_2xi32(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_2xi32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi32_param_0]; |
| ; CHECK-NEXT: ld.shared.v2.u32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r3, %r2, 1; |
| ; CHECK-NEXT: add.s32 %r4, %r1, 1; |
| ; CHECK-NEXT: st.shared.v2.u32 [%rd1], {%r4, %r3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x i32>, ptr addrspace(3) %a |
| %a.add = add <2 x i32> %a.load, <i32 1, i32 1> |
| store <2 x i32> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_4xi32(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_4xi32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi32_param_0]; |
| ; CHECK-NEXT: ld.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r5, %r4, 1; |
| ; CHECK-NEXT: add.s32 %r6, %r3, 1; |
| ; CHECK-NEXT: add.s32 %r7, %r2, 1; |
| ; CHECK-NEXT: add.s32 %r8, %r1, 1; |
| ; CHECK-NEXT: st.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <4 x i32>, ptr addrspace(3) %a |
| %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> |
| store <4 x i32> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_2xi64(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_2xi64( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<6>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi64_param_0]; |
| ; CHECK-NEXT: ld.shared.v2.u64 {%rd2, %rd3}, [%rd1]; |
| ; CHECK-NEXT: add.s64 %rd4, %rd3, 1; |
| ; CHECK-NEXT: add.s64 %rd5, %rd2, 1; |
| ; CHECK-NEXT: st.shared.v2.u64 [%rd1], {%rd5, %rd4}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x i64>, ptr addrspace(3) %a |
| %a.add = add <2 x i64> %a.load, <i64 1, i64 1> |
| store <2 x i64> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_2xfloat(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_2xfloat( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .f32 %f<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xfloat_param_0]; |
| ; CHECK-NEXT: ld.shared.v2.f32 {%f1, %f2}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; |
| ; CHECK-NEXT: st.shared.v2.f32 [%rd1], {%f4, %f3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x float>, ptr addrspace(3) %a |
| %a.add = fadd <2 x float> %a.load, <float 1., float 1.> |
| store <2 x float> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_4xfloat(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_4xfloat( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .f32 %f<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xfloat_param_0]; |
| ; CHECK-NEXT: ld.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; |
| ; CHECK-NEXT: st.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <4 x float>, ptr addrspace(3) %a |
| %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> |
| store <4 x float> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_2xdouble(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_2xdouble( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-NEXT: .reg .f64 %fd<5>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xdouble_param_0]; |
| ; CHECK-NEXT: ld.shared.v2.f64 {%fd1, %fd2}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; |
| ; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; |
| ; CHECK-NEXT: st.shared.v2.f64 [%rd1], {%fd4, %fd3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x double>, ptr addrspace(3) %a |
| %a.add = fadd <2 x double> %a.load, <double 1., double 1.> |
| store <2 x double> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| ; shared_volatile |
| |
| define void @shared_volatile_2xi8(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_volatile_2xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi8_param_0]; |
| ; CHECK-NEXT: ld.volatile.shared.v2.u8 {%rs1, %rs2}, [%rd1]; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: st.volatile.shared.v2.u8 [%rd1], {%rs4, %rs3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x i8>, ptr addrspace(3) %a |
| %a.add = add <2 x i8> %a.load, <i8 1, i8 1> |
| store volatile <2 x i8> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_volatile_4xi8(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_volatile_4xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<9>; |
| ; CHECK-NEXT: .reg .b32 %r<13>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi8_param_0]; |
| ; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; |
| ; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r12; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <4 x i8>, ptr addrspace(3) %a |
| %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> |
| store volatile <4 x i8> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_volatile_8xi8(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_volatile_8xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<17>; |
| ; CHECK-NEXT: .reg .b32 %r<25>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xi8_param_0]; |
| ; CHECK-NEXT: ld.volatile.shared.v2.b32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; |
| ; CHECK-NEXT: add.s16 %rs10, %rs9, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; |
| ; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; |
| ; CHECK-NEXT: add.s16 %rs12, %rs11, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; |
| ; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; |
| ; CHECK-NEXT: add.s16 %rs14, %rs13, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; |
| ; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; |
| ; CHECK-NEXT: add.s16 %rs16, %rs15, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; |
| ; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; |
| ; CHECK-NEXT: st.volatile.shared.v2.b32 [%rd1], {%r24, %r13}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <8 x i8>, ptr addrspace(3) %a |
| %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| store volatile <8 x i8> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_volatile_16xi8(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_volatile_16xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<33>; |
| ; CHECK-NEXT: .reg .b32 %r<49>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_16xi8_param_0]; |
| ; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; |
| ; CHECK-NEXT: add.s16 %rs10, %rs9, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; |
| ; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; |
| ; CHECK-NEXT: add.s16 %rs12, %rs11, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; |
| ; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; |
| ; CHECK-NEXT: add.s16 %rs14, %rs13, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; |
| ; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; |
| ; CHECK-NEXT: add.s16 %rs16, %rs15, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; |
| ; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; |
| ; CHECK-NEXT: add.s16 %rs18, %rs17, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; |
| ; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; |
| ; CHECK-NEXT: add.s16 %rs20, %rs19, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; |
| ; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; |
| ; CHECK-NEXT: add.s16 %rs22, %rs21, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; |
| ; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; |
| ; CHECK-NEXT: add.s16 %rs24, %rs23, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; |
| ; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; |
| ; CHECK-NEXT: add.s16 %rs26, %rs25, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; |
| ; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; |
| ; CHECK-NEXT: add.s16 %rs28, %rs27, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; |
| ; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; |
| ; CHECK-NEXT: add.s16 %rs30, %rs29, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; |
| ; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; |
| ; CHECK-NEXT: add.s16 %rs32, %rs31, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; |
| ; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; |
| ; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <16 x i8>, ptr addrspace(3) %a |
| %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| store volatile <16 x i8> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_volatile_2xi16(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_volatile_2xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b32 %r<3>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi16_param_0]; |
| ; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; |
| ; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x i16>, ptr addrspace(3) %a |
| %a.add = add <2 x i16> %a.load, <i16 1, i16 1> |
| store volatile <2 x i16> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_volatile_4xi16(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_volatile_4xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi16_param_0]; |
| ; CHECK-NEXT: ld.volatile.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; |
| ; CHECK-NEXT: add.s16 %rs5, %rs4, 1; |
| ; CHECK-NEXT: add.s16 %rs6, %rs3, 1; |
| ; CHECK-NEXT: add.s16 %rs7, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs8, %rs1, 1; |
| ; CHECK-NEXT: st.volatile.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <4 x i16>, ptr addrspace(3) %a |
| %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> |
| store volatile <4 x i16> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_volatile_8xi16(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_volatile_8xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<17>; |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xi16_param_0]; |
| ; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; |
| ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; |
| ; CHECK-NEXT: add.s16 %rs7, %rs6, 1; |
| ; CHECK-NEXT: add.s16 %rs8, %rs5, 1; |
| ; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; |
| ; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; |
| ; CHECK-NEXT: add.s16 %rs11, %rs10, 1; |
| ; CHECK-NEXT: add.s16 %rs12, %rs9, 1; |
| ; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; |
| ; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; |
| ; CHECK-NEXT: add.s16 %rs15, %rs14, 1; |
| ; CHECK-NEXT: add.s16 %rs16, %rs13, 1; |
| ; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; |
| ; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <8 x i16>, ptr addrspace(3) %a |
| %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> |
| store volatile <8 x i16> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_volatile_2xi32(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_volatile_2xi32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi32_param_0]; |
| ; CHECK-NEXT: ld.volatile.shared.v2.u32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r3, %r2, 1; |
| ; CHECK-NEXT: add.s32 %r4, %r1, 1; |
| ; CHECK-NEXT: st.volatile.shared.v2.u32 [%rd1], {%r4, %r3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x i32>, ptr addrspace(3) %a |
| %a.add = add <2 x i32> %a.load, <i32 1, i32 1> |
| store volatile <2 x i32> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_volatile_4xi32(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_volatile_4xi32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi32_param_0]; |
| ; CHECK-NEXT: ld.volatile.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r5, %r4, 1; |
| ; CHECK-NEXT: add.s32 %r6, %r3, 1; |
| ; CHECK-NEXT: add.s32 %r7, %r2, 1; |
| ; CHECK-NEXT: add.s32 %r8, %r1, 1; |
| ; CHECK-NEXT: st.volatile.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <4 x i32>, ptr addrspace(3) %a |
| %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> |
| store volatile <4 x i32> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_volatile_2xi64(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_volatile_2xi64( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<6>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi64_param_0]; |
| ; CHECK-NEXT: ld.volatile.shared.v2.u64 {%rd2, %rd3}, [%rd1]; |
| ; CHECK-NEXT: add.s64 %rd4, %rd3, 1; |
| ; CHECK-NEXT: add.s64 %rd5, %rd2, 1; |
| ; CHECK-NEXT: st.volatile.shared.v2.u64 [%rd1], {%rd5, %rd4}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x i64>, ptr addrspace(3) %a |
| %a.add = add <2 x i64> %a.load, <i64 1, i64 1> |
| store volatile <2 x i64> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_volatile_2xfloat(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_volatile_2xfloat( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .f32 %f<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xfloat_param_0]; |
| ; CHECK-NEXT: ld.volatile.shared.v2.f32 {%f1, %f2}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; |
| ; CHECK-NEXT: st.volatile.shared.v2.f32 [%rd1], {%f4, %f3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x float>, ptr addrspace(3) %a |
| %a.add = fadd <2 x float> %a.load, <float 1., float 1.> |
| store volatile <2 x float> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_volatile_4xfloat(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_volatile_4xfloat( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .f32 %f<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xfloat_param_0]; |
| ; CHECK-NEXT: ld.volatile.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; |
| ; CHECK-NEXT: st.volatile.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <4 x float>, ptr addrspace(3) %a |
| %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> |
| store volatile <4 x float> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| define void @shared_volatile_2xdouble(ptr addrspace(3) %a) { |
| ; CHECK-LABEL: shared_volatile_2xdouble( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-NEXT: .reg .f64 %fd<5>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xdouble_param_0]; |
| ; CHECK-NEXT: ld.volatile.shared.v2.f64 {%fd1, %fd2}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; |
| ; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; |
| ; CHECK-NEXT: st.volatile.shared.v2.f64 [%rd1], {%fd4, %fd3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x double>, ptr addrspace(3) %a |
| %a.add = fadd <2 x double> %a.load, <double 1., double 1.> |
| store volatile <2 x double> %a.add, ptr addrspace(3) %a |
| ret void |
| } |
| |
| ;; local statespace |
| |
| ; local |
| |
| define void @local_2xi8(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_2xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi8_param_0]; |
| ; CHECK-NEXT: ld.local.v2.u8 {%rs1, %rs2}, [%rd1]; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: st.local.v2.u8 [%rd1], {%rs4, %rs3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x i8>, ptr addrspace(5) %a |
| %a.add = add <2 x i8> %a.load, <i8 1, i8 1> |
| store <2 x i8> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_4xi8(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_4xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<9>; |
| ; CHECK-NEXT: .reg .b32 %r<13>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi8_param_0]; |
| ; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; |
| ; CHECK-NEXT: st.local.u32 [%rd1], %r12; |
| ; CHECK-NEXT: ret; |
| %a.load = load <4 x i8>, ptr addrspace(5) %a |
| %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> |
| store <4 x i8> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_8xi8(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_8xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<17>; |
| ; CHECK-NEXT: .reg .b32 %r<25>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_8xi8_param_0]; |
| ; CHECK-NEXT: ld.local.v2.b32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; |
| ; CHECK-NEXT: add.s16 %rs10, %rs9, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; |
| ; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; |
| ; CHECK-NEXT: add.s16 %rs12, %rs11, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; |
| ; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; |
| ; CHECK-NEXT: add.s16 %rs14, %rs13, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; |
| ; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; |
| ; CHECK-NEXT: add.s16 %rs16, %rs15, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; |
| ; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; |
| ; CHECK-NEXT: st.local.v2.b32 [%rd1], {%r24, %r13}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <8 x i8>, ptr addrspace(5) %a |
| %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| store <8 x i8> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_16xi8(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_16xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<33>; |
| ; CHECK-NEXT: .reg .b32 %r<49>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_16xi8_param_0]; |
| ; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; |
| ; CHECK-NEXT: add.s16 %rs10, %rs9, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; |
| ; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; |
| ; CHECK-NEXT: add.s16 %rs12, %rs11, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; |
| ; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; |
| ; CHECK-NEXT: add.s16 %rs14, %rs13, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; |
| ; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; |
| ; CHECK-NEXT: add.s16 %rs16, %rs15, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; |
| ; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; |
| ; CHECK-NEXT: add.s16 %rs18, %rs17, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; |
| ; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; |
| ; CHECK-NEXT: add.s16 %rs20, %rs19, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; |
| ; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; |
| ; CHECK-NEXT: add.s16 %rs22, %rs21, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; |
| ; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; |
| ; CHECK-NEXT: add.s16 %rs24, %rs23, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; |
| ; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; |
| ; CHECK-NEXT: add.s16 %rs26, %rs25, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; |
| ; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; |
| ; CHECK-NEXT: add.s16 %rs28, %rs27, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; |
| ; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; |
| ; CHECK-NEXT: add.s16 %rs30, %rs29, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; |
| ; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; |
| ; CHECK-NEXT: add.s16 %rs32, %rs31, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; |
| ; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; |
| ; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <16 x i8>, ptr addrspace(5) %a |
| %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| store <16 x i8> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_2xi16(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_2xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b32 %r<3>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi16_param_0]; |
| ; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; |
| ; CHECK-NEXT: st.local.u32 [%rd1], %r2; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x i16>, ptr addrspace(5) %a |
| %a.add = add <2 x i16> %a.load, <i16 1, i16 1> |
| store <2 x i16> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_4xi16(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_4xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi16_param_0]; |
| ; CHECK-NEXT: ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; |
| ; CHECK-NEXT: add.s16 %rs5, %rs4, 1; |
| ; CHECK-NEXT: add.s16 %rs6, %rs3, 1; |
| ; CHECK-NEXT: add.s16 %rs7, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs8, %rs1, 1; |
| ; CHECK-NEXT: st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <4 x i16>, ptr addrspace(5) %a |
| %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> |
| store <4 x i16> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_8xi16(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_8xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<17>; |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_8xi16_param_0]; |
| ; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; |
| ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; |
| ; CHECK-NEXT: add.s16 %rs7, %rs6, 1; |
| ; CHECK-NEXT: add.s16 %rs8, %rs5, 1; |
| ; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; |
| ; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; |
| ; CHECK-NEXT: add.s16 %rs11, %rs10, 1; |
| ; CHECK-NEXT: add.s16 %rs12, %rs9, 1; |
| ; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; |
| ; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; |
| ; CHECK-NEXT: add.s16 %rs15, %rs14, 1; |
| ; CHECK-NEXT: add.s16 %rs16, %rs13, 1; |
| ; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; |
| ; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <8 x i16>, ptr addrspace(5) %a |
| %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> |
| store <8 x i16> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_2xi32(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_2xi32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi32_param_0]; |
| ; CHECK-NEXT: ld.local.v2.u32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r3, %r2, 1; |
| ; CHECK-NEXT: add.s32 %r4, %r1, 1; |
| ; CHECK-NEXT: st.local.v2.u32 [%rd1], {%r4, %r3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x i32>, ptr addrspace(5) %a |
| %a.add = add <2 x i32> %a.load, <i32 1, i32 1> |
| store <2 x i32> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_4xi32(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_4xi32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi32_param_0]; |
| ; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r5, %r4, 1; |
| ; CHECK-NEXT: add.s32 %r6, %r3, 1; |
| ; CHECK-NEXT: add.s32 %r7, %r2, 1; |
| ; CHECK-NEXT: add.s32 %r8, %r1, 1; |
| ; CHECK-NEXT: st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <4 x i32>, ptr addrspace(5) %a |
| %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> |
| store <4 x i32> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_2xi64(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_2xi64( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<6>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi64_param_0]; |
| ; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; |
| ; CHECK-NEXT: add.s64 %rd4, %rd3, 1; |
| ; CHECK-NEXT: add.s64 %rd5, %rd2, 1; |
| ; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x i64>, ptr addrspace(5) %a |
| %a.add = add <2 x i64> %a.load, <i64 1, i64 1> |
| store <2 x i64> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_2xfloat(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_2xfloat( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .f32 %f<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_2xfloat_param_0]; |
| ; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; |
| ; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x float>, ptr addrspace(5) %a |
| %a.add = fadd <2 x float> %a.load, <float 1., float 1.> |
| store <2 x float> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_4xfloat(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_4xfloat( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .f32 %f<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_4xfloat_param_0]; |
| ; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; |
| ; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <4 x float>, ptr addrspace(5) %a |
| %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> |
| store <4 x float> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_2xdouble(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_2xdouble( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-NEXT: .reg .f64 %fd<5>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_2xdouble_param_0]; |
| ; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; |
| ; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; |
| ; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load <2 x double>, ptr addrspace(5) %a |
| %a.add = fadd <2 x double> %a.load, <double 1., double 1.> |
| store <2 x double> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| ; local_volatile |
| |
| define void @local_volatile_2xi8(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_volatile_2xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi8_param_0]; |
| ; CHECK-NEXT: ld.local.v2.u8 {%rs1, %rs2}, [%rd1]; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: st.local.v2.u8 [%rd1], {%rs4, %rs3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x i8>, ptr addrspace(5) %a |
| %a.add = add <2 x i8> %a.load, <i8 1, i8 1> |
| store volatile <2 x i8> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_volatile_4xi8(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_volatile_4xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<9>; |
| ; CHECK-NEXT: .reg .b32 %r<13>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi8_param_0]; |
| ; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; |
| ; CHECK-NEXT: st.local.u32 [%rd1], %r12; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <4 x i8>, ptr addrspace(5) %a |
| %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> |
| store volatile <4 x i8> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_volatile_8xi8(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_volatile_8xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<17>; |
| ; CHECK-NEXT: .reg .b32 %r<25>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_8xi8_param_0]; |
| ; CHECK-NEXT: ld.local.v2.b32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; |
| ; CHECK-NEXT: add.s16 %rs10, %rs9, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; |
| ; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; |
| ; CHECK-NEXT: add.s16 %rs12, %rs11, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; |
| ; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; |
| ; CHECK-NEXT: add.s16 %rs14, %rs13, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; |
| ; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; |
| ; CHECK-NEXT: add.s16 %rs16, %rs15, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; |
| ; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; |
| ; CHECK-NEXT: st.local.v2.b32 [%rd1], {%r24, %r13}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <8 x i8>, ptr addrspace(5) %a |
| %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| store volatile <8 x i8> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_volatile_16xi8(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_volatile_16xi8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<33>; |
| ; CHECK-NEXT: .reg .b32 %r<49>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_16xi8_param_0]; |
| ; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; |
| ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; |
| ; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; |
| ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; |
| ; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; |
| ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; |
| ; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; |
| ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; |
| ; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; |
| ; CHECK-NEXT: add.s16 %rs10, %rs9, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; |
| ; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; |
| ; CHECK-NEXT: add.s16 %rs12, %rs11, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; |
| ; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; |
| ; CHECK-NEXT: add.s16 %rs14, %rs13, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; |
| ; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; |
| ; CHECK-NEXT: add.s16 %rs16, %rs15, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; |
| ; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; |
| ; CHECK-NEXT: add.s16 %rs18, %rs17, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; |
| ; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; |
| ; CHECK-NEXT: add.s16 %rs20, %rs19, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; |
| ; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; |
| ; CHECK-NEXT: add.s16 %rs22, %rs21, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; |
| ; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; |
| ; CHECK-NEXT: add.s16 %rs24, %rs23, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; |
| ; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; |
| ; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; |
| ; CHECK-NEXT: add.s16 %rs26, %rs25, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; |
| ; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; |
| ; CHECK-NEXT: add.s16 %rs28, %rs27, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; |
| ; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; |
| ; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; |
| ; CHECK-NEXT: add.s16 %rs30, %rs29, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; |
| ; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; |
| ; CHECK-NEXT: add.s16 %rs32, %rs31, 1; |
| ; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; |
| ; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; |
| ; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; |
| ; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <16 x i8>, ptr addrspace(5) %a |
| %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| store volatile <16 x i8> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_volatile_2xi16(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_volatile_2xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b32 %r<3>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi16_param_0]; |
| ; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; |
| ; CHECK-NEXT: st.local.u32 [%rd1], %r2; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x i16>, ptr addrspace(5) %a |
| %a.add = add <2 x i16> %a.load, <i16 1, i16 1> |
| store volatile <2 x i16> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_volatile_4xi16(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_volatile_4xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi16_param_0]; |
| ; CHECK-NEXT: ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; |
| ; CHECK-NEXT: add.s16 %rs5, %rs4, 1; |
| ; CHECK-NEXT: add.s16 %rs6, %rs3, 1; |
| ; CHECK-NEXT: add.s16 %rs7, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs8, %rs1, 1; |
| ; CHECK-NEXT: st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <4 x i16>, ptr addrspace(5) %a |
| %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> |
| store volatile <4 x i16> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_volatile_8xi16(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_volatile_8xi16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<17>; |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_8xi16_param_0]; |
| ; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; |
| ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; |
| ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; |
| ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; |
| ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; |
| ; CHECK-NEXT: add.s16 %rs7, %rs6, 1; |
| ; CHECK-NEXT: add.s16 %rs8, %rs5, 1; |
| ; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; |
| ; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; |
| ; CHECK-NEXT: add.s16 %rs11, %rs10, 1; |
| ; CHECK-NEXT: add.s16 %rs12, %rs9, 1; |
| ; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; |
| ; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; |
| ; CHECK-NEXT: add.s16 %rs15, %rs14, 1; |
| ; CHECK-NEXT: add.s16 %rs16, %rs13, 1; |
| ; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; |
| ; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <8 x i16>, ptr addrspace(5) %a |
| %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> |
| store volatile <8 x i16> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_volatile_2xi32(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_volatile_2xi32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi32_param_0]; |
| ; CHECK-NEXT: ld.local.v2.u32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r3, %r2, 1; |
| ; CHECK-NEXT: add.s32 %r4, %r1, 1; |
| ; CHECK-NEXT: st.local.v2.u32 [%rd1], {%r4, %r3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x i32>, ptr addrspace(5) %a |
| %a.add = add <2 x i32> %a.load, <i32 1, i32 1> |
| store volatile <2 x i32> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_volatile_4xi32(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_volatile_4xi32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi32_param_0]; |
| ; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r5, %r4, 1; |
| ; CHECK-NEXT: add.s32 %r6, %r3, 1; |
| ; CHECK-NEXT: add.s32 %r7, %r2, 1; |
| ; CHECK-NEXT: add.s32 %r8, %r1, 1; |
| ; CHECK-NEXT: st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <4 x i32>, ptr addrspace(5) %a |
| %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> |
| store volatile <4 x i32> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_volatile_2xi64(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_volatile_2xi64( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<6>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi64_param_0]; |
| ; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; |
| ; CHECK-NEXT: add.s64 %rd4, %rd3, 1; |
| ; CHECK-NEXT: add.s64 %rd5, %rd2, 1; |
| ; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x i64>, ptr addrspace(5) %a |
| %a.add = add <2 x i64> %a.load, <i64 1, i64 1> |
| store volatile <2 x i64> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_volatile_2xfloat(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_volatile_2xfloat( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .f32 %f<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xfloat_param_0]; |
| ; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; |
| ; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x float>, ptr addrspace(5) %a |
| %a.add = fadd <2 x float> %a.load, <float 1., float 1.> |
| store volatile <2 x float> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_volatile_4xfloat(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_volatile_4xfloat( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .f32 %f<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xfloat_param_0]; |
| ; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; |
| ; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; |
| ; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <4 x float>, ptr addrspace(5) %a |
| %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> |
| store volatile <4 x float> %a.add, ptr addrspace(5) %a |
| ret void |
| } |
| |
| define void @local_volatile_2xdouble(ptr addrspace(5) %a) { |
| ; CHECK-LABEL: local_volatile_2xdouble( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-NEXT: .reg .f64 %fd<5>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xdouble_param_0]; |
| ; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; |
| ; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; |
| ; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; |
| ; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3}; |
| ; CHECK-NEXT: ret; |
| %a.load = load volatile <2 x double>, ptr addrspace(5) %a |
| %a.add = fadd <2 x double> %a.load, <double 1., double 1.> |
| store volatile <2 x double> %a.add, ptr addrspace(5) %a |
| ret void |
| } |