| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s |
| ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %} |
| |
| ; Check that invariant loads from the global addrspace are lowered to |
| ; ld.global.nc. |
| |
| define i32 @ld_global(ptr addrspace(1) %ptr) { |
| ; CHECK-LABEL: ld_global( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<2>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_param_0]; |
| ; CHECK-NEXT: ld.global.nc.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; |
| ; CHECK-NEXT: ret; |
| %a = load i32, ptr addrspace(1) %ptr, !invariant.load !0 |
| ret i32 %a |
| } |
| |
| define half @ld_global_v2f16(ptr addrspace(1) %ptr) { |
| ; Load of v2f16 is weird. We consider it to be a legal type, which happens to be |
| ; loaded/stored as a 32-bit scalar. |
| ; CHECK-LABEL: ld_global_v2f16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<4>; |
| ; CHECK-NEXT: .reg .b32 %r<2>; |
| ; CHECK-NEXT: .reg .f32 %f<4>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v2f16_param_0]; |
| ; CHECK-NEXT: ld.global.nc.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; |
| ; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; |
| ; CHECK-NEXT: cvt.f32.f16 %f2, %rs1; |
| ; CHECK-NEXT: add.rn.f32 %f3, %f2, %f1; |
| ; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %f3; |
| ; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; |
| ; CHECK-NEXT: ret; |
| %a = load <2 x half>, ptr addrspace(1) %ptr, !invariant.load !0 |
| %v1 = extractelement <2 x half> %a, i32 0 |
| %v2 = extractelement <2 x half> %a, i32 1 |
| %sum = fadd half %v1, %v2 |
| ret half %sum |
| } |
| |
| define half @ld_global_v4f16(ptr addrspace(1) %ptr) { |
| ; Larger f16 vectors may be split into individual f16 elements and multiple |
| ; loads/stores may be vectorized using f16 element type. Practically it's |
| ; limited to v4 variant only. |
| ; CHECK-LABEL: ld_global_v4f16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<8>; |
| ; CHECK-NEXT: .reg .f32 %f<10>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v4f16_param_0]; |
| ; CHECK-NEXT: ld.global.nc.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; |
| ; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; |
| ; CHECK-NEXT: cvt.f32.f16 %f2, %rs1; |
| ; CHECK-NEXT: add.rn.f32 %f3, %f2, %f1; |
| ; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %f3; |
| ; CHECK-NEXT: cvt.f32.f16 %f4, %rs4; |
| ; CHECK-NEXT: cvt.f32.f16 %f5, %rs3; |
| ; CHECK-NEXT: add.rn.f32 %f6, %f5, %f4; |
| ; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %f6; |
| ; CHECK-NEXT: cvt.f32.f16 %f7, %rs6; |
| ; CHECK-NEXT: cvt.f32.f16 %f8, %rs5; |
| ; CHECK-NEXT: add.rn.f32 %f9, %f8, %f7; |
| ; CHECK-NEXT: cvt.rn.f16.f32 %rs7, %f9; |
| ; CHECK-NEXT: st.param.b16 [func_retval0], %rs7; |
| ; CHECK-NEXT: ret; |
| %a = load <4 x half>, ptr addrspace(1) %ptr, !invariant.load !0 |
| %v1 = extractelement <4 x half> %a, i32 0 |
| %v2 = extractelement <4 x half> %a, i32 1 |
| %v3 = extractelement <4 x half> %a, i32 2 |
| %v4 = extractelement <4 x half> %a, i32 3 |
| %sum1 = fadd half %v1, %v2 |
| %sum2 = fadd half %v3, %v4 |
| %sum = fadd half %sum1, %sum2 |
| ret half %sum |
| } |
| |
| define half @ld_global_v8f16(ptr addrspace(1) %ptr) { |
| ; Larger vectors are, again, loaded as v4i32. PTX has no v8 variants of loads/stores, |
| ; so load/store vectorizer has to convert v8f16 -> v4 x v2f16. |
| ; CHECK-LABEL: ld_global_v8f16( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<8>; |
| ; CHECK-NEXT: .reg .b32 %r<5>; |
| ; CHECK-NEXT: .reg .f32 %f<10>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v8f16_param_0]; |
| ; CHECK-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; } |
| ; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r4; } |
| ; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r1; } |
| ; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs4, tmp}, %r2; } |
| ; CHECK-NEXT: cvt.f32.f16 %f1, %rs4; |
| ; CHECK-NEXT: cvt.f32.f16 %f2, %rs3; |
| ; CHECK-NEXT: add.rn.f32 %f3, %f2, %f1; |
| ; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %f3; |
| ; CHECK-NEXT: cvt.f32.f16 %f4, %rs2; |
| ; CHECK-NEXT: cvt.f32.f16 %f5, %rs1; |
| ; CHECK-NEXT: add.rn.f32 %f6, %f5, %f4; |
| ; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %f6; |
| ; CHECK-NEXT: cvt.f32.f16 %f7, %rs6; |
| ; CHECK-NEXT: cvt.f32.f16 %f8, %rs5; |
| ; CHECK-NEXT: add.rn.f32 %f9, %f8, %f7; |
| ; CHECK-NEXT: cvt.rn.f16.f32 %rs7, %f9; |
| ; CHECK-NEXT: st.param.b16 [func_retval0], %rs7; |
| ; CHECK-NEXT: ret; |
| %a = load <8 x half>, ptr addrspace(1) %ptr, !invariant.load !0 |
| %v1 = extractelement <8 x half> %a, i32 0 |
| %v2 = extractelement <8 x half> %a, i32 2 |
| %v3 = extractelement <8 x half> %a, i32 4 |
| %v4 = extractelement <8 x half> %a, i32 6 |
| %sum1 = fadd half %v1, %v2 |
| %sum2 = fadd half %v3, %v4 |
| %sum = fadd half %sum1, %sum2 |
| ret half %sum |
| } |
| |
| define i8 @ld_global_v8i8(ptr addrspace(1) %ptr) { |
| ; CHECK-LABEL: ld_global_v8i8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<8>; |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v8i8_param_0]; |
| ; CHECK-NEXT: ld.global.nc.v2.u32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r3, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; |
| ; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; |
| ; CHECK-NEXT: bfe.u32 %r5, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; |
| ; CHECK-NEXT: bfe.u32 %r6, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; |
| ; CHECK-NEXT: add.s16 %rs5, %rs4, %rs3; |
| ; CHECK-NEXT: add.s16 %rs6, %rs2, %rs1; |
| ; CHECK-NEXT: add.s16 %rs7, %rs5, %rs6; |
| ; CHECK-NEXT: cvt.u32.u16 %r7, %rs7; |
| ; CHECK-NEXT: and.b32 %r8, %r7, 255; |
| ; CHECK-NEXT: st.param.b32 [func_retval0], %r8; |
| ; CHECK-NEXT: ret; |
| %a = load <8 x i8>, ptr addrspace(1) %ptr, !invariant.load !0 |
| %v1 = extractelement <8 x i8> %a, i32 0 |
| %v2 = extractelement <8 x i8> %a, i32 2 |
| %v3 = extractelement <8 x i8> %a, i32 4 |
| %v4 = extractelement <8 x i8> %a, i32 6 |
| %sum1 = add i8 %v1, %v2 |
| %sum2 = add i8 %v3, %v4 |
| %sum = add i8 %sum1, %sum2 |
| ret i8 %sum |
| } |
| |
| define i8 @ld_global_v16i8(ptr addrspace(1) %ptr) { |
| ; CHECK-LABEL: ld_global_v16i8( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<16>; |
| ; CHECK-NEXT: .reg .b32 %r<15>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v16i8_param_0]; |
| ; CHECK-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: bfe.u32 %r5, %r4, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; |
| ; CHECK-NEXT: bfe.u32 %r6, %r4, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs2, %r6; |
| ; CHECK-NEXT: bfe.u32 %r7, %r3, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; |
| ; CHECK-NEXT: bfe.u32 %r8, %r3, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs4, %r8; |
| ; CHECK-NEXT: bfe.u32 %r9, %r2, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs5, %r9; |
| ; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs6, %r10; |
| ; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r11; |
| ; CHECK-NEXT: bfe.u32 %r12, %r1, 0, 8; |
| ; CHECK-NEXT: cvt.u16.u32 %rs8, %r12; |
| ; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7; |
| ; CHECK-NEXT: add.s16 %rs10, %rs6, %rs5; |
| ; CHECK-NEXT: add.s16 %rs11, %rs4, %rs3; |
| ; CHECK-NEXT: add.s16 %rs12, %rs2, %rs1; |
| ; CHECK-NEXT: add.s16 %rs13, %rs9, %rs10; |
| ; CHECK-NEXT: add.s16 %rs14, %rs11, %rs12; |
| ; CHECK-NEXT: add.s16 %rs15, %rs13, %rs14; |
| ; CHECK-NEXT: cvt.u32.u16 %r13, %rs15; |
| ; CHECK-NEXT: and.b32 %r14, %r13, 255; |
| ; CHECK-NEXT: st.param.b32 [func_retval0], %r14; |
| ; CHECK-NEXT: ret; |
| %a = load <16 x i8>, ptr addrspace(1) %ptr, !invariant.load !0 |
| %v1 = extractelement <16 x i8> %a, i32 0 |
| %v2 = extractelement <16 x i8> %a, i32 2 |
| %v3 = extractelement <16 x i8> %a, i32 4 |
| %v4 = extractelement <16 x i8> %a, i32 6 |
| %v5 = extractelement <16 x i8> %a, i32 8 |
| %v6 = extractelement <16 x i8> %a, i32 10 |
| %v7 = extractelement <16 x i8> %a, i32 12 |
| %v8 = extractelement <16 x i8> %a, i32 14 |
| %sum1 = add i8 %v1, %v2 |
| %sum2 = add i8 %v3, %v4 |
| %sum3 = add i8 %v5, %v6 |
| %sum4 = add i8 %v7, %v8 |
| %sum5 = add i8 %sum1, %sum2 |
| %sum6 = add i8 %sum3, %sum4 |
| %sum7 = add i8 %sum5, %sum6 |
| ret i8 %sum7 |
| } |
| |
| define i32 @ld_global_v2i32(ptr addrspace(1) %ptr) { |
| ; CHECK-LABEL: ld_global_v2i32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<4>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v2i32_param_0]; |
| ; CHECK-NEXT: ld.global.nc.v2.u32 {%r1, %r2}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r3, %r1, %r2; |
| ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; |
| ; CHECK-NEXT: ret; |
| %a = load <2 x i32>, ptr addrspace(1) %ptr, !invariant.load !0 |
| %v1 = extractelement <2 x i32> %a, i32 0 |
| %v2 = extractelement <2 x i32> %a, i32 1 |
| %sum = add i32 %v1, %v2 |
| ret i32 %sum |
| } |
| |
| define i32 @ld_global_v4i32(ptr addrspace(1) %ptr) { |
| ; CHECK-LABEL: ld_global_v4i32( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<8>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v4i32_param_0]; |
| ; CHECK-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; |
| ; CHECK-NEXT: add.s32 %r5, %r1, %r2; |
| ; CHECK-NEXT: add.s32 %r6, %r3, %r4; |
| ; CHECK-NEXT: add.s32 %r7, %r5, %r6; |
| ; CHECK-NEXT: st.param.b32 [func_retval0], %r7; |
| ; CHECK-NEXT: ret; |
| %a = load <4 x i32>, ptr addrspace(1) %ptr, !invariant.load !0 |
| %v1 = extractelement <4 x i32> %a, i32 0 |
| %v2 = extractelement <4 x i32> %a, i32 1 |
| %v3 = extractelement <4 x i32> %a, i32 2 |
| %v4 = extractelement <4 x i32> %a, i32 3 |
| %sum1 = add i32 %v1, %v2 |
| %sum2 = add i32 %v3, %v4 |
| %sum3 = add i32 %sum1, %sum2 |
| ret i32 %sum3 |
| } |
| |
| define i32 @ld_not_invariant(ptr addrspace(1) %ptr) { |
| ; CHECK-LABEL: ld_not_invariant( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<2>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [ld_not_invariant_param_0]; |
| ; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; |
| ; CHECK-NEXT: ret; |
| %a = load i32, ptr addrspace(1) %ptr |
| ret i32 %a |
| } |
| |
| define i32 @ld_not_global_addrspace(ptr addrspace(0) %ptr) { |
| ; CHECK-LABEL: ld_not_global_addrspace( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<2>; |
| ; CHECK-NEXT: .reg .b64 %rd<2>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u64 %rd1, [ld_not_global_addrspace_param_0]; |
| ; CHECK-NEXT: ld.u32 %r1, [%rd1]; |
| ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; |
| ; CHECK-NEXT: ret; |
| %a = load i32, ptr addrspace(0) %ptr |
| ret i32 %a |
| } |
| |
| !0 = !{} |