| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefixes=GFX11 %s |
| ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-True16 %s |
| ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-NoTrue16 %s |
| |
| ; global address space, addrspace(1) |
| |
| ; gfx12, true16 is S16 16-bit load |
| ; gfx12, without true 16 is S32 16-bit any-extending load |
| define amdgpu_ps void @load_uniform_P1_i16_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P1_i16_gfx12: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NEXT: global_load_d16_b16 v2, v2, s[0:1] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-True16-LABEL: load_uniform_P1_i16_gfx12: |
| ; GFX12-True16: ; %bb.0: |
| ; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 |
| ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-True16-NEXT: s_endpgm |
| ; |
| ; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_gfx12: |
| ; GFX12-NoTrue16: ; %bb.0: |
| ; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 |
| ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-NoTrue16-NEXT: s_endpgm |
| %a = load i16, ptr addrspace(1) %ptra |
| store i16 %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; gfx11, and older true16 is S16 16-bit load |
| ; gfx11, and older without true 16 is S32 16-bit any-extending load |
| ; both cases require align 4 and uniform mmo to widen mmo to 32-bit load |
| define amdgpu_ps void @load_uniform_P1_i16_align4_widen_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P1_i16_align4_widen_mmo_gfx11: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX11-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-True16-LABEL: load_uniform_P1_i16_align4_widen_mmo_gfx11: |
| ; GFX12-True16: ; %bb.0: |
| ; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 |
| ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-True16-NEXT: s_endpgm |
| ; |
| ; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_align4_widen_mmo_gfx11: |
| ; GFX12-NoTrue16: ; %bb.0: |
| ; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 |
| ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-NoTrue16-NEXT: s_endpgm |
| %a = load i16, ptr addrspace(1) %ptra, align 4 |
| store i16 %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; gfx12, S32 8-bit anyextending load, no difference regarding true 16 |
| define amdgpu_ps void @load_uniform_P1_i8_any_extending_load(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P1_i8_any_extending_load: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NEXT: global_load_u8 v2, v2, s[0:1] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_store_b8 v[0:1], v2, off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P1_i8_any_extending_load: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NEXT: global_store_b8 v[0:1], v2, off |
| ; GFX12-NEXT: s_endpgm |
| %a = load i8, ptr addrspace(1) %ptra |
| store i8 %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; gfx11 and older, S32 8-bit any-extending load, no difference regarding true 16 |
| define amdgpu_ps void @load_uniform_P1_i8_any_extending_load_align4_widen_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P1_i8_any_extending_load_align4_widen_mmo_gfx11: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NEXT: global_store_b8 v[0:1], v2, off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P1_i8_any_extending_load_align4_widen_mmo_gfx11: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NEXT: global_store_b8 v[0:1], v2, off |
| ; GFX12-NEXT: s_endpgm |
| %a = load i8, ptr addrspace(1) %ptra, align 4 |
| store i8 %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_ps void @load_uniform_P1_i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P1_i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NEXT: global_store_b32 v[0:1], v2, off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P1_i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NEXT: global_store_b32 v[0:1], v2, off |
| ; GFX12-NEXT: s_endpgm |
| %a = load i32, ptr addrspace(1) %ptra |
| store i32 %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_ps void @load_uniform_P1_v2i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P1_v2i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P1_v2i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off |
| ; GFX12-NEXT: s_endpgm |
| %a = load <2 x i32>, ptr addrspace(1) %ptra |
| store <2 x i32> %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; gfx11, S96 load align 16(default) to load S128 |
| define amdgpu_ps void @load_uniform_P1_v3i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P1_v3i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P1_v3i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off |
| ; GFX12-NEXT: s_endpgm |
| %a = load <3 x i32>, ptr addrspace(1) %ptra |
| store <3 x i32> %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; gfx11, S96 load align 4 to load S64 + load S32 |
| define amdgpu_ps void @load_uniform_P1_v3i32_align4_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P1_v3i32_align4_gfx11: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 |
| ; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 |
| ; GFX11-NEXT: v_mov_b32_e32 v4, s6 |
| ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P1_v3i32_align4_gfx11: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off |
| ; GFX12-NEXT: s_endpgm |
| %a = load <3 x i32>, ptr addrspace(1) %ptra, align 4 |
| store <3 x i32> %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_ps void @load_uniform_P1_v4i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P1_v4i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P1_v4i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX12-NEXT: s_endpgm |
| %a = load <4 x i32>, ptr addrspace(1) %ptra |
| store <4 x i32> %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_ps void @load_uniform_P1_v8i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P1_v8i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 |
| ; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P1_v8i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 |
| ; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 |
| ; GFX12-NEXT: s_endpgm |
| %a = load <8 x i32>, ptr addrspace(1) %ptra |
| store <8 x i32> %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_ps void @load_uniform_P1_v16i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P1_v16i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 |
| ; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 |
| ; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10 |
| ; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8 |
| ; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 |
| ; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 |
| ; GFX11-NEXT: s_clause 0x3 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P1_v16i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 |
| ; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 |
| ; GFX12-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10 |
| ; GFX12-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8 |
| ; GFX12-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 |
| ; GFX12-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 |
| ; GFX12-NEXT: s_clause 0x3 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 |
| ; GFX12-NEXT: s_endpgm |
| %a = load <16 x i32>, ptr addrspace(1) %ptra |
| store <16 x i32> %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; constant address space, addrspace(4) |
| |
| define amdgpu_ps void @load_uniform_P4_i16_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P4_i16_gfx12: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NEXT: global_load_d16_b16 v2, v2, s[0:1] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-True16-LABEL: load_uniform_P4_i16_gfx12: |
| ; GFX12-True16: ; %bb.0: |
| ; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 |
| ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-True16-NEXT: s_endpgm |
| ; |
| ; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_gfx12: |
| ; GFX12-NoTrue16: ; %bb.0: |
| ; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 |
| ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-NoTrue16-NEXT: s_endpgm |
| %a = load i16, ptr addrspace(4) %ptra |
| store i16 %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_ps void @load_uniform_P4_i16_align4_widen_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P4_i16_align4_widen_mmo_gfx11: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX11-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-True16-LABEL: load_uniform_P4_i16_align4_widen_mmo_gfx11: |
| ; GFX12-True16: ; %bb.0: |
| ; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 |
| ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-True16-NEXT: s_endpgm |
| ; |
| ; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_align4_widen_mmo_gfx11: |
| ; GFX12-NoTrue16: ; %bb.0: |
| ; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 |
| ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-NoTrue16-NEXT: s_endpgm |
| %a = load i16, ptr addrspace(4) %ptra, align 4 |
| store i16 %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; gfx12, S32 8-bit anyextending load, no difference regarding true 16 |
| define amdgpu_ps void @load_uniform_P4_i8_any_extending_load(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P4_i8_any_extending_load: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NEXT: global_load_u8 v2, v2, s[0:1] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_store_b8 v[0:1], v2, off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P4_i8_any_extending_load: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NEXT: global_store_b8 v[0:1], v2, off |
| ; GFX12-NEXT: s_endpgm |
| %a = load i8, ptr addrspace(4) %ptra |
| store i8 %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; gfx11 and older, S32 8-bit any-extending load, no difference regarding true 16 |
| define amdgpu_ps void @load_uniform_P4_i8_any_extending_load_align4_widen_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P4_i8_any_extending_load_align4_widen_mmo_gfx11: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NEXT: global_store_b8 v[0:1], v2, off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P4_i8_any_extending_load_align4_widen_mmo_gfx11: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NEXT: global_store_b8 v[0:1], v2, off |
| ; GFX12-NEXT: s_endpgm |
| %a = load i8, ptr addrspace(4) %ptra, align 4 |
| store i8 %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_ps void @load_uniform_P4_i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P4_i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NEXT: global_store_b32 v[0:1], v2, off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P4_i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NEXT: global_store_b32 v[0:1], v2, off |
| ; GFX12-NEXT: s_endpgm |
| %a = load i32, ptr addrspace(4) %ptra |
| store i32 %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_ps void @load_uniform_P4_v2i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P4_v2i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P4_v2i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off |
| ; GFX12-NEXT: s_endpgm |
| %a = load <2 x i32>, ptr addrspace(4) %ptra |
| store <2 x i32> %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; gfx11, S96 load align 16(default) to load S128 |
| define amdgpu_ps void @load_uniform_P4_v3i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P4_v3i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P4_v3i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off |
| ; GFX12-NEXT: s_endpgm |
| %a = load <3 x i32>, ptr addrspace(4) %ptra |
| store <3 x i32> %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; gfx11, S96 load align 4 to load S64 + load S32 |
| define amdgpu_ps void @load_uniform_P4_v3i32_align4_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P4_v3i32_align4_gfx11: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 |
| ; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 |
| ; GFX11-NEXT: v_mov_b32_e32 v4, s6 |
| ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P4_v3i32_align4_gfx11: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off |
| ; GFX12-NEXT: s_endpgm |
| %a = load <3 x i32>, ptr addrspace(4) %ptra, align 4 |
| store <3 x i32> %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| |
| define amdgpu_ps void @load_uniform_P4_v4i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P4_v4i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P4_v4i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX12-NEXT: s_endpgm |
| %a = load <4 x i32>, ptr addrspace(4) %ptra |
| store <4 x i32> %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_ps void @load_uniform_P4_v8i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P4_v8i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 |
| ; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P4_v8i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 |
| ; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 |
| ; GFX12-NEXT: s_endpgm |
| %a = load <8 x i32>, ptr addrspace(4) %ptra |
| store <8 x i32> %a, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_ps void @load_uniform_P4_v16i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { |
| ; GFX11-LABEL: load_uniform_P4_v16i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 |
| ; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 |
| ; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10 |
| ; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8 |
| ; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 |
| ; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 |
| ; GFX11-NEXT: s_clause 0x3 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P4_v16i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 |
| ; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 |
| ; GFX12-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10 |
| ; GFX12-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8 |
| ; GFX12-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 |
| ; GFX12-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 |
| ; GFX12-NEXT: s_clause 0x3 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 |
| ; GFX12-NEXT: s_endpgm |
| %a = load <16 x i32>, ptr addrspace(4) %ptra |
| store <16 x i32> %a, ptr addrspace(1) %out |
| ret void |
| } |