blob: 9bf140cf744dbd22d4a332646a693672fee5bed9 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefixes=GFX11 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-True16 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-NoTrue16 %s
; global address space, addrspace(1)
; gfx12, true16 is S16 16-bit load
; gfx12, without true 16 is S32 16-bit any-extending load
define amdgpu_ps void @load_uniform_P1_i16_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P1_i16_gfx12:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_load_d16_b16 v2, v2, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
; GFX12-True16-LABEL: load_uniform_P1_i16_gfx12:
; GFX12-True16: ; %bb.0:
; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0
; GFX12-True16-NEXT: s_wait_kmcnt 0x0
; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-True16-NEXT: s_endpgm
;
; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_gfx12:
; GFX12-NoTrue16: ; %bb.0:
; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0
; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NoTrue16-NEXT: s_endpgm
%a = load i16, ptr addrspace(1) %ptra
store i16 %a, ptr addrspace(1) %out
ret void
}
; gfx11, and older true16 is S16 16-bit load
; gfx11, and older without true 16 is S32 16-bit any-extending load
; both cases require align 4 and uniform mmo to widen mmo to 32-bit load
define amdgpu_ps void @load_uniform_P1_i16_align4_widen_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P1_i16_align4_widen_mmo_gfx11:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
; GFX12-True16-LABEL: load_uniform_P1_i16_align4_widen_mmo_gfx11:
; GFX12-True16: ; %bb.0:
; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0
; GFX12-True16-NEXT: s_wait_kmcnt 0x0
; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-True16-NEXT: s_endpgm
;
; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_align4_widen_mmo_gfx11:
; GFX12-NoTrue16: ; %bb.0:
; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0
; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NoTrue16-NEXT: s_endpgm
%a = load i16, ptr addrspace(1) %ptra, align 4
store i16 %a, ptr addrspace(1) %out
ret void
}
; gfx12, S32 8-bit anyextending load, no difference regarding true 16
define amdgpu_ps void @load_uniform_P1_i8_any_extending_load(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P1_i8_any_extending_load:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_load_u8 v2, v2, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P1_i8_any_extending_load:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b8 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
%a = load i8, ptr addrspace(1) %ptra
store i8 %a, ptr addrspace(1) %out
ret void
}
; gfx11 and older, S32 8-bit any-extending load, no difference regarding true 16
define amdgpu_ps void @load_uniform_P1_i8_any_extending_load_align4_widen_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P1_i8_any_extending_load_align4_widen_mmo_gfx11:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: global_store_b8 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P1_i8_any_extending_load_align4_widen_mmo_gfx11:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b8 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
%a = load i8, ptr addrspace(1) %ptra, align 4
store i8 %a, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @load_uniform_P1_i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P1_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P1_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %ptra
store i32 %a, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @load_uniform_P1_v2i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P1_v2i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P1_v2i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %ptra
store <2 x i32> %a, ptr addrspace(1) %out
ret void
}
; gfx11, S96 load align 16(default) to load S128
define amdgpu_ps void @load_uniform_P1_v3i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P1_v3i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P1_v3i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off
; GFX12-NEXT: s_endpgm
%a = load <3 x i32>, ptr addrspace(1) %ptra
store <3 x i32> %a, ptr addrspace(1) %out
ret void
}
; gfx11, S96 load align 4 to load S64 + load S32
define amdgpu_ps void @load_uniform_P1_v3i32_align4_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P1_v3i32_align4_gfx11:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX11-NEXT: v_mov_b32_e32 v4, s6
; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P1_v3i32_align4_gfx11:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off
; GFX12-NEXT: s_endpgm
%a = load <3 x i32>, ptr addrspace(1) %ptra, align 4
store <3 x i32> %a, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @load_uniform_P1_v4i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P1_v4i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P1_v4i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX12-NEXT: s_endpgm
%a = load <4 x i32>, ptr addrspace(1) %ptra
store <4 x i32> %a, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @load_uniform_P1_v8i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P1_v8i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P1_v8i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX12-NEXT: s_endpgm
%a = load <8 x i32>, ptr addrspace(1) %ptra
store <8 x i32> %a, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @load_uniform_P1_v16i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P1_v16i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10
; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8
; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14
; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P1_v16i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b512 s[0:15], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
; GFX12-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10
; GFX12-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8
; GFX12-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14
; GFX12-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12
; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
; GFX12-NEXT: s_endpgm
%a = load <16 x i32>, ptr addrspace(1) %ptra
store <16 x i32> %a, ptr addrspace(1) %out
ret void
}
; constant address space, addrspace(4)
define amdgpu_ps void @load_uniform_P4_i16_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P4_i16_gfx12:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_load_d16_b16 v2, v2, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
; GFX12-True16-LABEL: load_uniform_P4_i16_gfx12:
; GFX12-True16: ; %bb.0:
; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0
; GFX12-True16-NEXT: s_wait_kmcnt 0x0
; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-True16-NEXT: s_endpgm
;
; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_gfx12:
; GFX12-NoTrue16: ; %bb.0:
; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0
; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NoTrue16-NEXT: s_endpgm
%a = load i16, ptr addrspace(4) %ptra
store i16 %a, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @load_uniform_P4_i16_align4_widen_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P4_i16_align4_widen_mmo_gfx11:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
; GFX12-True16-LABEL: load_uniform_P4_i16_align4_widen_mmo_gfx11:
; GFX12-True16: ; %bb.0:
; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0
; GFX12-True16-NEXT: s_wait_kmcnt 0x0
; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-True16-NEXT: s_endpgm
;
; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_align4_widen_mmo_gfx11:
; GFX12-NoTrue16: ; %bb.0:
; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0
; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NoTrue16-NEXT: s_endpgm
%a = load i16, ptr addrspace(4) %ptra, align 4
store i16 %a, ptr addrspace(1) %out
ret void
}
; gfx12, S32 8-bit anyextending load, no difference regarding true 16
define amdgpu_ps void @load_uniform_P4_i8_any_extending_load(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P4_i8_any_extending_load:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_load_u8 v2, v2, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P4_i8_any_extending_load:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b8 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
%a = load i8, ptr addrspace(4) %ptra
store i8 %a, ptr addrspace(1) %out
ret void
}
; gfx11 and older, S32 8-bit any-extending load, no difference regarding true 16
define amdgpu_ps void @load_uniform_P4_i8_any_extending_load_align4_widen_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P4_i8_any_extending_load_align4_widen_mmo_gfx11:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: global_store_b8 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P4_i8_any_extending_load_align4_widen_mmo_gfx11:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b8 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
%a = load i8, ptr addrspace(4) %ptra, align 4
store i8 %a, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @load_uniform_P4_i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P4_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P4_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
%a = load i32, ptr addrspace(4) %ptra
store i32 %a, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @load_uniform_P4_v2i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P4_v2i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P4_v2i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(4) %ptra
store <2 x i32> %a, ptr addrspace(1) %out
ret void
}
; gfx11, S96 load align 16(default) to load S128
define amdgpu_ps void @load_uniform_P4_v3i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P4_v3i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P4_v3i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off
; GFX12-NEXT: s_endpgm
%a = load <3 x i32>, ptr addrspace(4) %ptra
store <3 x i32> %a, ptr addrspace(1) %out
ret void
}
; gfx11, S96 load align 4 to load S64 + load S32
define amdgpu_ps void @load_uniform_P4_v3i32_align4_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P4_v3i32_align4_gfx11:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX11-NEXT: v_mov_b32_e32 v4, s6
; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P4_v3i32_align4_gfx11:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off
; GFX12-NEXT: s_endpgm
%a = load <3 x i32>, ptr addrspace(4) %ptra, align 4
store <3 x i32> %a, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @load_uniform_P4_v4i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P4_v4i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P4_v4i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX12-NEXT: s_endpgm
%a = load <4 x i32>, ptr addrspace(4) %ptra
store <4 x i32> %a, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @load_uniform_P4_v8i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P4_v8i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P4_v8i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX12-NEXT: s_endpgm
%a = load <8 x i32>, ptr addrspace(4) %ptra
store <8 x i32> %a, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @load_uniform_P4_v16i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) {
; GFX11-LABEL: load_uniform_P4_v16i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10
; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8
; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14
; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P4_v16i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b512 s[0:15], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
; GFX12-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10
; GFX12-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8
; GFX12-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14
; GFX12-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12
; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
; GFX12-NEXT: s_endpgm
%a = load <16 x i32>, ptr addrspace(4) %ptra
store <16 x i32> %a, ptr addrspace(1) %out
ret void
}