| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s |
| |
| declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16(<8 x half>, <16 x half>, <8 x half>, i16) |
| |
| define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { |
| ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 |
| ; GCN-NEXT: v_mov_b32_e32 v48, 0 |
| ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GCN-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 |
| ; GCN-NEXT: s_wait_kmcnt 0x0 |
| ; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28 |
| ; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 |
| ; GCN-NEXT: ds_load_b128 v[8:11], v0 |
| ; GCN-NEXT: ds_load_b128 v[12:15], v0 offset:512 |
| ; GCN-NEXT: ds_load_b128 v[16:19], v0 offset:1536 |
| ; GCN-NEXT: ds_load_b128 v[20:23], v0 offset:3072 |
| ; GCN-NEXT: ds_load_b128 v[24:27], v0 offset:5120 |
| ; GCN-NEXT: ds_load_b128 v[4:7], v0 offset:11280 |
| ; GCN-NEXT: ds_load_b128 v[0:3], v0 offset:11264 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0) |
| ; GCN-NEXT: s_wait_dscnt 0x6 |
| ; GCN-NEXT: v_mov_b32_e32 v31, v11 |
| ; GCN-NEXT: s_wait_dscnt 0x5 |
| ; GCN-NEXT: v_mov_b32_e32 v35, v15 |
| ; GCN-NEXT: s_wait_dscnt 0x4 |
| ; GCN-NEXT: v_mov_b32_e32 v39, v19 |
| ; GCN-NEXT: s_wait_dscnt 0x3 |
| ; GCN-NEXT: v_mov_b32_e32 v43, v23 |
| ; GCN-NEXT: s_wait_dscnt 0x2 |
| ; GCN-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10 |
| ; GCN-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 |
| ; GCN-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 |
| ; GCN-NEXT: v_mov_b32_e32 v32, v12 |
| ; GCN-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17 |
| ; GCN-NEXT: v_mov_b32_e32 v36, v16 |
| ; GCN-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21 |
| ; GCN-NEXT: v_mov_b32_e32 v40, v20 |
| ; GCN-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25 |
| ; GCN-NEXT: v_mov_b32_e32 v44, v24 |
| ; GCN-NEXT: s_wait_dscnt 0x0 |
| ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48 |
| ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48 |
| ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48 |
| ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48 |
| ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48 |
| ; GCN-NEXT: ds_store_b128 v49, v[28:31] |
| ; GCN-NEXT: ds_store_b128 v50, v[32:35] offset:512 |
| ; GCN-NEXT: ds_store_b128 v50, v[36:39] offset:1024 |
| ; GCN-NEXT: ds_store_b128 v50, v[40:43] offset:1536 |
| ; GCN-NEXT: ds_store_b128 v50, v[44:47] offset:2048 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(5) SyncID(0) |
| ; GCN-NEXT: s_endpgm |
| ; |
| ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: |
| ; EXACTCUTOFF: ; %bb.0: ; %entry |
| ; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 4, v0 |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0 |
| ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; EXACTCUTOFF-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 |
| ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 |
| ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v0 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[12:15], v0 offset:512 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[16:19], v0 offset:1536 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v0 offset:3072 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[24:27], v0 offset:5120 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v0 offset:11280 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v0 offset:11264 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0) |
| ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x6 |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v31, v11 |
| ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x5 |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v35, v15 |
| ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x4 |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v39, v19 |
| ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x3 |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v43, v23 |
| ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v32, v12 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17 |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v36, v16 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21 |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v40, v20 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25 |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v44, v24 |
| ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 |
| ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48 |
| ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48 |
| ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48 |
| ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48 |
| ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48 |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v49, v[28:31] |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[32:35] offset:512 |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[36:39] offset:1024 |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[40:43] offset:1536 |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[44:47] offset:2048 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(5) SyncID(0) |
| ; EXACTCUTOFF-NEXT: s_endpgm |
| entry: |
| %idx = call i32 @llvm.amdgcn.workitem.id.x() |
| %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %in, i32 %idx |
| %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr |
| %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 32 |
| %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr |
| %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 64 |
| %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr |
| %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 96 |
| %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr |
| %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 128 |
| %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr |
| %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %load.4.addr, i32 192 |
| %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr |
| %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0) |
| %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0) |
| %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0) |
| %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0) |
| %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0) |
| %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx |
| store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr |
| %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32 |
| store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr |
| %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64 |
| store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr |
| %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96 |
| store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr |
| %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128 |
| store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr |
| ; 7 DS read |
| call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 7, i32 0) |
| ; 5 SWMMAC |
| call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 5, i32 0) |
| ; 5 DS write |
| call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 5, i32 0) |
| ret void |
| } |
| |
| define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { |
| ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GCN-NEXT: v_and_b32_e32 v16, 0x3ff, v0 |
| ; GCN-NEXT: v_mov_b32_e32 v18, 0 |
| ; GCN-NEXT: s_wait_kmcnt 0x0 |
| ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GCN-NEXT: v_lshl_add_u32 v17, v16, 5, s0 |
| ; GCN-NEXT: v_lshl_add_u32 v16, v16, 4, s1 |
| ; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:1024 |
| ; GCN-NEXT: ds_load_b128 v[0:3], v17 |
| ; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) |
| ; GCN-NEXT: s_wait_dscnt 0x2 |
| ; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 |
| ; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 |
| ; GCN-NEXT: s_wait_dscnt 0x0 |
| ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; GCN-NEXT: ds_store_b128 v16, v[12:15] |
| ; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:2560 |
| ; GCN-NEXT: v_mov_b32_e32 v16, s1 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) |
| ; GCN-NEXT: s_wait_dscnt 0x0 |
| ; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 |
| ; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 |
| ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:512 |
| ; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:4608 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) |
| ; GCN-NEXT: s_wait_dscnt 0x0 |
| ; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 |
| ; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 |
| ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:1024 |
| ; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:7168 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) |
| ; GCN-NEXT: s_wait_dscnt 0x0 |
| ; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 |
| ; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 |
| ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:1536 |
| ; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:10240 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) |
| ; GCN-NEXT: s_wait_dscnt 0x0 |
| ; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 |
| ; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 |
| ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:2048 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| ; GCN-NEXT: s_endpgm |
| ; |
| ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: |
| ; EXACTCUTOFF: ; %bb.0: ; %entry |
| ; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x3ff, v0 |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v18, 0 |
| ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 |
| ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v16, 5, s0 |
| ; EXACTCUTOFF-NEXT: v_lshl_add_u32 v16, v16, 4, s1 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:1024 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) |
| ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 |
| ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 |
| ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:2560 |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v16, s1 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 |
| ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:512 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:4608 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 |
| ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:1024 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:7168 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 |
| ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:1536 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:10240 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 |
| ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:2048 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: s_endpgm |
| entry: |
| %idx = call i32 @llvm.amdgcn.workitem.id.x() |
| %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx |
| %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr |
| %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %load.b.addr, i32 64 |
| %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr |
| %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 96 |
| %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr |
| %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 128 |
| %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr |
| %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 160 |
| %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr |
| %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 192 |
| %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr |
| %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0) |
| %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0) |
| %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0) |
| %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0) |
| %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0) |
| %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx |
| store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr |
| %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32 |
| store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr |
| %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64 |
| store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr |
| %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96 |
| store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr |
| %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128 |
| store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr |
| ; 3 DS read |
| call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 3, i32 0) |
| ; 1 SWMMAC |
| call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) |
| ; 1 DS write |
| call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) |
| ; 1 DS read |
| call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) |
| ; 1 SWMMAC |
| call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) |
| ; 1 DS write |
| call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) |
| ; 1 DS read |
| call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) |
| ; 1 SWMMAC |
| call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) |
| ; 1 DS write |
| call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) |
| ; 1 DS read |
| call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) |
| ; 1 SWMMAC |
| call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) |
| ; 1 DS write |
| call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) |
| ; 1 DS read |
| call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) |
| ; 1 SWMMAC |
| call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) |
| ; 1 DS write |
| call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) |
| ret void |
| } |