| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90a -mattr=+ptx83 | FileCheck %s |
| ; RUN: %if ptxas-isa-8.3 && ptxas-sm_90a %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90a -mattr=+ptx83 | %ptxas-verify -arch=sm_90a %} |
| |
| define void @tensormap_replace_global_address(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i64 %value) { |
| ; CHECK-LABEL: tensormap_replace_global_address( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<4>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_global_address_param_0]; |
| ; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_global_address_param_2]; |
| ; CHECK-NEXT: tensormap.replace.tile.global_address.global.b1024.b64 [%rd1], %rd2; |
| ; CHECK-NEXT: ld.param.b64 %rd3, [tensormap_replace_global_address_param_1]; |
| ; CHECK-NEXT: tensormap.replace.tile.global_address.shared::cta.b1024.b64 [%rd3], %rd2; |
| ; CHECK-NEXT: ret; |
| call void @llvm.nvvm.tensormap.replace.global.address.p1(ptr addrspace(1) %global_addr, i64 %value) |
| call void @llvm.nvvm.tensormap.replace.global.address.p3(ptr addrspace(3) %shared_addr, i64 %value) |
| ret void |
| } |
| |
| define void @tensormap_replace_rank(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i32 %value) { |
| ; CHECK-LABEL: tensormap_replace_rank( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<2>; |
| ; CHECK-NEXT: .reg .b64 %rd<3>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_rank_param_0]; |
| ; CHECK-NEXT: ld.param.b32 %r1, [tensormap_replace_rank_param_2]; |
| ; CHECK-NEXT: tensormap.replace.tile.rank.global.b1024.b32 [%rd1], %r1; |
| ; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_rank_param_1]; |
| ; CHECK-NEXT: tensormap.replace.tile.rank.shared::cta.b1024.b32 [%rd2], %r1; |
| ; CHECK-NEXT: ret; |
| call void @llvm.nvvm.tensormap.replace.rank.p1(ptr addrspace(1) %global_addr, i32 %value) |
| call void @llvm.nvvm.tensormap.replace.rank.p3(ptr addrspace(3) %shared_addr, i32 %value) |
| ret void |
| } |
| |
| define void @tensormap_replace_box_dim(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i32 %value) { |
| ; CHECK-LABEL: tensormap_replace_box_dim( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<2>; |
| ; CHECK-NEXT: .reg .b64 %rd<3>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_box_dim_param_0]; |
| ; CHECK-NEXT: ld.param.b32 %r1, [tensormap_replace_box_dim_param_2]; |
| ; CHECK-NEXT: tensormap.replace.tile.box_dim.global.b1024.b32 [%rd1], 0, %r1; |
| ; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_box_dim_param_1]; |
| ; CHECK-NEXT: tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [%rd2], 0, %r1; |
| ; CHECK-NEXT: ret; |
| call void @llvm.nvvm.tensormap.replace.box.dim.p1(ptr addrspace(1) %global_addr, i32 0, i32 %value) |
| call void @llvm.nvvm.tensormap.replace.box.dim.p3(ptr addrspace(3) %shared_addr, i32 0, i32 %value) |
| ret void |
| } |
| |
| define void @tensormap_replace_global_dim(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i32 %value) { |
| ; CHECK-LABEL: tensormap_replace_global_dim( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<2>; |
| ; CHECK-NEXT: .reg .b64 %rd<3>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_global_dim_param_0]; |
| ; CHECK-NEXT: ld.param.b32 %r1, [tensormap_replace_global_dim_param_2]; |
| ; CHECK-NEXT: tensormap.replace.tile.global_dim.global.b1024.b32 [%rd1], 0, %r1; |
| ; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_global_dim_param_1]; |
| ; CHECK-NEXT: tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%rd2], 0, %r1; |
| ; CHECK-NEXT: ret; |
| call void @llvm.nvvm.tensormap.replace.global.dim.p1(ptr addrspace(1) %global_addr, i32 0, i32 %value) |
| call void @llvm.nvvm.tensormap.replace.global.dim.p3(ptr addrspace(3) %shared_addr, i32 0, i32 %value) |
| ret void |
| } |
| |
| define void @tensormap_replace_global_stride(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i64 %value) { |
| ; CHECK-LABEL: tensormap_replace_global_stride( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<4>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_global_stride_param_0]; |
| ; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_global_stride_param_2]; |
| ; CHECK-NEXT: tensormap.replace.tile.global_stride.global.b1024.b64 [%rd1], 0, %rd2; |
| ; CHECK-NEXT: ld.param.b64 %rd3, [tensormap_replace_global_stride_param_1]; |
| ; CHECK-NEXT: tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%rd3], 0, %rd2; |
| ; CHECK-NEXT: ret; |
| call void @llvm.nvvm.tensormap.replace.global.stride.p1(ptr addrspace(1) %global_addr, i32 0, i64 %value) |
| call void @llvm.nvvm.tensormap.replace.global.stride.p3(ptr addrspace(3) %shared_addr, i32 0, i64 %value) |
| ret void |
| } |
| |
| define void @tensormap_replace_element_stride(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i32 %value) { |
| ; CHECK-LABEL: tensormap_replace_element_stride( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<2>; |
| ; CHECK-NEXT: .reg .b64 %rd<3>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_element_stride_param_0]; |
| ; CHECK-NEXT: ld.param.b32 %r1, [tensormap_replace_element_stride_param_2]; |
| ; CHECK-NEXT: tensormap.replace.tile.element_stride.global.b1024.b32 [%rd1], 0, %r1; |
| ; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_element_stride_param_1]; |
| ; CHECK-NEXT: tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [%rd2], 0, %r1; |
| ; CHECK-NEXT: ret; |
| call void @llvm.nvvm.tensormap.replace.element.stride.p1(ptr addrspace(1) %global_addr, i32 0, i32 %value) |
| call void @llvm.nvvm.tensormap.replace.element.stride.p3(ptr addrspace(3) %shared_addr, i32 0, i32 %value) |
| ret void |
| } |
| |
| define void @tensormap_replace_elemtype(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr) { |
| ; CHECK-LABEL: tensormap_replace_elemtype( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<3>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_elemtype_param_0]; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 0; |
| ; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_elemtype_param_1]; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 0; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 1; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 1; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 2; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 2; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 3; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 3; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 4; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 4; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 5; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 5; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 6; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 6; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 7; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 7; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 8; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 8; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 9; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 9; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 10; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 10; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 11; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 11; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 12; |
| ; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 12; |
| ; CHECK-NEXT: ret; |
| call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=u8 */ i32 0) |
| call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=u8 */ i32 0) |
| |
| call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=u16 */ i32 1) |
| call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=u16 */ i32 1) |
| |
| call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=u32 */ i32 2) |
| call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=u32 */ i32 2) |
| |
| call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=s32 */ i32 3) |
| call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=s32 */ i32 3) |
| |
| call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=u64 */ i32 4) |
| call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=u64 */ i32 4) |
| |
| call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=s64 */ i32 5) |
| call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=s64 */ i32 5) |
| |
| call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=f16 */ i32 6) |
| call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=f16 */ i32 6) |
| |
| call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=f32 */ i32 7) |
| call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=f32 */ i32 7) |
| |
| call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=f32.ftz */ i32 8) |
| call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=f32.ftz */ i32 8) |
| |
| call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=f64 */ i32 9) |
| call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=f64 */ i32 9) |
| |
| call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=bf16 */ i32 10) |
| call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=bf16 */ i32 10) |
| |
| call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=tf32 */ i32 11) |
| call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=tf32 */ i32 11) |
| |
| call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=tf32.ftz */ i32 12) |
| call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=tf32.ftz */ i32 12) |
| ret void |
| } |
| |
| define void @tensormap_replace_interleave_layout(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr) { |
| ; CHECK-LABEL: tensormap_replace_interleave_layout( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<3>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_interleave_layout_param_0]; |
| ; CHECK-NEXT: tensormap.replace.tile.interleave_layout.global.b1024.b32 [%rd1], 0; |
| ; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_interleave_layout_param_1]; |
| ; CHECK-NEXT: tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%rd2], 0; |
| ; CHECK-NEXT: tensormap.replace.tile.interleave_layout.global.b1024.b32 [%rd1], 1; |
| ; CHECK-NEXT: tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%rd2], 1; |
| ; CHECK-NEXT: tensormap.replace.tile.interleave_layout.global.b1024.b32 [%rd1], 2; |
| ; CHECK-NEXT: tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%rd2], 2; |
| ; CHECK-NEXT: ret; |
| call void @llvm.nvvm.tensormap.replace.interleave.layout.p1(ptr addrspace(1) %global_addr, /* interleave_layout=No interleave */ i32 0) |
| call void @llvm.nvvm.tensormap.replace.interleave.layout.p3(ptr addrspace(3) %shared_addr, /* interleave_layout=No interleave */ i32 0) |
| |
| call void @llvm.nvvm.tensormap.replace.interleave.layout.p1(ptr addrspace(1) %global_addr, /* interleave_layout=16B interleave */ i32 1) |
| call void @llvm.nvvm.tensormap.replace.interleave.layout.p3(ptr addrspace(3) %shared_addr, /* interleave_layout=16B interleave */ i32 1) |
| |
| call void @llvm.nvvm.tensormap.replace.interleave.layout.p1(ptr addrspace(1) %global_addr, /* interleave_layout=32B interleave */ i32 2) |
| call void @llvm.nvvm.tensormap.replace.interleave.layout.p3(ptr addrspace(3) %shared_addr, /* interleave_layout=32B interleave */ i32 2) |
| ret void |
| } |
| |
| define void @tensormap_replace_swizzle_mode(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr) { |
| ; CHECK-LABEL: tensormap_replace_swizzle_mode( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<3>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_swizzle_mode_param_0]; |
| ; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%rd1], 0; |
| ; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_swizzle_mode_param_1]; |
| ; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%rd2], 0; |
| ; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%rd1], 1; |
| ; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%rd2], 1; |
| ; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%rd1], 2; |
| ; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%rd2], 2; |
| ; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%rd1], 3; |
| ; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%rd2], 3; |
| ; CHECK-NEXT: ret; |
| call void @llvm.nvvm.tensormap.replace.swizzle.mode.p1(ptr addrspace(1) %global_addr, /* swizzle_mode=No swizzling */ i32 0) |
| call void @llvm.nvvm.tensormap.replace.swizzle.mode.p3(ptr addrspace(3) %shared_addr, /* swizzle_mode=No swizzling */ i32 0) |
| |
| call void @llvm.nvvm.tensormap.replace.swizzle.mode.p1(ptr addrspace(1) %global_addr, /* swizzle_mode=32B swizzling */ i32 1) |
| call void @llvm.nvvm.tensormap.replace.swizzle.mode.p3(ptr addrspace(3) %shared_addr, /* swizzle_mode=32B swizzling */ i32 1) |
| |
| call void @llvm.nvvm.tensormap.replace.swizzle.mode.p1(ptr addrspace(1) %global_addr, /* swizzle_mode=64B swizzling */ i32 2) |
| call void @llvm.nvvm.tensormap.replace.swizzle.mode.p3(ptr addrspace(3) %shared_addr, /* swizzle_mode=64B swizzling */ i32 2) |
| |
| call void @llvm.nvvm.tensormap.replace.swizzle.mode.p1(ptr addrspace(1) %global_addr, /* swizzle_mode=128B swizzling */ i32 3) |
| call void @llvm.nvvm.tensormap.replace.swizzle.mode.p3(ptr addrspace(3) %shared_addr, /* swizzle_mode=128B swizzling */ i32 3) |
| ret void |
| } |
| |
| define void @tensormap_replace_fill_mode(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr) { |
| ; CHECK-LABEL: tensormap_replace_fill_mode( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b64 %rd<3>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_fill_mode_param_0]; |
| ; CHECK-NEXT: tensormap.replace.tile.fill_mode.global.b1024.b32 [%rd1], 0; |
| ; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_fill_mode_param_1]; |
| ; CHECK-NEXT: tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [%rd2], 0; |
| ; CHECK-NEXT: tensormap.replace.tile.fill_mode.global.b1024.b32 [%rd1], 1; |
| ; CHECK-NEXT: tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [%rd2], 1; |
| ; CHECK-NEXT: ret; |
| call void @llvm.nvvm.tensormap.replace.fill.mode.p1(ptr addrspace(1) %global_addr, /* fill_mode=Zero fill */ i32 0) |
| call void @llvm.nvvm.tensormap.replace.fill.mode.p3(ptr addrspace(3) %shared_addr, /* fill_mode=Zero fill */ i32 0) |
| |
| call void @llvm.nvvm.tensormap.replace.fill.mode.p1(ptr addrspace(1) %global_addr, /* fill_mode=OOB-NaN fill */ i32 1) |
| call void @llvm.nvvm.tensormap.replace.fill.mode.p3(ptr addrspace(3) %shared_addr, /* fill_mode=OOB-NaN fill */ i32 1) |
| ret void |
| } |