blob: d81167603d71391d13cb5fc633f864b98a3f6f73 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90a -mattr=+ptx83 | FileCheck %s
; RUN: %if ptxas-isa-8.3 && ptxas-sm_90a %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90a -mattr=+ptx83 | %ptxas-verify -arch=sm_90a %}
define void @tensormap_replace_global_address(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i64 %value) {
; CHECK-LABEL: tensormap_replace_global_address(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_global_address_param_0];
; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_global_address_param_2];
; CHECK-NEXT: tensormap.replace.tile.global_address.global.b1024.b64 [%rd1], %rd2;
; CHECK-NEXT: ld.param.b64 %rd3, [tensormap_replace_global_address_param_1];
; CHECK-NEXT: tensormap.replace.tile.global_address.shared::cta.b1024.b64 [%rd3], %rd2;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tensormap.replace.global.address.p1(ptr addrspace(1) %global_addr, i64 %value)
call void @llvm.nvvm.tensormap.replace.global.address.p3(ptr addrspace(3) %shared_addr, i64 %value)
ret void
}
define void @tensormap_replace_rank(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i32 %value) {
; CHECK-LABEL: tensormap_replace_rank(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_rank_param_0];
; CHECK-NEXT: ld.param.b32 %r1, [tensormap_replace_rank_param_2];
; CHECK-NEXT: tensormap.replace.tile.rank.global.b1024.b32 [%rd1], %r1;
; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_rank_param_1];
; CHECK-NEXT: tensormap.replace.tile.rank.shared::cta.b1024.b32 [%rd2], %r1;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tensormap.replace.rank.p1(ptr addrspace(1) %global_addr, i32 %value)
call void @llvm.nvvm.tensormap.replace.rank.p3(ptr addrspace(3) %shared_addr, i32 %value)
ret void
}
define void @tensormap_replace_box_dim(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i32 %value) {
; CHECK-LABEL: tensormap_replace_box_dim(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_box_dim_param_0];
; CHECK-NEXT: ld.param.b32 %r1, [tensormap_replace_box_dim_param_2];
; CHECK-NEXT: tensormap.replace.tile.box_dim.global.b1024.b32 [%rd1], 0, %r1;
; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_box_dim_param_1];
; CHECK-NEXT: tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [%rd2], 0, %r1;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tensormap.replace.box.dim.p1(ptr addrspace(1) %global_addr, i32 0, i32 %value)
call void @llvm.nvvm.tensormap.replace.box.dim.p3(ptr addrspace(3) %shared_addr, i32 0, i32 %value)
ret void
}
define void @tensormap_replace_global_dim(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i32 %value) {
; CHECK-LABEL: tensormap_replace_global_dim(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_global_dim_param_0];
; CHECK-NEXT: ld.param.b32 %r1, [tensormap_replace_global_dim_param_2];
; CHECK-NEXT: tensormap.replace.tile.global_dim.global.b1024.b32 [%rd1], 0, %r1;
; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_global_dim_param_1];
; CHECK-NEXT: tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%rd2], 0, %r1;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tensormap.replace.global.dim.p1(ptr addrspace(1) %global_addr, i32 0, i32 %value)
call void @llvm.nvvm.tensormap.replace.global.dim.p3(ptr addrspace(3) %shared_addr, i32 0, i32 %value)
ret void
}
define void @tensormap_replace_global_stride(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i64 %value) {
; CHECK-LABEL: tensormap_replace_global_stride(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_global_stride_param_0];
; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_global_stride_param_2];
; CHECK-NEXT: tensormap.replace.tile.global_stride.global.b1024.b64 [%rd1], 0, %rd2;
; CHECK-NEXT: ld.param.b64 %rd3, [tensormap_replace_global_stride_param_1];
; CHECK-NEXT: tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%rd3], 0, %rd2;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tensormap.replace.global.stride.p1(ptr addrspace(1) %global_addr, i32 0, i64 %value)
call void @llvm.nvvm.tensormap.replace.global.stride.p3(ptr addrspace(3) %shared_addr, i32 0, i64 %value)
ret void
}
define void @tensormap_replace_element_stride(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i32 %value) {
; CHECK-LABEL: tensormap_replace_element_stride(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_element_stride_param_0];
; CHECK-NEXT: ld.param.b32 %r1, [tensormap_replace_element_stride_param_2];
; CHECK-NEXT: tensormap.replace.tile.element_stride.global.b1024.b32 [%rd1], 0, %r1;
; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_element_stride_param_1];
; CHECK-NEXT: tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [%rd2], 0, %r1;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tensormap.replace.element.stride.p1(ptr addrspace(1) %global_addr, i32 0, i32 %value)
call void @llvm.nvvm.tensormap.replace.element.stride.p3(ptr addrspace(3) %shared_addr, i32 0, i32 %value)
ret void
}
define void @tensormap_replace_elemtype(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr) {
; CHECK-LABEL: tensormap_replace_elemtype(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_elemtype_param_0];
; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 0;
; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_elemtype_param_1];
; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 0;
; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 1;
; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 1;
; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 2;
; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 2;
; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 3;
; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 3;
; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 4;
; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 4;
; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 5;
; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 5;
; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 6;
; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 6;
; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 7;
; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 7;
; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 8;
; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 8;
; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 9;
; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 9;
; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 10;
; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 10;
; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 11;
; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 11;
; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 12;
; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 12;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=u8 */ i32 0)
call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=u8 */ i32 0)
call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=u16 */ i32 1)
call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=u16 */ i32 1)
call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=u32 */ i32 2)
call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=u32 */ i32 2)
call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=s32 */ i32 3)
call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=s32 */ i32 3)
call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=u64 */ i32 4)
call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=u64 */ i32 4)
call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=s64 */ i32 5)
call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=s64 */ i32 5)
call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=f16 */ i32 6)
call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=f16 */ i32 6)
call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=f32 */ i32 7)
call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=f32 */ i32 7)
call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=f32.ftz */ i32 8)
call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=f32.ftz */ i32 8)
call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=f64 */ i32 9)
call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=f64 */ i32 9)
call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=bf16 */ i32 10)
call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=bf16 */ i32 10)
call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=tf32 */ i32 11)
call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=tf32 */ i32 11)
call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=tf32.ftz */ i32 12)
call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=tf32.ftz */ i32 12)
ret void
}
define void @tensormap_replace_interleave_layout(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr) {
; CHECK-LABEL: tensormap_replace_interleave_layout(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_interleave_layout_param_0];
; CHECK-NEXT: tensormap.replace.tile.interleave_layout.global.b1024.b32 [%rd1], 0;
; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_interleave_layout_param_1];
; CHECK-NEXT: tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%rd2], 0;
; CHECK-NEXT: tensormap.replace.tile.interleave_layout.global.b1024.b32 [%rd1], 1;
; CHECK-NEXT: tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%rd2], 1;
; CHECK-NEXT: tensormap.replace.tile.interleave_layout.global.b1024.b32 [%rd1], 2;
; CHECK-NEXT: tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%rd2], 2;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tensormap.replace.interleave.layout.p1(ptr addrspace(1) %global_addr, /* interleave_layout=No interleave */ i32 0)
call void @llvm.nvvm.tensormap.replace.interleave.layout.p3(ptr addrspace(3) %shared_addr, /* interleave_layout=No interleave */ i32 0)
call void @llvm.nvvm.tensormap.replace.interleave.layout.p1(ptr addrspace(1) %global_addr, /* interleave_layout=16B interleave */ i32 1)
call void @llvm.nvvm.tensormap.replace.interleave.layout.p3(ptr addrspace(3) %shared_addr, /* interleave_layout=16B interleave */ i32 1)
call void @llvm.nvvm.tensormap.replace.interleave.layout.p1(ptr addrspace(1) %global_addr, /* interleave_layout=32B interleave */ i32 2)
call void @llvm.nvvm.tensormap.replace.interleave.layout.p3(ptr addrspace(3) %shared_addr, /* interleave_layout=32B interleave */ i32 2)
ret void
}
define void @tensormap_replace_swizzle_mode(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr) {
; CHECK-LABEL: tensormap_replace_swizzle_mode(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_swizzle_mode_param_0];
; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%rd1], 0;
; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_swizzle_mode_param_1];
; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%rd2], 0;
; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%rd1], 1;
; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%rd2], 1;
; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%rd1], 2;
; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%rd2], 2;
; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%rd1], 3;
; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%rd2], 3;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tensormap.replace.swizzle.mode.p1(ptr addrspace(1) %global_addr, /* swizzle_mode=No swizzling */ i32 0)
call void @llvm.nvvm.tensormap.replace.swizzle.mode.p3(ptr addrspace(3) %shared_addr, /* swizzle_mode=No swizzling */ i32 0)
call void @llvm.nvvm.tensormap.replace.swizzle.mode.p1(ptr addrspace(1) %global_addr, /* swizzle_mode=32B swizzling */ i32 1)
call void @llvm.nvvm.tensormap.replace.swizzle.mode.p3(ptr addrspace(3) %shared_addr, /* swizzle_mode=32B swizzling */ i32 1)
call void @llvm.nvvm.tensormap.replace.swizzle.mode.p1(ptr addrspace(1) %global_addr, /* swizzle_mode=64B swizzling */ i32 2)
call void @llvm.nvvm.tensormap.replace.swizzle.mode.p3(ptr addrspace(3) %shared_addr, /* swizzle_mode=64B swizzling */ i32 2)
call void @llvm.nvvm.tensormap.replace.swizzle.mode.p1(ptr addrspace(1) %global_addr, /* swizzle_mode=128B swizzling */ i32 3)
call void @llvm.nvvm.tensormap.replace.swizzle.mode.p3(ptr addrspace(3) %shared_addr, /* swizzle_mode=128B swizzling */ i32 3)
ret void
}
define void @tensormap_replace_fill_mode(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr) {
; CHECK-LABEL: tensormap_replace_fill_mode(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_fill_mode_param_0];
; CHECK-NEXT: tensormap.replace.tile.fill_mode.global.b1024.b32 [%rd1], 0;
; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_fill_mode_param_1];
; CHECK-NEXT: tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [%rd2], 0;
; CHECK-NEXT: tensormap.replace.tile.fill_mode.global.b1024.b32 [%rd1], 1;
; CHECK-NEXT: tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [%rd2], 1;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tensormap.replace.fill.mode.p1(ptr addrspace(1) %global_addr, /* fill_mode=Zero fill */ i32 0)
call void @llvm.nvvm.tensormap.replace.fill.mode.p3(ptr addrspace(3) %shared_addr, /* fill_mode=Zero fill */ i32 0)
call void @llvm.nvvm.tensormap.replace.fill.mode.p1(ptr addrspace(1) %global_addr, /* fill_mode=OOB-NaN fill */ i32 1)
call void @llvm.nvvm.tensormap.replace.fill.mode.p3(ptr addrspace(3) %shared_addr, /* fill_mode=OOB-NaN fill */ i32 1)
ret void
}