blob: 64d643d91530e591052f1e41f64ae8df7269c703 [file]
//@ add-minicore
//@ revisions: gfx11 gfx12
//@ assembly-output: emit-asm
//@ compile-flags: --target amdgcn-amd-amdhsa
//@[gfx11] compile-flags: -Ctarget-cpu=gfx1100
//@[gfx12] compile-flags: -Ctarget-cpu=gfx1200
//@ needs-llvm-components: amdgpu
//@ needs-rust-lld
// ignore-tidy-linelength
#![feature(abi_gpu_kernel, no_core, asm_experimental_arch, repr_simd, f16)]
#![crate_type = "rlib"]
#![no_core]
#![allow(
asm_sub_register,
improper_gpu_kernel_arg,
improper_ctypes_definitions,
non_camel_case_types,
unused_assignments,
unused_variables
)]
extern crate minicore;
use minicore::*;
type ptr = *mut u8;
#[repr(simd)]
pub struct i16x2([i16; 2]);
#[repr(simd)]
pub struct f16x2([f16; 2]);
#[repr(simd)]
pub struct i16x4([i16; 4]);
#[repr(simd)]
pub struct f16x4([f16; 4]);
#[repr(simd)]
pub struct i32x2([i32; 2]);
#[repr(simd)]
pub struct f32x2([f32; 2]);
#[repr(simd)]
pub struct i32x3([i32; 3]);
#[repr(simd)]
pub struct f32x3([f32; 3]);
#[repr(simd)]
pub struct i16x8([i16; 8]);
#[repr(simd)]
pub struct f16x8([f16; 8]);
#[repr(simd)]
pub struct i32x4([i32; 4]);
#[repr(simd)]
pub struct f32x4([f32; 4]);
#[repr(simd)]
pub struct i32x5([i32; 5]);
#[repr(simd)]
pub struct f32x5([f32; 5]);
#[repr(simd)]
pub struct i32x6([i32; 6]);
#[repr(simd)]
pub struct f32x6([f32; 6]);
#[repr(simd)]
pub struct i32x7([i32; 7]);
#[repr(simd)]
pub struct f32x7([f32; 7]);
#[repr(simd)]
pub struct i16x16([i16; 16]);
#[repr(simd)]
pub struct f16x16([f16; 16]);
#[repr(simd)]
pub struct i32x8([i32; 8]);
#[repr(simd)]
pub struct f32x8([f32; 8]);
#[repr(simd)]
pub struct i32x10([i32; 10]);
#[repr(simd)]
pub struct f32x10([f32; 10]);
#[repr(simd)]
pub struct i16x32([i16; 32]);
#[repr(simd)]
pub struct f16x32([f16; 32]);
#[repr(simd)]
pub struct i32x16([i32; 16]);
#[repr(simd)]
pub struct f32x16([f32; 16]);
macro_rules! impl_copy {
($($ty:ident)*) => {
$(
impl Copy for $ty {}
)*
};
}
impl_copy!(
i16x2 f16x2 i16x4 f16x4 i32x2 f32x2 i32x3 f32x3 i16x8 f16x8 i32x4 f32x4
i32x5 f32x5 i32x6 f32x6 i32x7 f32x7 i16x16 f16x16 i32x8 f32x8 i32x10 f32x10
i16x32 f16x32 i32x16 f32x16
);
macro_rules! check {
($func:ident $ty:ident $class:ident $mov:literal) => {
#[no_mangle]
pub unsafe extern "gpu-kernel" fn $func(x: $ty) {
let y: $ty;
asm!(concat!($mov, " {}, {}"), out($class) y, in($class) x);
}
};
($func:ident $ret_ty:ident $ret_class:ident $($arg_name:ident: $arg_ty:ident $arg_class:ident,)*
$mov:literal) => {
check!($func $ret_ty $ret_class $($arg_name: $arg_ty $arg_class,)* $mov, "");
};
($func:ident $ret_ty:ident $ret_class:ident $($arg_name:ident: $arg_ty:ident $arg_class:ident,)*
$mov:literal, $tail:literal) => {
#[no_mangle]
pub unsafe extern "gpu-kernel" fn $func($($arg_name: $arg_ty,)*) {
let result: $ret_ty;
asm!(concat!($mov, " {}", $(", {", stringify!($arg_name), "}",)* $tail),
out($ret_class) result, $($arg_name = in($arg_class) $arg_name,)*);
}
};
}
macro_rules! check_reg {
($func:ident $ty:ident $reg:tt $mov:literal) => {
#[no_mangle]
pub unsafe extern "gpu-kernel" fn $func(x: $ty) {
let y: $ty;
asm!(concat!($mov, " ", $reg, ", ", $reg), lateout($reg) y, in($reg) x);
}
};
($func:ident $ret_ty:ident $ret_reg:tt $($arg_name:ident: $arg_ty:ident $arg_reg:tt,)*
$mov:literal) => {
check_reg!($func $ret_ty $ret_reg $($arg_name: $arg_ty $arg_reg,)* $mov, "");
};
($func:ident $ret_ty:ident $ret_reg:tt $($arg_name:ident: $arg_ty:ident $arg_reg:tt,)*
$mov:literal, $tail:literal) => {
#[no_mangle]
pub unsafe extern "gpu-kernel" fn $func($($arg_name: $arg_ty,)*) {
let result: $ret_ty;
asm!(concat!($mov, " ", $ret_reg, $(", ", $arg_reg,)* $tail), lateout($ret_reg) result,
$(in($arg_reg) $arg_name,)*);
}
};
}
// CHECK-LABEL: sgpr_i16x2:
// CHECK: #ASMSTART
// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}
// CHECK: #ASMEND
check!(sgpr_i16x2 i16x2 sgpr32 x: i16 sgpr32, y: i16 sgpr32, "s_pack_ll_b32_b16");
// CHECK-LABEL: sgpr_f16x2:
// CHECK: #ASMSTART
// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}
// CHECK: #ASMEND
check!(sgpr_f16x2 f16x2 sgpr32 x: i16 sgpr32, y: i16 sgpr32, "s_pack_ll_b32_b16");
// CHECK-LABEL: vgpr_i16x2:
// CHECK: #ASMSTART
// CHECK: v_pk_add_i16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}}
// CHECK: #ASMEND
check!(vgpr_i16x2 i16x2 vgpr32 x: i16x2 vgpr32, y: i16x2 vgpr32, "v_pk_add_i16");
// CHECK-LABEL: vgpr_f16x2:
// CHECK: #ASMSTART
// CHECK: v_pk_add_f16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}}
// CHECK: #ASMEND
check!(vgpr_f16x2 f16x2 vgpr32 x: f16x2 vgpr32, y: f16x2 vgpr32, "v_pk_add_f16");
// CHECK-LABEL: sgpr_i16x4:
// CHECK: #ASMSTART
// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check!(sgpr_i16x4 i16x4 sgpr64 x: ptr sgpr64, y: i32 sgpr32, "s_load_b64");
// CHECK-LABEL: sgpr_f16x4:
// CHECK: #ASMSTART
// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check!(sgpr_f16x4 f16x4 sgpr64 x: ptr sgpr64, y: i32 sgpr32, "s_load_b64");
// CHECK-LABEL: sgpr_i32x2:
// CHECK: #ASMSTART
// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check!(sgpr_i32x2 i32x2 sgpr64 x: ptr sgpr64, y: i32 sgpr32, "s_load_b64");
// CHECK-LABEL: sgpr_f32x2:
// CHECK: #ASMSTART
// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check!(sgpr_f32x2 f32x2 sgpr64 x: ptr sgpr64, y: i32 sgpr32, "s_load_b64");
// CHECK-LABEL: vgpr_i16x4:
// CHECK: #ASMSTART
// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check!(vgpr_i16x4 i16x4 vgpr64 x: i32 vgpr32, y: ptr sgpr64, "global_load_b64");
// CHECK-LABEL: vgpr_f16x4:
// CHECK: #ASMSTART
// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check!(vgpr_f16x4 f16x4 vgpr64 x: i32 vgpr32, y: ptr sgpr64, "global_load_b64");
// CHECK-LABEL: vgpr_i32x2:
// CHECK: #ASMSTART
// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check!(vgpr_i32x2 i32x2 vgpr64 x: i32 vgpr32, y: ptr sgpr64, "global_load_b64");
// CHECK-LABEL: vgpr_f32x2:
// CHECK: #ASMSTART
// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check!(vgpr_f32x2 f32x2 vgpr64 x: i32 vgpr32, y: ptr sgpr64, "global_load_b64");
// gfx12-LABEL: sgpr_i32x3:
// gfx12: #ASMSTART
// gfx12: s_load_b96 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// gfx12: #ASMEND
#[cfg(gfx12)]
check!(sgpr_i32x3 i32x3 sgpr96 x: ptr sgpr64, y: i32 sgpr32, "s_load_b96");
// gfx12-LABEL: sgpr_f32x3:
// gfx12: #ASMSTART
// gfx12: s_load_b96 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// gfx12: #ASMEND
#[cfg(gfx12)]
check!(sgpr_f32x3 f32x3 sgpr96 x: ptr sgpr64, y: i32 sgpr32, "s_load_b96");
// CHECK-LABEL: vgpr_i32x3:
// CHECK: #ASMSTART
// CHECK: global_load_b96 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check!(vgpr_i32x3 i32x3 vgpr96 x: i32 vgpr32, y: ptr sgpr64, "global_load_b96");
// CHECK-LABEL: vgpr_f32x3:
// CHECK: #ASMSTART
// CHECK: global_load_b96 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check!(vgpr_f32x3 f32x3 vgpr96 x: i32 vgpr32, y: ptr sgpr64, "global_load_b96");
// CHECK-LABEL: sgpr_i16x8:
// CHECK: #ASMSTART
// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check!(sgpr_i16x8 i16x8 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128");
// CHECK-LABEL: sgpr_f16x8:
// CHECK: #ASMSTART
// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check!(sgpr_f16x8 f16x8 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128");
// CHECK-LABEL: sgpr_i32x4:
// CHECK: #ASMSTART
// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check!(sgpr_i32x4 i32x4 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128");
// CHECK-LABEL: sgpr_f32x4:
// CHECK: #ASMSTART
// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check!(sgpr_f32x4 f32x4 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128");
// CHECK-LABEL: vgpr_i16x8:
// CHECK: #ASMSTART
// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check!(vgpr_i16x8 i16x8 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128");
// CHECK-LABEL: vgpr_f16x8:
// CHECK: #ASMSTART
// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check!(vgpr_f16x8 f16x8 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128");
// CHECK-LABEL: vgpr_i32x4:
// CHECK: #ASMSTART
// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check!(vgpr_i32x4 i32x4 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128");
// CHECK-LABEL: vgpr_f32x4:
// CHECK: #ASMSTART
// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check!(vgpr_f32x4 f32x4 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128");
// CHECK-LABEL: vgpr_i32x5:
// CHECK: #ASMSTART
// CHECK: image_load v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_1D tfe
// CHECK: #ASMEND
check!(vgpr_i32x5 i32x5 vgpr160 x: i32 vgpr32, y: i32x8 sgpr256, "image_load",
" dmask:0xf dim:SQ_RSRC_IMG_1D tfe");
// CHECK-LABEL: vgpr_f32x5:
// CHECK: #ASMSTART
// CHECK: image_load v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_1D tfe
// CHECK: #ASMEND
check!(vgpr_f32x5 f32x5 vgpr160 x: i32 vgpr32, y: i32x8 sgpr256, "image_load",
" dmask:0xf dim:SQ_RSRC_IMG_1D tfe");
// gfx11-LABEL: vgpr_i32x6:
// gfx11: #ASMSTART
// gfx11: image_sample_d v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D
// gfx11: #ASMEND
#[cfg(gfx11)]
check!(vgpr_i32x6 i32x4 vgpr128 x: i32x6 vgpr192, y: i32x8 sgpr256, z: i32x4 sgpr128,
"image_sample_d", " dmask:0xf dim:SQ_RSRC_IMG_2D");
// gfx11-LABEL: vgpr_f32x6:
// gfx11: #ASMSTART
// gfx11: image_sample_d v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D
// gfx11: #ASMEND
#[cfg(gfx11)]
check!(vgpr_f32x6 i32x4 vgpr128 x: f32x6 vgpr192, y: i32x8 sgpr256, z: i32x4 sgpr128,
"image_sample_d", " dmask:0xf dim:SQ_RSRC_IMG_2D");
// gfx11-LABEL: vgpr_i32x7:
// gfx11: #ASMSTART
// gfx11: image_sample_d_cl v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D
// gfx11: #ASMEND
#[cfg(gfx11)]
check!(vgpr_i32x7 i32x4 vgpr128 x: i32x7 vgpr224, y: i32x8 sgpr256, z: i32x4 sgpr128,
"image_sample_d_cl", " dmask:0xf dim:SQ_RSRC_IMG_2D");
// gfx11-LABEL: vgpr_f32x7:
// gfx11: #ASMSTART
// gfx11: image_sample_d_cl v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D
// gfx11: #ASMEND
#[cfg(gfx11)]
check!(vgpr_f32x7 i32x4 vgpr128 x: f32x7 vgpr224, y: i32x8 sgpr256, z: i32x4 sgpr128,
"image_sample_d_cl", " dmask:0xf dim:SQ_RSRC_IMG_2D");
// CHECK-LABEL: sgpr_i16x16:
// CHECK: #ASMSTART
// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check!(sgpr_i16x16 i16x16 sgpr256 x: ptr sgpr64, y: i32 sgpr32, "s_load_b256");
// CHECK-LABEL: sgpr_f16x16:
// CHECK: #ASMSTART
// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check!(sgpr_f16x16 f16x16 sgpr256 x: ptr sgpr64, y: i32 sgpr32, "s_load_b256");
// CHECK-LABEL: sgpr_i32x8:
// CHECK: #ASMSTART
// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check!(sgpr_i32x8 i32x8 sgpr256 x: ptr sgpr64, y: i32 sgpr32, "s_load_b256");
// CHECK-LABEL: sgpr_f32x8:
// CHECK: #ASMSTART
// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check!(sgpr_f32x8 f32x8 sgpr256 x: ptr sgpr64, y: i32 sgpr32, "s_load_b256");
// gfx11-LABEL: vgpr_i16x16:
// gfx11: #ASMSTART
// gfx11: v_wmma_f32_16x16x16_bf16 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
// gfx11: #ASMEND
#[cfg(gfx11)]
check!(vgpr_i16x16 f32x8 vgpr256 x: i32x8 vgpr256, y: i16x16 vgpr256, z: f32x8 vgpr256,
"v_wmma_f32_16x16x16_bf16");
// gfx11-LABEL: vgpr_f16x16:
// gfx11: #ASMSTART
// gfx11: v_wmma_f32_16x16x16_f16 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
// gfx11: #ASMEND
#[cfg(gfx11)]
check!(vgpr_f16x16 f32x8 vgpr256 x: f16x16 vgpr256, y: f16x16 vgpr256, z: f32x8 vgpr256,
"v_wmma_f32_16x16x16_f16");
// gfx11-LABEL: vgpr_i32x8:
// gfx11: #ASMSTART
// gfx11: v_wmma_i32_16x16x16_iu8 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
// gfx11: #ASMEND
#[cfg(gfx11)]
check!(vgpr_i32x8 i32x8 vgpr256 x: i32x4 vgpr128, y: i32x4 vgpr128, z: i32x8 vgpr256,
"v_wmma_i32_16x16x16_iu8");
// gfx12-LABEL: vgpr_f32x8:
// gfx12: #ASMSTART
// gfx12: v_wmma_f32_16x16x16_fp8_fp8 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
// gfx12: #ASMEND
#[cfg(gfx12)]
check!(vgpr_f32x8 f32x8 vgpr256 x: f32x2 vgpr64, y: f32x2 vgpr64, z: f32x8 vgpr256,
"v_wmma_f32_16x16x16_fp8_fp8");
// gfx12-LABEL: vgpr_i32x10:
// gfx12: #ASMSTART
// gfx12: image_bvh8_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]
// gfx12: #ASMEND
#[cfg(gfx12)]
check!(vgpr_i32x10 i32x10 vgpr320 "image_bvh8_intersect_ray",
", [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]");
// gfx12-LABEL: vgpr_f32x10:
// gfx12: #ASMSTART
// gfx12: image_bvh8_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]
// gfx12: #ASMEND
#[cfg(gfx12)]
check!(vgpr_f32x10 f32x10 vgpr320 "image_bvh8_intersect_ray",
", [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]");
// CHECK-LABEL: sgpr_i16x32:
// CHECK: #ASMSTART
// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check!(sgpr_i16x32 i16x32 sgpr512 x: ptr sgpr64, y: i32 sgpr32, "s_load_b512");
// CHECK-LABEL: sgpr_f16x32:
// CHECK: #ASMSTART
// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check!(sgpr_f16x32 f16x32 sgpr512 x: ptr sgpr64, y: i32 sgpr32, "s_load_b512");
// CHECK-LABEL: sgpr_i32x16:
// CHECK: #ASMSTART
// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check!(sgpr_i32x16 i32x16 sgpr512 x: ptr sgpr64, y: i32 sgpr32, "s_load_b512");
// CHECK-LABEL: sgpr_f32x16:
// CHECK: #ASMSTART
// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check!(sgpr_f32x16 f32x16 sgpr512 x: ptr sgpr64, y: i32 sgpr32, "s_load_b512");
// CHECK-LABEL: s0_i16x2:
// CHECK: #ASMSTART
// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}
// CHECK: #ASMEND
check_reg!(s0_i16x2 i16x2 "s0" x: i16 "s1", y: i16 "s2", "s_pack_ll_b32_b16");
// CHECK-LABEL: s0_f16x2:
// CHECK: #ASMSTART
// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}
// CHECK: #ASMEND
check_reg!(s0_f16x2 f16x2 "s0" x: i16 "s1", y: i16 "s2", "s_pack_ll_b32_b16");
// CHECK-LABEL: v0_i16x2:
// CHECK: #ASMSTART
// CHECK: v_pk_add_i16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}}
// CHECK: #ASMEND
check_reg!(v0_i16x2 i16x2 "v0" x: i16x2 "v1", y: i16x2 "v2", "v_pk_add_i16");
// CHECK-LABEL: v0_f16x2:
// CHECK: #ASMSTART
// CHECK: v_pk_add_f16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}}
// CHECK: #ASMEND
check_reg!(v0_f16x2 f16x2 "v0" x: f16x2 "v1", y: f16x2 "v2", "v_pk_add_f16");
// CHECK-LABEL: s0_i16x4:
// CHECK: #ASMSTART
// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check_reg!(s0_i16x4 i16x4 "s[0:1]" x: ptr "s[2:3]", y: i32 "s4", "s_load_b64");
// CHECK-LABEL: s0_f16x4:
// CHECK: #ASMSTART
// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check_reg!(s0_f16x4 f16x4 "s[0:1]" x: ptr "s[2:3]", y: i32 "s4", "s_load_b64");
// CHECK-LABEL: s0_i32x2:
// CHECK: #ASMSTART
// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check_reg!(s0_i32x2 i32x2 "s[0:1]" x: ptr "s[2:3]", y: i32 "s4", "s_load_b64");
// CHECK-LABEL: s0_f32x2:
// CHECK: #ASMSTART
// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check_reg!(s0_f32x2 f32x2 "s[0:1]" x: ptr "s[2:3]", y: i32 "s4", "s_load_b64");
// CHECK-LABEL: v0_i16x4:
// CHECK: #ASMSTART
// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check_reg!(v0_i16x4 i16x4 "v[0:1]" x: i32 "v2", y: ptr "s[0:1]", "global_load_b64");
// CHECK-LABEL: v0_f16x4:
// CHECK: #ASMSTART
// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check_reg!(v0_f16x4 f16x4 "v[0:1]" x: i32 "v2", y: ptr "s[0:1]", "global_load_b64");
// CHECK-LABEL: v0_i32x2:
// CHECK: #ASMSTART
// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check_reg!(v0_i32x2 i32x2 "v[0:1]" x: i32 "v2", y: ptr "s[0:1]", "global_load_b64");
// CHECK-LABEL: v0_f32x2:
// CHECK: #ASMSTART
// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check_reg!(v0_f32x2 f32x2 "v[0:1]" x: i32 "v2", y: ptr "s[0:1]", "global_load_b64");
// gfx12-LABEL: s0_i32x3:
// gfx12: #ASMSTART
// gfx12: s_load_b96 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// gfx12: #ASMEND
#[cfg(gfx12)]
check_reg!(s0_i32x3 i32x3 "s[0:2]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b96");
// gfx12-LABEL: s0_f32x3:
// gfx12: #ASMSTART
// gfx12: s_load_b96 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// gfx12: #ASMEND
#[cfg(gfx12)]
check_reg!(s0_f32x3 f32x3 "s[0:2]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b96");
// CHECK-LABEL: v0_i32x3:
// CHECK: #ASMSTART
// CHECK: global_load_b96 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check_reg!(v0_i32x3 i32x3 "v[0:2]" x: i32 "v3", y: ptr "s[0:1]", "global_load_b96");
// CHECK-LABEL: v0_f32x3:
// CHECK: #ASMSTART
// CHECK: global_load_b96 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check_reg!(v0_f32x3 f32x3 "v[0:2]" x: i32 "v3", y: ptr "s[0:1]", "global_load_b96");
// CHECK-LABEL: s0_i16x8:
// CHECK: #ASMSTART
// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check_reg!(s0_i16x8 i16x8 "s[0:3]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b128");
// CHECK-LABEL: s0_f16x8:
// CHECK: #ASMSTART
// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check_reg!(s0_f16x8 f16x8 "s[0:3]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b128");
// CHECK-LABEL: s0_i32x4:
// CHECK: #ASMSTART
// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check_reg!(s0_i32x4 i32x4 "s[0:3]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b128");
// CHECK-LABEL: s0_f32x4:
// CHECK: #ASMSTART
// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check_reg!(s0_f32x4 f32x4 "s[0:3]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b128");
// CHECK-LABEL: v0_i16x8:
// CHECK: #ASMSTART
// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check_reg!(v0_i16x8 i16x8 "v[0:3]" x: i32 "v4", y: ptr "s[0:1]", "global_load_b128");
// CHECK-LABEL: v0_f16x8:
// CHECK: #ASMSTART
// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check_reg!(v0_f16x8 f16x8 "v[0:3]" x: i32 "v4", y: ptr "s[0:1]", "global_load_b128");
// CHECK-LABEL: v0_i32x4:
// CHECK: #ASMSTART
// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check_reg!(v0_i32x4 i32x4 "v[0:3]" x: i32 "v4", y: ptr "s[0:1]", "global_load_b128");
// CHECK-LABEL: v0_f32x4:
// CHECK: #ASMSTART
// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
// CHECK: #ASMEND
check_reg!(v0_f32x4 f32x4 "v[0:3]" x: i32 "v4", y: ptr "s[0:1]", "global_load_b128");
// CHECK-LABEL: v0_i32x5:
// CHECK: #ASMSTART
// CHECK: image_load v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_1D tfe
// CHECK: #ASMEND
check_reg!(v0_i32x5 i32x5 "v[0:4]" x: i32 "v5", y: i32x8 "s[0:7]", "image_load",
" dmask:0xf dim:SQ_RSRC_IMG_1D tfe");
// CHECK-LABEL: v0_f32x5:
// CHECK: #ASMSTART
// CHECK: image_load v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_1D tfe
// CHECK: #ASMEND
check_reg!(v0_f32x5 f32x5 "v[0:4]" x: i32 "v5", y: i32x8 "s[0:7]", "image_load",
" dmask:0xf dim:SQ_RSRC_IMG_1D tfe");
// gfx11-LABEL: v0_i32x6:
// gfx11: #ASMSTART
// gfx11: image_sample_d v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D
// gfx11: #ASMEND
#[cfg(gfx11)]
check_reg!(v0_i32x6 i32x4 "v[0:3]" x: i32x6 "v[4:9]", y: i32x8 "s[0:7]", z: i32x4 "s[8:11]",
"image_sample_d", " dmask:0xf dim:SQ_RSRC_IMG_2D");
// gfx11-LABEL: v0_f32x6:
// gfx11: #ASMSTART
// gfx11: image_sample_d v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D
// gfx11: #ASMEND
#[cfg(gfx11)]
check_reg!(v0_f32x6 i32x4 "v[0:3]" x: f32x6 "v[4:9]", y: i32x8 "s[0:7]", z: i32x4 "s[8:11]",
"image_sample_d", " dmask:0xf dim:SQ_RSRC_IMG_2D");
// gfx11-LABEL: v0_i32x7:
// gfx11: #ASMSTART
// gfx11: image_sample_d_cl v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D
// gfx11: #ASMEND
#[cfg(gfx11)]
check_reg!(v0_i32x7 i32x4 "v[0:3]" x: i32x7 "v[4:10]", y: i32x8 "s[0:7]", z: i32x4 "s[8:11]",
"image_sample_d_cl", " dmask:0xf dim:SQ_RSRC_IMG_2D");
// gfx11-LABEL: v0_f32x7:
// gfx11: #ASMSTART
// gfx11: image_sample_d_cl v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D
// gfx11: #ASMEND
#[cfg(gfx11)]
check_reg!(v0_f32x7 i32x4 "v[0:3]" x: f32x7 "v[4:10]", y: i32x8 "s[0:7]", z: i32x4 "s[8:11]",
"image_sample_d_cl", " dmask:0xf dim:SQ_RSRC_IMG_2D");
// CHECK-LABEL: s0_i16x16:
// CHECK: #ASMSTART
// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check_reg!(s0_i16x16 i16x16 "s[0:7]" x: ptr "s[8:9]", y: i32 "s10", "s_load_b256");
// CHECK-LABEL: s0_f16x16:
// CHECK: #ASMSTART
// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check_reg!(s0_f16x16 f16x16 "s[0:7]" x: ptr "s[8:9]", y: i32 "s10", "s_load_b256");
// CHECK-LABEL: s0_i32x8:
// CHECK: #ASMSTART
// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check_reg!(s0_i32x8 i32x8 "s[0:7]" x: ptr "s[8:9]", y: i32 "s10", "s_load_b256");
// CHECK-LABEL: s0_f32x8:
// CHECK: #ASMSTART
// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check_reg!(s0_f32x8 f32x8 "s[0:7]" x: ptr "s[8:9]", y: i32 "s10", "s_load_b256");
// gfx11-LABEL: v0_i16x16:
// gfx11: #ASMSTART
// gfx11: v_wmma_f32_16x16x16_bf16 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
// gfx11: #ASMEND
#[cfg(gfx11)]
check_reg!(v0_i16x16 f32x8 "v[0:7]" x: i32x8 "v[8:15]", y: i16x16 "v[16:23]", z: f32x8 "v[24:31]",
"v_wmma_f32_16x16x16_bf16");
// gfx11-LABEL: v0_f16x16:
// gfx11: #ASMSTART
// gfx11: v_wmma_f32_16x16x16_f16 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
// gfx11: #ASMEND
#[cfg(gfx11)]
check_reg!(v0_f16x16 f32x8 "v[0:7]" x: f16x16 "v[8:15]", y: f16x16 "v[16:23]", z: f32x8 "v[24:31]",
"v_wmma_f32_16x16x16_f16");
// gfx11-LABEL: v0_i32x8:
// gfx11: #ASMSTART
// gfx11: v_wmma_i32_16x16x16_iu8 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
// gfx11: #ASMEND
#[cfg(gfx11)]
check_reg!(v0_i32x8 i32x8 "v[0:7]" x: i32x4 "v[8:11]", y: i32x4 "v[16:19]", z: i32x8 "v[24:31]",
"v_wmma_i32_16x16x16_iu8");
// gfx12-LABEL: v0_f32x8:
// gfx12: #ASMSTART
// gfx12: v_wmma_f32_16x16x16_fp8_fp8 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
// gfx12: #ASMEND
#[cfg(gfx12)]
check_reg!(v0_f32x8 f32x8 "v[0:7]" x: f32x2 "v[8:9]", y: f32x2 "v[16:17]", z: f32x8 "v[24:31]",
"v_wmma_f32_16x16x16_fp8_fp8");
// gfx12-LABEL: v0_i32x10:
// gfx12: #ASMSTART
// gfx12: image_bvh8_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]
// gfx12: #ASMEND
#[cfg(gfx12)]
check_reg!(v0_i32x10 i32x10 "v[0:9]" "image_bvh8_intersect_ray",
", [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]");
// gfx12-LABEL: v0_f32x10:
// gfx12: #ASMSTART
// gfx12: image_bvh8_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]
// gfx12: #ASMEND
#[cfg(gfx12)]
check_reg!(v0_f32x10 f32x10 "v[0:9]" "image_bvh8_intersect_ray",
", [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]");
// CHECK-LABEL: s0_i16x32:
// CHECK: #ASMSTART
// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check_reg!(s0_i16x32 i16x32 "s[0:15]" x: ptr "s[16:17]", y: i32 "s18", "s_load_b512");
// CHECK-LABEL: s0_f16x32:
// CHECK: #ASMSTART
// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check_reg!(s0_f16x32 f16x32 "s[0:15]" x: ptr "s[16:17]", y: i32 "s18", "s_load_b512");
// CHECK-LABEL: s0_i32x16:
// CHECK: #ASMSTART
// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check_reg!(s0_i32x16 i32x16 "s[0:15]" x: ptr "s[16:17]", y: i32 "s18", "s_load_b512");
// CHECK-LABEL: s0_f32x16:
// CHECK: #ASMSTART
// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
// CHECK: #ASMEND
check_reg!(s0_f32x16 f32x16 "s[0:15]" x: ptr "s[16:17]", y: i32 "s18", "s_load_b512");