| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=amdgcn-amd-amdhsa | FileCheck %s |
| |
| ; COM: Test that, unlike on CPU targets, the mask doesn't get bitcast to a scalar, |
| ; COM: since, on GPUs, each i1 takes up at least one register and so they should |
| ; COM: be treated separately. |
| |
| define <2 x i32> @scalarize_v2i32(<2 x ptr> %p, <2 x i1> %mask, <2 x i32> %passthru) { |
| ; CHECK-LABEL: define <2 x i32> @scalarize_v2i32( |
| ; CHECK-SAME: <2 x ptr> [[P:%.*]], <2 x i1> [[MASK:%.*]], <2 x i32> [[PASSTHRU:%.*]]) { |
| ; CHECK-NEXT: [[MASK0:%.*]] = extractelement <2 x i1> [[MASK]], i64 0 |
| ; CHECK-NEXT: br i1 [[MASK0]], label %[[COND_LOAD:.*]], label %[[ELSE:.*]] |
| ; CHECK: [[COND_LOAD]]: |
| ; CHECK-NEXT: [[PTR0:%.*]] = extractelement <2 x ptr> [[P]], i64 0 |
| ; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[PTR0]], align 8 |
| ; CHECK-NEXT: [[RES0:%.*]] = insertelement <2 x i32> [[PASSTHRU]], i32 [[LOAD0]], i64 0 |
| ; CHECK-NEXT: br label %[[ELSE]] |
| ; CHECK: [[ELSE]]: |
| ; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i32> [ [[RES0]], %[[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ] |
| ; CHECK-NEXT: [[MASK1:%.*]] = extractelement <2 x i1> [[MASK]], i64 1 |
| ; CHECK-NEXT: br i1 [[MASK1]], label %[[COND_LOAD1:.*]], label %[[ELSE2:.*]] |
| ; CHECK: [[COND_LOAD1]]: |
| ; CHECK-NEXT: [[PTR1:%.*]] = extractelement <2 x ptr> [[P]], i64 1 |
| ; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[PTR1]], align 8 |
| ; CHECK-NEXT: [[RES1:%.*]] = insertelement <2 x i32> [[RES_PHI_ELSE]], i32 [[LOAD1]], i64 1 |
| ; CHECK-NEXT: br label %[[ELSE2]] |
| ; CHECK: [[ELSE2]]: |
| ; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i32> [ [[RES1]], %[[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], %[[ELSE]] ] |
| ; CHECK-NEXT: ret <2 x i32> [[RES_PHI_ELSE3]] |
| ; |
| %ret = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %p, i32 8, <2 x i1> %mask, <2 x i32> %passthru) |
| ret <2 x i32> %ret |
| } |
| |
| declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>) |