| ; RUN: opt -S -passes=amdgpu-late-codegenprepare \ |
| ; RUN: -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s |
| |
| ; Goal: With a loop-header PHI in illegal vector type and a same-BB |
| ; non-lookthrough user (vector add) in the header, LRO should still coerce |
| ; the PHI to i32 because a profitable sink (store) exists across BB. |
| |
| define amdgpu_kernel void @phi_samebb_nonlookthrough_store( |
| ptr addrspace(1) %out, <4 x i8> %v, i1 %exit) { |
| ; CHECK-LABEL: @phi_samebb_nonlookthrough_store( |
| entry: |
| br label %loop |
| |
| loop: ; preds = %entry, %loop |
| ; Loop-carried PHI in illegal vector type. |
| %acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ] |
| |
| ; Same-BB non-lookthrough use in header. |
| %acc.next = add <4 x i8> %acc, %v |
| |
| ; Make it a real loop: either iterate or exit to the sink block. |
| br i1 %exit, label %store, label %loop |
| |
| store: ; preds = %loop |
| ; The across-BB sink: storing the PHI coerced to i32. |
| %acc.bc = bitcast <4 x i8> %acc to i32 |
| store i32 %acc.bc, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| ; After AMDGPULateCodeGenPrepare we expect: |
| ; - PHI is coerced to i32 |
| ; - A header bitcast materializes for the add |
| ; This proves the same-BB non-lookthrough user (add) did not get pruned |
| ; when the def is a PHI. |
| |
| ; CHECK: loop: |
| ; CHECK: %[[ACC_TC:[^ ]+]] = phi i32 |
| ; CHECK: %[[ACC_TC_BC:[^ ]+]] = bitcast i32 %[[ACC_TC]] to <4 x i8> |
| ; CHECK: %[[ACC_NEXT:[^ ]+]] = add <4 x i8> %[[ACC_TC_BC]], %v |
| ; CHECK: br i1 %exit, label %store, label %loop |
| ; CHECK: store: |
| ; CHECK: %[[ACC_TC_BC2:[^ ]+]] = bitcast i32 %[[ACC_TC]] to <4 x i8> |
| ; CHECK: %[[ST_I32:[^ ]+]] = bitcast <4 x i8> %[[ACC_TC_BC2]] to i32 |
| ; CHECK: store i32 %[[ST_I32]], |
| |