[AArch64][GlobalISel] Fix incorrect handling of fp truncating stores. When the tablegen patterns fail to select a truncating scalar FPR store, our manual selection code also failed to handle it silently, trying to generate an invalid copy. Fix this by adding support in the manual code to generate a proper subreg copy before selecting a non-truncating store. (cherry-picked from 04fb9b729a53a5f513328159590d86d28336a6da)

commit: ce65262d7ce251555ed386bca9691636b6fdcf77 [log] [tgz]
author: Amara Emerson <amara@apple.com> Tue Aug 24 13:56:01 2021 -0700
committer: Josh Stone <cuviper@gmail.com> Tue Jan 18 09:59:47 2022 -0800
tree: 8f67d86bc727bb3c0548881c597ffc85970351f1
parent: 2abffbf977a9e8c6ca4174a08fe5c4d7781f0aac [diff]
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index a17bd1b..7980ddb 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp

@@ -22,6 +22,7 @@
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -2701,8 +2702,9 @@
   case TargetOpcode::G_ZEXTLOAD:
   case TargetOpcode::G_LOAD:
   case TargetOpcode::G_STORE: {
+    GLoadStore &LdSt = cast<GLoadStore>(I);
     bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
-    LLT PtrTy = MRI.getType(I.getOperand(1).getReg());
+    LLT PtrTy = MRI.getType(LdSt.getPointerReg());
 
     if (PtrTy != LLT::pointer(0, 64)) {
       LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
@@ -2710,20 +2712,19 @@
       return false;
     }
 
-    auto &MemOp = **I.memoperands_begin();
-    uint64_t MemSizeInBytes = MemOp.getSize();
-    unsigned MemSizeInBits = MemSizeInBytes * 8;
-    AtomicOrdering Order = MemOp.getSuccessOrdering();
+    uint64_t MemSizeInBytes = LdSt.getMemSize();
+    unsigned MemSizeInBits = LdSt.getMemSizeInBits();
+    AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
 
     // Need special instructions for atomics that affect ordering.
     if (Order != AtomicOrdering::NotAtomic &&
         Order != AtomicOrdering::Unordered &&
         Order != AtomicOrdering::Monotonic) {
-      assert(I.getOpcode() != TargetOpcode::G_ZEXTLOAD);
+      assert(!isa<GZExtLoad>(LdSt));
       if (MemSizeInBytes > 64)
         return false;
 
-      if (I.getOpcode() == TargetOpcode::G_LOAD) {
+      if (isa<GLoad>(LdSt)) {
         static unsigned Opcodes[] = {AArch64::LDARB, AArch64::LDARH,
                                      AArch64::LDARW, AArch64::LDARX};
         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
@@ -2737,7 +2738,7 @@
     }
 
 #ifndef NDEBUG
-    const Register PtrReg = I.getOperand(1).getReg();
+    const Register PtrReg = LdSt.getPointerReg();
     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
     // Sanity-check the pointer register.
     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
@@ -2746,13 +2747,31 @@
            "Load/Store pointer operand isn't a pointer");
 #endif
 
-    const Register ValReg = I.getOperand(0).getReg();
+    const Register ValReg = LdSt.getReg(0);
+    const LLT ValTy = MRI.getType(ValReg);
     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
 
+    // The code below doesn't support truncating stores, so we need to split it
+    // again.
+    if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
+      unsigned SubReg;
+      LLT MemTy = LdSt.getMMO().getMemoryType();
+      auto *RC = getRegClassForTypeOnBank(MemTy, RB, RBI);
+      if (!getSubRegForClass(RC, TRI, SubReg))
+        return false;
+
+      // Generate a subreg copy.
+      auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {})
+                      .addReg(ValReg, 0, SubReg)
+                      .getReg(0);
+      RBI.constrainGenericRegister(Copy, *RC, MRI);
+      LdSt.getOperand(0).setReg(Copy);
+    }
+
     // Helper lambda for partially selecting I. Either returns the original
     // instruction with an updated opcode, or a new instruction.
     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
-      bool IsStore = I.getOpcode() == TargetOpcode::G_STORE;
+      bool IsStore = isa<GStore>(I);
       const unsigned NewOpc =
           selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
       if (NewOpc == I.getOpcode())
@@ -2769,7 +2788,8 @@
 
       // Folded something. Create a new instruction and return it.
       auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
-      IsStore ? NewInst.addUse(ValReg) : NewInst.addDef(ValReg);
+      Register CurValReg = I.getOperand(0).getReg();
+      IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
       NewInst.cloneMemRefs(I);
       for (auto &Fn : *AddrModeFns)
         Fn(NewInst);

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-store-truncating-float.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-store-truncating-float.mir
new file mode 100644
index 0000000..3e06016
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-store-truncating-float.mir

@@ -0,0 +1,116 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+--- |
+  define void @truncating_f32(double %x) {
+    %alloca = alloca i32, align 4
+    %bitcast = bitcast double %x to i64
+    %trunc = trunc i64 %bitcast to i32
+    store i32 %trunc, i32* %alloca, align 4
+    ret void
+  }
+
+  define void @truncating_f16(double %x) {
+    %alloca = alloca i16, align 2
+    %bitcast = bitcast double %x to i64
+    %trunc = trunc i64 %bitcast to i16
+    store i16 %trunc, i16* %alloca, align 2
+    ret void
+  }
+
+  define void @truncating_f8(double %x) {
+    %alloca = alloca i8, align 1
+    %bitcast = bitcast double %x to i64
+    %trunc = trunc i64 %bitcast to i8
+    store i8 %trunc, i8* %alloca, align 1
+    ret void
+  }
+
+...
+---
+name:            truncating_f32
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, name: alloca, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $d0
+
+    ; CHECK-LABEL: name: truncating_f32
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr32 = COPY [[COPY]].ssub
+    ; CHECK: STRSui [[COPY1]], %stack.0.alloca, 0 :: (store (s32) into %ir.alloca)
+    ; CHECK: RET_ReallyLR
+    %0:fpr(s64) = COPY $d0
+    %1:gpr(p0) = G_FRAME_INDEX %stack.0.alloca
+    G_STORE %0(s64), %1(p0) :: (store (s32) into %ir.alloca)
+    RET_ReallyLR
+
+...
+---
+name:            truncating_f16
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+frameInfo:
+  maxAlignment:    2
+stack:
+  - { id: 0, name: alloca, size: 2, alignment: 2 }
+machineFunctionInfo: {}
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $d0
+
+    ; CHECK-LABEL: name: truncating_f16
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr16 = COPY [[COPY]].hsub
+    ; CHECK: STRHui [[COPY1]], %stack.0.alloca, 0 :: (store (s16) into %ir.alloca)
+    ; CHECK: RET_ReallyLR
+    %0:fpr(s64) = COPY $d0
+    %1:gpr(p0) = G_FRAME_INDEX %stack.0.alloca
+    G_STORE %0(s64), %1(p0) :: (store (s16) into %ir.alloca)
+    RET_ReallyLR
+
+...
+---
+name:            truncating_f8
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+frameInfo:
+  maxAlignment:    1
+stack:
+  - { id: 0, name: alloca, size: 1, alignment: 1 }
+machineFunctionInfo: {}
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $d0
+
+    ; CHECK-LABEL: name: truncating_f8
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr16 = COPY [[COPY]].hsub
+    ; CHECK: [[COPY2:%[0-9]+]]:fpr8 = COPY [[COPY1]]
+    ; CHECK: STRBui [[COPY2]], %stack.0.alloca, 0 :: (store (s8) into %ir.alloca)
+    ; CHECK: RET_ReallyLR
+    %0:fpr(s64) = COPY $d0
+    %1:gpr(p0) = G_FRAME_INDEX %stack.0.alloca
+    G_STORE %0(s64), %1(p0) :: (store (s8) into %ir.alloca)
+    RET_ReallyLR
+
+...

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-with-no-legality-check.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-with-no-legality-check.mir
index e0983db..9d044d1 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-with-no-legality-check.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-with-no-legality-check.mir

@@ -278,13 +278,13 @@
     ; CHECK-LABEL: name: test_rule96_id2146_at_idx8070
     ; CHECK: liveins: $x0
     ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
-    ; CHECK: [[LDRBui:%[0-9]+]]:fpr8 = LDRBui [[COPY]], 0 :: (load (s1))
+    ; CHECK: [[LDRBui:%[0-9]+]]:fpr8 = LDRBui [[COPY]], 0 :: (load (s8))
     ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[LDRBui]]
-    ; CHECK: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 0, 0
+    ; CHECK: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY1]], 0, 7
     ; CHECK: $noreg = PATCHABLE_RET [[UBFMWri]]
     %2:gpr(p0) = COPY $x0
-    %0:fpr(s1) = G_LOAD %2(p0) :: (load (s1))
-    %1:gpr(s32) = G_ZEXT %0(s1)
+    %0:fpr(s8) = G_LOAD %2(p0) :: (load (s8))
+    %1:gpr(s32) = G_ZEXT %0(s8)
     $noreg = PATCHABLE_RET %1(s32)
 
 ...
commit	ce65262d7ce251555ed386bca9691636b6fdcf77	[log] [tgz]
author	Amara Emerson <amara@apple.com>	Tue Aug 24 13:56:01 2021 -0700
committer	Josh Stone <cuviper@gmail.com>	Tue Jan 18 09:59:47 2022 -0800
tree	8f67d86bc727bb3c0548881c597ffc85970351f1
parent	2abffbf977a9e8c6ca4174a08fe5c4d7781f0aac [diff]