llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp - rust-lang/llvm-project - Git at Google

 //===---HexagonLoadStoreWidening.cpp---------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 // HexagonStoreWidening:
 // Replace sequences of "narrow" stores to adjacent memory locations with
 // a fewer "wide" stores that have the same effect.
 // For example, replace:
 //   S4_storeirb_io  %100, 0, 0   ; store-immediate-byte
 //   S4_storeirb_io  %100, 1, 0   ; store-immediate-byte
 // with
 //   S4_storeirh_io  %100, 0, 0   ; store-immediate-halfword
 // The above is the general idea.  The actual cases handled by the code
 // may be a bit more complex.
 // The purpose of this pass is to reduce the number of outstanding stores,
 // or as one could say, "reduce store queue pressure".  Also, wide stores
 // mean fewer stores, and since there are only two memory instructions allowed
 // per packet, it also means fewer packets, and ultimately fewer cycles.
 //
 // HexagonLoadWidening does the same thing as HexagonStoreWidening but
 // for Loads. Here, we try to replace 4-byte Loads with register-pair loads.
 // For example:
 // Replace
 //   %2:intregs = L2_loadri_io %1:intregs, 0 :: (load (s32) from %ptr1, align 8)
 //   %3:intregs = L2_loadri_io %1:intregs, 4 :: (load (s32) from %ptr2)
 // with
 //   %4:doubleregs = L2_loadrd_io %1:intregs, 0 :: (load (s64) from %ptr1)
 //   %2:intregs = COPY %4.isub_lo:doubleregs
 //   %3:intregs = COPY %4.isub_hi:doubleregs
 //
 // LoadWidening for 8 and 16-bit loads is not useful as we end up generating 2N
 // insts to replace N loads: 1 widened load, N bitwise and, N - 1 shifts

 //===---------------------------------------------------------------------===//

 #include "HexagonInstrInfo.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <iterator>
 #include <vector>

 using namespace llvm;

 #define DEBUG_TYPE "hexagon-load-store-widening"

 static cl::opt<unsigned> MaxMBBSizeForLoadStoreWidening(
     "max-bb-size-for-load-store-widening", cl::Hidden, cl::init(1000),
     cl::desc("Limit block size to analyze in load/store widening pass"));

 namespace llvm {

 FunctionPass *createHexagonStoreWidening();
 FunctionPass *createHexagonLoadWidening();
 void initializeHexagonStoreWideningPass(PassRegistry &);
 void initializeHexagonLoadWideningPass(PassRegistry &);

 } // end namespace llvm

 namespace {

 struct HexagonLoadStoreWidening {
   enum WideningMode { Store, Load };
   const HexagonInstrInfo *TII;
   const HexagonRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
   AliasAnalysis *AA;
   MachineFunction *MF;

 public:
   HexagonLoadStoreWidening(const HexagonInstrInfo *TII,
                            const HexagonRegisterInfo *TRI,
                            MachineRegisterInfo *MRI, AliasAnalysis *AA,
                            MachineFunction *MF, bool StoreMode)
       : TII(TII), TRI(TRI), MRI(MRI), AA(AA), MF(MF),
         Mode(StoreMode ? WideningMode::Store : WideningMode::Load),
         HII(MF->getSubtarget<HexagonSubtarget>().getInstrInfo()) {}

   bool run();

 private:
   const bool Mode;
   const unsigned MaxWideSize = 8;
   const HexagonInstrInfo *HII = nullptr;

   using InstrSet = SmallPtrSet<MachineInstr *, 16>;
   using InstrGroup = SmallVector<MachineInstr *, 8>;
   using InstrGroupList = SmallVector<InstrGroup, 8>;

   InstrSet ProcessedInsts;

   unsigned getBaseAddressRegister(const MachineInstr *MI);
   int64_t getOffset(const MachineInstr *MI);
   int64_t getPostIncrementValue(const MachineInstr *MI);
   bool handledInstType(const MachineInstr *MI);

   void createGroup(MachineInstr *BaseInst, InstrGroup &Group);
   void createGroups(MachineBasicBlock &MBB, InstrGroupList &StoreGroups);
   bool processBasicBlock(MachineBasicBlock &MBB);
   bool processGroup(InstrGroup &Group);
   bool selectInsts(InstrGroup::iterator Begin, InstrGroup::iterator End,
                    InstrGroup &OG, unsigned &TotalSize, unsigned MaxSize);
   bool createWideInsts(InstrGroup &OG, InstrGroup &NG, unsigned TotalSize);
   bool createWideStores(InstrGroup &OG, InstrGroup &NG, unsigned TotalSize);
   bool createWideLoads(InstrGroup &OG, InstrGroup &NG, unsigned TotalSize);
   bool replaceInsts(InstrGroup &OG, InstrGroup &NG);
   bool areAdjacent(const MachineInstr *S1, const MachineInstr *S2);
   bool canSwapInstructions(const MachineInstr *A, const MachineInstr *B);
 };

 struct HexagonStoreWidening : public MachineFunctionPass {
   static char ID;

   HexagonStoreWidening() : MachineFunctionPass(ID) {
     initializeHexagonStoreWideningPass(*PassRegistry::getPassRegistry());
   }

   StringRef getPassName() const override { return "Hexagon Store Widening"; }

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AAResultsWrapperPass>();
     AU.addPreserved<AAResultsWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }

   bool runOnMachineFunction(MachineFunction &MFn) override {
     if (skipFunction(MFn.getFunction()))
       return false;

     auto &ST = MFn.getSubtarget<HexagonSubtarget>();
     const HexagonInstrInfo *TII = ST.getInstrInfo();
     const HexagonRegisterInfo *TRI = ST.getRegisterInfo();
     MachineRegisterInfo *MRI = &MFn.getRegInfo();
     AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();

     return HexagonLoadStoreWidening(TII, TRI, MRI, AA, &MFn, true).run();
   }
 };

 struct HexagonLoadWidening : public MachineFunctionPass {
   static char ID;

   HexagonLoadWidening() : MachineFunctionPass(ID) {
     initializeHexagonLoadWideningPass(*PassRegistry::getPassRegistry());
   }

   StringRef getPassName() const override { return "Hexagon Load Widening"; }

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AAResultsWrapperPass>();
     AU.addPreserved<AAResultsWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }

   bool runOnMachineFunction(MachineFunction &MFn) override {
     if (skipFunction(MFn.getFunction()))
       return false;

     auto &ST = MFn.getSubtarget<HexagonSubtarget>();
     const HexagonInstrInfo *TII = ST.getInstrInfo();
     const HexagonRegisterInfo *TRI = ST.getRegisterInfo();
     MachineRegisterInfo *MRI = &MFn.getRegInfo();
     AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
     return HexagonLoadStoreWidening(TII, TRI, MRI, AA, &MFn, false).run();
   }
 };

 char HexagonStoreWidening::ID = 0;
 char HexagonLoadWidening::ID = 0;

 } // end anonymous namespace

 INITIALIZE_PASS_BEGIN(HexagonStoreWidening, "hexagon-widen-stores",
                       "Hexagon Store Widening", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(HexagonStoreWidening, "hexagon-widen-stores",
                     "Hexagon Store Widening", false, false)

 INITIALIZE_PASS_BEGIN(HexagonLoadWidening, "hexagon-widen-loads",
                       "Hexagon Load Widening", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(HexagonLoadWidening, "hexagon-widen-loads",
                     "Hexagon Load Widening", false, false)

 static const MachineMemOperand &getMemTarget(const MachineInstr *MI) {
   assert(!MI->memoperands_empty() && "Expecting memory operands");
   return **MI->memoperands_begin();
 }

 unsigned
 HexagonLoadStoreWidening::getBaseAddressRegister(const MachineInstr *MI) {
   assert(HexagonLoadStoreWidening::handledInstType(MI) && "Unhandled opcode");
   unsigned Base, Offset;
   HII->getBaseAndOffsetPosition(*MI, Base, Offset);
   const MachineOperand &MO = MI->getOperand(Base);
   assert(MO.isReg() && "Expecting register operand");
   return MO.getReg();
 }

 int64_t HexagonLoadStoreWidening::getOffset(const MachineInstr *MI) {
   assert(HexagonLoadStoreWidening::handledInstType(MI) && "Unhandled opcode");

   // On Hexagon, post-incs always have an offset of 0
   // There is no Offset operand to post-incs
   if (HII->isPostIncrement(*MI))
     return 0;

   unsigned Base, Offset;

   HII->getBaseAndOffsetPosition(*MI, Base, Offset);
   const MachineOperand &MO = MI->getOperand(Offset);
   switch (MO.getType()) {
   case MachineOperand::MO_Immediate:
     return MO.getImm();
   case MachineOperand::MO_GlobalAddress:
     return MO.getOffset();
   default:
     break;
   }
   llvm_unreachable("Expecting an immediate or global operand");
 }

 inline int64_t
 HexagonLoadStoreWidening::getPostIncrementValue(const MachineInstr *MI) {
   unsigned Base, PostIncIdx;
   HII->getBaseAndOffsetPosition(*MI, Base, PostIncIdx);
   const MachineOperand &MO = MI->getOperand(PostIncIdx);
   return MO.getImm();
 }

 // Filtering function: any loads/stores whose opcodes are not "approved" of by
 // this function will not be subjected to widening.
 inline bool HexagonLoadStoreWidening::handledInstType(const MachineInstr *MI) {
   unsigned Opc = MI->getOpcode();
   if (Mode == WideningMode::Store) {
     switch (Opc) {
     case Hexagon::S4_storeirb_io:
     case Hexagon::S4_storeirh_io:
     case Hexagon::S4_storeiri_io:
     case Hexagon::S2_storeri_io:
       // Base address must be a register. (Implement FI later.)
       return MI->getOperand(0).isReg();
     case Hexagon::S2_storeri_pi:
       return MI->getOperand(1).isReg();
     }
   } else {
     // LoadWidening for 8 and 16 bit loads needs 2x instructions to replace x
     // loads. So we only widen 32 bit loads as we don't need to select the
     // right bits with AND & SHIFT ops.
     switch (Opc) {
     case Hexagon::L2_loadri_io:
       // Base address must be a register and offset must be immediate.
       return !MI->memoperands_empty() && MI->getOperand(1).isReg() &&
              MI->getOperand(2).isImm();
     case Hexagon::L2_loadri_pi:
       return !MI->memoperands_empty() && MI->getOperand(2).isReg();
     }
   }
   return false;
 }

 static void addDefsUsesToList(const MachineInstr *MI,
                               DenseSet<Register> &RegDefs,
                               DenseSet<Register> &RegUses) {
   for (const auto &Op : MI->operands()) {
     if (!Op.isReg())
       continue;
     if (Op.isDef())
       RegDefs.insert(Op.getReg());
     if (Op.readsReg())
       RegUses.insert(Op.getReg());
   }
 }

 bool HexagonLoadStoreWidening::canSwapInstructions(const MachineInstr *A,
                                                    const MachineInstr *B) {
   DenseSet<Register> ARegDefs;
   DenseSet<Register> ARegUses;
   addDefsUsesToList(A, ARegDefs, ARegUses);
   if (A->mayLoadOrStore() && B->mayLoadOrStore() &&
       (A->mayStore() || B->mayStore()) && A->mayAlias(AA, *B, true))
     return false;
   for (const auto &BOp : B->operands()) {
     if (!BOp.isReg())
       continue;
     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
       return false;
     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
       return false;
   }
   return true;
 }

 // Inspect a machine basic block, and generate groups out of loads/stores
 // encountered in the block.
 //
 // A load/store group is a group of loads or stores that use the same base
 // register, and which can be reordered within that group without altering the
 // semantics of the program.  A single group could be widened as
 // a whole, if there existed a single load/store instruction with the same
 // semantics as the entire group.  In many cases, a single group may need more
 // than one wide load or store.
 void HexagonLoadStoreWidening::createGroups(MachineBasicBlock &MBB,
                                             InstrGroupList &StoreGroups) {
   // Traverse all instructions and if we encounter
   // a load/store, then try to create a group starting at that instruction
   // i.e. a sequence of independent loads/stores that can be widened.
   for (auto I = MBB.begin(); I != MBB.end(); ++I) {
     MachineInstr *MI = &(*I);
     if (!handledInstType(MI))
       continue;
     if (ProcessedInsts.count(MI))
       continue;

     // Found a store.  Try to create a store group.
     InstrGroup G;
     createGroup(MI, G);
     if (G.size() > 1)
       StoreGroups.push_back(G);
   }
 }

 // Create a single load/store group.  The insts need to be independent between
 // themselves, and also there cannot be other instructions between them
 // that could read or modify storage being read from or stored into.
 void HexagonLoadStoreWidening::createGroup(MachineInstr *BaseInst,
                                            InstrGroup &Group) {
   assert(handledInstType(BaseInst) && "Unexpected instruction");
   unsigned BaseReg = getBaseAddressRegister(BaseInst);
   InstrGroup Other;

   Group.push_back(BaseInst);
   LLVM_DEBUG(dbgs() << "BaseInst: "; BaseInst->dump());
   auto End = BaseInst->getParent()->end();
   auto I = BaseInst->getIterator();

   while (true) {
     I = std::next(I);
     if (I == End)
       break;
     MachineInstr *MI = &(*I);

     // Assume calls are aliased to everything.
     if (MI->isCall() || MI->hasUnmodeledSideEffects() ||
         MI->hasOrderedMemoryRef())
       return;

     if (!handledInstType(MI)) {
       if (MI->mayLoadOrStore())
         Other.push_back(MI);
       continue;
     }

     // We have a handledInstType instruction
     // If this load/store instruction is aliased with anything already in the
     // group, terminate the group now.
     for (auto GI : Group)
       if (GI->mayAlias(AA, *MI, true))
         return;
     if (Mode == WideningMode::Load) {
       // Check if current load MI can be moved to the first load instruction
       // in Group. If any load instruction aliases with memory instructions in
       // Other, terminate the group.
       for (auto MemI : Other)
         if (!canSwapInstructions(MI, MemI))
           return;
     } else {
       // Check if store instructions in the group can be moved to current
       // store MI. If any store instruction aliases with memory instructions
       // in Other, terminate the group.
       for (auto MemI : Other) {
         if (std::distance(Group.back()->getIterator(), MemI->getIterator()) <=
             0)
           continue;
         for (auto GI : Group)
           if (!canSwapInstructions(MemI, GI))
             return;
       }
     }

     unsigned BR = getBaseAddressRegister(MI);
     if (BR == BaseReg) {
       LLVM_DEBUG(dbgs() << "Added MI to group: "; MI->dump());
       Group.push_back(MI);
       ProcessedInsts.insert(MI);
     }
   } // while
 }

 // Check if load/store instructions S1 and S2 are adjacent.  More precisely,
 // S2 has to access memory immediately following that accessed by S1.
 bool HexagonLoadStoreWidening::areAdjacent(const MachineInstr *S1,
                                            const MachineInstr *S2) {
   if (!handledInstType(S1) || !handledInstType(S2))
     return false;

   const MachineMemOperand &S1MO = getMemTarget(S1);

   // Currently only handling immediate stores.
   int Off1 = getOffset(S1);
   int Off2 = getOffset(S2);

   return (Off1 >= 0) ? Off1 + S1MO.getSize().getValue() == unsigned(Off2)
                      : int(Off1 + S1MO.getSize().getValue()) == Off2;
 }

 /// Given a sequence of adjacent loads/stores, and a maximum size of a single
 /// wide inst, pick a group of insts that can be replaced by a single load/store
 /// of size not exceeding MaxSize.  The selected sequence will be recorded
 /// in OG ("old group" of instructions).
 /// OG should be empty on entry, and should be left empty if the function
 /// fails.
 bool HexagonLoadStoreWidening::selectInsts(InstrGroup::iterator Begin,
                                            InstrGroup::iterator End,
                                            InstrGroup &OG, unsigned &TotalSize,
                                            unsigned MaxSize) {
   assert(Begin != End && "No instructions to analyze");
   assert(OG.empty() && "Old group not empty on entry");

   if (std::distance(Begin, End) <= 1)
     return false;

   MachineInstr *FirstMI = *Begin;
   assert(!FirstMI->memoperands_empty() && "Expecting some memory operands");
   const MachineMemOperand &FirstMMO = getMemTarget(FirstMI);
   if (!FirstMMO.getType().isValid())
     return false;

   unsigned Alignment = FirstMMO.getAlign().value();
   unsigned SizeAccum = FirstMMO.getSize().getValue();
   unsigned FirstOffset = getOffset(FirstMI);

   // The initial value of SizeAccum should always be a power of 2.
   assert(isPowerOf2_32(SizeAccum) && "First store size not a power of 2");

   // If the size of the first store equals to or exceeds the limit, do nothing.
   if (SizeAccum >= MaxSize)
     return false;

   // If the size of the first load/store is greater than or equal to the address
   // stored to, then the inst cannot be made any wider.
   if (SizeAccum >= Alignment) {
     LLVM_DEBUG(
         dbgs() << "Size of load/store greater than equal to its alignment\n");
     return false;
   }

   // The offset of a load/store will put restrictions on how wide the inst can
   // be.  Offsets in loads/stores of size 2^n bytes need to have the n lowest
   // bits be 0.  If the first inst already exhausts the offset limits, quit.
   // Test this by checking if the next wider size would exceed the limit.
   // For post-increment instructions, the increment amount needs to follow the
   // same rule.
   unsigned OffsetOrIncVal = 0;
   if (HII->isPostIncrement(*FirstMI))
     OffsetOrIncVal = getPostIncrementValue(FirstMI);
   else
     OffsetOrIncVal = FirstOffset;
   if ((2 * SizeAccum - 1) & OffsetOrIncVal) {
     LLVM_DEBUG(dbgs() << "Instruction cannot be widened as the offset/postinc"
                       << " value: " << getPostIncrementValue(FirstMI)
                       << " is invalid in the widened version\n");
     return false;
   }

   OG.push_back(FirstMI);
   MachineInstr *S1 = FirstMI;

   // Pow2Num will be the largest number of elements in OG such that the sum
   // of sizes of loads/stores 0...Pow2Num-1 will be a power of 2.
   unsigned Pow2Num = 1;
   unsigned Pow2Size = SizeAccum;
   bool HavePostInc = HII->isPostIncrement(*S1);

   // Be greedy: keep accumulating insts as long as they are to adjacent
   // memory locations, and as long as the total number of bytes stored
   // does not exceed the limit (MaxSize).
   // Keep track of when the total size covered is a power of 2, since
   // this is a size a single load/store can cover.
   for (InstrGroup::iterator I = Begin + 1; I != End; ++I) {
     MachineInstr *S2 = *I;
     // Insts are sorted, so if S1 and S2 are not adjacent, there won't be
     // any other store to fill the "hole".
     if (!areAdjacent(S1, S2))
       break;

     // Cannot widen two post increments, need to return two registers
     // with incremented values
     if (HavePostInc && HII->isPostIncrement(*S2))
       break;

     unsigned S2Size = getMemTarget(S2).getSize().getValue();
     if (SizeAccum + S2Size > std::min(MaxSize, Alignment))
       break;

     OG.push_back(S2);
     SizeAccum += S2Size;
     if (isPowerOf2_32(SizeAccum)) {
       Pow2Num = OG.size();
       Pow2Size = SizeAccum;
     }
     if ((2 * Pow2Size - 1) & FirstOffset)
       break;

     S1 = S2;
   }

   // The insts don't add up to anything that can be widened.  Clean up.
   if (Pow2Num <= 1) {
     OG.clear();
     return false;
   }

   // Only leave the loads/stores being widened.
   OG.resize(Pow2Num);
   TotalSize = Pow2Size;
   return true;
 }

 /// Given an "old group" OG of insts, create a "new group" NG of instructions
 /// to replace them.
 bool HexagonLoadStoreWidening::createWideInsts(InstrGroup &OG, InstrGroup &NG,
                                                unsigned TotalSize) {
   if (Mode == WideningMode::Store) {
     return createWideStores(OG, NG, TotalSize);
   }
   return createWideLoads(OG, NG, TotalSize);
 }

 /// Given an "old group" OG of stores, create a "new group" NG of instructions
 /// to replace them.  Ideally, NG would only have a single instruction in it,
 /// but that may only be possible for store-immediate.
 bool HexagonLoadStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
                                                 unsigned TotalSize) {
   // XXX Current limitations:
   // - only handle a TotalSize of up to 8

   LLVM_DEBUG(dbgs() << "Creating wide stores\n");
   if (TotalSize > MaxWideSize)
     return false;

   uint64_t Acc = 0; // Value accumulator.
   unsigned Shift = 0;
   bool HaveImm = false;
   bool HaveReg = false;

   for (MachineInstr *MI : OG) {
     const MachineMemOperand &MMO = getMemTarget(MI);
     MachineOperand &SO = HII->isPostIncrement(*MI)
                              ? MI->getOperand(3)
                              : MI->getOperand(2); // Source.
     unsigned NBits;
     uint64_t Mask;
     uint64_t Val;

     switch (SO.getType()) {
     case MachineOperand::MO_Immediate:
       LLVM_DEBUG(dbgs() << "Have store immediate\n");
       HaveImm = true;

       NBits = MMO.getSizeInBits().toRaw();
       Mask = (0xFFFFFFFFFFFFFFFFU >> (64 - NBits));
       Val = (SO.getImm() & Mask) << Shift;
       Acc |= Val;
       Shift += NBits;
       break;
     case MachineOperand::MO_Register:
       HaveReg = true;
       break;
     default:
       LLVM_DEBUG(dbgs() << "Unhandled store\n");
       return false;
     }
   }

   if (HaveImm && HaveReg) {
     LLVM_DEBUG(dbgs() << "Cannot merge store register and store imm\n");
     return false;
   }

   MachineInstr *FirstSt = OG.front();
   DebugLoc DL = OG.back()->getDebugLoc();
   const MachineMemOperand &OldM = getMemTarget(FirstSt);
   MachineMemOperand *NewM =
       MF->getMachineMemOperand(OldM.getPointerInfo(), OldM.getFlags(),
                                TotalSize, OldM.getAlign(), OldM.getAAInfo());
   MachineInstr *StI;
   MachineOperand &MR =
       (HII->isPostIncrement(*FirstSt) ? FirstSt->getOperand(1)
                                       : FirstSt->getOperand(0));
   auto SecondSt = OG.back();
   if (HaveReg) {
     MachineOperand FReg =
         (HII->isPostIncrement(*FirstSt) ? FirstSt->getOperand(3)
                                         : FirstSt->getOperand(2));
     // Post increments appear first in the sorted group.
     // Cannot have a post increment for the second instruction
     assert(!HII->isPostIncrement(*SecondSt) && "Unexpected PostInc");
     MachineOperand SReg = SecondSt->getOperand(2);
     assert(FReg.isReg() && SReg.isReg() &&
            "Cannot merge store register and store imm");
     const MCInstrDesc &CombD = TII->get(Hexagon::A2_combinew);
     Register VReg =
         MF->getRegInfo().createVirtualRegister(&Hexagon::DoubleRegsRegClass);
     MachineInstr *CombI = BuildMI(*MF, DL, CombD, VReg).add(SReg).add(FReg);
     NG.push_back(CombI);

     if (FirstSt->getOpcode() == Hexagon::S2_storeri_pi) {
       const MCInstrDesc &StD = TII->get(Hexagon::S2_storerd_pi);
       auto IncDestMO = FirstSt->getOperand(0);
       auto IncMO = FirstSt->getOperand(2);
       StI =
           BuildMI(*MF, DL, StD).add(IncDestMO).add(MR).add(IncMO).addReg(VReg);
     } else {
       const MCInstrDesc &StD = TII->get(Hexagon::S2_storerd_io);
       auto OffMO = FirstSt->getOperand(1);
       StI = BuildMI(*MF, DL, StD).add(MR).add(OffMO).addReg(VReg);
     }
     StI->addMemOperand(*MF, NewM);
     NG.push_back(StI);
     return true;
   }

   // Handle store immediates
   // There are no post increment store immediates on Hexagon
   assert(!HII->isPostIncrement(*FirstSt) && "Unexpected PostInc");
   auto Off = FirstSt->getOperand(1).getImm();
   if (TotalSize == 8) {
     // Create vreg = A2_tfrsi #Acc; nreg = combine(#s32, vreg); memd = nreg
     uint64_t Mask = 0xFFFFFFFFU;
     int LowerAcc = int(Mask & Acc);
     int UpperAcc = Acc >> 32;
     Register DReg =
         MF->getRegInfo().createVirtualRegister(&Hexagon::DoubleRegsRegClass);
     MachineInstr *CombI;
     if (Acc != 0) {
       const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
       const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI, *MF);
       Register VReg = MF->getRegInfo().createVirtualRegister(RC);
       MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg).addImm(LowerAcc);
       NG.push_back(TfrI);
       const MCInstrDesc &CombD = TII->get(Hexagon::A4_combineir);
       CombI = BuildMI(*MF, DL, CombD, DReg)
                   .addImm(UpperAcc)
                   .addReg(VReg, RegState::Kill);
     }
     // If immediates are 0, we do not need A2_tfrsi
     else {
       const MCInstrDesc &CombD = TII->get(Hexagon::A4_combineii);
       CombI = BuildMI(*MF, DL, CombD, DReg).addImm(0).addImm(0);
     }
     NG.push_back(CombI);
     const MCInstrDesc &StD = TII->get(Hexagon::S2_storerd_io);
     StI =
         BuildMI(*MF, DL, StD).add(MR).addImm(Off).addReg(DReg, RegState::Kill);
   } else if (Acc < 0x10000) {
     // Create mem[hw] = #Acc
     unsigned WOpc = (TotalSize == 2)   ? Hexagon::S4_storeirh_io
                     : (TotalSize == 4) ? Hexagon::S4_storeiri_io
                                        : 0;
     assert(WOpc && "Unexpected size");

     int Val = (TotalSize == 2) ? int16_t(Acc) : int(Acc);
     const MCInstrDesc &StD = TII->get(WOpc);
     StI = BuildMI(*MF, DL, StD).add(MR).addImm(Off).addImm(Val);
   } else {
     // Create vreg = A2_tfrsi #Acc; mem[hw] = vreg
     const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
     const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI, *MF);
     Register VReg = MF->getRegInfo().createVirtualRegister(RC);
     MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg).addImm(int(Acc));
     NG.push_back(TfrI);

     unsigned WOpc = (TotalSize == 2)   ? Hexagon::S2_storerh_io
                     : (TotalSize == 4) ? Hexagon::S2_storeri_io
                                        : 0;
     assert(WOpc && "Unexpected size");

     const MCInstrDesc &StD = TII->get(WOpc);
     StI =
         BuildMI(*MF, DL, StD).add(MR).addImm(Off).addReg(VReg, RegState::Kill);
   }
   StI->addMemOperand(*MF, NewM);
   NG.push_back(StI);

   return true;
 }

 /// Given an "old group" OG of loads, create a "new group" NG of instructions
 /// to replace them.  Ideally, NG would only have a single instruction in it,
 /// but that may only be possible for double register loads.
 bool HexagonLoadStoreWidening::createWideLoads(InstrGroup &OG, InstrGroup &NG,
                                                unsigned TotalSize) {
   LLVM_DEBUG(dbgs() << "Creating wide loads\n");
   // XXX Current limitations:
   // - only expect stores of immediate values in OG,
   // - only handle a TotalSize of up to 8
   if (TotalSize > MaxWideSize)
     return false;
   assert(OG.size() == 2 && "Expecting two elements in Instruction Group.");

   MachineInstr *FirstLd = OG.front();
   const MachineMemOperand &OldM = getMemTarget(FirstLd);
   MachineMemOperand *NewM =
       MF->getMachineMemOperand(OldM.getPointerInfo(), OldM.getFlags(),
                                TotalSize, OldM.getAlign(), OldM.getAAInfo());

   MachineOperand &MR = FirstLd->getOperand(0);
   MachineOperand &MRBase =
       (HII->isPostIncrement(*FirstLd) ? FirstLd->getOperand(2)
                                       : FirstLd->getOperand(1));
   DebugLoc DL = OG.back()->getDebugLoc();

   // Create the double register Load Instruction.
   Register NewMR = MRI->createVirtualRegister(&Hexagon::DoubleRegsRegClass);
   MachineInstr *LdI;

   // Post increments appear first in the sorted group
   if (FirstLd->getOpcode() == Hexagon::L2_loadri_pi) {
     auto IncDestMO = FirstLd->getOperand(1);
     auto IncMO = FirstLd->getOperand(3);
     LdI = BuildMI(*MF, DL, TII->get(Hexagon::L2_loadrd_pi))
               .addDef(NewMR, getKillRegState(MR.isKill()), MR.getSubReg())
               .add(IncDestMO)
               .add(MRBase)
               .add(IncMO);
     LdI->addMemOperand(*MF, NewM);
   } else {
     auto OffMO = FirstLd->getOperand(2);
     LdI = BuildMI(*MF, DL, TII->get(Hexagon::L2_loadrd_io))
               .addDef(NewMR, getKillRegState(MR.isKill()), MR.getSubReg())
               .add(MRBase)
               .add(OffMO);
     LdI->addMemOperand(*MF, NewM);
   }
   NG.push_back(LdI);

   auto getHalfReg = [&](MachineInstr *DoubleReg, unsigned SubReg,
                         MachineInstr *DstReg) {
     Register DestReg = DstReg->getOperand(0).getReg();
     return BuildMI(*MF, DL, TII->get(Hexagon::COPY), DestReg)
         .addReg(NewMR, getKillRegState(LdI->isKill()), SubReg);
   };

   MachineInstr *LdI_lo = getHalfReg(LdI, Hexagon::isub_lo, FirstLd);
   MachineInstr *LdI_hi = getHalfReg(LdI, Hexagon::isub_hi, OG.back());
   NG.push_back(LdI_lo);
   NG.push_back(LdI_hi);

   return true;
 }

 // Replace instructions from the old group OG with instructions from the
 // new group NG.  Conceptually, remove all instructions in OG, and then
 // insert all instructions in NG, starting at where the first instruction
 // from OG was (in the order in which they appeared in the basic block).
 // (The ordering in OG does not have to match the order in the basic block.)
 bool HexagonLoadStoreWidening::replaceInsts(InstrGroup &OG, InstrGroup &NG) {
   LLVM_DEBUG({
     dbgs() << "Replacing:\n";
     for (auto I : OG)
       dbgs() << "  " << *I;
     dbgs() << "with\n";
     for (auto I : NG)
       dbgs() << "  " << *I;
   });

   MachineBasicBlock *MBB = OG.back()->getParent();
   MachineBasicBlock::iterator InsertAt = MBB->end();

   // Need to establish the insertion point.
   // For loads the best one is right before the first load in the OG,
   // but in the order in which the insts occur in the program list.
   // For stores the best point is right after the last store in the OG.
   // Since the ordering in OG does not correspond
   // to the order in the program list, we need to do some work to find
   // the insertion point.

   // Create a set of all instructions in OG (for quick lookup).
   InstrSet OldMemInsts;
   for (auto *I : OG)
     OldMemInsts.insert(I);

   if (Mode == WideningMode::Load) {
     // Find the first load instruction in the block that is present in OG.
     for (auto &I : *MBB) {
       if (OldMemInsts.count(&I)) {
         InsertAt = I;
         break;
       }
     }

     assert((InsertAt != MBB->end()) && "Cannot locate any load from the group");

     for (auto *I : NG)
       MBB->insert(InsertAt, I);
   } else {
     // Find the last store instruction in the block that is present in OG.
     auto I = MBB->rbegin();
     for (; I != MBB->rend(); ++I) {
       if (OldMemInsts.count(&(*I))) {
         InsertAt = (*I).getIterator();
         break;
       }
     }

     assert((I != MBB->rend()) && "Cannot locate any store from the group");

     for (auto I = NG.rbegin(); I != NG.rend(); ++I)
       MBB->insertAfter(InsertAt, *I);
   }

   for (auto *I : OG)
     I->eraseFromParent();

   return true;
 }

 // Break up the group into smaller groups, each of which can be replaced by
 // a single wide load/store.  Widen each such smaller group and replace the old
 // instructions with the widened ones.
 bool HexagonLoadStoreWidening::processGroup(InstrGroup &Group) {
   bool Changed = false;
   InstrGroup::iterator I = Group.begin(), E = Group.end();
   InstrGroup OG, NG; // Old and new groups.
   unsigned CollectedSize;

   while (I != E) {
     OG.clear();
     NG.clear();

     bool Succ = selectInsts(I++, E, OG, CollectedSize, MaxWideSize) &&
                 createWideInsts(OG, NG, CollectedSize) && replaceInsts(OG, NG);
     if (!Succ)
       continue;

     assert(OG.size() > 1 && "Created invalid group");
     assert(std::distance(I, E) + 1 >= int(OG.size()) && "Too many elements");
     I += OG.size() - 1;

     Changed = true;
   }

   return Changed;
 }

 // Process a single basic block: create the load/store groups, and replace them
 // with the widened insts, if possible.  Processing of each basic block
 // is independent from processing of any other basic block.  This transfor-
 // mation could be stopped after having processed any basic block without
 // any ill effects (other than not having performed widening in the unpro-
 // cessed blocks).  Also, the basic blocks can be processed in any order.
 bool HexagonLoadStoreWidening::processBasicBlock(MachineBasicBlock &MBB) {
   InstrGroupList SGs;
   bool Changed = false;

   // To prevent long compile time check for max BB size.
   if (MBB.size() > MaxMBBSizeForLoadStoreWidening)
     return false;

   createGroups(MBB, SGs);

   auto Less = [this](const MachineInstr *A, const MachineInstr *B) -> bool {
     return getOffset(A) < getOffset(B);
   };
   for (auto &G : SGs) {
     assert(G.size() > 1 && "Group with fewer than 2 elements");
     llvm::sort(G, Less);

     Changed |= processGroup(G);
   }

   return Changed;
 }

 bool HexagonLoadStoreWidening::run() {
   bool Changed = false;

   for (auto &B : *MF)
     Changed |= processBasicBlock(B);

   return Changed;
 }

 FunctionPass *llvm::createHexagonStoreWidening() {
   return new HexagonStoreWidening();
 }

 FunctionPass *llvm::createHexagonLoadWidening() {
   return new HexagonLoadWidening();
 }