llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp - rust-lang/llvm-project - Git at Google

 //===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This pass performs below peephole optimizations on MIR level.
 //
 // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
 //    MOVi64imm + ANDXrr ==> ANDXri + ANDXri
 //
 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
 //    MOVi64imm + ADDXrr ==> ANDXri + ANDXri
 //
 // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
 //    MOVi64imm + SUBXrr ==> SUBXri + SUBXri
 //
 //    The mov pseudo instruction could be expanded to multiple mov instructions
 //    later. In this case, we could try to split the constant  operand of mov
 //    instruction into two immediates which can be directly encoded into
 //    *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
 //    multiple `mov` + `and/add/sub` instructions.
 //
 // 4. Remove redundant ORRWrs which is generated by zero-extend.
 //
 //    %3:gpr32 = ORRWrs $wzr, %2, 0
 //    %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
 //
 //    If AArch64's 32-bit form of instruction defines the source operand of
 //    ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
 //    operand are set to zero.
 //
 //===----------------------------------------------------------------------===//

 #include "AArch64ExpandImm.h"
 #include "AArch64InstrInfo.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"

 using namespace llvm;

 #define DEBUG_TYPE "aarch64-mi-peephole-opt"

 namespace {

 struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   static char ID;

   AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {
     initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry());
   }

   const AArch64InstrInfo *TII;
   const AArch64RegisterInfo *TRI;
   MachineLoopInfo *MLI;
   MachineRegisterInfo *MRI;

   template <typename T>
   using SplitAndOpcFunc =
       std::function<Optional<unsigned>(T, unsigned, T &, T &)>;
   using BuildMIFunc =
       std::function<void(MachineInstr &, unsigned, unsigned, unsigned, Register,
                          Register, Register)>;

   /// For instructions where an immediate operand could be split into two
   /// separate immediate instructions, use the splitTwoPartImm two handle the
   /// optimization.
   ///
   /// To implement, the following function types must be passed to
   /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
   /// splitting the immediate is valid and returns the associated new opcode. A
   /// BuildMIFunc must be implemented to build the two immediate instructions.
   ///
   /// Example Pattern (where IMM would require 2+ MOV instructions):
   ///     %dst = <Instr>rr %src IMM [...]
   /// becomes:
   ///     %tmp = <Instr>ri %src (encode half IMM) [...]
   ///     %dst = <Instr>ri %tmp (encode half IMM) [...]
   template <typename T>
   bool splitTwoPartImm(MachineInstr &MI,
                        SmallSetVector<MachineInstr *, 8> &ToBeRemoved,
                        SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);

   bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
                         MachineInstr *&SubregToRegMI);

   template <typename T>
   bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI,
                    SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
   template <typename T>
   bool visitAND(unsigned Opc, MachineInstr &MI,
                 SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
   bool visitORR(MachineInstr &MI,
                 SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
   bool runOnMachineFunction(MachineFunction &MF) override;

   StringRef getPassName() const override {
     return "AArch64 MI Peephole Optimization pass";
   }

   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<MachineLoopInfo>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };

 char AArch64MIPeepholeOpt::ID = 0;

 } // end anonymous namespace

 INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
                 "AArch64 MI Peephole Optimization", false, false)

 template <typename T>
 static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
   T UImm = static_cast<T>(Imm);
   if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
     return false;

   // If this immediate can be handled by one instruction, do not split it.
   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
   AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
   if (Insn.size() == 1)
     return false;

   // The bitmask immediate consists of consecutive ones.  Let's say there is
   // constant 0b00000000001000000000010000000000 which does not consist of
   // consecutive ones. We can split it in to two bitmask immediate like
   // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
   // If we do AND with these two bitmask immediate, we can see original one.
   unsigned LowestBitSet = countTrailingZeros(UImm);
   unsigned HighestBitSet = Log2_64(UImm);

   // Create a mask which is filled with one from the position of lowest bit set
   // to the position of highest bit set.
   T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
               (static_cast<T>(1) << LowestBitSet);
   // Create a mask which is filled with one outside the position of lowest bit
   // set and the position of highest bit set.
   T NewImm2 = UImm | ~NewImm1;

   // If the split value is not valid bitmask immediate, do not split this
   // constant.
   if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
     return false;

   Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
   Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
   return true;
 }

 template <typename T>
 bool AArch64MIPeepholeOpt::visitAND(
     unsigned Opc, MachineInstr &MI,
     SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
   // Try below transformation.
   //
   // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
   // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
   //
   // The mov pseudo instruction could be expanded to multiple mov instructions
   // later. Let's try to split the constant operand of mov instruction into two
   // bitmask immediates. It makes only two AND instructions intead of multiple
   // mov + and instructions.

   return splitTwoPartImm<T>(
       MI, ToBeRemoved,
       [Opc](T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<unsigned> {
         if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
           return Opc;
         return None;
       },
       [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0,
                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
                    Register NewDstReg) {
         DebugLoc DL = MI.getDebugLoc();
         MachineBasicBlock *MBB = MI.getParent();
         BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
             .addReg(SrcReg)
             .addImm(Imm0);
         BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
             .addReg(NewTmpReg)
             .addImm(Imm1);
       });
 }

 bool AArch64MIPeepholeOpt::visitORR(
     MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
   // Check this ORR comes from below zero-extend pattern.
   //
   // def : Pat<(i64 (zext GPR32:$src)),
   //           (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
   if (MI.getOperand(3).getImm() != 0)
     return false;

   if (MI.getOperand(1).getReg() != AArch64::WZR)
     return false;

   MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
   if (!SrcMI)
     return false;

   // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
   //
   // When you use the 32-bit form of an instruction, the upper 32 bits of the
   // source registers are ignored and the upper 32 bits of the destination
   // register are set to zero.
   //
   // If AArch64's 32-bit form of instruction defines the source operand of
   // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
   // real AArch64 instruction and if it is not, do not process the opcode
   // conservatively.
   if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
     return false;

   Register DefReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(2).getReg();
   MRI->replaceRegWith(DefReg, SrcReg);
   MRI->clearKillFlags(SrcReg);
   // replaceRegWith changes MI's definition register. Keep it for SSA form until
   // deleting MI.
   MI.getOperand(0).setReg(DefReg);
   ToBeRemoved.insert(&MI);

   LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");

   return true;
 }

 template <typename T>
 static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
   // The immediate must be in the form of ((imm0 << 12) + imm1), in which both
   // imm0 and imm1 are non-zero 12-bit unsigned int.
   if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||
       (Imm & ~static_cast<T>(0xffffff)) != 0)
     return false;

   // The immediate can not be composed via a single instruction.
   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
   AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
   if (Insn.size() == 1)
     return false;

   // Split Imm into (Imm0 << 12) + Imm1;
   Imm0 = (Imm >> 12) & 0xfff;
   Imm1 = Imm & 0xfff;
   return true;
 }

 template <typename T>
 bool AArch64MIPeepholeOpt::visitADDSUB(
     unsigned PosOpc, unsigned NegOpc, MachineInstr &MI,
     SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
   // Try below transformation.
   //
   // MOVi32imm + ADDWrr ==> ADDWri + ADDWri
   // MOVi64imm + ADDXrr ==> ADDXri + ADDXri
   //
   // MOVi32imm + SUBWrr ==> SUBWri + SUBWri
   // MOVi64imm + SUBXrr ==> SUBXri + SUBXri
   //
   // The mov pseudo instruction could be expanded to multiple mov instructions
   // later. Let's try to split the constant operand of mov instruction into two
   // legal add/sub immediates. It makes only two ADD/SUB instructions intead of
   // multiple `mov` + `and/sub` instructions.

   return splitTwoPartImm<T>(
       MI, ToBeRemoved,
       [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
                        T &Imm1) -> Optional<unsigned> {
         if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
           return PosOpc;
         if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
           return NegOpc;
         return None;
       },
       [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0,
                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
                    Register NewDstReg) {
         DebugLoc DL = MI.getDebugLoc();
         MachineBasicBlock *MBB = MI.getParent();
         BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
             .addReg(SrcReg)
             .addImm(Imm0)
             .addImm(12);
         BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
             .addReg(NewTmpReg)
             .addImm(Imm1)
             .addImm(0);
       });
 }

 // Checks if the corresponding MOV immediate instruction is applicable for
 // this peephole optimization.
 bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
                                             MachineInstr *&MovMI,
                                             MachineInstr *&SubregToRegMI) {
   // Check whether current MBB is in loop and the AND is loop invariant.
   MachineBasicBlock *MBB = MI.getParent();
   MachineLoop *L = MLI->getLoopFor(MBB);
   if (L && !L->isLoopInvariant(MI))
     return false;

   // Check whether current MI's operand is MOV with immediate.
   MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
   if (!MovMI)
     return false;

   // If it is SUBREG_TO_REG, check its operand.
   SubregToRegMI = nullptr;
   if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
     SubregToRegMI = MovMI;
     MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
     if (!MovMI)
       return false;
   }

   if (MovMI->getOpcode() != AArch64::MOVi32imm &&
       MovMI->getOpcode() != AArch64::MOVi64imm)
     return false;

   // If the MOV has multiple uses, do not split the immediate because it causes
   // more instructions.
   if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
     return false;
   if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
     return false;

   // It is OK to perform this peephole optimization.
   return true;
 }

 template <typename T>
 bool AArch64MIPeepholeOpt::splitTwoPartImm(
     MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved,
     SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
   unsigned RegSize = sizeof(T) * 8;
   assert((RegSize == 32 || RegSize == 64) &&
          "Invalid RegSize for legal immediate peephole optimization");

   // Perform several essential checks against current MI.
   MachineInstr *MovMI, *SubregToRegMI;
   if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
     return false;

   // Split the immediate to Imm0 and Imm1, and calculate the Opcode.
   T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
   // For the 32 bit form of instruction, the upper 32 bits of the destination
   // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
   // of Imm to zero. This is essential if the Immediate value was a negative
   // number since it was sign extended when we assign to the 64-bit Imm.
   if (SubregToRegMI)
     Imm &= 0xFFFFFFFF;
   unsigned Opcode;
   if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
     Opcode = R.getValue();
   else
     return false;

   // Create new ADD/SUB MIs.
   MachineFunction *MF = MI.getMF();
   const TargetRegisterClass *RC =
       TII->getRegClass(TII->get(Opcode), 0, TRI, *MF);
   const TargetRegisterClass *ORC =
       TII->getRegClass(TII->get(Opcode), 1, TRI, *MF);
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
   Register NewTmpReg = MRI->createVirtualRegister(RC);
   Register NewDstReg = MRI->createVirtualRegister(RC);

   MRI->constrainRegClass(SrcReg, RC);
   MRI->constrainRegClass(NewTmpReg, ORC);
   MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));

   BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);

   MRI->replaceRegWith(DstReg, NewDstReg);
   // replaceRegWith changes MI's definition register. Keep it for SSA form until
   // deleting MI.
   MI.getOperand(0).setReg(DstReg);

   // Record the MIs need to be removed.
   ToBeRemoved.insert(&MI);
   if (SubregToRegMI)
     ToBeRemoved.insert(SubregToRegMI);
   ToBeRemoved.insert(MovMI);

   return true;
 }

 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;

   TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
   TRI = static_cast<const AArch64RegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
   MLI = &getAnalysis<MachineLoopInfo>();
   MRI = &MF.getRegInfo();

   assert(MRI->isSSA() && "Expected to be run on SSA form!");

   bool Changed = false;
   SmallSetVector<MachineInstr *, 8> ToBeRemoved;

   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
       switch (MI.getOpcode()) {
       default:
         break;
       case AArch64::ANDWrr:
         Changed = visitAND<uint32_t>(AArch64::ANDWri, MI, ToBeRemoved);
         break;
       case AArch64::ANDXrr:
         Changed = visitAND<uint64_t>(AArch64::ANDXri, MI, ToBeRemoved);
         break;
       case AArch64::ORRWrs:
         Changed = visitORR(MI, ToBeRemoved);
         break;
       case AArch64::ADDWrr:
         Changed = visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI,
                                         ToBeRemoved);
         break;
       case AArch64::SUBWrr:
         Changed = visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI,
                                         ToBeRemoved);
         break;
       case AArch64::ADDXrr:
         Changed = visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI,
                                         ToBeRemoved);
         break;
       case AArch64::SUBXrr:
         Changed = visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI,
                                         ToBeRemoved);
         break;
       }
     }
   }

   for (MachineInstr *MI : ToBeRemoved)
     MI->eraseFromParent();

   return Changed;
 }

 FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
   return new AArch64MIPeepholeOpt();
 }
	//===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass performs below peephole optimizations on MIR level.
	//
	// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
	// MOVi64imm + ANDXrr ==> ANDXri + ANDXri
	//
	// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
	// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
	//
	// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
	// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
	//
	// The mov pseudo instruction could be expanded to multiple mov instructions
	// later. In this case, we could try to split the constant operand of mov
	// instruction into two immediates which can be directly encoded into
	// Wri/Xri instructions. It makes two AND/ADD/SUB instructions instead of
	// multiple `mov` + `and/add/sub` instructions.
	//
	// 4. Remove redundant ORRWrs which is generated by zero-extend.
	//
	// %3:gpr32 = ORRWrs $wzr, %2, 0
	// %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
	//
	// If AArch64's 32-bit form of instruction defines the source operand of
	// ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
	// operand are set to zero.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64ExpandImm.h"
	#include "AArch64InstrInfo.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/CodeGen/MachineDominators.h"
	#include "llvm/CodeGen/MachineLoopInfo.h"

	using namespace llvm;

	#define DEBUG_TYPE "aarch64-mi-peephole-opt"

	namespace {

	struct AArch64MIPeepholeOpt : public MachineFunctionPass {
	static char ID;

	AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {
	initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry());
	}

	const AArch64InstrInfo *TII;
	const AArch64RegisterInfo *TRI;
	MachineLoopInfo *MLI;
	MachineRegisterInfo *MRI;

	template <typename T>
	using SplitAndOpcFunc =
	std::function<Optional<unsigned>(T, unsigned, T &, T &)>;
	using BuildMIFunc =
	std::function<void(MachineInstr &, unsigned, unsigned, unsigned, Register,
	Register, Register)>;

	/// For instructions where an immediate operand could be split into two
	/// separate immediate instructions, use the splitTwoPartImm two handle the
	/// optimization.
	///
	/// To implement, the following function types must be passed to
	/// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
	/// splitting the immediate is valid and returns the associated new opcode. A
	/// BuildMIFunc must be implemented to build the two immediate instructions.
	///
	/// Example Pattern (where IMM would require 2+ MOV instructions):
	/// %dst = <Instr>rr %src IMM [...]
	/// becomes:
	/// %tmp = <Instr>ri %src (encode half IMM) [...]
	/// %dst = <Instr>ri %tmp (encode half IMM) [...]
	template <typename T>
	bool splitTwoPartImm(MachineInstr &MI,
	SmallSetVector<MachineInstr *, 8> &ToBeRemoved,
	SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);

	bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
	MachineInstr *&SubregToRegMI);

	template <typename T>
	bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI,
	SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
	template <typename T>
	bool visitAND(unsigned Opc, MachineInstr &MI,
	SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
	bool visitORR(MachineInstr &MI,
	SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
	bool runOnMachineFunction(MachineFunction &MF) override;

	StringRef getPassName() const override {
	return "AArch64 MI Peephole Optimization pass";
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	AU.addRequired<MachineLoopInfo>();
	MachineFunctionPass::getAnalysisUsage(AU);
	}
	};

	char AArch64MIPeepholeOpt::ID = 0;

	} // end anonymous namespace

	INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
	"AArch64 MI Peephole Optimization", false, false)

	template <typename T>
	static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
	T UImm = static_cast<T>(Imm);
	if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
	return false;

	// If this immediate can be handled by one instruction, do not split it.
	SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
	AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
	if (Insn.size() == 1)
	return false;

	// The bitmask immediate consists of consecutive ones. Let's say there is
	// constant 0b00000000001000000000010000000000 which does not consist of
	// consecutive ones. We can split it in to two bitmask immediate like
	// 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
	// If we do AND with these two bitmask immediate, we can see original one.
	unsigned LowestBitSet = countTrailingZeros(UImm);
	unsigned HighestBitSet = Log2_64(UImm);

	// Create a mask which is filled with one from the position of lowest bit set
	// to the position of highest bit set.
	T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
	(static_cast<T>(1) << LowestBitSet);
	// Create a mask which is filled with one outside the position of lowest bit
	// set and the position of highest bit set.
	T NewImm2 = UImm \| ~NewImm1;

	// If the split value is not valid bitmask immediate, do not split this
	// constant.
	if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
	return false;

	Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
	Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
	return true;
	}

	template <typename T>
	bool AArch64MIPeepholeOpt::visitAND(
	unsigned Opc, MachineInstr &MI,
	SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
	// Try below transformation.
	//
	// MOVi32imm + ANDWrr ==> ANDWri + ANDWri
	// MOVi64imm + ANDXrr ==> ANDXri + ANDXri
	//
	// The mov pseudo instruction could be expanded to multiple mov instructions
	// later. Let's try to split the constant operand of mov instruction into two
	// bitmask immediates. It makes only two AND instructions intead of multiple
	// mov + and instructions.

	return splitTwoPartImm<T>(
	MI, ToBeRemoved,
	[Opc](T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<unsigned> {
	if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
	return Opc;
	return None;
	},
	[&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0,
	unsigned Imm1, Register SrcReg, Register NewTmpReg,
	Register NewDstReg) {
	DebugLoc DL = MI.getDebugLoc();
	MachineBasicBlock *MBB = MI.getParent();
	BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
	.addReg(SrcReg)
	.addImm(Imm0);
	BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
	.addReg(NewTmpReg)
	.addImm(Imm1);
	});
	}

	bool AArch64MIPeepholeOpt::visitORR(
	MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
	// Check this ORR comes from below zero-extend pattern.
	//
	// def : Pat<(i64 (zext GPR32:$src)),
	// (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
	if (MI.getOperand(3).getImm() != 0)
	return false;

	if (MI.getOperand(1).getReg() != AArch64::WZR)
	return false;

	MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
	if (!SrcMI)
	return false;

	// From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
	//
	// When you use the 32-bit form of an instruction, the upper 32 bits of the
	// source registers are ignored and the upper 32 bits of the destination
	// register are set to zero.
	//
	// If AArch64's 32-bit form of instruction defines the source operand of
	// zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
	// real AArch64 instruction and if it is not, do not process the opcode
	// conservatively.
	if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
	return false;

	Register DefReg = MI.getOperand(0).getReg();
	Register SrcReg = MI.getOperand(2).getReg();
	MRI->replaceRegWith(DefReg, SrcReg);
	MRI->clearKillFlags(SrcReg);
	// replaceRegWith changes MI's definition register. Keep it for SSA form until
	// deleting MI.
	MI.getOperand(0).setReg(DefReg);
	ToBeRemoved.insert(&MI);

	LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");

	return true;
	}

	template <typename T>
	static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
	// The immediate must be in the form of ((imm0 << 12) + imm1), in which both
	// imm0 and imm1 are non-zero 12-bit unsigned int.
	if ((Imm & 0xfff000) == 0 \|\| (Imm & 0xfff) == 0 \|\|
	(Imm & ~static_cast<T>(0xffffff)) != 0)
	return false;

	// The immediate can not be composed via a single instruction.
	SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
	AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
	if (Insn.size() == 1)
	return false;

	// Split Imm into (Imm0 << 12) + Imm1;
	Imm0 = (Imm >> 12) & 0xfff;
	Imm1 = Imm & 0xfff;
	return true;
	}

	template <typename T>
	bool AArch64MIPeepholeOpt::visitADDSUB(
	unsigned PosOpc, unsigned NegOpc, MachineInstr &MI,
	SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
	// Try below transformation.
	//
	// MOVi32imm + ADDWrr ==> ADDWri + ADDWri
	// MOVi64imm + ADDXrr ==> ADDXri + ADDXri
	//
	// MOVi32imm + SUBWrr ==> SUBWri + SUBWri
	// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
	//
	// The mov pseudo instruction could be expanded to multiple mov instructions
	// later. Let's try to split the constant operand of mov instruction into two
	// legal add/sub immediates. It makes only two ADD/SUB instructions intead of
	// multiple `mov` + `and/sub` instructions.

	return splitTwoPartImm<T>(
	MI, ToBeRemoved,
	[PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
	T &Imm1) -> Optional<unsigned> {
	if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
	return PosOpc;
	if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
	return NegOpc;
	return None;
	},
	[&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0,
	unsigned Imm1, Register SrcReg, Register NewTmpReg,
	Register NewDstReg) {
	DebugLoc DL = MI.getDebugLoc();
	MachineBasicBlock *MBB = MI.getParent();
	BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
	.addReg(SrcReg)
	.addImm(Imm0)
	.addImm(12);
	BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
	.addReg(NewTmpReg)
	.addImm(Imm1)
	.addImm(0);
	});
	}

	// Checks if the corresponding MOV immediate instruction is applicable for
	// this peephole optimization.
	bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
	MachineInstr *&MovMI,
	MachineInstr *&SubregToRegMI) {
	// Check whether current MBB is in loop and the AND is loop invariant.
	MachineBasicBlock *MBB = MI.getParent();
	MachineLoop *L = MLI->getLoopFor(MBB);
	if (L && !L->isLoopInvariant(MI))
	return false;

	// Check whether current MI's operand is MOV with immediate.
	MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
	if (!MovMI)
	return false;

	// If it is SUBREG_TO_REG, check its operand.
	SubregToRegMI = nullptr;
	if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
	SubregToRegMI = MovMI;
	MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
	if (!MovMI)
	return false;
	}

	if (MovMI->getOpcode() != AArch64::MOVi32imm &&
	MovMI->getOpcode() != AArch64::MOVi64imm)
	return false;

	// If the MOV has multiple uses, do not split the immediate because it causes
	// more instructions.
	if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
	return false;
	if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
	return false;

	// It is OK to perform this peephole optimization.
	return true;
	}

	template <typename T>
	bool AArch64MIPeepholeOpt::splitTwoPartImm(
	MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved,
	SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
	unsigned RegSize = sizeof(T) * 8;
	assert((RegSize == 32 \|\| RegSize == 64) &&
	"Invalid RegSize for legal immediate peephole optimization");

	// Perform several essential checks against current MI.
	MachineInstr MovMI, SubregToRegMI;
	if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
	return false;

	// Split the immediate to Imm0 and Imm1, and calculate the Opcode.
	T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
	// For the 32 bit form of instruction, the upper 32 bits of the destination
	// register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
	// of Imm to zero. This is essential if the Immediate value was a negative
	// number since it was sign extended when we assign to the 64-bit Imm.
	if (SubregToRegMI)
	Imm &= 0xFFFFFFFF;
	unsigned Opcode;
	if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
	Opcode = R.getValue();
	else
	return false;

	// Create new ADD/SUB MIs.
	MachineFunction *MF = MI.getMF();
	const TargetRegisterClass *RC =
	TII->getRegClass(TII->get(Opcode), 0, TRI, *MF);
	const TargetRegisterClass *ORC =
	TII->getRegClass(TII->get(Opcode), 1, TRI, *MF);
	Register DstReg = MI.getOperand(0).getReg();
	Register SrcReg = MI.getOperand(1).getReg();
	Register NewTmpReg = MRI->createVirtualRegister(RC);
	Register NewDstReg = MRI->createVirtualRegister(RC);

	MRI->constrainRegClass(SrcReg, RC);
	MRI->constrainRegClass(NewTmpReg, ORC);
	MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));

	BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);

	MRI->replaceRegWith(DstReg, NewDstReg);
	// replaceRegWith changes MI's definition register. Keep it for SSA form until
	// deleting MI.
	MI.getOperand(0).setReg(DstReg);

	// Record the MIs need to be removed.
	ToBeRemoved.insert(&MI);
	if (SubregToRegMI)
	ToBeRemoved.insert(SubregToRegMI);
	ToBeRemoved.insert(MovMI);

	return true;
	}

	bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
	if (skipFunction(MF.getFunction()))
	return false;

	TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
	TRI = static_cast<const AArch64RegisterInfo *>(
	MF.getSubtarget().getRegisterInfo());
	MLI = &getAnalysis<MachineLoopInfo>();
	MRI = &MF.getRegInfo();

	assert(MRI->isSSA() && "Expected to be run on SSA form!");

	bool Changed = false;
	SmallSetVector<MachineInstr *, 8> ToBeRemoved;

	for (MachineBasicBlock &MBB : MF) {
	for (MachineInstr &MI : MBB) {
	switch (MI.getOpcode()) {
	default:
	break;
	case AArch64::ANDWrr:
	Changed = visitAND<uint32_t>(AArch64::ANDWri, MI, ToBeRemoved);
	break;
	case AArch64::ANDXrr:
	Changed = visitAND<uint64_t>(AArch64::ANDXri, MI, ToBeRemoved);
	break;
	case AArch64::ORRWrs:
	Changed = visitORR(MI, ToBeRemoved);
	break;
	case AArch64::ADDWrr:
	Changed = visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI,
	ToBeRemoved);
	break;
	case AArch64::SUBWrr:
	Changed = visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI,
	ToBeRemoved);
	break;
	case AArch64::ADDXrr:
	Changed = visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI,
	ToBeRemoved);
	break;
	case AArch64::SUBXrr:
	Changed = visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI,
	ToBeRemoved);
	break;
	}
	}
	}

	for (MachineInstr *MI : ToBeRemoved)
	MI->eraseFromParent();

	return Changed;
	}

	FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
	return new AArch64MIPeepholeOpt();
	}