llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp - rust-lang/llvm-project - Git at Google

 //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP  ------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // \file This file defines a set of schedule DAG mutations that can be used to
 // override default scheduler behavior to enforce specific scheduling patterns.
 // They should be used in cases where runtime performance considerations such as
 // inter-wavefront interactions, mean that compile-time heuristics cannot
 // predict the optimal instruction ordering, or in kernels where optimum
 // instruction scheduling is important enough to warrant manual intervention.
 //
 //===----------------------------------------------------------------------===//

 #include "AMDGPUIGroupLP.h"
 #include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/TargetOpcodes.h"

 using namespace llvm;

 #define DEBUG_TYPE "machine-scheduler"

 namespace {

 static cl::opt<bool>
     EnableIGroupLP("amdgpu-igrouplp",
                    cl::desc("Enable construction of Instruction Groups and "
                             "their ordering for scheduling"),
                    cl::init(false));

 static cl::opt<Optional<unsigned>>
     VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None),
                      cl::Hidden,
                      cl::desc("The maximum number of instructions to include "
                               "in VMEM group."));

 static cl::opt<Optional<unsigned>>
     MFMAGroupMaxSize("amdgpu-igrouplp-mfma-group-size", cl::init(None),
                      cl::Hidden,
                      cl::desc("The maximum number of instructions to include "
                               "in MFMA group."));

 static cl::opt<Optional<unsigned>>
     LDRGroupMaxSize("amdgpu-igrouplp-ldr-group-size", cl::init(None),
                     cl::Hidden,
                     cl::desc("The maximum number of instructions to include "
                              "in lds/gds read group."));

 static cl::opt<Optional<unsigned>>
     LDWGroupMaxSize("amdgpu-igrouplp-ldw-group-size", cl::init(None),
                     cl::Hidden,
                     cl::desc("The maximum number of instructions to include "
                              "in lds/gds write group."));

 typedef function_ref<bool(const MachineInstr &, const SIInstrInfo *)>
     CanAddMIFn;

 // Classify instructions into groups to enable fine tuned control over the
 // scheduler. These groups may be more specific than current SchedModel
 // instruction classes.
 class SchedGroup {
 private:
   // Function that returns true if a non-bundle MI may be inserted into this
   // group.
   const CanAddMIFn canAddMI;

   // Maximum number of SUnits that can be added to this group.
   Optional<unsigned> MaxSize;

   // Collection of SUnits that are classified as members of this group.
   SmallVector<SUnit *, 32> Collection;

   ScheduleDAGInstrs *DAG;

   void tryAddEdge(SUnit *A, SUnit *B) {
     if (A != B && DAG->canAddEdge(B, A)) {
       DAG->addEdge(B, SDep(A, SDep::Artificial));
       LLVM_DEBUG(dbgs() << "Adding edge...\n"
                         << "from: SU(" << A->NodeNum << ") " << *A->getInstr()
                         << "to: SU(" << B->NodeNum << ") " << *B->getInstr());
     }
   }

 public:
   // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
   // MakePred is true, SU will be a predecessor of the SUnits in this
   // SchedGroup, otherwise SU will be a successor.
   void link(SUnit &SU, bool MakePred = false) {
     for (auto A : Collection) {
       SUnit *B = &SU;
       if (MakePred)
         std::swap(A, B);

       tryAddEdge(A, B);
     }
   }

   // Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use
   // the predicate to determine whether SU should be a predecessor (P = true)
   // or a successor (P = false) of this SchedGroup.
   void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P) {
     for (auto A : Collection) {
       SUnit *B = &SU;
       if (P(A, B))
         std::swap(A, B);

       tryAddEdge(A, B);
     }
   }

   // Add DAG dependencies such that SUnits in this group shall be ordered
   // before SUnits in OtherGroup.
   void link(SchedGroup &OtherGroup) {
     for (auto B : OtherGroup.Collection)
       link(*B);
   }

   // Returns true if no more instructions may be added to this group.
   bool isFull() { return MaxSize && Collection.size() >= *MaxSize; }

   // Returns true if SU can be added to this SchedGroup.
   bool canAddSU(SUnit &SU, const SIInstrInfo *TII) {
     if (isFull())
       return false;

     MachineInstr &MI = *SU.getInstr();
     if (MI.getOpcode() != TargetOpcode::BUNDLE)
       return canAddMI(MI, TII);

     // Special case for bundled MIs.
     const MachineBasicBlock *MBB = MI.getParent();
     MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
     while (E != MBB->end() && E->isBundledWithPred())
       ++E;

     // Return true if all of the bundled MIs can be added to this group.
     return std::all_of(
         B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); });
   }

   void add(SUnit &SU) { Collection.push_back(&SU); }

   SchedGroup(CanAddMIFn canAddMI, Optional<unsigned> MaxSize,
              ScheduleDAGInstrs *DAG)
       : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {}
 };

 bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
   return TII->isMFMA(MI);
 }

 bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
   return TII->isVALU(MI) && !TII->isMFMA(MI);
 }

 bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
   return TII->isSALU(MI);
 }

 bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
   return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI));
 }

 bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
   return MI.mayLoad() &&
          (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
 }

 bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
   return MI.mayStore() &&
          (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
 }

 bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
   return MI.mayStore() && TII->isDS(MI);
 }

 bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
   return MI.mayLoad() && TII->isDS(MI);
 }

 class IGroupLPDAGMutation : public ScheduleDAGMutation {
 public:
   const SIInstrInfo *TII;
   ScheduleDAGMI *DAG;

   IGroupLPDAGMutation() = default;
   void apply(ScheduleDAGInstrs *DAGInstrs) override;
 };

 // DAG mutation that coordinates with the SCHED_BARRIER instruction and
 // corresponding builtin. The mutation adds edges from specific instruction
 // classes determined by the SCHED_BARRIER mask so that they cannot be
 // scheduled around the SCHED_BARRIER.
 class SchedBarrierDAGMutation : public ScheduleDAGMutation {
 private:
   const SIInstrInfo *TII;

   ScheduleDAGMI *DAG;

   // Components of the mask that determines which instructions may not be
   // scheduled across the SCHED_BARRIER.
   enum class SchedBarrierMasks {
     NONE = 0u,
     ALU = 1u << 0,
     VALU = 1u << 1,
     SALU = 1u << 2,
     MFMA = 1u << 3,
     VMEM = 1u << 4,
     VMEM_READ = 1u << 5,
     VMEM_WRITE = 1u << 6,
     DS = 1u << 7,
     DS_READ = 1u << 8,
     DS_WRITE = 1u << 9,
     LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE)
   };

   // Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a
   // region.
   //
   std::unique_ptr<SchedGroup> MFMASchedGroup = nullptr;
   std::unique_ptr<SchedGroup> VALUSchedGroup = nullptr;
   std::unique_ptr<SchedGroup> SALUSchedGroup = nullptr;
   std::unique_ptr<SchedGroup> VMEMReadSchedGroup = nullptr;
   std::unique_ptr<SchedGroup> VMEMWriteSchedGroup = nullptr;
   std::unique_ptr<SchedGroup> DSWriteSchedGroup = nullptr;
   std::unique_ptr<SchedGroup> DSReadSchedGroup = nullptr;

   // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
   // not be reordered accross the SCHED_BARRIER.
   void getSchedGroupsFromMask(int32_t Mask,
                               SmallVectorImpl<SchedGroup *> &SchedGroups);

   // Add DAG edges that enforce SCHED_BARRIER ordering.
   void addSchedBarrierEdges(SUnit &SU);

   // Classify instructions and add them to the SchedGroup.
   void initSchedGroup(SchedGroup *SG);

   // Remove all existing edges from a SCHED_BARRIER.
   void resetSchedBarrierEdges(SUnit &SU);

 public:
   void apply(ScheduleDAGInstrs *DAGInstrs) override;

   SchedBarrierDAGMutation() = default;
 };

 void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
   const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
   DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
   const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
   if (!TSchedModel || DAG->SUnits.empty())
     return;

   LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");

   // The order of InstructionGroups in this vector defines the
   // order in which edges will be added. In other words, given the
   // present ordering, we will try to make each VMEMRead instruction
   // a predecessor of each DSRead instruction, and so on.
   SmallVector<SchedGroup, 4> PipelineOrderGroups = {
       SchedGroup(isVMEMSGMember, VMEMGroupMaxSize, DAG),
       SchedGroup(isDSReadSGMember, LDRGroupMaxSize, DAG),
       SchedGroup(isMFMASGMember, MFMAGroupMaxSize, DAG),
       SchedGroup(isDSWriteSGMember, LDWGroupMaxSize, DAG)};

   for (SUnit &SU : DAG->SUnits) {
     LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU));
     for (auto &SG : PipelineOrderGroups)
       if (SG.canAddSU(SU, TII))
         SG.add(SU);
   }

   for (unsigned i = 0; i < PipelineOrderGroups.size() - 1; i++) {
     auto &GroupA = PipelineOrderGroups[i];
     for (unsigned j = i + 1; j < PipelineOrderGroups.size(); j++) {
       auto &GroupB = PipelineOrderGroups[j];
       GroupA.link(GroupB);
     }
   }
 }

 void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
   const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
   if (!TSchedModel || DAGInstrs->SUnits.empty())
     return;

   LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n");

   const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
   DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
   for (auto &SU : DAG->SUnits)
     if (SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER)
       addSchedBarrierEdges(SU);
 }

 void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
   MachineInstr &MI = *SchedBarrier.getInstr();
   assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
   // Remove all existing edges from the SCHED_BARRIER that were added due to the
   // instruction having side effects.
   resetSchedBarrierEdges(SchedBarrier);
   SmallVector<SchedGroup *, 4> SchedGroups;
   int32_t Mask = MI.getOperand(0).getImm();
   getSchedGroupsFromMask(Mask, SchedGroups);
   for (auto SG : SchedGroups)
     SG->link(
         SchedBarrier, (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
                           const SUnit *A, const SUnit *B) {
           return A->NodeNum > B->NodeNum;
         });
 }

 void SchedBarrierDAGMutation::getSchedGroupsFromMask(
     int32_t Mask, SmallVectorImpl<SchedGroup *> &SchedGroups) {
   SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask;
   // See IntrinsicsAMDGPU.td for an explanation of these masks and their
   // mappings.
   //
   if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE &&
       (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
     if (!VALUSchedGroup) {
       VALUSchedGroup = std::make_unique<SchedGroup>(isVALUSGMember, None, DAG);
       initSchedGroup(VALUSchedGroup.get());
     }

     SchedGroups.push_back(VALUSchedGroup.get());
   }

   if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE &&
       (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
     if (!SALUSchedGroup) {
       SALUSchedGroup = std::make_unique<SchedGroup>(isSALUSGMember, None, DAG);
       initSchedGroup(SALUSchedGroup.get());
     }

     SchedGroups.push_back(SALUSchedGroup.get());
   }

   if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE &&
       (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
     if (!MFMASchedGroup) {
       MFMASchedGroup = std::make_unique<SchedGroup>(isMFMASGMember, None, DAG);
       initSchedGroup(MFMASchedGroup.get());
     }

     SchedGroups.push_back(MFMASchedGroup.get());
   }

   if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE &&
       (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
     if (!VMEMReadSchedGroup) {
       VMEMReadSchedGroup =
           std::make_unique<SchedGroup>(isVMEMReadSGMember, None, DAG);
       initSchedGroup(VMEMReadSchedGroup.get());
     }

     SchedGroups.push_back(VMEMReadSchedGroup.get());
   }

   if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE &&
       (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
     if (!VMEMWriteSchedGroup) {
       VMEMWriteSchedGroup =
           std::make_unique<SchedGroup>(isVMEMWriteSGMember, None, DAG);
       initSchedGroup(VMEMWriteSchedGroup.get());
     }

     SchedGroups.push_back(VMEMWriteSchedGroup.get());
   }

   if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE &&
       (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
     if (!DSReadSchedGroup) {
       DSReadSchedGroup =
           std::make_unique<SchedGroup>(isDSReadSGMember, None, DAG);
       initSchedGroup(DSReadSchedGroup.get());
     }

     SchedGroups.push_back(DSReadSchedGroup.get());
   }

   if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE &&
       (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
     if (!DSWriteSchedGroup) {
       DSWriteSchedGroup =
           std::make_unique<SchedGroup>(isDSWriteSGMember, None, DAG);
       initSchedGroup(DSWriteSchedGroup.get());
     }

     SchedGroups.push_back(DSWriteSchedGroup.get());
   }
 }

 void SchedBarrierDAGMutation::initSchedGroup(SchedGroup *SG) {
   assert(SG);
   for (auto &SU : DAG->SUnits)
     if (SG->canAddSU(SU, TII))
       SG->add(SU);
 }

 void SchedBarrierDAGMutation::resetSchedBarrierEdges(SUnit &SU) {
   assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER);
   for (auto &P : SU.Preds)
     SU.removePred(P);

   for (auto &S : SU.Succs) {
     for (auto &SP : S.getSUnit()->Preds) {
       if (SP.getSUnit() == &SU) {
         S.getSUnit()->removePred(SP);
       }
     }
   }
 }

 } // namespace

 namespace llvm {

 std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
   return EnableIGroupLP ? std::make_unique<IGroupLPDAGMutation>() : nullptr;
 }

 std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation() {
   return std::make_unique<SchedBarrierDAGMutation>();
 }

 } // end namespace llvm
	//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// \file This file defines a set of schedule DAG mutations that can be used to
	// override default scheduler behavior to enforce specific scheduling patterns.
	// They should be used in cases where runtime performance considerations such as
	// inter-wavefront interactions, mean that compile-time heuristics cannot
	// predict the optimal instruction ordering, or in kernels where optimum
	// instruction scheduling is important enough to warrant manual intervention.
	//
	//===----------------------------------------------------------------------===//

	#include "AMDGPUIGroupLP.h"
	#include "AMDGPUTargetMachine.h"
	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
	#include "SIInstrInfo.h"
	#include "SIMachineFunctionInfo.h"
	#include "llvm/ADT/BitmaskEnum.h"
	#include "llvm/CodeGen/MachineScheduler.h"
	#include "llvm/CodeGen/TargetOpcodes.h"

	using namespace llvm;

	#define DEBUG_TYPE "machine-scheduler"

	namespace {

	static cl::opt<bool>
	EnableIGroupLP("amdgpu-igrouplp",
	cl::desc("Enable construction of Instruction Groups and "
	"their ordering for scheduling"),
	cl::init(false));

	static cl::opt<Optional<unsigned>>
	VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None),
	cl::Hidden,
	cl::desc("The maximum number of instructions to include "
	"in VMEM group."));

	static cl::opt<Optional<unsigned>>
	MFMAGroupMaxSize("amdgpu-igrouplp-mfma-group-size", cl::init(None),
	cl::Hidden,
	cl::desc("The maximum number of instructions to include "
	"in MFMA group."));

	static cl::opt<Optional<unsigned>>
	LDRGroupMaxSize("amdgpu-igrouplp-ldr-group-size", cl::init(None),
	cl::Hidden,
	cl::desc("The maximum number of instructions to include "
	"in lds/gds read group."));

	static cl::opt<Optional<unsigned>>
	LDWGroupMaxSize("amdgpu-igrouplp-ldw-group-size", cl::init(None),
	cl::Hidden,
	cl::desc("The maximum number of instructions to include "
	"in lds/gds write group."));

	typedef function_ref<bool(const MachineInstr &, const SIInstrInfo *)>
	CanAddMIFn;

	// Classify instructions into groups to enable fine tuned control over the
	// scheduler. These groups may be more specific than current SchedModel
	// instruction classes.
	class SchedGroup {
	private:
	// Function that returns true if a non-bundle MI may be inserted into this
	// group.
	const CanAddMIFn canAddMI;

	// Maximum number of SUnits that can be added to this group.
	Optional<unsigned> MaxSize;

	// Collection of SUnits that are classified as members of this group.
	SmallVector<SUnit *, 32> Collection;

	ScheduleDAGInstrs *DAG;

	void tryAddEdge(SUnit A, SUnit B) {
	if (A != B && DAG->canAddEdge(B, A)) {
	DAG->addEdge(B, SDep(A, SDep::Artificial));
	LLVM_DEBUG(dbgs() << "Adding edge...\n"
	<< "from: SU(" << A->NodeNum << ") " << *A->getInstr()
	<< "to: SU(" << B->NodeNum << ") " << *B->getInstr());
	}
	}

	public:
	// Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
	// MakePred is true, SU will be a predecessor of the SUnits in this
	// SchedGroup, otherwise SU will be a successor.
	void link(SUnit &SU, bool MakePred = false) {
	for (auto A : Collection) {
	SUnit *B = &SU;
	if (MakePred)
	std::swap(A, B);

	tryAddEdge(A, B);
	}
	}

	// Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use
	// the predicate to determine whether SU should be a predecessor (P = true)
	// or a successor (P = false) of this SchedGroup.
	void link(SUnit &SU, function_ref<bool(const SUnit A, const SUnit B)> P) {
	for (auto A : Collection) {
	SUnit *B = &SU;
	if (P(A, B))
	std::swap(A, B);

	tryAddEdge(A, B);
	}
	}

	// Add DAG dependencies such that SUnits in this group shall be ordered
	// before SUnits in OtherGroup.
	void link(SchedGroup &OtherGroup) {
	for (auto B : OtherGroup.Collection)
	link(*B);
	}

	// Returns true if no more instructions may be added to this group.
	bool isFull() { return MaxSize && Collection.size() >= *MaxSize; }

	// Returns true if SU can be added to this SchedGroup.
	bool canAddSU(SUnit &SU, const SIInstrInfo *TII) {
	if (isFull())
	return false;

	MachineInstr &MI = *SU.getInstr();
	if (MI.getOpcode() != TargetOpcode::BUNDLE)
	return canAddMI(MI, TII);

	// Special case for bundled MIs.
	const MachineBasicBlock *MBB = MI.getParent();
	MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
	while (E != MBB->end() && E->isBundledWithPred())
	++E;

	// Return true if all of the bundled MIs can be added to this group.
	return std::all_of(
	B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); });
	}

	void add(SUnit &SU) { Collection.push_back(&SU); }

	SchedGroup(CanAddMIFn canAddMI, Optional<unsigned> MaxSize,
	ScheduleDAGInstrs *DAG)
	: canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {}
	};

	bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
	return TII->isMFMA(MI);
	}

	bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
	return TII->isVALU(MI) && !TII->isMFMA(MI);
	}

	bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
	return TII->isSALU(MI);
	}

	bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
	return TII->isVMEM(MI) \|\| (TII->isFLAT(MI) && !TII->isDS(MI));
	}

	bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
	return MI.mayLoad() &&
	(TII->isVMEM(MI) \|\| (TII->isFLAT(MI) && !TII->isDS(MI)));
	}

	bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
	return MI.mayStore() &&
	(TII->isVMEM(MI) \|\| (TII->isFLAT(MI) && !TII->isDS(MI)));
	}

	bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
	return MI.mayStore() && TII->isDS(MI);
	}

	bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
	return MI.mayLoad() && TII->isDS(MI);
	}

	class IGroupLPDAGMutation : public ScheduleDAGMutation {
	public:
	const SIInstrInfo *TII;
	ScheduleDAGMI *DAG;

	IGroupLPDAGMutation() = default;
	void apply(ScheduleDAGInstrs *DAGInstrs) override;
	};

	// DAG mutation that coordinates with the SCHED_BARRIER instruction and
	// corresponding builtin. The mutation adds edges from specific instruction
	// classes determined by the SCHED_BARRIER mask so that they cannot be
	// scheduled around the SCHED_BARRIER.
	class SchedBarrierDAGMutation : public ScheduleDAGMutation {
	private:
	const SIInstrInfo *TII;

	ScheduleDAGMI *DAG;

	// Components of the mask that determines which instructions may not be
	// scheduled across the SCHED_BARRIER.
	enum class SchedBarrierMasks {
	NONE = 0u,
	ALU = 1u << 0,
	VALU = 1u << 1,
	SALU = 1u << 2,
	MFMA = 1u << 3,
	VMEM = 1u << 4,
	VMEM_READ = 1u << 5,
	VMEM_WRITE = 1u << 6,
	DS = 1u << 7,
	DS_READ = 1u << 8,
	DS_WRITE = 1u << 9,
	LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE)
	};

	// Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a
	// region.
	//
	std::unique_ptr<SchedGroup> MFMASchedGroup = nullptr;
	std::unique_ptr<SchedGroup> VALUSchedGroup = nullptr;
	std::unique_ptr<SchedGroup> SALUSchedGroup = nullptr;
	std::unique_ptr<SchedGroup> VMEMReadSchedGroup = nullptr;
	std::unique_ptr<SchedGroup> VMEMWriteSchedGroup = nullptr;
	std::unique_ptr<SchedGroup> DSWriteSchedGroup = nullptr;
	std::unique_ptr<SchedGroup> DSReadSchedGroup = nullptr;

	// Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
	// not be reordered accross the SCHED_BARRIER.
	void getSchedGroupsFromMask(int32_t Mask,
	SmallVectorImpl<SchedGroup *> &SchedGroups);

	// Add DAG edges that enforce SCHED_BARRIER ordering.
	void addSchedBarrierEdges(SUnit &SU);

	// Classify instructions and add them to the SchedGroup.
	void initSchedGroup(SchedGroup *SG);

	// Remove all existing edges from a SCHED_BARRIER.
	void resetSchedBarrierEdges(SUnit &SU);

	public:
	void apply(ScheduleDAGInstrs *DAGInstrs) override;

	SchedBarrierDAGMutation() = default;
	};

	void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
	const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
	TII = ST.getInstrInfo();
	DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
	const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
	if (!TSchedModel \|\| DAG->SUnits.empty())
	return;

	LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");

	// The order of InstructionGroups in this vector defines the
	// order in which edges will be added. In other words, given the
	// present ordering, we will try to make each VMEMRead instruction
	// a predecessor of each DSRead instruction, and so on.
	SmallVector<SchedGroup, 4> PipelineOrderGroups = {
	SchedGroup(isVMEMSGMember, VMEMGroupMaxSize, DAG),
	SchedGroup(isDSReadSGMember, LDRGroupMaxSize, DAG),
	SchedGroup(isMFMASGMember, MFMAGroupMaxSize, DAG),
	SchedGroup(isDSWriteSGMember, LDWGroupMaxSize, DAG)};

	for (SUnit &SU : DAG->SUnits) {
	LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU));
	for (auto &SG : PipelineOrderGroups)
	if (SG.canAddSU(SU, TII))
	SG.add(SU);
	}

	for (unsigned i = 0; i < PipelineOrderGroups.size() - 1; i++) {
	auto &GroupA = PipelineOrderGroups[i];
	for (unsigned j = i + 1; j < PipelineOrderGroups.size(); j++) {
	auto &GroupB = PipelineOrderGroups[j];
	GroupA.link(GroupB);
	}
	}
	}

	void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
	const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
	if (!TSchedModel \|\| DAGInstrs->SUnits.empty())
	return;

	LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n");

	const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
	TII = ST.getInstrInfo();
	DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
	for (auto &SU : DAG->SUnits)
	if (SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER)
	addSchedBarrierEdges(SU);
	}

	void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
	MachineInstr &MI = *SchedBarrier.getInstr();
	assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
	// Remove all existing edges from the SCHED_BARRIER that were added due to the
	// instruction having side effects.
	resetSchedBarrierEdges(SchedBarrier);
	SmallVector<SchedGroup *, 4> SchedGroups;
	int32_t Mask = MI.getOperand(0).getImm();
	getSchedGroupsFromMask(Mask, SchedGroups);
	for (auto SG : SchedGroups)
	SG->link(
	SchedBarrier, (function_ref<bool(const SUnit A, const SUnit B)>)[](
	const SUnit A, const SUnit B) {
	return A->NodeNum > B->NodeNum;
	});
	}

	void SchedBarrierDAGMutation::getSchedGroupsFromMask(
	int32_t Mask, SmallVectorImpl<SchedGroup *> &SchedGroups) {
	SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask;
	// See IntrinsicsAMDGPU.td for an explanation of these masks and their
	// mappings.
	//
	if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE &&
	(SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
	if (!VALUSchedGroup) {
	VALUSchedGroup = std::make_unique<SchedGroup>(isVALUSGMember, None, DAG);
	initSchedGroup(VALUSchedGroup.get());
	}

	SchedGroups.push_back(VALUSchedGroup.get());
	}

	if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE &&
	(SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
	if (!SALUSchedGroup) {
	SALUSchedGroup = std::make_unique<SchedGroup>(isSALUSGMember, None, DAG);
	initSchedGroup(SALUSchedGroup.get());
	}

	SchedGroups.push_back(SALUSchedGroup.get());
	}

	if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE &&
	(SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
	if (!MFMASchedGroup) {
	MFMASchedGroup = std::make_unique<SchedGroup>(isMFMASGMember, None, DAG);
	initSchedGroup(MFMASchedGroup.get());
	}

	SchedGroups.push_back(MFMASchedGroup.get());
	}

	if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE &&
	(SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
	if (!VMEMReadSchedGroup) {
	VMEMReadSchedGroup =
	std::make_unique<SchedGroup>(isVMEMReadSGMember, None, DAG);
	initSchedGroup(VMEMReadSchedGroup.get());
	}

	SchedGroups.push_back(VMEMReadSchedGroup.get());
	}

	if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE &&
	(SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
	if (!VMEMWriteSchedGroup) {
	VMEMWriteSchedGroup =
	std::make_unique<SchedGroup>(isVMEMWriteSGMember, None, DAG);
	initSchedGroup(VMEMWriteSchedGroup.get());
	}

	SchedGroups.push_back(VMEMWriteSchedGroup.get());
	}

	if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE &&
	(SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
	if (!DSReadSchedGroup) {
	DSReadSchedGroup =
	std::make_unique<SchedGroup>(isDSReadSGMember, None, DAG);
	initSchedGroup(DSReadSchedGroup.get());
	}

	SchedGroups.push_back(DSReadSchedGroup.get());
	}

	if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE &&
	(SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
	if (!DSWriteSchedGroup) {
	DSWriteSchedGroup =
	std::make_unique<SchedGroup>(isDSWriteSGMember, None, DAG);
	initSchedGroup(DSWriteSchedGroup.get());
	}

	SchedGroups.push_back(DSWriteSchedGroup.get());
	}
	}

	void SchedBarrierDAGMutation::initSchedGroup(SchedGroup *SG) {
	assert(SG);
	for (auto &SU : DAG->SUnits)
	if (SG->canAddSU(SU, TII))
	SG->add(SU);
	}

	void SchedBarrierDAGMutation::resetSchedBarrierEdges(SUnit &SU) {
	assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER);
	for (auto &P : SU.Preds)
	SU.removePred(P);

	for (auto &S : SU.Succs) {
	for (auto &SP : S.getSUnit()->Preds) {
	if (SP.getSUnit() == &SU) {
	S.getSUnit()->removePred(SP);
	}
	}
	}
	}

	} // namespace

	namespace llvm {

	std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
	return EnableIGroupLP ? std::make_unique<IGroupLPDAGMutation>() : nullptr;
	}

	std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation() {
	return std::make_unique<SchedBarrierDAGMutation>();
	}

	} // end namespace llvm