| //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // \file This file defines a set of schedule DAG mutations that can be used to |
| // override default scheduler behavior to enforce specific scheduling patterns. |
| // They should be used in cases where runtime performance considerations such as |
| // inter-wavefront interactions, mean that compile-time heuristics cannot |
| // predict the optimal instruction ordering, or in kernels where optimum |
| // instruction scheduling is important enough to warrant manual intervention. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "AMDGPUIGroupLP.h" |
| #include "AMDGPUTargetMachine.h" |
| #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| #include "SIInstrInfo.h" |
| #include "SIMachineFunctionInfo.h" |
| #include "llvm/ADT/BitmaskEnum.h" |
| #include "llvm/CodeGen/MachineScheduler.h" |
| #include "llvm/CodeGen/TargetOpcodes.h" |
| |
| using namespace llvm; |
| |
| #define DEBUG_TYPE "machine-scheduler" |
| |
| namespace { |
| |
| static cl::opt<bool> |
| EnableIGroupLP("amdgpu-igrouplp", |
| cl::desc("Enable construction of Instruction Groups and " |
| "their ordering for scheduling"), |
| cl::init(false)); |
| |
| static cl::opt<Optional<unsigned>> |
| VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None), |
| cl::Hidden, |
| cl::desc("The maximum number of instructions to include " |
| "in VMEM group.")); |
| |
| static cl::opt<Optional<unsigned>> |
| MFMAGroupMaxSize("amdgpu-igrouplp-mfma-group-size", cl::init(None), |
| cl::Hidden, |
| cl::desc("The maximum number of instructions to include " |
| "in MFMA group.")); |
| |
| static cl::opt<Optional<unsigned>> |
| LDRGroupMaxSize("amdgpu-igrouplp-ldr-group-size", cl::init(None), |
| cl::Hidden, |
| cl::desc("The maximum number of instructions to include " |
| "in lds/gds read group.")); |
| |
| static cl::opt<Optional<unsigned>> |
| LDWGroupMaxSize("amdgpu-igrouplp-ldw-group-size", cl::init(None), |
| cl::Hidden, |
| cl::desc("The maximum number of instructions to include " |
| "in lds/gds write group.")); |
| |
| typedef function_ref<bool(const MachineInstr &, const SIInstrInfo *)> |
| CanAddMIFn; |
| |
| // Classify instructions into groups to enable fine tuned control over the |
| // scheduler. These groups may be more specific than current SchedModel |
| // instruction classes. |
| class SchedGroup { |
| private: |
| // Function that returns true if a non-bundle MI may be inserted into this |
| // group. |
| const CanAddMIFn canAddMI; |
| |
| // Maximum number of SUnits that can be added to this group. |
| Optional<unsigned> MaxSize; |
| |
| // Collection of SUnits that are classified as members of this group. |
| SmallVector<SUnit *, 32> Collection; |
| |
| ScheduleDAGInstrs *DAG; |
| |
| void tryAddEdge(SUnit *A, SUnit *B) { |
| if (A != B && DAG->canAddEdge(B, A)) { |
| DAG->addEdge(B, SDep(A, SDep::Artificial)); |
| LLVM_DEBUG(dbgs() << "Adding edge...\n" |
| << "from: SU(" << A->NodeNum << ") " << *A->getInstr() |
| << "to: SU(" << B->NodeNum << ") " << *B->getInstr()); |
| } |
| } |
| |
| public: |
| // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If |
| // MakePred is true, SU will be a predecessor of the SUnits in this |
| // SchedGroup, otherwise SU will be a successor. |
| void link(SUnit &SU, bool MakePred = false) { |
| for (auto A : Collection) { |
| SUnit *B = &SU; |
| if (MakePred) |
| std::swap(A, B); |
| |
| tryAddEdge(A, B); |
| } |
| } |
| |
| // Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use |
| // the predicate to determine whether SU should be a predecessor (P = true) |
| // or a successor (P = false) of this SchedGroup. |
| void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P) { |
| for (auto A : Collection) { |
| SUnit *B = &SU; |
| if (P(A, B)) |
| std::swap(A, B); |
| |
| tryAddEdge(A, B); |
| } |
| } |
| |
| // Add DAG dependencies such that SUnits in this group shall be ordered |
| // before SUnits in OtherGroup. |
| void link(SchedGroup &OtherGroup) { |
| for (auto B : OtherGroup.Collection) |
| link(*B); |
| } |
| |
| // Returns true if no more instructions may be added to this group. |
| bool isFull() { return MaxSize && Collection.size() >= *MaxSize; } |
| |
| // Returns true if SU can be added to this SchedGroup. |
| bool canAddSU(SUnit &SU, const SIInstrInfo *TII) { |
| if (isFull()) |
| return false; |
| |
| MachineInstr &MI = *SU.getInstr(); |
| if (MI.getOpcode() != TargetOpcode::BUNDLE) |
| return canAddMI(MI, TII); |
| |
| // Special case for bundled MIs. |
| const MachineBasicBlock *MBB = MI.getParent(); |
| MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; |
| while (E != MBB->end() && E->isBundledWithPred()) |
| ++E; |
| |
| // Return true if all of the bundled MIs can be added to this group. |
| return std::all_of( |
| B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); }); |
| } |
| |
| void add(SUnit &SU) { Collection.push_back(&SU); } |
| |
| SchedGroup(CanAddMIFn canAddMI, Optional<unsigned> MaxSize, |
| ScheduleDAGInstrs *DAG) |
| : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {} |
| }; |
| |
| bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) { |
| return TII->isMFMA(MI); |
| } |
| |
| bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { |
| return TII->isVALU(MI) && !TII->isMFMA(MI); |
| } |
| |
| bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { |
| return TII->isSALU(MI); |
| } |
| |
| bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { |
| return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)); |
| } |
| |
| bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { |
| return MI.mayLoad() && |
| (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); |
| } |
| |
| bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { |
| return MI.mayStore() && |
| (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); |
| } |
| |
| bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { |
| return MI.mayStore() && TII->isDS(MI); |
| } |
| |
| bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { |
| return MI.mayLoad() && TII->isDS(MI); |
| } |
| |
| class IGroupLPDAGMutation : public ScheduleDAGMutation { |
| public: |
| const SIInstrInfo *TII; |
| ScheduleDAGMI *DAG; |
| |
| IGroupLPDAGMutation() = default; |
| void apply(ScheduleDAGInstrs *DAGInstrs) override; |
| }; |
| |
| // DAG mutation that coordinates with the SCHED_BARRIER instruction and |
| // corresponding builtin. The mutation adds edges from specific instruction |
| // classes determined by the SCHED_BARRIER mask so that they cannot be |
| // scheduled around the SCHED_BARRIER. |
| class SchedBarrierDAGMutation : public ScheduleDAGMutation { |
| private: |
| const SIInstrInfo *TII; |
| |
| ScheduleDAGMI *DAG; |
| |
| // Components of the mask that determines which instructions may not be |
| // scheduled across the SCHED_BARRIER. |
| enum class SchedBarrierMasks { |
| NONE = 0u, |
| ALU = 1u << 0, |
| VALU = 1u << 1, |
| SALU = 1u << 2, |
| MFMA = 1u << 3, |
| VMEM = 1u << 4, |
| VMEM_READ = 1u << 5, |
| VMEM_WRITE = 1u << 6, |
| DS = 1u << 7, |
| DS_READ = 1u << 8, |
| DS_WRITE = 1u << 9, |
| LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE) |
| }; |
| |
| // Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a |
| // region. |
| // |
| std::unique_ptr<SchedGroup> MFMASchedGroup = nullptr; |
| std::unique_ptr<SchedGroup> VALUSchedGroup = nullptr; |
| std::unique_ptr<SchedGroup> SALUSchedGroup = nullptr; |
| std::unique_ptr<SchedGroup> VMEMReadSchedGroup = nullptr; |
| std::unique_ptr<SchedGroup> VMEMWriteSchedGroup = nullptr; |
| std::unique_ptr<SchedGroup> DSWriteSchedGroup = nullptr; |
| std::unique_ptr<SchedGroup> DSReadSchedGroup = nullptr; |
| |
| // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should |
| // not be reordered accross the SCHED_BARRIER. |
| void getSchedGroupsFromMask(int32_t Mask, |
| SmallVectorImpl<SchedGroup *> &SchedGroups); |
| |
| // Add DAG edges that enforce SCHED_BARRIER ordering. |
| void addSchedBarrierEdges(SUnit &SU); |
| |
| // Classify instructions and add them to the SchedGroup. |
| void initSchedGroup(SchedGroup *SG); |
| |
| // Remove all existing edges from a SCHED_BARRIER. |
| void resetSchedBarrierEdges(SUnit &SU); |
| |
| public: |
| void apply(ScheduleDAGInstrs *DAGInstrs) override; |
| |
| SchedBarrierDAGMutation() = default; |
| }; |
| |
| void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { |
| const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); |
| TII = ST.getInstrInfo(); |
| DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); |
| const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); |
| if (!TSchedModel || DAG->SUnits.empty()) |
| return; |
| |
| LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); |
| |
| // The order of InstructionGroups in this vector defines the |
| // order in which edges will be added. In other words, given the |
| // present ordering, we will try to make each VMEMRead instruction |
| // a predecessor of each DSRead instruction, and so on. |
| SmallVector<SchedGroup, 4> PipelineOrderGroups = { |
| SchedGroup(isVMEMSGMember, VMEMGroupMaxSize, DAG), |
| SchedGroup(isDSReadSGMember, LDRGroupMaxSize, DAG), |
| SchedGroup(isMFMASGMember, MFMAGroupMaxSize, DAG), |
| SchedGroup(isDSWriteSGMember, LDWGroupMaxSize, DAG)}; |
| |
| for (SUnit &SU : DAG->SUnits) { |
| LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU)); |
| for (auto &SG : PipelineOrderGroups) |
| if (SG.canAddSU(SU, TII)) |
| SG.add(SU); |
| } |
| |
| for (unsigned i = 0; i < PipelineOrderGroups.size() - 1; i++) { |
| auto &GroupA = PipelineOrderGroups[i]; |
| for (unsigned j = i + 1; j < PipelineOrderGroups.size(); j++) { |
| auto &GroupB = PipelineOrderGroups[j]; |
| GroupA.link(GroupB); |
| } |
| } |
| } |
| |
| void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { |
| const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); |
| if (!TSchedModel || DAGInstrs->SUnits.empty()) |
| return; |
| |
| LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n"); |
| |
| const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); |
| TII = ST.getInstrInfo(); |
| DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); |
| for (auto &SU : DAG->SUnits) |
| if (SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER) |
| addSchedBarrierEdges(SU); |
| } |
| |
| void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { |
| MachineInstr &MI = *SchedBarrier.getInstr(); |
| assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); |
| // Remove all existing edges from the SCHED_BARRIER that were added due to the |
| // instruction having side effects. |
| resetSchedBarrierEdges(SchedBarrier); |
| SmallVector<SchedGroup *, 4> SchedGroups; |
| int32_t Mask = MI.getOperand(0).getImm(); |
| getSchedGroupsFromMask(Mask, SchedGroups); |
| for (auto SG : SchedGroups) |
| SG->link( |
| SchedBarrier, (function_ref<bool(const SUnit *A, const SUnit *B)>)[]( |
| const SUnit *A, const SUnit *B) { |
| return A->NodeNum > B->NodeNum; |
| }); |
| } |
| |
| void SchedBarrierDAGMutation::getSchedGroupsFromMask( |
| int32_t Mask, SmallVectorImpl<SchedGroup *> &SchedGroups) { |
| SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask; |
| // See IntrinsicsAMDGPU.td for an explanation of these masks and their |
| // mappings. |
| // |
| if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE && |
| (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { |
| if (!VALUSchedGroup) { |
| VALUSchedGroup = std::make_unique<SchedGroup>(isVALUSGMember, None, DAG); |
| initSchedGroup(VALUSchedGroup.get()); |
| } |
| |
| SchedGroups.push_back(VALUSchedGroup.get()); |
| } |
| |
| if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE && |
| (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { |
| if (!SALUSchedGroup) { |
| SALUSchedGroup = std::make_unique<SchedGroup>(isSALUSGMember, None, DAG); |
| initSchedGroup(SALUSchedGroup.get()); |
| } |
| |
| SchedGroups.push_back(SALUSchedGroup.get()); |
| } |
| |
| if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE && |
| (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { |
| if (!MFMASchedGroup) { |
| MFMASchedGroup = std::make_unique<SchedGroup>(isMFMASGMember, None, DAG); |
| initSchedGroup(MFMASchedGroup.get()); |
| } |
| |
| SchedGroups.push_back(MFMASchedGroup.get()); |
| } |
| |
| if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE && |
| (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) { |
| if (!VMEMReadSchedGroup) { |
| VMEMReadSchedGroup = |
| std::make_unique<SchedGroup>(isVMEMReadSGMember, None, DAG); |
| initSchedGroup(VMEMReadSchedGroup.get()); |
| } |
| |
| SchedGroups.push_back(VMEMReadSchedGroup.get()); |
| } |
| |
| if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE && |
| (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) { |
| if (!VMEMWriteSchedGroup) { |
| VMEMWriteSchedGroup = |
| std::make_unique<SchedGroup>(isVMEMWriteSGMember, None, DAG); |
| initSchedGroup(VMEMWriteSchedGroup.get()); |
| } |
| |
| SchedGroups.push_back(VMEMWriteSchedGroup.get()); |
| } |
| |
| if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE && |
| (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) { |
| if (!DSReadSchedGroup) { |
| DSReadSchedGroup = |
| std::make_unique<SchedGroup>(isDSReadSGMember, None, DAG); |
| initSchedGroup(DSReadSchedGroup.get()); |
| } |
| |
| SchedGroups.push_back(DSReadSchedGroup.get()); |
| } |
| |
| if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE && |
| (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) { |
| if (!DSWriteSchedGroup) { |
| DSWriteSchedGroup = |
| std::make_unique<SchedGroup>(isDSWriteSGMember, None, DAG); |
| initSchedGroup(DSWriteSchedGroup.get()); |
| } |
| |
| SchedGroups.push_back(DSWriteSchedGroup.get()); |
| } |
| } |
| |
| void SchedBarrierDAGMutation::initSchedGroup(SchedGroup *SG) { |
| assert(SG); |
| for (auto &SU : DAG->SUnits) |
| if (SG->canAddSU(SU, TII)) |
| SG->add(SU); |
| } |
| |
| void SchedBarrierDAGMutation::resetSchedBarrierEdges(SUnit &SU) { |
| assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER); |
| for (auto &P : SU.Preds) |
| SU.removePred(P); |
| |
| for (auto &S : SU.Succs) { |
| for (auto &SP : S.getSUnit()->Preds) { |
| if (SP.getSUnit() == &SU) { |
| S.getSUnit()->removePred(SP); |
| } |
| } |
| } |
| } |
| |
| } // namespace |
| |
| namespace llvm { |
| |
| std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() { |
| return EnableIGroupLP ? std::make_unique<IGroupLPDAGMutation>() : nullptr; |
| } |
| |
| std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation() { |
| return std::make_unique<SchedBarrierDAGMutation>(); |
| } |
| |
| } // end namespace llvm |