|  | //===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===// | 
|  | // | 
|  | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | // See https://llvm.org/LICENSE.txt for license information. | 
|  | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  | /// \file | 
|  | /// | 
|  | /// This file implements methods from the AMDGPUCustomBehaviour class. | 
|  | /// | 
|  | //===----------------------------------------------------------------------===// | 
|  |  | 
|  | #include "AMDGPUCustomBehaviour.h" | 
|  | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" | 
|  | #include "SIInstrInfo.h" | 
|  | #include "TargetInfo/AMDGPUTargetInfo.h" | 
|  | #include "llvm/MC/TargetRegistry.h" | 
|  | #include "llvm/Support/WithColor.h" | 
|  |  | 
|  | namespace llvm { | 
|  | namespace mca { | 
|  |  | 
|  | void AMDGPUInstrPostProcess::postProcessInstruction( | 
|  | std::unique_ptr<Instruction> &Inst, const MCInst &MCI) { | 
|  | switch (MCI.getOpcode()) { | 
|  | case AMDGPU::S_WAITCNT: | 
|  | case AMDGPU::S_WAITCNT_EXPCNT: | 
|  | case AMDGPU::S_WAITCNT_LGKMCNT: | 
|  | case AMDGPU::S_WAITCNT_VMCNT: | 
|  | case AMDGPU::S_WAITCNT_VSCNT: | 
|  | case AMDGPU::S_WAITCNT_EXPCNT_gfx10: | 
|  | case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: | 
|  | case AMDGPU::S_WAITCNT_VMCNT_gfx10: | 
|  | case AMDGPU::S_WAITCNT_VSCNT_gfx10: | 
|  | case AMDGPU::S_WAITCNT_gfx10: | 
|  | case AMDGPU::S_WAITCNT_gfx6_gfx7: | 
|  | case AMDGPU::S_WAITCNT_vi: | 
|  | return processWaitCnt(Inst, MCI); | 
|  | } | 
|  | } | 
|  |  | 
|  | // s_waitcnt instructions encode important information as immediate operands | 
|  | // which are lost during the MCInst -> mca::Instruction lowering. | 
|  | void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst, | 
|  | const MCInst &MCI) { | 
|  | for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) { | 
|  | MCAOperand Op; | 
|  | const MCOperand &MCOp = MCI.getOperand(Idx); | 
|  | if (MCOp.isReg()) { | 
|  | Op = MCAOperand::createReg(MCOp.getReg()); | 
|  | } else if (MCOp.isImm()) { | 
|  | Op = MCAOperand::createImm(MCOp.getImm()); | 
|  | } | 
|  | Op.setIndex(Idx); | 
|  | Inst->addOperand(Op); | 
|  | } | 
|  | } | 
|  |  | 
|  | AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, | 
|  | const mca::SourceMgr &SrcMgr, | 
|  | const MCInstrInfo &MCII) | 
|  | : CustomBehaviour(STI, SrcMgr, MCII) { | 
|  | generateWaitCntInfo(); | 
|  | } | 
|  |  | 
|  | unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst, | 
|  | const InstRef &IR) { | 
|  | const Instruction &Inst = *IR.getInstruction(); | 
|  | unsigned Opcode = Inst.getOpcode(); | 
|  |  | 
|  | // llvm-mca is generally run on fully compiled assembly so we wouldn't see any | 
|  | // pseudo instructions here. However, there are plans for the future to make | 
|  | // it possible to use mca within backend passes. As such, I have left the | 
|  | // pseudo version of s_waitcnt within this switch statement. | 
|  | switch (Opcode) { | 
|  | default: | 
|  | return 0; | 
|  | case AMDGPU::S_WAITCNT: // This instruction | 
|  | case AMDGPU::S_WAITCNT_EXPCNT: | 
|  | case AMDGPU::S_WAITCNT_LGKMCNT: | 
|  | case AMDGPU::S_WAITCNT_VMCNT: | 
|  | case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo. | 
|  | case AMDGPU::S_WAITCNT_EXPCNT_gfx10: | 
|  | case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: | 
|  | case AMDGPU::S_WAITCNT_VMCNT_gfx10: | 
|  | case AMDGPU::S_WAITCNT_VSCNT_gfx10: | 
|  | case AMDGPU::S_WAITCNT_gfx10: | 
|  | case AMDGPU::S_WAITCNT_gfx6_gfx7: | 
|  | case AMDGPU::S_WAITCNT_vi: | 
|  | // s_endpgm also behaves as if there is an implicit | 
|  | // s_waitcnt 0, but I'm not sure if it would be appropriate | 
|  | // to model this in llvm-mca based on how the iterations work | 
|  | // while simulating the pipeline over and over. | 
|  | return handleWaitCnt(IssuedInst, IR); | 
|  | } | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst, | 
|  | const InstRef &IR) { | 
|  | // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr. | 
|  | // I do not know how that instruction works so I did not attempt to model it. | 
|  | // set the max values to begin | 
|  | unsigned Vmcnt = 63; | 
|  | unsigned Expcnt = 7; | 
|  | unsigned Lgkmcnt = 31; | 
|  | unsigned Vscnt = 63; | 
|  | unsigned CurrVmcnt = 0; | 
|  | unsigned CurrExpcnt = 0; | 
|  | unsigned CurrLgkmcnt = 0; | 
|  | unsigned CurrVscnt = 0; | 
|  | unsigned CyclesToWaitVm = ~0U; | 
|  | unsigned CyclesToWaitExp = ~0U; | 
|  | unsigned CyclesToWaitLgkm = ~0U; | 
|  | unsigned CyclesToWaitVs = ~0U; | 
|  |  | 
|  | computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt); | 
|  |  | 
|  | // We will now look at each of the currently executing instructions | 
|  | // to find out if this wait instruction still needs to wait. | 
|  | for (const InstRef &PrevIR : IssuedInst) { | 
|  | const Instruction &PrevInst = *PrevIR.getInstruction(); | 
|  | const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size(); | 
|  | const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex]; | 
|  | const int CyclesLeft = PrevInst.getCyclesLeft(); | 
|  | assert(CyclesLeft != UNKNOWN_CYCLES && | 
|  | "We should know how many cycles are left for this instruction"); | 
|  | if (PrevInstWaitInfo.VmCnt) { | 
|  | CurrVmcnt++; | 
|  | if ((unsigned)CyclesLeft < CyclesToWaitVm) | 
|  | CyclesToWaitVm = CyclesLeft; | 
|  | } | 
|  | if (PrevInstWaitInfo.ExpCnt) { | 
|  | CurrExpcnt++; | 
|  | if ((unsigned)CyclesLeft < CyclesToWaitExp) | 
|  | CyclesToWaitExp = CyclesLeft; | 
|  | } | 
|  | if (PrevInstWaitInfo.LgkmCnt) { | 
|  | CurrLgkmcnt++; | 
|  | if ((unsigned)CyclesLeft < CyclesToWaitLgkm) | 
|  | CyclesToWaitLgkm = CyclesLeft; | 
|  | } | 
|  | if (PrevInstWaitInfo.VsCnt) { | 
|  | CurrVscnt++; | 
|  | if ((unsigned)CyclesLeft < CyclesToWaitVs) | 
|  | CyclesToWaitVs = CyclesLeft; | 
|  | } | 
|  | } | 
|  |  | 
|  | unsigned CyclesToWait = ~0U; | 
|  | if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait) | 
|  | CyclesToWait = CyclesToWaitVm; | 
|  | if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait) | 
|  | CyclesToWait = CyclesToWaitExp; | 
|  | if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait) | 
|  | CyclesToWait = CyclesToWaitLgkm; | 
|  | if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait) | 
|  | CyclesToWait = CyclesToWaitVs; | 
|  |  | 
|  | // We may underestimate how many cycles we need to wait, but this | 
|  | // isn't a big deal. Our return value is just how many cycles until | 
|  | // this function gets run again. So as long as we don't overestimate | 
|  | // the wait time, we'll still end up stalling at this instruction | 
|  | // for the correct number of cycles. | 
|  |  | 
|  | if (CyclesToWait == ~0U) | 
|  | return 0; | 
|  | return CyclesToWait; | 
|  | } | 
|  |  | 
|  | void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, | 
|  | unsigned &Expcnt, unsigned &Lgkmcnt, | 
|  | unsigned &Vscnt) { | 
|  | AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU()); | 
|  | const Instruction &Inst = *IR.getInstruction(); | 
|  | unsigned Opcode = Inst.getOpcode(); | 
|  |  | 
|  | switch (Opcode) { | 
|  | case AMDGPU::S_WAITCNT_EXPCNT_gfx10: | 
|  | case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: | 
|  | case AMDGPU::S_WAITCNT_VMCNT_gfx10: | 
|  | case AMDGPU::S_WAITCNT_VSCNT_gfx10: { | 
|  | // Should probably be checking for nullptr | 
|  | // here, but I'm not sure how I should handle the case | 
|  | // where we see a nullptr. | 
|  | const MCAOperand *OpReg = Inst.getOperand(0); | 
|  | const MCAOperand *OpImm = Inst.getOperand(1); | 
|  | assert(OpReg && OpReg->isReg() && "First operand should be a register."); | 
|  | assert(OpImm && OpImm->isImm() && "Second operand should be an immediate."); | 
|  | if (OpReg->getReg() != AMDGPU::SGPR_NULL) { | 
|  | // Instruction is using a real register. | 
|  | // Since we can't know what value this register will have, | 
|  | // we can't compute what the value of this wait should be. | 
|  | WithColor::warning() << "The register component of " | 
|  | << MCII.getName(Opcode) << " will be completely " | 
|  | << "ignored. So the wait may not be accurate.\n"; | 
|  | } | 
|  | switch (Opcode) { | 
|  | // Redundant switch so I don't have to repeat the code above | 
|  | // for each case. There are more clever ways to avoid this | 
|  | // extra switch and anyone can feel free to implement one of them. | 
|  | case AMDGPU::S_WAITCNT_EXPCNT_gfx10: | 
|  | Expcnt = OpImm->getImm(); | 
|  | break; | 
|  | case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: | 
|  | Lgkmcnt = OpImm->getImm(); | 
|  | break; | 
|  | case AMDGPU::S_WAITCNT_VMCNT_gfx10: | 
|  | Vmcnt = OpImm->getImm(); | 
|  | break; | 
|  | case AMDGPU::S_WAITCNT_VSCNT_gfx10: | 
|  | Vscnt = OpImm->getImm(); | 
|  | break; | 
|  | } | 
|  | return; | 
|  | } | 
|  | case AMDGPU::S_WAITCNT_gfx10: | 
|  | case AMDGPU::S_WAITCNT_gfx6_gfx7: | 
|  | case AMDGPU::S_WAITCNT_vi: | 
|  | unsigned WaitCnt = Inst.getOperand(0)->getImm(); | 
|  | AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt); | 
|  | return; | 
|  | } | 
|  | } | 
|  |  | 
|  | void AMDGPUCustomBehaviour::generateWaitCntInfo() { | 
|  | // The core logic from this function is taken from | 
|  | // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions | 
|  | // that are being looked at are in the MachineInstr format, whereas we have | 
|  | // access to the MCInst format. The side effects of this are that we can't use | 
|  | // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst) | 
|  | // functions. Therefore, we conservatively assume that these functions will | 
|  | // return true. This may cause a few instructions to be incorrectly tagged | 
|  | // with an extra CNT. However, these are instructions that do interact with at | 
|  | // least one CNT so giving them an extra CNT shouldn't cause issues in most | 
|  | // scenarios. | 
|  | AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU()); | 
|  | InstrWaitCntInfo.resize(SrcMgr.size()); | 
|  |  | 
|  | int Index = 0; | 
|  | for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) { | 
|  | const std::unique_ptr<Instruction> &Inst = *I; | 
|  | unsigned Opcode = Inst->getOpcode(); | 
|  | const MCInstrDesc &MCID = MCII.get(Opcode); | 
|  | if ((MCID.TSFlags & SIInstrFlags::DS) && | 
|  | (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) { | 
|  | InstrWaitCntInfo[Index].LgkmCnt = true; | 
|  | if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds)) | 
|  | InstrWaitCntInfo[Index].ExpCnt = true; | 
|  | } else if (MCID.TSFlags & SIInstrFlags::FLAT) { | 
|  | // We conservatively assume that mayAccessVMEMThroughFlat(Inst) | 
|  | // and mayAccessLDSThroughFlat(Inst) would both return true for this | 
|  | // instruction. We have to do this because those functions use | 
|  | // information about the memory operands that we don't have access to. | 
|  | InstrWaitCntInfo[Index].LgkmCnt = true; | 
|  | if (!STI.hasFeature(AMDGPU::FeatureVscnt)) | 
|  | InstrWaitCntInfo[Index].VmCnt = true; | 
|  | else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) | 
|  | InstrWaitCntInfo[Index].VmCnt = true; | 
|  | else | 
|  | InstrWaitCntInfo[Index].VsCnt = true; | 
|  | } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) { | 
|  | if (!STI.hasFeature(AMDGPU::FeatureVscnt)) | 
|  | InstrWaitCntInfo[Index].VmCnt = true; | 
|  | else if ((MCID.mayLoad() && | 
|  | !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) || | 
|  | ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() && | 
|  | !MCID.mayStore())) | 
|  | InstrWaitCntInfo[Index].VmCnt = true; | 
|  | else if (MCID.mayStore()) | 
|  | InstrWaitCntInfo[Index].VsCnt = true; | 
|  |  | 
|  | // (IV.Major < 7) is meant to represent | 
|  | // GCNTarget.vmemWriteNeedsExpWaitcnt() | 
|  | // which is defined as | 
|  | // { return getGeneration() < SEA_ISLANDS; } | 
|  | if (IV.Major < 7 && | 
|  | (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet))) | 
|  | InstrWaitCntInfo[Index].ExpCnt = true; | 
|  | } else if (MCID.TSFlags & SIInstrFlags::SMRD) { | 
|  | InstrWaitCntInfo[Index].LgkmCnt = true; | 
|  | } else if (MCID.TSFlags & SIInstrFlags::EXP) { | 
|  | InstrWaitCntInfo[Index].ExpCnt = true; | 
|  | } else { | 
|  | switch (Opcode) { | 
|  | case AMDGPU::S_SENDMSG: | 
|  | case AMDGPU::S_SENDMSGHALT: | 
|  | case AMDGPU::S_MEMTIME: | 
|  | case AMDGPU::S_MEMREALTIME: | 
|  | InstrWaitCntInfo[Index].LgkmCnt = true; | 
|  | break; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // taken from SIInstrInfo::isVMEM() | 
|  | bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) { | 
|  | return MCID.TSFlags & SIInstrFlags::MUBUF || | 
|  | MCID.TSFlags & SIInstrFlags::MTBUF || | 
|  | MCID.TSFlags & SIInstrFlags::MIMG; | 
|  | } | 
|  |  | 
|  | // taken from SIInstrInfo::hasModifiersSet() | 
|  | bool AMDGPUCustomBehaviour::hasModifiersSet( | 
|  | const std::unique_ptr<Instruction> &Inst, unsigned OpName) const { | 
|  | int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName); | 
|  | if (Idx == -1) | 
|  | return false; | 
|  |  | 
|  | const MCAOperand *Op = Inst->getOperand(Idx); | 
|  | if (Op == nullptr || !Op->isImm() || !Op->getImm()) | 
|  | return false; | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | // taken from SIInstrInfo::isAlwaysGDS() | 
|  | bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const { | 
|  | return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT || | 
|  | Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR || | 
|  | Opcode == AMDGPU::DS_GWS_SEMA_P || | 
|  | Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || | 
|  | Opcode == AMDGPU::DS_GWS_BARRIER; | 
|  | } | 
|  |  | 
|  | } // namespace mca | 
|  | } // namespace llvm | 
|  |  | 
|  | using namespace llvm; | 
|  | using namespace mca; | 
|  |  | 
|  | static CustomBehaviour * | 
|  | createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI, | 
|  | const mca::SourceMgr &SrcMgr, | 
|  | const MCInstrInfo &MCII) { | 
|  | return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII); | 
|  | } | 
|  |  | 
|  | static InstrPostProcess * | 
|  | createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI, | 
|  | const MCInstrInfo &MCII) { | 
|  | return new AMDGPUInstrPostProcess(STI, MCII); | 
|  | } | 
|  |  | 
|  | /// Extern function to initialize the targets for the AMDGPU backend | 
|  |  | 
|  | extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() { | 
|  | TargetRegistry::RegisterCustomBehaviour(getTheAMDGPUTarget(), | 
|  | createAMDGPUCustomBehaviour); | 
|  | TargetRegistry::RegisterInstrPostProcess(getTheAMDGPUTarget(), | 
|  | createAMDGPUInstrPostProcess); | 
|  |  | 
|  | TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(), | 
|  | createAMDGPUCustomBehaviour); | 
|  | TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(), | 
|  | createAMDGPUInstrPostProcess); | 
|  | } |