//===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file This pass adds target attributes to functions which use intrinsics /// which will impact calling convention lowering. // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "GCNSubtarget.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "amdgpu-annotate-kernel-features" using namespace llvm; namespace { static constexpr StringLiteral ImplicitAttrNames[] = { // X ids unnecessarily propagated to kernels. "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", "amdgpu-queue-ptr", "amdgpu-implicitarg-ptr"}; class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { private: const TargetMachine *TM = nullptr; SmallVector NodeList; bool addFeatureAttributes(Function &F); bool processUniformWorkGroupAttribute(); bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee); public: static char ID; AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} bool doInitialization(CallGraph &CG) override; bool runOnSCC(CallGraphSCC &SCC) override; StringRef getPassName() const override { return "AMDGPU Annotate Kernel Features"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); CallGraphSCCPass::getAnalysisUsage(AU); } static bool visitConstantExpr(const ConstantExpr *CE); static bool visitConstantExprsRecursively( const Constant *EntryC, SmallPtrSet &ConstantExprVisited, bool IsFunc, bool HasApertureRegs); }; } // end anonymous namespace char AMDGPUAnnotateKernelFeatures::ID = 0; char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, "Add AMDGPU function attributes", false, false) // The queue ptr is only needed when casting to flat, not from it. static bool castRequiresQueuePtr(unsigned SrcAS) { return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; } static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { return castRequiresQueuePtr(ASC->getSrcAddressSpace()); } static bool isDSAddress(const Constant *C) { const GlobalValue *GV = dyn_cast(C); if (!GV) return false; unsigned AS = GV->getAddressSpace(); return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; } bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { if (CE->getOpcode() == Instruction::AddrSpaceCast) { unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); return castRequiresQueuePtr(SrcAS); } return false; } bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( const Constant *EntryC, SmallPtrSet &ConstantExprVisited, bool IsFunc, bool HasApertureRegs) { if (!ConstantExprVisited.insert(EntryC).second) return false; SmallVector Stack; Stack.push_back(EntryC); while (!Stack.empty()) { const Constant *C = Stack.pop_back_val(); // We need to trap on DS globals in non-entry functions. if (IsFunc && isDSAddress(C)) return true; // Check this constant expression. if (const auto *CE = dyn_cast(C)) { if (!HasApertureRegs && visitConstantExpr(CE)) return true; } // Visit all sub-expressions. for (const Use &U : C->operands()) { const auto *OpC = dyn_cast(U); if (!OpC) continue; if (!ConstantExprVisited.insert(OpC).second) continue; Stack.push_back(OpC); } } return false; } // We do not need to note the x workitem or workgroup id because they are always // initialized. // // TODO: We should not add the attributes if the known compile time workgroup // size is 1 for y/z. static StringRef intrinsicToAttrName(Intrinsic::ID ID, bool &NonKernelOnly, bool &IsQueuePtr) { switch (ID) { case Intrinsic::amdgcn_workitem_id_x: NonKernelOnly = true; return "amdgpu-work-item-id-x"; case Intrinsic::amdgcn_workgroup_id_x: NonKernelOnly = true; return "amdgpu-work-group-id-x"; case Intrinsic::amdgcn_workitem_id_y: case Intrinsic::r600_read_tidig_y: return "amdgpu-work-item-id-y"; case Intrinsic::amdgcn_workitem_id_z: case Intrinsic::r600_read_tidig_z: return "amdgpu-work-item-id-z"; case Intrinsic::amdgcn_workgroup_id_y: case Intrinsic::r600_read_tgid_y: return "amdgpu-work-group-id-y"; case Intrinsic::amdgcn_workgroup_id_z: case Intrinsic::r600_read_tgid_z: return "amdgpu-work-group-id-z"; case Intrinsic::amdgcn_dispatch_ptr: return "amdgpu-dispatch-ptr"; case Intrinsic::amdgcn_dispatch_id: return "amdgpu-dispatch-id"; case Intrinsic::amdgcn_kernarg_segment_ptr: return "amdgpu-kernarg-segment-ptr"; case Intrinsic::amdgcn_implicitarg_ptr: return "amdgpu-implicitarg-ptr"; case Intrinsic::amdgcn_queue_ptr: case Intrinsic::amdgcn_is_shared: case Intrinsic::amdgcn_is_private: // TODO: Does not require queue ptr on gfx9+ case Intrinsic::trap: case Intrinsic::debugtrap: IsQueuePtr = true; return "amdgpu-queue-ptr"; default: return ""; } } static bool handleAttr(Function &Parent, const Function &Callee, StringRef Name) { if (Callee.hasFnAttribute(Name)) { Parent.addFnAttr(Name); return true; } return false; } static void copyFeaturesToFunction(Function &Parent, const Function &Callee, bool &NeedQueuePtr) { if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) NeedQueuePtr = true; for (StringRef AttrName : ImplicitAttrNames) handleAttr(Parent, Callee, AttrName); } bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() { bool Changed = false; for (auto *Node : reverse(NodeList)) { Function *Caller = Node->getFunction(); for (auto I : *Node) { Function *Callee = std::get<1>(I)->getFunction(); if (Callee) Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee); } } return Changed; } bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute( Function &Caller, Function &Callee) { // Check for externally defined function if (!Callee.hasExactDefinition()) { Callee.addFnAttr("uniform-work-group-size", "false"); if (!Caller.hasFnAttribute("uniform-work-group-size")) Caller.addFnAttr("uniform-work-group-size", "false"); return true; } // Check if the Caller has the attribute if (Caller.hasFnAttribute("uniform-work-group-size")) { // Check if the value of the attribute is true if (Caller.getFnAttribute("uniform-work-group-size") .getValueAsString().equals("true")) { // Propagate the attribute to the Callee, if it does not have it if (!Callee.hasFnAttribute("uniform-work-group-size")) { Callee.addFnAttr("uniform-work-group-size", "true"); return true; } } else { Callee.addFnAttr("uniform-work-group-size", "false"); return true; } } else { // If the attribute is absent, set it as false Caller.addFnAttr("uniform-work-group-size", "false"); Callee.addFnAttr("uniform-work-group-size", "false"); return true; } return false; } bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { const GCNSubtarget &ST = TM->getSubtarget(F); bool HasApertureRegs = ST.hasApertureRegs(); SmallPtrSet ConstantExprVisited; bool HaveStackObjects = false; bool Changed = false; bool NeedQueuePtr = false; bool HaveCall = false; bool HasIndirectCall = false; bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); CallingConv::ID CC = F.getCallingConv(); bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx); // If this function hasAddressTaken() = true // then add all attributes corresponding to the implicit args. if (CallingConvSupportsAllImplicits && F.hasAddressTaken(nullptr, true, true, true)) { for (StringRef AttrName : ImplicitAttrNames) { F.addFnAttr(AttrName); } Changed = true; } for (BasicBlock &BB : F) { for (Instruction &I : BB) { if (isa(I)) { HaveStackObjects = true; continue; } if (auto *CB = dyn_cast(&I)) { const Function *Callee = dyn_cast(CB->getCalledOperand()->stripPointerCasts()); // Note the occurence of indirect call. if (!Callee) { if (!CB->isInlineAsm()) { HasIndirectCall = true; HaveCall = true; } continue; } Intrinsic::ID IID = Callee->getIntrinsicID(); if (IID == Intrinsic::not_intrinsic) { HaveCall = true; copyFeaturesToFunction(F, *Callee, NeedQueuePtr); Changed = true; } else { bool NonKernelOnly = false; if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) { F.addFnAttr("amdgpu-kernarg-segment-ptr"); } else { StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly, NeedQueuePtr); if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { F.addFnAttr(AttrName); Changed = true; } } } } if (NeedQueuePtr || (!IsFunc && HasApertureRegs)) continue; if (const AddrSpaceCastInst *ASC = dyn_cast(&I)) { if (!HasApertureRegs && castRequiresQueuePtr(ASC)) { NeedQueuePtr = true; continue; } } for (const Use &U : I.operands()) { const auto *OpC = dyn_cast(U); if (!OpC) continue; if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc, HasApertureRegs)) { NeedQueuePtr = true; break; } } } } if (NeedQueuePtr) { F.addFnAttr("amdgpu-queue-ptr"); Changed = true; } // TODO: We could refine this to captured pointers that could possibly be // accessed by flat instructions. For now this is mostly a poor way of // estimating whether there are calls before argument lowering. if (!IsFunc && HaveCall) { F.addFnAttr("amdgpu-calls"); Changed = true; } if (HaveStackObjects) { F.addFnAttr("amdgpu-stack-objects"); Changed = true; } // This pass cannot copy attributes from callees to callers // if there is an indirect call and in thus such cases, // hasAddressTaken() would be false for kernels and functions // making an indirect call (if they are themselves not indirectly called). // We must tag all such kernels/functions with all implicits attributes // for correctness. // e.g. // 1. Kernel K1 makes an indirect call to function F1. // Without detecting an indirect call in K1, this pass will not // add all implicit args to K1 (which is incorrect). // 2. Kernel K1 makes direct call to F1 which makes indirect call to function // F2. // Without detecting an indirect call in F1 (whose hasAddressTaken() is // false), the pass will not add all implicit args to F1 (which is // essential for correctness). if (CallingConvSupportsAllImplicits && HasIndirectCall) { for (StringRef AttrName : ImplicitAttrNames) { F.addFnAttr(AttrName); } Changed = true; } return Changed; } bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { bool Changed = false; for (CallGraphNode *I : SCC) { // Build a list of CallGraphNodes from most number of uses to least if (I->getNumReferences()) NodeList.push_back(I); else { processUniformWorkGroupAttribute(); NodeList.clear(); } Function *F = I->getFunction(); // Ignore functions with graphics calling conventions, these are currently // not allowed to have kernel arguments. if (!F || F->isDeclaration() || AMDGPU::isGraphics(F->getCallingConv())) continue; // Add feature attributes Changed |= addFeatureAttributes(*F); } return Changed; } bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { auto *TPC = getAnalysisIfAvailable(); if (!TPC) report_fatal_error("TargetMachine is required"); TM = &TPC->getTM(); return false; } Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { return new AMDGPUAnnotateKernelFeatures(); }