//===- PerformanceInliner.cpp - Basic cost based inlining for performance -===// // // This source file is part of the Swift.org open source project // // Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See http://swift.org/LICENSE.txt for license information // See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// #define DEBUG_TYPE "sil-inliner" #include "swift/SIL/SILInstruction.h" #include "swift/SIL/Dominance.h" #include "swift/SIL/SILModule.h" #include "swift/SIL/Projection.h" #include "swift/SILAnalysis/ColdBlockInfo.h" #include "swift/SILAnalysis/DominanceAnalysis.h" #include "swift/SILAnalysis/CallGraphAnalysis.h" #include "swift/SILAnalysis/LoopAnalysis.h" #include "swift/SILPasses/Passes.h" #include "swift/SILPasses/Transforms.h" #include "swift/SILPasses/Utils/Local.h" #include "swift/SILPasses/Utils/ConstantFolding.h" #include "swift/SILPasses/Utils/Devirtualize.h" #include "swift/SILPasses/Utils/Generics.h" #include "swift/SILPasses/Utils/SILInliner.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Support/Debug.h" #include "llvm/Support/CommandLine.h" #include "llvm/ADT/MapVector.h" #include using namespace swift; STATISTIC(NumFunctionsInlined, "Number of functions inlined"); namespace { // Threshold for deterministic testing of the inline heuristic. // It specifies an instruction cost limit where a simplified model is used // for the instruction costs: only builtin instructions have a cost of exactly // 1. llvm::cl::opt TestThreshold("sil-inline-test-threshold", llvm::cl::init(-1), llvm::cl::Hidden); llvm::cl::opt TestOpt("sil-inline-test", llvm::cl::init(0), llvm::cl::Hidden); // The following constants define the cost model for inlining. // The base value for every call: it represents the benefit of removing the // call overhead. // This value can be overridden with the -sil-inline-threshold option. const unsigned RemovedCallBenefit = 80; // The benefit if the condition of a terminator instruction gets constant due // to inlining. const unsigned ConstTerminatorBenefit = 2; // Benefit if the operand of an apply gets constant, e.g. if a closure is // passed to an apply instruction in the callee. const unsigned ConstCalleeBenefit = 150; // Additional benefit for each loop level. const unsigned LoopBenefitFactor = 40; // Approximately up to this cost level a function can be inlined without // increasing the code size. const unsigned TrivialFunctionThreshold = 20; // Represents a value in integer constant evaluation. struct IntConst { IntConst() : isValid(false), isFromCaller(false) { } IntConst(const APInt &value, bool isFromCaller) : value(value), isValid(true), isFromCaller(isFromCaller) { } // The actual value. APInt value; // True if the value is valid, i.e. could be evaluated to a constant. bool isValid; // True if the value is only valid, because a constant is passed to the // callee. False if constant propagation could do the same job inside the // callee without inlining it. bool isFromCaller; }; // Tracks constants in the caller and callee to get an estimation of what // values get constant if the callee is inlined. // This can be seen as a "simulation" of several optimizations: SROA, mem2reg // and constant propagation. // Note that this is only a simplified model and not correct in all cases. // For example aliasing information is not taken into account. class ConstantTracker { // Links between loaded and stored values. // The key is a load instruction, the value is the corresponding store // instruction which stores the loaded value. Both, key and value can also // be copy_addr instructions. llvm::DenseMap links; // The current stored values at memory addresses. // The key is the base address of the memory (after skipping address // projections). The value are store (or copy_addr) instructions, which // store the current value. // This is only an estimation, because e.g. it does not consider potential // aliasing. llvm::DenseMap memoryContent; // Cache for evaluated constants. llvm::SmallDenseMap constCache; // The caller/callee function which is tracked. SILFunction *F; // The constant tracker of the caller function (null if this is the // tracker of the callee). ConstantTracker *callerTracker; // The apply instruction in the caller (null if this is the tracker of the // callee). ApplyInst *AI; // Walks through address projections and (optionally) collects them. // Returns the base address, i.e. the first address which is not a // projection. SILValue scanProjections(SILValue addr, SmallVectorImpl *Result = nullptr); // Get the stored value for a load. The loadInst can be either a real load // or a copy_addr. SILValue getStoredValue(SILInstruction *loadInst, ProjectionPath &projStack); // Gets the parameter in the caller for a function argument. SILValue getParam(SILValue value) { if (SILArgument *arg = dyn_cast(value)) { if (AI && arg->isFunctionArg() && arg->getFunction() == F) { // Continue at the caller. return AI->getArgument(arg->getIndex()); } } return SILValue(); } SILInstruction *getMemoryContent(SILValue addr) { // The memory content can be stored in this ConstantTracker or in the // caller's ConstantTracker. SILInstruction *storeInst = memoryContent[addr]; if (storeInst) return storeInst; if (callerTracker) return callerTracker->getMemoryContent(addr); return nullptr; } // Gets the estimated definition of a value. SILInstruction *getDef(SILValue val, ProjectionPath &projStack); // Gets the estimated integer constant result of a builtin. IntConst getBuiltinConst(BuiltinInst *BI, int depth); public: // Constructor for the caller function. ConstantTracker(SILFunction *function) : F(function), callerTracker(nullptr), AI(nullptr) { } // Constructor for the callee function. ConstantTracker(SILFunction *function, ConstantTracker *caller, ApplyInst *callerApply) : F(function), callerTracker(caller), AI(callerApply) { } void beginBlock() { // Currently we don't do any sophisticated dataflow analysis, so we keep // the memoryContent alive only for a single block. memoryContent.clear(); } // Must be called for each instruction visited in dominance order. void trackInst(SILInstruction *inst); // Gets the estimated definition of a value. SILInstruction *getDef(SILValue val) { ProjectionPath projStack; return getDef(val, projStack); } // Gets the estimated definition of a value if it is in the caller. SILInstruction *getDefInCaller(SILValue val) { SILInstruction *def = getDef(val); if (def && def->getFunction() != F) return def; return nullptr; } // Gets the estimated integer constant of a value. IntConst getIntConst(SILValue val, int depth = 0); }; // Controls the decision to inline functions with @_semantics, @effect and // global_init attributes. enum class InlineSelection { Everything, NoGlobalInit, // and no availability semantics calls NoSemanticsAndGlobalInit }; class SILPerformanceInliner { /// The inline threashold. const int InlineCostThreshold; /// Specifies which functions not to inline, based on @_semantics and /// global_init attributes. InlineSelection WhatToInline; /// A map for each newly cloned apply back to the apply that it /// was cloned from. This is used in order to stop unbounded /// recursion in the case where we are cloning functions that are /// themselves recursive, especially when they are indirectly /// recursive in a way that is only exposed through inlining /// followed by devirtualization. llvm::DenseMap OriginMap; llvm::DenseSet RemovedApplies; SILFunction *getEligibleFunction(ApplyInst *AI); bool isProfitableToInline(ApplyInst *AI, unsigned loopDepthOfAI, DominanceAnalysis *DA, SILLoopAnalysis *LA, ConstantTracker &constTracker); void visitColdBlocks(SmallVectorImpl &AppliesToInline, SILBasicBlock *root, DominanceInfo *DT); void collectAppliesToInline(SILFunction *Caller, SmallVectorImpl &Applies, DominanceAnalysis *DA, SILLoopAnalysis *LA, CallGraph &CG); bool applyTargetsOriginFunction(FullApplySite Apply, SILFunction *Callee); void removeApply(FullApplySite Apply, CallGraph &CG, llvm::SmallVectorImpl &NewApplies); FullApplySite devirtualizeUpdatingCallGraph(FullApplySite Apply, CallGraph &CG); bool devirtualizeAndSpecializeApplies( llvm::SmallVectorImpl &Applies, CallGraphAnalysis *CGA, SILModuleTransform *MT, llvm::SmallVectorImpl &WorkList); FullApplySite specializeGenericUpdatingCallGraph(FullApplySite Apply, CallGraph &CG, llvm::SmallVectorImpl &NewApplies); bool inlineCallsIntoFunction(SILFunction *F, DominanceAnalysis *DA, SILLoopAnalysis *LA, CallGraph &CG, llvm::SmallVectorImpl &NewApplies); public: SILPerformanceInliner(int threshold, InlineSelection WhatToInline) : InlineCostThreshold(threshold), WhatToInline(WhatToInline) {} void inlineDevirtualizeAndSpecialize(SILFunction *WorkItem, SILModuleTransform *MT, CallGraphAnalysis *CGA, DominanceAnalysis *DA, SILLoopAnalysis *LA); }; } //===----------------------------------------------------------------------===// // ConstantTracker //===----------------------------------------------------------------------===// void ConstantTracker::trackInst(SILInstruction *inst) { if (LoadInst *LI = dyn_cast(inst)) { SILValue baseAddr = scanProjections(LI->getOperand()); if (SILInstruction *loadLink = getMemoryContent(baseAddr)) links[LI] = loadLink; } else if (StoreInst *SI = dyn_cast(inst)) { SILValue baseAddr = scanProjections(SI->getOperand(1)); memoryContent[baseAddr] = SI; } else if (CopyAddrInst *CAI = dyn_cast(inst)) { if (!CAI->isTakeOfSrc()) { // Treat a copy_addr as a load + store SILValue loadAddr = scanProjections(CAI->getOperand(0)); if (SILInstruction *loadLink = getMemoryContent(loadAddr)) { links[CAI] = loadLink; SILValue storeAddr = scanProjections(CAI->getOperand(1)); memoryContent[storeAddr] = CAI; } } } } SILValue ConstantTracker::scanProjections(SILValue addr, SmallVectorImpl *Result) { for (;;) { if (Projection::isAddrProjection(addr)) { SILInstruction *I = cast(addr.getDef()); if (Result) { Optional P = Projection::addressProjectionForInstruction(I); Result->push_back(P.getValue()); } addr = I->getOperand(0); continue; } if (SILValue param = getParam(addr)) { // Go to the caller. addr = param; continue; } // Return the base address = the first address which is not a projection. return addr; } } SILValue ConstantTracker::getStoredValue(SILInstruction *loadInst, ProjectionPath &projStack) { SILInstruction *store = links[loadInst]; if (!store && callerTracker) store = callerTracker->links[loadInst]; if (!store) return SILValue(); assert(isa(loadInst) || isa(loadInst)); // Push the address projections of the load onto the stack. SmallVector loadProjections; scanProjections(loadInst->getOperand(0), &loadProjections); for (const Projection &proj : loadProjections) { projStack.push_back(proj); } // Pop the address projections of the store from the stack. SmallVector storeProjections; scanProjections(store->getOperand(1), &storeProjections); for (auto iter = storeProjections.rbegin(); iter != storeProjections.rend(); ++iter) { const Projection &proj = *iter; // The corresponding load-projection must match the store-projection. if (projStack.empty() || projStack.back() != proj) return SILValue(); projStack.pop_back(); } if (isa(store)) return store->getOperand(0); // The copy_addr instruction is both a load and a store. So we follow the link // again. assert(isa(store)); return getStoredValue(store, projStack); } // Get the aggregate member based on the top of the projection stack. static SILValue getMember(SILInstruction *inst, ProjectionPath &projStack) { if (!projStack.empty()) { const Projection &proj = projStack.back(); return proj.getOperandForAggregate(inst); } return SILValue(); } SILInstruction *ConstantTracker::getDef(SILValue val, ProjectionPath &projStack) { // Track the value up the dominator tree. for (;;) { if (SILInstruction *inst = dyn_cast(val)) { if (auto proj = Projection::valueProjectionForInstruction(inst)) { // Extract a member from a struct/tuple/enum. projStack.push_back(proj.getValue()); val = inst->getOperand(0); continue; } else if (SILValue member = getMember(inst, projStack)) { // The opposite of a projection instruction: composing a struct/tuple. projStack.pop_back(); val = member; continue; } else if (SILValue loadedVal = getStoredValue(inst, projStack)) { // A value loaded from memory. val = loadedVal; continue; } else if (isa(inst)) { val = inst->getOperand(0); continue; } return inst; } else if (SILValue param = getParam(val)) { // Continue in the caller. val = param; continue; } return nullptr; } } IntConst ConstantTracker::getBuiltinConst(BuiltinInst *BI, int depth) { const BuiltinInfo &Builtin = BI->getBuiltinInfo(); OperandValueArrayRef Args = BI->getArguments(); switch (Builtin.ID) { default: break; // Fold comparison predicates. #define BUILTIN(id, name, Attrs) #define BUILTIN_BINARY_PREDICATE(id, name, attrs, overload) \ case BuiltinValueKind::id: #include "swift/AST/Builtins.def" { IntConst lhs = getIntConst(Args[0], depth); IntConst rhs = getIntConst(Args[1], depth); if (lhs.isValid && rhs.isValid) { return IntConst(constantFoldComparison(lhs.value, rhs.value, Builtin.ID), lhs.isFromCaller || rhs.isFromCaller); } break; } case BuiltinValueKind::SAddOver: case BuiltinValueKind::UAddOver: case BuiltinValueKind::SSubOver: case BuiltinValueKind::USubOver: case BuiltinValueKind::SMulOver: case BuiltinValueKind::UMulOver: { IntConst lhs = getIntConst(Args[0], depth); IntConst rhs = getIntConst(Args[1], depth); if (lhs.isValid && rhs.isValid) { bool IgnoredOverflow; return IntConst(constantFoldBinaryWithOverflow(lhs.value, rhs.value, IgnoredOverflow, getLLVMIntrinsicIDForBuiltinWithOverflow(Builtin.ID)), lhs.isFromCaller || rhs.isFromCaller); } break; } case BuiltinValueKind::SDiv: case BuiltinValueKind::SRem: case BuiltinValueKind::UDiv: case BuiltinValueKind::URem: { IntConst lhs = getIntConst(Args[0], depth); IntConst rhs = getIntConst(Args[1], depth); if (lhs.isValid && rhs.isValid && rhs.value != 0) { bool IgnoredOverflow; return IntConst(constantFoldDiv(lhs.value, rhs.value, IgnoredOverflow, Builtin.ID), lhs.isFromCaller || rhs.isFromCaller); } break; } case BuiltinValueKind::And: case BuiltinValueKind::AShr: case BuiltinValueKind::LShr: case BuiltinValueKind::Or: case BuiltinValueKind::Shl: case BuiltinValueKind::Xor: { IntConst lhs = getIntConst(Args[0], depth); IntConst rhs = getIntConst(Args[1], depth); if (lhs.isValid && rhs.isValid) { return IntConst(constantFoldBitOperation(lhs.value, rhs.value, Builtin.ID), lhs.isFromCaller || rhs.isFromCaller); } break; } case BuiltinValueKind::Trunc: case BuiltinValueKind::ZExt: case BuiltinValueKind::SExt: case BuiltinValueKind::TruncOrBitCast: case BuiltinValueKind::ZExtOrBitCast: case BuiltinValueKind::SExtOrBitCast: { IntConst val = getIntConst(Args[0], depth); if (val.isValid) { return IntConst(constantFoldCast(val.value, Builtin), val.isFromCaller); } break; } } return IntConst(); } // Tries to evaluate the integer constant of a value. The \p depth is used // to limit the complexity. IntConst ConstantTracker::getIntConst(SILValue val, int depth) { // Don't spend too much time with constant evaluation. if (depth >= 10) return IntConst(); SILInstruction *I = getDef(val); if (!I) return IntConst(); if (auto *IL = dyn_cast(I)) { return IntConst(IL->getValue(), IL->getFunction() != F); } if (auto *BI = dyn_cast(I)) { if (constCache.count(BI) != 0) return constCache[BI]; IntConst builtinConst = getBuiltinConst(BI, depth + 1); constCache[BI] = builtinConst; return builtinConst; } return IntConst(); } //===----------------------------------------------------------------------===// // Performance Inliner //===----------------------------------------------------------------------===// // Returns the referenced function of an apply_inst if it is a direct call. static SILFunction *getReferencedFunction(FullApplySite Apply) { auto *FRI = dyn_cast(Apply.getCallee()); if (!FRI) return nullptr; return FRI->getReferencedFunction(); } // Given an apply, determine if it or any the apply it was cloned from // were in Callee. If so, we can end up in an infinite loop on trying // to inline recursive cycles in the code (some of which are only // exposed via devirtualization). bool SILPerformanceInliner::applyTargetsOriginFunction(FullApplySite Apply, SILFunction *Callee) { assert(Apply && "Expected non-null apply!"); assert(!RemovedApplies.count(Apply) && "Apply cannot have been removed!"); while (Apply.getFunction() != Callee) { auto Found = OriginMap.find(Apply); if (Found == OriginMap.end()) return false; assert(Found->second && "Expected non-null apply!"); Apply = Found->second; // Bail if we hit an apply that has been removed since we cannot // determine if we'll end up in a recursive inlining situation. if (RemovedApplies.count(Apply)) return true; } return true; } // Returns the callee of an apply_inst if it is basically inlinable. SILFunction *SILPerformanceInliner::getEligibleFunction(ApplyInst *AI) { SILFunction *Callee = getReferencedFunction(AI); if (!Callee) { DEBUG(llvm::dbgs() << " FAIL: Cannot find inlineable callee.\n"); return nullptr; } // Don't inline functions that are marked with the @_semantics or @effects // attribute if the inliner is asked not to inline them. if (Callee->hasDefinedSemantics() || Callee->hasEffectsKind()) { if (WhatToInline == InlineSelection::NoSemanticsAndGlobalInit) { DEBUG(llvm::dbgs() << " FAIL: Function " << Callee->getName() << " has special semantics or effects attribute.\n"); return nullptr; } // The "availability" semantics attribute is treated like global-init. if (Callee->hasDefinedSemantics() && WhatToInline != InlineSelection::Everything && Callee->getSemanticsString().startswith("availability")) { return nullptr; } } else if (Callee->isGlobalInit()) { if (WhatToInline != InlineSelection::Everything) { DEBUG(llvm::dbgs() << " FAIL: Function " << Callee->getName() << " has the global-init attribute.\n"); return nullptr; } } // We can't inline external declarations. if (Callee->empty() || Callee->isExternalDeclaration()) { DEBUG(llvm::dbgs() << " FAIL: Cannot inline external " << Callee->getName() << ".\n"); return nullptr; } // Explicitly disabled inlining. if (Callee->getInlineStrategy() == NoInline) { DEBUG(llvm::dbgs() << " FAIL: noinline attribute on " << Callee->getName() << ".\n"); return nullptr; } // We don't support this yet. if (AI->hasSubstitutions()) { DEBUG(llvm::dbgs() << " FAIL: Generic substitutions on " << Callee->getName() << ".\n"); return nullptr; } SILFunction *Caller = AI->getFunction(); // Check for trivial recursions. if (Callee == Caller) { DEBUG(llvm::dbgs() << " FAIL: Skipping recursive calls on " << Callee->getName() << ".\n"); return nullptr; } // Check for non-trivial recursion. if (applyTargetsOriginFunction(AI, Callee)) { DEBUG(llvm::dbgs() << " FAIL: Non-trivial recursion calling " << Callee->getName() << ".\n"); return nullptr; } // A non-fragile function may not be inlined into a fragile function. if (Caller->isFragile() && !Callee->isFragile()) { DEBUG(llvm::dbgs() << " FAIL: Can't inline fragile " << Callee->getName() << ".\n"); return nullptr; } DEBUG(llvm::dbgs() << " Eligible callee: " << Callee->getName() << "\n"); return Callee; } // Gets the cost of an instruction by using the simplified test-model: only // builtin instructions have a cost and that's exactly 1. static unsigned testCost(SILInstruction *I) { switch (I->getKind()) { case ValueKind::BuiltinInst: return 1; default: return 0; } } // Returns the taken block of a terminator instruction if the condition turns // out to be constant. static SILBasicBlock *getTakenBlock(TermInst *term, ConstantTracker &constTracker) { if (CondBranchInst *CBI = dyn_cast(term)) { IntConst condConst = constTracker.getIntConst(CBI->getCondition()); if (condConst.isFromCaller) { return condConst.value != 0 ? CBI->getTrueBB() : CBI->getFalseBB(); } return nullptr; } if (SwitchValueInst *SVI = dyn_cast(term)) { IntConst switchConst = constTracker.getIntConst(SVI->getOperand()); if (switchConst.isFromCaller) { for (unsigned Idx = 0; Idx < SVI->getNumCases(); ++Idx) { auto switchCase = SVI->getCase(Idx); if (auto *IL = dyn_cast(switchCase.first)) { if (switchConst.value == IL->getValue()) return switchCase.second; } else { return nullptr; } } if (SVI->hasDefault()) return SVI->getDefaultBB(); } return nullptr; } if (SwitchEnumInst *SEI = dyn_cast(term)) { if (SILInstruction *def = constTracker.getDefInCaller(SEI->getOperand())) { if (EnumInst *EI = dyn_cast(def)) { for (unsigned Idx = 0; Idx < SEI->getNumCases(); ++Idx) { auto enumCase = SEI->getCase(Idx); if (enumCase.first == EI->getElement()) return enumCase.second; } if (SEI->hasDefault()) return SEI->getDefaultBB(); } } return nullptr; } if (CheckedCastBranchInst *CCB = dyn_cast(term)) { if (SILInstruction *def = constTracker.getDefInCaller(CCB->getOperand())) { if (UpcastInst *UCI = dyn_cast(def)) { SILType castType = UCI->getOperand()->getType(0); if (CCB->getCastType().isSuperclassOf(castType)) { return CCB->getSuccessBB(); } if (!castType.isSuperclassOf(CCB->getCastType())) { return CCB->getFailureBB(); } } } } return nullptr; } /// Return true if inlining this call site is profitable. bool SILPerformanceInliner::isProfitableToInline(ApplyInst *AI, unsigned loopDepthOfAI, DominanceAnalysis *DA, SILLoopAnalysis *LA, ConstantTracker &callerTracker) { SILFunction *Callee = getReferencedFunction(AI); if (Callee->getInlineStrategy() == AlwaysInline) return true; ConstantTracker constTracker(Callee, &callerTracker, AI); DominanceInfo *DT = DA->get(Callee); SILLoopInfo *LI = LA->getLoopInfo(Callee); DominanceOrder domOrder(&Callee->front(), DT, Callee->size()); // Calculate the inlining cost of the callee. unsigned CalleeCost = 0; unsigned Benefit = InlineCostThreshold > 0 ? InlineCostThreshold : RemovedCallBenefit; Benefit += loopDepthOfAI * LoopBenefitFactor; int testThreshold = TestThreshold; while (SILBasicBlock *block = domOrder.getNext()) { constTracker.beginBlock(); unsigned loopDepth = LI->getLoopDepth(block); for (SILInstruction &I : *block) { constTracker.trackInst(&I); auto ICost = instructionInlineCost(I); if (testThreshold >= 0) { // We are in test-mode: use a simplified cost model. CalleeCost += testCost(&I); } else { // Use the regular cost model. CalleeCost += unsigned(ICost); } if (ApplyInst *AI = dyn_cast(&I)) { // Check if the callee is passed as an argument. If so, increase the // threshold, because inlining will (probably) eliminate the closure. SILInstruction *def = constTracker.getDefInCaller(AI->getCallee()); if (def && (isa(def) || isa(def))) { DEBUG(llvm::dbgs() << " Boost: apply const function at" << *AI); Benefit += ConstCalleeBenefit + loopDepth * LoopBenefitFactor; testThreshold *= 2; } } } // Don't count costs in blocks which are dead after inlining. SILBasicBlock *takenBlock = getTakenBlock(block->getTerminator(), constTracker); if (takenBlock) { Benefit += ConstTerminatorBenefit + TestOpt; DEBUG(llvm::dbgs() << " Take bb" << takenBlock->getDebugID() << " of" << *block->getTerminator()); domOrder.pushChildrenIf(block, [=] (SILBasicBlock *child) { return child->getSinglePredecessor() != block || child == takenBlock; }); } else { domOrder.pushChildren(block); } } unsigned Threshold = Benefit; // The default. if (testThreshold >= 0) { // We are in testing mode. Threshold = testThreshold; } else if (AI->getFunction()->isThunk()) { // Only inline trivial functions into thunks (which will not increase the // code size). Threshold = TrivialFunctionThreshold; } if (CalleeCost > Threshold) { DEBUG(llvm::dbgs() << " NO: Function too big to inline, " "cost: " << CalleeCost << ", threshold: " << Threshold << "\n"); return false; } DEBUG(llvm::dbgs() << " YES: ready to inline, " "cost: " << CalleeCost << ", threshold: " << Threshold << "\n"); return true; } /// Return true if inlining this call site into a cold block is profitable. static bool isProfitableInColdBlock(SILFunction *Callee) { if (Callee->getInlineStrategy() == AlwaysInline) return true; // Testing with the TestThreshold disables inlining into cold blocks. if (TestThreshold >= 0) return false; unsigned CalleeCost = 0; for (SILBasicBlock &Block : *Callee) { for (SILInstruction &I : Block) { auto ICost = instructionInlineCost(I); CalleeCost += (unsigned)ICost; if (CalleeCost > TrivialFunctionThreshold) return false; } } DEBUG(llvm::dbgs() << " YES: ready to inline into cold block, cost:" << CalleeCost << "\n"); return true; } // Attempt to devirtualize, maintaining the call graph if // successful. When successful, replaces the old apply with the new // one and returns the new one. When unsuccessful returns an empty // apply site. FullApplySite SILPerformanceInliner::devirtualizeUpdatingCallGraph( FullApplySite Apply, CallGraph &CG) { auto *AI = cast(Apply.getInstruction()); auto *NewInst = tryDevirtualizeApply(AI); if (!NewInst) return FullApplySite(); if (auto *Edge = CG.getCallGraphEdge(AI)) CG.removeEdge(Edge); auto *NewAI = findApplyFromDevirtualizedResult(NewInst); // In cases where devirtualization results in having to // insert code to match the result type of the original // function, we need to find the original apply. It's // currently simple enough to always do this, and we'll end // up with a better call graph if we try to maintain the // property that we can always find the apply that resulted // from devirtualizing. assert(NewAI && "Expected to find an apply!"); assert(!OriginMap.count(NewAI) && "Unexpected apply in map!"); if (OriginMap.count(AI)) OriginMap[NewAI] = OriginMap[AI]; RemovedApplies.insert(AI); replaceDeadApply(AI, NewInst); auto *F = getReferencedFunction(NewAI); assert(F && "Expected direct function referenced!"); // If we devirtualized to a function declaration, attempt to link it // in and update the call graph, or else just return the newly // devirtualized instruction. if (F->isExternalDeclaration()) { auto &M = F->getModule(); if (!M.linkFunction(F, SILModule::LinkingMode::LinkAll, CallGraphLinkerEditor(CG).getCallback())) return NewAI; } CG.addEdgesForApply(NewAI); return NewAI; } FullApplySite SILPerformanceInliner::specializeGenericUpdatingCallGraph( FullApplySite Apply, CallGraph &CG, llvm::SmallVectorImpl &NewApplies) { assert(NewApplies.empty() && "Expected out parameter for new applies!"); if (!Apply.hasSubstitutions()) return FullApplySite(); auto *Callee = getReferencedFunction(Apply); if (!Callee) return FullApplySite(); if (Callee->isExternalDeclaration()) { auto &M = Callee->getModule(); if (!M.linkFunction(Callee, SILModule::LinkingMode::LinkAll, CallGraphLinkerEditor(CG).getCallback())) return FullApplySite(); } auto *AI = Apply.getInstruction(); SILFunction *SpecializedFunction; llvm::SmallVector NewApplyPairs; auto Specialized = trySpecializeApplyOfGeneric(ApplySite(AI), SpecializedFunction, NewApplyPairs); if (!Specialized) return FullApplySite(); assert(FullApplySite::isa(Specialized.getInstruction()) && "Expected full apply site!"); auto SpecializedFullApply = FullApplySite(Specialized.getInstruction()); // Update the call graph based on the specialization. CallGraphEditor Editor(CG); if (SpecializedFunction) Editor.addCallGraphNode(SpecializedFunction); Editor.replaceApplyWithNew(Apply, SpecializedFullApply); for (auto NewApply : NewApplyPairs) { NewApplies.push_back(NewApply.first); Editor.addEdgesForApply(NewApply.first); assert(!OriginMap.count(NewApply.first) && "Unexpected apply in map!"); OriginMap[NewApply.first] = NewApply.second; } assert(!OriginMap.count(SpecializedFullApply) && "Unexpected apply in map!"); if (OriginMap.count(Apply)) OriginMap[SpecializedFullApply] = OriginMap[Apply]; RemovedApplies.insert(Apply); // Replace the old apply with the new and delete the old. replaceDeadApply(Apply, SpecializedFullApply.getInstruction()); return SpecializedFullApply; } static void collectAllAppliesInFunction(SILFunction *F, llvm::SmallVectorImpl &Applies) { assert(Applies.empty() && "Expected empty vector to store into!"); for (auto &B : *F) for (auto &I : B) if (auto *AI = dyn_cast(&I)) Applies.push_back(FullApplySite(AI)); } // Devirtualize and specialize a group of applies, updating the call // graph and returning a worklist of newly exposed function references // that should be considered for inlining before continuing with the // caller that has the passed-in applies. // // The returned worklist is stacked such that the last things we want // to process are earlier on the list. // // Returns true if any changes were made. bool SILPerformanceInliner::devirtualizeAndSpecializeApplies( llvm::SmallVectorImpl &Applies, CallGraphAnalysis *CGA, SILModuleTransform *MT, llvm::SmallVectorImpl &WorkList) { assert(WorkList.empty() && "Expected empty worklist for return results!"); auto &CG = CGA->getCallGraph(); bool ChangedAny = false; // The set of all new function references generated by // devirtualization and specialization. llvm::SetVector NewRefs; // Process all applies passed in, plus any new ones that are pushed // on as a result of specializing the referenced functions. while (!Applies.empty()) { auto Apply = Applies.back(); Applies.pop_back(); bool ChangedApply = false; if (auto NewApply = devirtualizeUpdatingCallGraph(Apply, CG)) { ChangedApply = true; Apply = NewApply; } llvm::SmallVector NewApplies; if (auto NewApply = specializeGenericUpdatingCallGraph(Apply, CG, NewApplies)) { ChangedApply = true; Apply = NewApply; Applies.insert(Applies.end(), NewApplies.begin(), NewApplies.end()); } if (ChangedApply) { ChangedAny = true; auto *NewCallee = getReferencedFunction(Apply); assert(NewCallee && "Expected directly referenced function!"); // Track all new references to function definitions. if (NewCallee->isDefinition()) NewRefs.insert(NewCallee); // Invalidate analyses, but lock the call graph since we // maintain it, and also indicate that we preserve branches // since we've only touched applies. CGA->lockInvalidation(); MT->invalidateAnalysis(Apply.getFunction(), SILAnalysis::PreserveKind::Branches); CGA->unlockInvalidation(); } } // Copy out all the new function references gathered. if (ChangedAny) WorkList.insert(WorkList.end(), NewRefs.begin(), NewRefs.end()); return ChangedAny; } void SILPerformanceInliner::collectAppliesToInline(SILFunction *Caller, SmallVectorImpl &Applies, DominanceAnalysis *DA, SILLoopAnalysis *LA, CallGraph &CG) { DominanceInfo *DT = DA->get(Caller); SILLoopInfo *LI = LA->getLoopInfo(Caller); ConstantTracker constTracker(Caller); DominanceOrder domOrder(&Caller->front(), DT, Caller->size()); // Go through all instructions and find candidates for inlining. // We do this in dominance order for the constTracker. SmallVector InitialCandidates; while (SILBasicBlock *block = domOrder.getNext()) { constTracker.beginBlock(); unsigned loopDepth = LI->getLoopDepth(block); for (auto I = block->begin(), E = block->end(); I != E; ++I) { constTracker.trackInst(&*I); auto *AI = dyn_cast(I); if (!AI) continue; DEBUG(llvm::dbgs() << " Check:" << *AI); auto *Callee = getEligibleFunction(AI); if (Callee) { if (isProfitableToInline(AI, loopDepth, DA, LA, constTracker)) InitialCandidates.push_back(AI); } } domOrder.pushChildrenIf(block, [&] (SILBasicBlock *child) { if (ColdBlockInfo::isSlowPath(block, child)) { // Handle cold blocks separately. visitColdBlocks(InitialCandidates, child, DT); return false; } return true; }); } // Calculate how many times a callee is called from this caller. llvm::DenseMap CalleeCount; for (auto AI : InitialCandidates) { SILFunction *Callee = getReferencedFunction(AI); assert(Callee && "apply_inst does not have a direct callee anymore"); CalleeCount[Callee]++; } // Now copy each candidate callee that has a small enough number of // call sites into the final set of call sites. for (auto AI : InitialCandidates) { SILFunction *Callee = getReferencedFunction(AI); assert(Callee && "apply_inst does not have a direct callee anymore"); const unsigned CallsToCalleeThreshold = 1024; if (CalleeCount[Callee] <= CallsToCalleeThreshold) Applies.push_back(AI); } } /// \brief Attempt to inline all calls smaller than our threshold. /// returns True if a function was inlined. bool SILPerformanceInliner::inlineCallsIntoFunction(SILFunction *Caller, DominanceAnalysis *DA, SILLoopAnalysis *LA, CallGraph &CG, llvm::SmallVectorImpl &NewApplies) { // Don't optimize functions that are marked with the opt.never attribute. if (!Caller->shouldOptimize()) return false; DEBUG(llvm::dbgs() << "Visiting Function: " << Caller->getName() << "\n"); assert(NewApplies.empty() && "Expected empty vector to store results in!"); // First step: collect all the functions we want to inline. We // don't change anything yet so that the dominator information // remains valid. SmallVector AppliesToInline; collectAppliesToInline(Caller, AppliesToInline, DA, LA, CG); if (AppliesToInline.empty()) return false; // Second step: do the actual inlining. for (auto AI : AppliesToInline) { SILFunction *Callee = getReferencedFunction(AI); assert(Callee && "apply_inst does not have a direct callee anymore"); DEBUG(llvm::dbgs() << " Inline:" << *AI); SmallVector Args; for (const auto &Arg : AI->getArguments()) Args.push_back(Arg); FullApplyCollector Collector; // Notice that we will skip all of the newly inlined ApplyInsts. That's // okay because we will visit them in our next invocation of the inliner. TypeSubstitutionMap ContextSubs; SILInliner Inliner(*Caller, *Callee, SILInliner::InlineKind::PerformanceInline, ContextSubs, AI->getSubstitutions(), Collector.getCallback()); auto Success = Inliner.inlineFunction(AI, Args); (void) Success; // We've already determined we should be able to inline this, so // we expect it to have happened. assert(Success && "Expected inliner to inline this function!"); llvm::SmallVector AppliesFromInlinee; for (auto &P : Collector.getApplyPairs()) { AppliesFromInlinee.push_back(P.first); // Maintain a mapping for all new applies back to the apply they // originated from. assert(!OriginMap.count(P.first) && "Did not expect apply to be in map!"); assert(P.second && "Expected non-null apply site!"); OriginMap[P.first] = P.second; } CallGraphEditor Editor(CG); Editor.replaceApplyWithNew(AI, AppliesFromInlinee); RemovedApplies.insert(AI); recursivelyDeleteTriviallyDeadInstructions(AI, true); NewApplies.insert(NewApplies.end(), AppliesFromInlinee.begin(), AppliesFromInlinee.end()); DA->invalidate(Caller, SILAnalysis::PreserveKind::Nothing); NumFunctionsInlined++; } DEBUG(llvm::dbgs() << "\n"); return true; } void SILPerformanceInliner::inlineDevirtualizeAndSpecialize( SILFunction *Caller, SILModuleTransform *MT, CallGraphAnalysis *CGA, DominanceAnalysis *DA, SILLoopAnalysis *LA) { assert(Caller->isDefinition() && "Expected only defined functions in the call graph!"); llvm::SmallVector WorkList; WorkList.push_back(Caller); auto &CG = CGA->getOrBuildCallGraph(); OriginMap.clear(); RemovedApplies.clear(); while (!WorkList.empty()) { llvm::SmallVector WorkItemApplies; collectAllAppliesInFunction(WorkList.back(), WorkItemApplies); // Devirtualize and specialize any applies we've collected, // and collect new functions we should inline into as we do // so. llvm::SmallVector NewFuncs; if (devirtualizeAndSpecializeApplies(WorkItemApplies, CGA, MT, NewFuncs)) { WorkList.insert(WorkList.end(), NewFuncs.begin(), NewFuncs.end()); NewFuncs.clear(); } assert(WorkItemApplies.empty() && "Expected all applies to be processed!"); // We want to inline into each function on the worklist, starting // with any new ones that were exposed as a result of // devirtualization (to insure we're inlining into callees first). // // After inlining, we may have new opportunities for // devirtualization, e.g. as a result of exposing the dynamic type // of an object. When those opportunities arise we want to attempt // devirtualization and then again attempt to inline into the // newly exposed functions, etc. until we're back to the function // we began with. auto *Initial = WorkList.back(); // In practice we rarely exceed 5, but in a perf test we iterate 51 times. const unsigned MaxLaps = 150; unsigned Lap = 0; while (1) { auto *WorkItem = WorkList.back(); assert(WorkItem->isDefinition() && "Expected function definition on work list!"); // Devirtualization and specialization might have exposed new // function references. We want to inline within those functions // before inlining within our original function. // // Inlining in turn might result in new applies that we should // consider for devirtualization and specialization. llvm::SmallVector NewApplies; if (inlineCallsIntoFunction(WorkItem, DA, LA, CG, NewApplies)) { // Invalidate analyses, but lock the call graph since we // maintain it. CGA->lockInvalidation(); MT->invalidateAnalysis(WorkItem, SILAnalysis::PreserveKind::Nothing); CGA->unlockInvalidation(); // FIXME: Update inlineCallsIntoFunction to collect all // remaining applies after inlining, not just those // resulting from inlining code. llvm::SmallVector WorkItemApplies; collectAllAppliesInFunction(WorkItem, WorkItemApplies); if (devirtualizeAndSpecializeApplies(WorkItemApplies, CGA, MT, NewFuncs)) { WorkList.insert(WorkList.end(), NewFuncs.begin(), NewFuncs.end()); NewFuncs.clear(); assert(WorkItemApplies.empty() && "Expected all applies to be processed!"); } } else if (WorkItem == Initial) { break; } else { WorkList.pop_back(); } Lap++; if (Lap > MaxLaps) // It's possible to construct real code where this will hit, but // it's more likely that there is an issue tracking recursive // inlining, in which case we want to know about it in internal // builds, and not hang on bots or user machines. assert(Lap <= MaxLaps && "Possible bug tracking recursion!"); // Give up and move along. if (Lap > MaxLaps) { while (WorkList.back() != Initial) WorkList.pop_back(); break; } } assert(WorkList.back() == Initial && "Expected to exit with same element on top of stack!" ); WorkList.pop_back(); } } // Find functions in cold blocks which are forced to be inlined. // All other functions are not inlined in cold blocks. void SILPerformanceInliner::visitColdBlocks(SmallVectorImpl & AppliesToInline, SILBasicBlock *Root, DominanceInfo *DT) { DominanceOrder domOrder(Root, DT); while (SILBasicBlock *block = domOrder.getNext()) { for (SILInstruction &I : *block) { ApplyInst *AI = dyn_cast(&I); if (!AI) continue; auto *Callee = getEligibleFunction(AI); if (Callee && isProfitableInColdBlock(Callee)) { DEBUG(llvm::dbgs() << " inline in cold block:" << *AI); AppliesToInline.push_back(AI); } } domOrder.pushChildren(block); } } //===----------------------------------------------------------------------===// // Performane Inliner Pass //===----------------------------------------------------------------------===// namespace { class SILPerformanceInlinerPass : public SILModuleTransform { /// Specifies which functions not to inline, based on @_semantics and /// global_init attributes. InlineSelection WhatToInline; std::string PassName; public: SILPerformanceInlinerPass(InlineSelection WhatToInline, StringRef LevelName): WhatToInline(WhatToInline), PassName(LevelName) { PassName.append(" Performance Inliner"); } void run() override { CallGraphAnalysis *CGA = PM->getAnalysis(); DominanceAnalysis *DA = PM->getAnalysis(); SILLoopAnalysis *LA = PM->getAnalysis(); if (getOptions().InlineThreshold == 0) { DEBUG(llvm::dbgs() << "*** The Performance Inliner is disabled ***\n"); return; } SILPerformanceInliner Inliner(getOptions().InlineThreshold, WhatToInline); auto &CG = CGA->getOrBuildCallGraph(); auto &BottomUpFunctions = CG.getBottomUpFunctionOrder(); // Copy the bottom-up function list into a worklist. llvm::SmallVector WorkList; // FIXME: std::reverse_copy would be better, but it crashes. for (auto I = BottomUpFunctions.rbegin(), E = BottomUpFunctions.rend(); I != E; ++I) WorkList.push_back(*I); // Inline functions bottom up from the leafs. while (!WorkList.empty()) { Inliner.inlineDevirtualizeAndSpecialize(WorkList.back(), this, CGA, DA, LA); WorkList.pop_back(); } } StringRef getName() override { return PassName; } }; } // end anonymous namespace /// Create an inliner pass that does not inline functions that are marked with /// the @_semantics, @effects or global_init attributes. SILTransform *swift::createEarlyInliner() { return new SILPerformanceInlinerPass( InlineSelection::NoSemanticsAndGlobalInit, "Early"); } /// Create an inliner pass that does not inline functions that are marked with /// the global_init attribute or have an "availability" semantics attribute. SILTransform *swift::createPerfInliner() { return new SILPerformanceInlinerPass(InlineSelection::NoGlobalInit, "Middle"); } /// Create an inliner pass that inlines all functions that are marked with /// the @_semantics, @effects or global_init attributes. SILTransform *swift::createLateInliner() { return new SILPerformanceInlinerPass(InlineSelection::Everything, "Late"); }