mirror of
https://github.com/apple/swift.git
synced 2025-12-21 12:14:44 +01:00
1563 lines
53 KiB
C++
1563 lines
53 KiB
C++
//===--- PerformanceInliner.cpp - Basic cost based performance inlining ---===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See http://swift.org/LICENSE.txt for license information
|
|
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#define DEBUG_TYPE "sil-inliner"
|
|
#include "swift/SIL/SILInstruction.h"
|
|
#include "swift/SIL/Dominance.h"
|
|
#include "swift/SIL/SILModule.h"
|
|
#include "swift/SIL/Projection.h"
|
|
#include "swift/SIL/InstructionUtils.h"
|
|
#include "swift/SILOptimizer/Analysis/ColdBlockInfo.h"
|
|
#include "swift/SILOptimizer/Analysis/DominanceAnalysis.h"
|
|
#include "swift/SILOptimizer/Analysis/FunctionOrder.h"
|
|
#include "swift/SILOptimizer/Analysis/LoopAnalysis.h"
|
|
#include "swift/SILOptimizer/Analysis/ArraySemantic.h"
|
|
#include "swift/SILOptimizer/PassManager/Passes.h"
|
|
#include "swift/SILOptimizer/PassManager/Transforms.h"
|
|
#include "swift/SILOptimizer/Utils/Local.h"
|
|
#include "swift/SILOptimizer/Utils/ConstantFolding.h"
|
|
#include "swift/SILOptimizer/Utils/SILInliner.h"
|
|
#include "llvm/ADT/SetVector.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/ADT/Statistic.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Support/CommandLine.h"
|
|
#include "llvm/ADT/MapVector.h"
|
|
#include <functional>
|
|
|
|
|
|
using namespace swift;
|
|
|
|
STATISTIC(NumFunctionsInlined, "Number of functions inlined");
|
|
|
|
llvm::cl::opt<bool> PrintShortestPathInfo(
|
|
"print-shortest-path-info", llvm::cl::init(false),
|
|
llvm::cl::desc("Print shortest-path information for inlining"));
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// ConstantTracker
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Tracks constants in the caller and callee to get an estimation of what
|
|
// values get constant if the callee is inlined.
|
|
// This can be seen as a "simulation" of several optimizations: SROA, mem2reg
|
|
// and constant propagation.
|
|
// Note that this is only a simplified model and not correct in all cases.
|
|
// For example aliasing information is not taken into account.
|
|
class ConstantTracker {
|
|
|
|
// Represents a value in integer constant evaluation.
|
|
struct IntConst {
|
|
IntConst() : isValid(false), isFromCaller(false) { }
|
|
|
|
IntConst(const APInt &value, bool isFromCaller) :
|
|
value(value), isValid(true), isFromCaller(isFromCaller) { }
|
|
|
|
// The actual value.
|
|
APInt value;
|
|
|
|
// True if the value is valid, i.e. could be evaluated to a constant.
|
|
bool isValid;
|
|
|
|
// True if the value is only valid, because a constant is passed to the
|
|
// callee. False if constant propagation could do the same job inside the
|
|
// callee without inlining it.
|
|
bool isFromCaller;
|
|
};
|
|
|
|
// Links between loaded and stored values.
|
|
// The key is a load instruction, the value is the corresponding store
|
|
// instruction which stores the loaded value. Both, key and value can also
|
|
// be copy_addr instructions.
|
|
llvm::DenseMap<SILInstruction *, SILInstruction *> links;
|
|
|
|
// The current stored values at memory addresses.
|
|
// The key is the base address of the memory (after skipping address
|
|
// projections). The value are store (or copy_addr) instructions, which
|
|
// store the current value.
|
|
// This is only an estimation, because e.g. it does not consider potential
|
|
// aliasing.
|
|
llvm::DenseMap<SILValue, SILInstruction *> memoryContent;
|
|
|
|
// Cache for evaluated constants.
|
|
llvm::SmallDenseMap<BuiltinInst *, IntConst> constCache;
|
|
|
|
// The caller/callee function which is tracked.
|
|
SILFunction *F;
|
|
|
|
// The constant tracker of the caller function (null if this is the
|
|
// tracker of the callee).
|
|
ConstantTracker *callerTracker;
|
|
|
|
// The apply instruction in the caller (null if this is the tracker of the
|
|
// callee).
|
|
FullApplySite AI;
|
|
|
|
// Walks through address projections and (optionally) collects them.
|
|
// Returns the base address, i.e. the first address which is not a
|
|
// projection.
|
|
SILValue scanProjections(SILValue addr,
|
|
SmallVectorImpl<Projection> *Result = nullptr);
|
|
|
|
// Get the stored value for a load. The loadInst can be either a real load
|
|
// or a copy_addr.
|
|
SILValue getStoredValue(SILInstruction *loadInst,
|
|
ProjectionPath &projStack);
|
|
|
|
// Gets the parameter in the caller for a function argument.
|
|
SILValue getParam(SILValue value) {
|
|
if (SILArgument *arg = dyn_cast<SILArgument>(value)) {
|
|
if (AI && arg->isFunctionArg() && arg->getFunction() == F) {
|
|
// Continue at the caller.
|
|
return AI.getArgument(arg->getIndex());
|
|
}
|
|
}
|
|
return SILValue();
|
|
}
|
|
|
|
SILInstruction *getMemoryContent(SILValue addr) {
|
|
// The memory content can be stored in this ConstantTracker or in the
|
|
// caller's ConstantTracker.
|
|
SILInstruction *storeInst = memoryContent[addr];
|
|
if (storeInst)
|
|
return storeInst;
|
|
if (callerTracker)
|
|
return callerTracker->getMemoryContent(addr);
|
|
return nullptr;
|
|
}
|
|
|
|
// Gets the estimated definition of a value.
|
|
SILInstruction *getDef(SILValue val, ProjectionPath &projStack);
|
|
|
|
// Gets the estimated integer constant result of a builtin.
|
|
IntConst getBuiltinConst(BuiltinInst *BI, int depth);
|
|
|
|
public:
|
|
|
|
// Constructor for the caller function.
|
|
ConstantTracker(SILFunction *function) :
|
|
F(function), callerTracker(nullptr), AI()
|
|
{ }
|
|
|
|
// Constructor for the callee function.
|
|
ConstantTracker(SILFunction *function, ConstantTracker *caller,
|
|
FullApplySite callerApply) :
|
|
F(function), callerTracker(caller), AI(callerApply)
|
|
{ }
|
|
|
|
void beginBlock() {
|
|
// Currently we don't do any sophisticated dataflow analysis, so we keep
|
|
// the memoryContent alive only for a single block.
|
|
memoryContent.clear();
|
|
}
|
|
|
|
// Must be called for each instruction visited in dominance order.
|
|
void trackInst(SILInstruction *inst);
|
|
|
|
// Gets the estimated definition of a value.
|
|
SILInstruction *getDef(SILValue val) {
|
|
ProjectionPath projStack(val->getType());
|
|
return getDef(val, projStack);
|
|
}
|
|
|
|
// Gets the estimated definition of a value if it is in the caller.
|
|
SILInstruction *getDefInCaller(SILValue val) {
|
|
SILInstruction *def = getDef(val);
|
|
if (def && def->getFunction() != F)
|
|
return def;
|
|
return nullptr;
|
|
}
|
|
|
|
bool isStackAddrInCaller(SILValue val) {
|
|
if (SILValue Param = getParam(stripAddressProjections(val))) {
|
|
return isa<AllocStackInst>(stripAddressProjections(Param));
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Gets the estimated integer constant of a value.
|
|
IntConst getIntConst(SILValue val, int depth = 0);
|
|
|
|
SILBasicBlock *getTakenBlock(TermInst *term);
|
|
};
|
|
|
|
void ConstantTracker::trackInst(SILInstruction *inst) {
|
|
if (auto *LI = dyn_cast<LoadInst>(inst)) {
|
|
SILValue baseAddr = scanProjections(LI->getOperand());
|
|
if (SILInstruction *loadLink = getMemoryContent(baseAddr))
|
|
links[LI] = loadLink;
|
|
} else if (StoreInst *SI = dyn_cast<StoreInst>(inst)) {
|
|
SILValue baseAddr = scanProjections(SI->getOperand(1));
|
|
memoryContent[baseAddr] = SI;
|
|
} else if (CopyAddrInst *CAI = dyn_cast<CopyAddrInst>(inst)) {
|
|
if (!CAI->isTakeOfSrc()) {
|
|
// Treat a copy_addr as a load + store
|
|
SILValue loadAddr = scanProjections(CAI->getOperand(0));
|
|
if (SILInstruction *loadLink = getMemoryContent(loadAddr)) {
|
|
links[CAI] = loadLink;
|
|
SILValue storeAddr = scanProjections(CAI->getOperand(1));
|
|
memoryContent[storeAddr] = CAI;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
SILValue ConstantTracker::scanProjections(SILValue addr,
|
|
SmallVectorImpl<Projection> *Result) {
|
|
for (;;) {
|
|
if (Projection::isAddressProjection(addr)) {
|
|
SILInstruction *I = cast<SILInstruction>(addr);
|
|
if (Result) {
|
|
Result->push_back(Projection(I));
|
|
}
|
|
addr = I->getOperand(0);
|
|
continue;
|
|
}
|
|
if (SILValue param = getParam(addr)) {
|
|
// Go to the caller.
|
|
addr = param;
|
|
continue;
|
|
}
|
|
// Return the base address = the first address which is not a projection.
|
|
return addr;
|
|
}
|
|
}
|
|
|
|
SILValue ConstantTracker::getStoredValue(SILInstruction *loadInst,
|
|
ProjectionPath &projStack) {
|
|
SILInstruction *store = links[loadInst];
|
|
if (!store && callerTracker)
|
|
store = callerTracker->links[loadInst];
|
|
if (!store) return SILValue();
|
|
|
|
assert(isa<LoadInst>(loadInst) || isa<CopyAddrInst>(loadInst));
|
|
|
|
// Push the address projections of the load onto the stack.
|
|
SmallVector<Projection, 4> loadProjections;
|
|
scanProjections(loadInst->getOperand(0), &loadProjections);
|
|
for (const Projection &proj : loadProjections) {
|
|
projStack.push_back(proj);
|
|
}
|
|
|
|
// Pop the address projections of the store from the stack.
|
|
SmallVector<Projection, 4> storeProjections;
|
|
scanProjections(store->getOperand(1), &storeProjections);
|
|
for (auto iter = storeProjections.rbegin(); iter != storeProjections.rend();
|
|
++iter) {
|
|
const Projection &proj = *iter;
|
|
// The corresponding load-projection must match the store-projection.
|
|
if (projStack.empty() || projStack.back() != proj)
|
|
return SILValue();
|
|
projStack.pop_back();
|
|
}
|
|
|
|
if (isa<StoreInst>(store))
|
|
return store->getOperand(0);
|
|
|
|
// The copy_addr instruction is both a load and a store. So we follow the link
|
|
// again.
|
|
assert(isa<CopyAddrInst>(store));
|
|
return getStoredValue(store, projStack);
|
|
}
|
|
|
|
// Get the aggregate member based on the top of the projection stack.
|
|
static SILValue getMember(SILInstruction *inst, ProjectionPath &projStack) {
|
|
if (!projStack.empty()) {
|
|
const Projection &proj = projStack.back();
|
|
return proj.getOperandForAggregate(inst);
|
|
}
|
|
return SILValue();
|
|
}
|
|
|
|
SILInstruction *ConstantTracker::getDef(SILValue val,
|
|
ProjectionPath &projStack) {
|
|
|
|
// Track the value up the dominator tree.
|
|
for (;;) {
|
|
if (SILInstruction *inst = dyn_cast<SILInstruction>(val)) {
|
|
if (Projection::isObjectProjection(inst)) {
|
|
// Extract a member from a struct/tuple/enum.
|
|
projStack.push_back(Projection(inst));
|
|
val = inst->getOperand(0);
|
|
continue;
|
|
} else if (SILValue member = getMember(inst, projStack)) {
|
|
// The opposite of a projection instruction: composing a struct/tuple.
|
|
projStack.pop_back();
|
|
val = member;
|
|
continue;
|
|
} else if (SILValue loadedVal = getStoredValue(inst, projStack)) {
|
|
// A value loaded from memory.
|
|
val = loadedVal;
|
|
continue;
|
|
} else if (isa<ThinToThickFunctionInst>(inst)) {
|
|
val = inst->getOperand(0);
|
|
continue;
|
|
}
|
|
return inst;
|
|
} else if (SILValue param = getParam(val)) {
|
|
// Continue in the caller.
|
|
val = param;
|
|
continue;
|
|
}
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
ConstantTracker::IntConst ConstantTracker::getBuiltinConst(BuiltinInst *BI, int depth) {
|
|
const BuiltinInfo &Builtin = BI->getBuiltinInfo();
|
|
OperandValueArrayRef Args = BI->getArguments();
|
|
switch (Builtin.ID) {
|
|
default: break;
|
|
|
|
// Fold comparison predicates.
|
|
#define BUILTIN(id, name, Attrs)
|
|
#define BUILTIN_BINARY_PREDICATE(id, name, attrs, overload) \
|
|
case BuiltinValueKind::id:
|
|
#include "swift/AST/Builtins.def"
|
|
{
|
|
IntConst lhs = getIntConst(Args[0], depth);
|
|
IntConst rhs = getIntConst(Args[1], depth);
|
|
if (lhs.isValid && rhs.isValid) {
|
|
return IntConst(constantFoldComparison(lhs.value, rhs.value,
|
|
Builtin.ID),
|
|
lhs.isFromCaller || rhs.isFromCaller);
|
|
}
|
|
break;
|
|
}
|
|
|
|
case BuiltinValueKind::SAddOver:
|
|
case BuiltinValueKind::UAddOver:
|
|
case BuiltinValueKind::SSubOver:
|
|
case BuiltinValueKind::USubOver:
|
|
case BuiltinValueKind::SMulOver:
|
|
case BuiltinValueKind::UMulOver: {
|
|
IntConst lhs = getIntConst(Args[0], depth);
|
|
IntConst rhs = getIntConst(Args[1], depth);
|
|
if (lhs.isValid && rhs.isValid) {
|
|
bool IgnoredOverflow;
|
|
return IntConst(constantFoldBinaryWithOverflow(lhs.value, rhs.value,
|
|
IgnoredOverflow,
|
|
getLLVMIntrinsicIDForBuiltinWithOverflow(Builtin.ID)),
|
|
lhs.isFromCaller || rhs.isFromCaller);
|
|
}
|
|
break;
|
|
}
|
|
|
|
case BuiltinValueKind::SDiv:
|
|
case BuiltinValueKind::SRem:
|
|
case BuiltinValueKind::UDiv:
|
|
case BuiltinValueKind::URem: {
|
|
IntConst lhs = getIntConst(Args[0], depth);
|
|
IntConst rhs = getIntConst(Args[1], depth);
|
|
if (lhs.isValid && rhs.isValid && rhs.value != 0) {
|
|
bool IgnoredOverflow;
|
|
return IntConst(constantFoldDiv(lhs.value, rhs.value,
|
|
IgnoredOverflow, Builtin.ID),
|
|
lhs.isFromCaller || rhs.isFromCaller);
|
|
}
|
|
break;
|
|
}
|
|
|
|
case BuiltinValueKind::And:
|
|
case BuiltinValueKind::AShr:
|
|
case BuiltinValueKind::LShr:
|
|
case BuiltinValueKind::Or:
|
|
case BuiltinValueKind::Shl:
|
|
case BuiltinValueKind::Xor: {
|
|
IntConst lhs = getIntConst(Args[0], depth);
|
|
IntConst rhs = getIntConst(Args[1], depth);
|
|
if (lhs.isValid && rhs.isValid) {
|
|
return IntConst(constantFoldBitOperation(lhs.value, rhs.value,
|
|
Builtin.ID),
|
|
lhs.isFromCaller || rhs.isFromCaller);
|
|
}
|
|
break;
|
|
}
|
|
|
|
case BuiltinValueKind::Trunc:
|
|
case BuiltinValueKind::ZExt:
|
|
case BuiltinValueKind::SExt:
|
|
case BuiltinValueKind::TruncOrBitCast:
|
|
case BuiltinValueKind::ZExtOrBitCast:
|
|
case BuiltinValueKind::SExtOrBitCast: {
|
|
IntConst val = getIntConst(Args[0], depth);
|
|
if (val.isValid) {
|
|
return IntConst(constantFoldCast(val.value, Builtin), val.isFromCaller);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
return IntConst();
|
|
}
|
|
|
|
// Tries to evaluate the integer constant of a value. The \p depth is used
|
|
// to limit the complexity.
|
|
ConstantTracker::IntConst ConstantTracker::getIntConst(SILValue val, int depth) {
|
|
|
|
// Don't spend too much time with constant evaluation.
|
|
if (depth >= 10)
|
|
return IntConst();
|
|
|
|
SILInstruction *I = getDef(val);
|
|
if (!I)
|
|
return IntConst();
|
|
|
|
if (auto *IL = dyn_cast<IntegerLiteralInst>(I)) {
|
|
return IntConst(IL->getValue(), IL->getFunction() != F);
|
|
}
|
|
if (auto *BI = dyn_cast<BuiltinInst>(I)) {
|
|
if (constCache.count(BI) != 0)
|
|
return constCache[BI];
|
|
|
|
IntConst builtinConst = getBuiltinConst(BI, depth + 1);
|
|
constCache[BI] = builtinConst;
|
|
return builtinConst;
|
|
}
|
|
return IntConst();
|
|
}
|
|
|
|
// Returns the taken block of a terminator instruction if the condition turns
|
|
// out to be constant.
|
|
SILBasicBlock *ConstantTracker::getTakenBlock(TermInst *term) {
|
|
if (CondBranchInst *CBI = dyn_cast<CondBranchInst>(term)) {
|
|
IntConst condConst = getIntConst(CBI->getCondition());
|
|
if (condConst.isFromCaller) {
|
|
return condConst.value != 0 ? CBI->getTrueBB() : CBI->getFalseBB();
|
|
}
|
|
return nullptr;
|
|
}
|
|
if (SwitchValueInst *SVI = dyn_cast<SwitchValueInst>(term)) {
|
|
IntConst switchConst = getIntConst(SVI->getOperand());
|
|
if (switchConst.isFromCaller) {
|
|
for (unsigned Idx = 0; Idx < SVI->getNumCases(); ++Idx) {
|
|
auto switchCase = SVI->getCase(Idx);
|
|
if (auto *IL = dyn_cast<IntegerLiteralInst>(switchCase.first)) {
|
|
if (switchConst.value == IL->getValue())
|
|
return switchCase.second;
|
|
} else {
|
|
return nullptr;
|
|
}
|
|
}
|
|
if (SVI->hasDefault())
|
|
return SVI->getDefaultBB();
|
|
}
|
|
return nullptr;
|
|
}
|
|
if (SwitchEnumInst *SEI = dyn_cast<SwitchEnumInst>(term)) {
|
|
if (SILInstruction *def = getDefInCaller(SEI->getOperand())) {
|
|
if (EnumInst *EI = dyn_cast<EnumInst>(def)) {
|
|
for (unsigned Idx = 0; Idx < SEI->getNumCases(); ++Idx) {
|
|
auto enumCase = SEI->getCase(Idx);
|
|
if (enumCase.first == EI->getElement())
|
|
return enumCase.second;
|
|
}
|
|
if (SEI->hasDefault())
|
|
return SEI->getDefaultBB();
|
|
}
|
|
}
|
|
return nullptr;
|
|
}
|
|
if (CheckedCastBranchInst *CCB = dyn_cast<CheckedCastBranchInst>(term)) {
|
|
if (SILInstruction *def = getDefInCaller(CCB->getOperand())) {
|
|
if (UpcastInst *UCI = dyn_cast<UpcastInst>(def)) {
|
|
SILType castType = UCI->getOperand()->getType();
|
|
if (CCB->getCastType().isExactSuperclassOf(castType)) {
|
|
return CCB->getSuccessBB();
|
|
}
|
|
if (!castType.isBindableToSuperclassOf(CCB->getCastType())) {
|
|
return CCB->getFailureBB();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Shortest path analysis
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// Computes the length of the shortest path through a block in a scope.
|
|
///
|
|
/// A scope is either a loop or the whole function. The "length" is an
|
|
/// estimation of the execution time. For example if the shortest path of a
|
|
/// block B is N, it means that the execution time of the scope (either
|
|
/// function or a single loop iteration), when execution includes the block, is
|
|
/// at least N, regardless of what branches are taken.
|
|
///
|
|
/// The shortest path of a block is the sum of the shortest path from the scope
|
|
/// entry and the shortest path to the scope exit.
|
|
class ShortestPathAnalysis {
|
|
public:
|
|
enum {
|
|
/// We assume that cold block take very long to execute.
|
|
ColdBlockLength = 1000,
|
|
|
|
/// Our assumption on how many times a loop is executed.
|
|
LoopCount = 10,
|
|
|
|
/// To keep things simple we only analyse up to this number of nested loops.
|
|
MaxNumLoopLevels = 4,
|
|
|
|
/// The "weight" for the benefit which a single loop nest gives.
|
|
SingleLoopWeight = 4,
|
|
|
|
/// Pretty large but small enough to add something without overflowing.
|
|
InitialDist = (1 << 29)
|
|
};
|
|
|
|
/// A weight for an inlining benefit.
|
|
class Weight {
|
|
/// How long does it take to execute the scope (either loop or the whole
|
|
/// function) of the call to inline? The larger it is the less important it
|
|
/// is to inline.
|
|
int ScopeLength;
|
|
|
|
/// Represents the loop nest. The larger it is the more important it is to
|
|
/// inline
|
|
int LoopWeight;
|
|
|
|
friend class ShortestPathAnalysis;
|
|
|
|
public:
|
|
Weight(int ScopeLength, int LoopWeight) : ScopeLength(ScopeLength),
|
|
LoopWeight(LoopWeight) { }
|
|
|
|
Weight(const Weight &RHS, int AdditionalLoopWeight) :
|
|
Weight(RHS.ScopeLength, RHS.LoopWeight + AdditionalLoopWeight) { }
|
|
|
|
Weight() : ScopeLength(-1), LoopWeight(0) {
|
|
assert(!isValid());
|
|
}
|
|
|
|
bool isValid() const { return ScopeLength >= 0; }
|
|
|
|
/// Updates the \p Benefit by weighting the \p Importance.
|
|
void updateBenefit(int &Benefit, int Importance) const {
|
|
assert(isValid());
|
|
int newBenefit = 0;
|
|
|
|
// Use some heuristics. The basic idea is: length is bad, loops are good.
|
|
if (ScopeLength > 320) {
|
|
newBenefit = Importance;
|
|
} else if (ScopeLength > 160) {
|
|
newBenefit = Importance + LoopWeight * 4;
|
|
} else if (ScopeLength > 80) {
|
|
newBenefit = Importance + LoopWeight * 8;
|
|
} else if (ScopeLength > 40) {
|
|
newBenefit = Importance + LoopWeight * 12;
|
|
} else if (ScopeLength > 20) {
|
|
newBenefit = Importance + LoopWeight * 16;
|
|
} else {
|
|
newBenefit = Importance + 20 + LoopWeight * 16;
|
|
}
|
|
// We don't accumulate the benefit instead we max it.
|
|
if (newBenefit > Benefit)
|
|
Benefit = newBenefit;
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
friend raw_ostream &operator<<(raw_ostream &os, const Weight &W) {
|
|
os << W.LoopWeight << '/' << W.ScopeLength;
|
|
return os;
|
|
}
|
|
#endif
|
|
};
|
|
|
|
private:
|
|
/// The distances for a block in it's scope (either loop or function).
|
|
struct Distances {
|
|
/// The shortest distance from the scope entry to the block entry.
|
|
int DistFromEntry = InitialDist;
|
|
|
|
/// The shortest distance from the block entry to the scope exit.
|
|
int DistToExit = InitialDist;
|
|
|
|
/// Additional length to account for loop iterations. It's != 0 for loop
|
|
/// headers (or loop predecessors).
|
|
int LoopHeaderLength = 0;
|
|
};
|
|
|
|
/// Holds the distance information for a block in all it's scopes, i.e. in all
|
|
/// its containing loops and the function itself.
|
|
struct BlockInfo {
|
|
private:
|
|
/// Distances for all scopes. Element 0 refers to the function, elements
|
|
/// > 0 to loops.
|
|
Distances Dists[MaxNumLoopLevels];
|
|
public:
|
|
/// The length of the block itself.
|
|
int Length = 0;
|
|
|
|
/// Returns the distances for the loop with nesting level \p LoopDepth.
|
|
Distances &getDistances(int LoopDepth) {
|
|
assert(LoopDepth >= 0 && LoopDepth < MaxNumLoopLevels);
|
|
return Dists[LoopDepth];
|
|
}
|
|
|
|
/// Returns the length including the LoopHeaderLength for a given
|
|
/// \p LoopDepth.
|
|
int getLength(int LoopDepth) {
|
|
return Length + getDistances(LoopDepth).LoopHeaderLength;
|
|
}
|
|
|
|
/// Returns the length of the shortest path of this block in the loop with
|
|
/// nesting level \p LoopDepth.
|
|
int getScopeLength(int LoopDepth) {
|
|
const Distances &D = getDistances(LoopDepth);
|
|
return D.DistFromEntry + D.DistToExit;
|
|
}
|
|
};
|
|
|
|
SILFunction *F;
|
|
SILLoopInfo *LI;
|
|
llvm::DenseMap<const SILBasicBlock *, BlockInfo *> BlockInfos;
|
|
std::vector<BlockInfo> BlockInfoStorage;
|
|
|
|
BlockInfo *getBlockInfo(const SILBasicBlock *BB) {
|
|
BlockInfo *BI = BlockInfos[BB];
|
|
assert(BI);
|
|
return BI;
|
|
}
|
|
|
|
// Utility functions to make the template solveDataFlow compilable for a block
|
|
// list containing references _and_ a list containing pointers.
|
|
|
|
const SILBasicBlock *getBlock(const SILBasicBlock *BB) { return BB; }
|
|
const SILBasicBlock *getBlock(const SILBasicBlock &BB) { return &BB; }
|
|
|
|
/// Returns the minimum distance from all predecessor blocks of \p BB.
|
|
int getEntryDistFromPreds(const SILBasicBlock *BB, int LoopDepth);
|
|
|
|
/// Returns the minimum distance from all successor blocks of \p BB.
|
|
int getExitDistFromSuccs(const SILBasicBlock *BB, int LoopDepth);
|
|
|
|
/// Computes the distances by solving the dataflow problem for all \p Blocks
|
|
/// in a scope.
|
|
template <typename BlockList>
|
|
void solveDataFlow(const BlockList &Blocks, int LoopDepth) {
|
|
bool Changed = false;
|
|
|
|
// Compute the distances from the entry block.
|
|
do {
|
|
Changed = false;
|
|
for (auto &Block : Blocks) {
|
|
const SILBasicBlock *BB = getBlock(Block);
|
|
BlockInfo *BBInfo = getBlockInfo(BB);
|
|
int DistFromEntry = getEntryDistFromPreds(BB, LoopDepth);
|
|
Distances &BBDists = BBInfo->getDistances(LoopDepth);
|
|
if (DistFromEntry < BBDists.DistFromEntry) {
|
|
BBDists.DistFromEntry = DistFromEntry;
|
|
Changed = true;
|
|
}
|
|
}
|
|
} while (Changed);
|
|
|
|
// Compute the distances to the exit block.
|
|
do {
|
|
Changed = false;
|
|
for (auto &Block : reverse(Blocks)) {
|
|
const SILBasicBlock *BB = getBlock(Block);
|
|
BlockInfo *BBInfo = getBlockInfo(BB);
|
|
int DistToExit =
|
|
getExitDistFromSuccs(BB, LoopDepth) + BBInfo->getLength(LoopDepth);
|
|
Distances &BBDists = BBInfo->getDistances(LoopDepth);
|
|
if (DistToExit < BBDists.DistToExit) {
|
|
BBDists.DistToExit = DistToExit;
|
|
Changed = true;
|
|
}
|
|
}
|
|
} while (Changed);
|
|
}
|
|
|
|
/// Analyze \p Loop and all its inner loops.
|
|
void analyzeLoopsRecursively(SILLoop *Loop, int LoopDepth);
|
|
|
|
void printFunction(llvm::raw_ostream &OS);
|
|
|
|
void printLoop(llvm::raw_ostream &OS, SILLoop *Loop, int LoopDepth);
|
|
|
|
void printBlockInfo(llvm::raw_ostream &OS, SILBasicBlock *BB, int LoopDepth);
|
|
|
|
public:
|
|
ShortestPathAnalysis(SILFunction *F, SILLoopInfo *LI) : F(F), LI(LI) { }
|
|
|
|
bool isValid() const { return !BlockInfos.empty(); }
|
|
|
|
/// Compute the distances. The function \p getApplyLength returns the length
|
|
/// of a function call.
|
|
template <typename Func>
|
|
void analyze(ColdBlockInfo &CBI, Func getApplyLength) {
|
|
assert(!isValid());
|
|
|
|
BlockInfoStorage.resize(F->size());
|
|
|
|
// First step: compute the length of the blocks.
|
|
unsigned BlockIdx = 0;
|
|
for (SILBasicBlock &BB : *F) {
|
|
int Length = 0;
|
|
if (CBI.isCold(&BB)) {
|
|
Length = ColdBlockLength;
|
|
} else {
|
|
for (SILInstruction &I : BB) {
|
|
if (auto FAS = FullApplySite::isa(&I)) {
|
|
Length += getApplyLength(FAS);
|
|
} else {
|
|
Length += (int)instructionInlineCost(I);
|
|
}
|
|
}
|
|
}
|
|
BlockInfo *BBInfo = &BlockInfoStorage[BlockIdx++];
|
|
BlockInfos[&BB] = BBInfo;
|
|
BBInfo->Length = Length;
|
|
|
|
// Initialize the distances for the entry and exit blocks, used when
|
|
// computing the distances for the function itself.
|
|
if (&BB == &F->front())
|
|
BBInfo->getDistances(0).DistFromEntry = 0;
|
|
|
|
if (isa<ReturnInst>(BB.getTerminator()))
|
|
BBInfo->getDistances(0).DistToExit = Length;
|
|
else if (isa<ThrowInst>(BB.getTerminator()))
|
|
BBInfo->getDistances(0).DistToExit = Length + ColdBlockLength;
|
|
}
|
|
// Compute the distances for all loops in the function.
|
|
for (SILLoop *Loop : *LI) {
|
|
analyzeLoopsRecursively(Loop, 1);
|
|
}
|
|
// Compute the distances for the function itself.
|
|
solveDataFlow(F->getBlocks(), 0);
|
|
}
|
|
|
|
/// Returns the length of the shortest path of the block \p BB in the loop
|
|
/// with nesting level \p LoopDepth. If \p LoopDepth is 0 ir returns the
|
|
/// shortest path in the function.
|
|
int getScopeLength(SILBasicBlock *BB, int LoopDepth) {
|
|
assert(BB->getParent() == F);
|
|
if (LoopDepth >= MaxNumLoopLevels)
|
|
LoopDepth = MaxNumLoopLevels - 1;
|
|
return getBlockInfo(BB)->getScopeLength(LoopDepth);
|
|
}
|
|
|
|
/// Returns the weight of block \p BB also considering the \p CallerWeight
|
|
/// which is the weight of the call site's block in the caller.
|
|
Weight getWeight(SILBasicBlock *BB, Weight CallerWeight);
|
|
|
|
void dump();
|
|
};
|
|
|
|
int ShortestPathAnalysis::getEntryDistFromPreds(const SILBasicBlock *BB,
|
|
int LoopDepth) {
|
|
int MinDist = InitialDist;
|
|
for (SILBasicBlock *Pred : BB->getPreds()) {
|
|
BlockInfo *PredInfo = getBlockInfo(Pred);
|
|
Distances &PDists = PredInfo->getDistances(LoopDepth);
|
|
int DistFromEntry = PDists.DistFromEntry + PredInfo->Length +
|
|
PDists.LoopHeaderLength;
|
|
assert(DistFromEntry >= 0);
|
|
if (DistFromEntry < MinDist)
|
|
MinDist = DistFromEntry;
|
|
}
|
|
return MinDist;
|
|
}
|
|
|
|
int ShortestPathAnalysis::getExitDistFromSuccs(const SILBasicBlock *BB,
|
|
int LoopDepth) {
|
|
int MinDist = InitialDist;
|
|
for (const SILSuccessor &Succ : BB->getSuccessors()) {
|
|
BlockInfo *SuccInfo = getBlockInfo(Succ);
|
|
Distances &SDists = SuccInfo->getDistances(LoopDepth);
|
|
if (SDists.DistToExit < MinDist)
|
|
MinDist = SDists.DistToExit;
|
|
}
|
|
return MinDist;
|
|
}
|
|
|
|
/// Detect an edge from the loop pre-header's predecessor to the loop exit
|
|
/// block. Such an edge "short-cuts" a loop if it is never iterated. But usually
|
|
/// it is the less frequent case and we want to ignore it.
|
|
/// E.g. it handles the case of N==0 for
|
|
/// for i in 0..<N { ... }
|
|
/// If the \p Loop has such an edge the source block of this edge is returned,
|
|
/// which is the predecessor of the loop pre-header.
|
|
static SILBasicBlock *detectLoopBypassPreheader(SILLoop *Loop) {
|
|
SILBasicBlock *Pred = Loop->getLoopPreheader();
|
|
if (!Pred)
|
|
return nullptr;
|
|
|
|
SILBasicBlock *PredPred = Pred->getSinglePredecessor();
|
|
if (!PredPred)
|
|
return nullptr;
|
|
|
|
auto *CBR = dyn_cast<CondBranchInst>(PredPred->getTerminator());
|
|
if (!CBR)
|
|
return nullptr;
|
|
|
|
SILBasicBlock *Succ = (CBR->getTrueBB() == Pred ? CBR->getFalseBB() :
|
|
CBR->getTrueBB());
|
|
|
|
for (SILBasicBlock *PredOfSucc : Succ->getPreds()) {
|
|
SILBasicBlock *Exiting = PredOfSucc->getSinglePredecessor();
|
|
if (!Exiting)
|
|
Exiting = PredOfSucc;
|
|
if (Loop->contains(Exiting))
|
|
return PredPred;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
void ShortestPathAnalysis::analyzeLoopsRecursively(SILLoop *Loop, int LoopDepth) {
|
|
if (LoopDepth >= MaxNumLoopLevels)
|
|
return;
|
|
|
|
// First dive into the inner loops.
|
|
for (SILLoop *SubLoop : Loop->getSubLoops()) {
|
|
analyzeLoopsRecursively(SubLoop, LoopDepth + 1);
|
|
}
|
|
|
|
BlockInfo *HeaderInfo = getBlockInfo(Loop->getHeader());
|
|
Distances &HeaderDists = HeaderInfo->getDistances(LoopDepth);
|
|
|
|
// Initial values for the entry (== header) and exit-predecessor (== header as
|
|
// well).
|
|
HeaderDists.DistFromEntry = 0;
|
|
HeaderDists.DistToExit = 0;
|
|
|
|
solveDataFlow(Loop->getBlocks(), LoopDepth);
|
|
|
|
int LoopLength = getExitDistFromSuccs(Loop->getHeader(), LoopDepth) +
|
|
HeaderInfo->getLength(LoopDepth);
|
|
HeaderDists.DistToExit = LoopLength;
|
|
|
|
// If there is a loop bypass edge, add the loop length to the loop pre-pre-
|
|
// header instead to the header. This actually let us ignore the loop bypass
|
|
// edge in the length calculation for the loop's parent scope.
|
|
if (SILBasicBlock *Bypass = detectLoopBypassPreheader(Loop))
|
|
HeaderInfo = getBlockInfo(Bypass);
|
|
|
|
// Add the full loop length (= assumed-iteration-count * length) to the loop
|
|
// header so that it is considered in the parent scope.
|
|
HeaderInfo->getDistances(LoopDepth - 1).LoopHeaderLength =
|
|
LoopCount * LoopLength;
|
|
}
|
|
|
|
ShortestPathAnalysis::Weight ShortestPathAnalysis::
|
|
getWeight(SILBasicBlock *BB, Weight CallerWeight) {
|
|
assert(BB->getParent() == F);
|
|
|
|
SILLoop *Loop = LI->getLoopFor(BB);
|
|
if (!Loop) {
|
|
// We are not in a loop. So just account the length of our function scope
|
|
// in to the length of the CallerWeight.
|
|
return Weight(CallerWeight.ScopeLength + getScopeLength(BB, 0),
|
|
CallerWeight.LoopWeight);
|
|
}
|
|
int LoopDepth = Loop->getLoopDepth();
|
|
// Deal with the corner case of having more than 4 nested loops.
|
|
while (LoopDepth >= MaxNumLoopLevels) {
|
|
--LoopDepth;
|
|
Loop = Loop->getParentLoop();
|
|
}
|
|
Weight W(getScopeLength(BB, LoopDepth), SingleLoopWeight);
|
|
|
|
// Add weights for all the loops BB is in.
|
|
while (Loop) {
|
|
assert(LoopDepth > 0);
|
|
BlockInfo *HeaderInfo = getBlockInfo(Loop->getHeader());
|
|
int InnerLoopLength = HeaderInfo->getScopeLength(LoopDepth) *
|
|
ShortestPathAnalysis::LoopCount;
|
|
int OuterLoopWeight = SingleLoopWeight;
|
|
int OuterScopeLength = HeaderInfo->getScopeLength(LoopDepth - 1);
|
|
|
|
// Reaching the outermost loop, we use the CallerWeight to get the outer
|
|
// length+loopweight.
|
|
if (LoopDepth == 1) {
|
|
// If the apply in the caller is not in a significant loop, just stop with
|
|
// what we have now.
|
|
if (CallerWeight.LoopWeight < 4)
|
|
return W;
|
|
|
|
// If this function is part of the caller's scope length take the caller's
|
|
// scope length. Note: this is not the case e.g. if the apply is in a
|
|
// then-branch of an if-then-else in the caller and the else-branch is
|
|
// the short path.
|
|
if (CallerWeight.ScopeLength > OuterScopeLength)
|
|
OuterScopeLength = CallerWeight.ScopeLength;
|
|
OuterLoopWeight = CallerWeight.LoopWeight;
|
|
}
|
|
assert(OuterScopeLength >= InnerLoopLength);
|
|
|
|
// If the current loop is only a small part of its outer loop, we don't
|
|
// take the outer loop that much into account. Only if the current loop is
|
|
// actually the "main part" in the outer loop we add the full loop weight
|
|
// for the outer loop.
|
|
if (OuterScopeLength < InnerLoopLength * 2) {
|
|
W.LoopWeight += OuterLoopWeight - 1;
|
|
} else if (OuterScopeLength < InnerLoopLength * 3) {
|
|
W.LoopWeight += OuterLoopWeight - 2;
|
|
} else if (OuterScopeLength < InnerLoopLength * 4) {
|
|
W.LoopWeight += OuterLoopWeight - 3;
|
|
} else {
|
|
return W;
|
|
}
|
|
--LoopDepth;
|
|
Loop = Loop->getParentLoop();
|
|
}
|
|
assert(LoopDepth == 0);
|
|
return W;
|
|
}
|
|
|
|
void ShortestPathAnalysis::dump() {
|
|
printFunction(llvm::errs());
|
|
}
|
|
|
|
void ShortestPathAnalysis::printFunction(llvm::raw_ostream &OS) {
|
|
OS << "SPA @" << F->getName() << "\n";
|
|
for (SILBasicBlock &BB : *F) {
|
|
printBlockInfo(OS, &BB, 0);
|
|
}
|
|
for (SILLoop *Loop : *LI) {
|
|
printLoop(OS, Loop, 1);
|
|
}
|
|
}
|
|
|
|
void ShortestPathAnalysis::printLoop(llvm::raw_ostream &OS, SILLoop *Loop,
|
|
int LoopDepth) {
|
|
if (LoopDepth >= MaxNumLoopLevels)
|
|
return;
|
|
assert(LoopDepth == (int)Loop->getLoopDepth());
|
|
|
|
OS << "Loop bb" << Loop->getHeader()->getDebugID() << ":\n";
|
|
for (SILBasicBlock *BB : Loop->getBlocks()) {
|
|
printBlockInfo(OS, BB, LoopDepth);
|
|
}
|
|
for (SILLoop *SubLoop : Loop->getSubLoops()) {
|
|
printLoop(OS, SubLoop, LoopDepth + 1);
|
|
}
|
|
}
|
|
|
|
void ShortestPathAnalysis::printBlockInfo(llvm::raw_ostream &OS,
|
|
SILBasicBlock *BB, int LoopDepth) {
|
|
BlockInfo *BBInfo = getBlockInfo(BB);
|
|
Distances &D = BBInfo->getDistances(LoopDepth);
|
|
OS << " bb" << BB->getDebugID() << ": length=" << BBInfo->Length << '+'
|
|
<< D.LoopHeaderLength << ", d-entry=" << D.DistFromEntry
|
|
<< ", d-exit=" << D.DistToExit << '\n';
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Performance Inliner
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
namespace {
|
|
|
|
// Controls the decision to inline functions with @_semantics, @effect and
|
|
// global_init attributes.
|
|
enum class InlineSelection {
|
|
Everything,
|
|
NoGlobalInit, // and no availability semantics calls
|
|
NoSemanticsAndGlobalInit
|
|
};
|
|
|
|
using Weight = ShortestPathAnalysis::Weight;
|
|
|
|
class SILPerformanceInliner {
|
|
/// Specifies which functions not to inline, based on @_semantics and
|
|
/// global_init attributes.
|
|
InlineSelection WhatToInline;
|
|
|
|
DominanceAnalysis *DA;
|
|
SILLoopAnalysis *LA;
|
|
|
|
// For keys of SILFunction and SILLoop.
|
|
llvm::DenseMap<SILFunction *, ShortestPathAnalysis *> SPAs;
|
|
llvm::SpecificBumpPtrAllocator<ShortestPathAnalysis> SPAAllocator;
|
|
|
|
ColdBlockInfo CBI;
|
|
|
|
/// The following constants define the cost model for inlining. Some constants
|
|
/// are also defined in ShortestPathAnalysis.
|
|
enum {
|
|
/// The base value for every call: it represents the benefit of removing the
|
|
/// call overhead itself.
|
|
RemovedCallBenefit = 20,
|
|
|
|
/// The benefit if the operand of an apply gets constant, e.g. if a closure
|
|
/// is passed to an apply instruction in the callee.
|
|
RemovedClosureBenefit = RemovedCallBenefit + 50,
|
|
|
|
/// The benefit if a load can (probably) eliminated because it loads from
|
|
/// a stack location in the caller.
|
|
RemovedLoadBenefit = RemovedCallBenefit + 5,
|
|
|
|
/// The benefit if a store can (probably) eliminated because it stores to
|
|
/// a stack location in the caller.
|
|
RemovedStoreBenefit = RemovedCallBenefit + 10,
|
|
|
|
/// The benefit if the condition of a terminator instruction gets constant
|
|
/// due to inlining.
|
|
RemovedTerminatorBenefit = RemovedCallBenefit + 10,
|
|
|
|
/// The benefit if a retain/release can (probably) be eliminated after
|
|
/// inlining.
|
|
RefCountBenefit = RemovedCallBenefit + 20,
|
|
|
|
/// Approximately up to this cost level a function can be inlined without
|
|
/// increasing the code size.
|
|
TrivialFunctionThreshold = 18,
|
|
|
|
/// Configuration for the caller block limit.
|
|
BlockLimitDenominator = 10000,
|
|
|
|
/// The assumed execution length of a function call.
|
|
DefaultApplyLength = 10
|
|
};
|
|
|
|
#ifndef NDEBUG
|
|
SILFunction *LastPrintedCaller = nullptr;
|
|
void dumpCaller(SILFunction *Caller) {
|
|
if (Caller != LastPrintedCaller) {
|
|
llvm::dbgs() << "\nInline into caller: " << Caller->getName() << '\n';
|
|
LastPrintedCaller = Caller;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
ShortestPathAnalysis *getSPA(SILFunction *F, SILLoopInfo *LI) {
|
|
ShortestPathAnalysis *&SPA = SPAs[F];
|
|
if (!SPA) {
|
|
SPA = new (SPAAllocator.Allocate()) ShortestPathAnalysis(F, LI);
|
|
}
|
|
return SPA;
|
|
}
|
|
|
|
SILFunction *getEligibleFunction(FullApplySite AI);
|
|
|
|
bool isProfitableToInline(FullApplySite AI,
|
|
Weight CallerWeight,
|
|
ConstantTracker &constTracker,
|
|
int &NumCallerBlocks);
|
|
|
|
bool isProfitableInColdBlock(FullApplySite AI, SILFunction *Callee);
|
|
|
|
void visitColdBlocks(SmallVectorImpl<FullApplySite> &AppliesToInline,
|
|
SILBasicBlock *root, DominanceInfo *DT);
|
|
|
|
void collectAppliesToInline(SILFunction *Caller,
|
|
SmallVectorImpl<FullApplySite> &Applies);
|
|
|
|
public:
|
|
SILPerformanceInliner(InlineSelection WhatToInline, DominanceAnalysis *DA,
|
|
SILLoopAnalysis *LA)
|
|
: WhatToInline(WhatToInline), DA(DA), LA(LA), CBI(DA) {}
|
|
|
|
bool inlineCallsIntoFunction(SILFunction *F);
|
|
};
|
|
|
|
} // namespace
|
|
|
|
// Return true if the callee has self-recursive calls.
|
|
static bool calleeIsSelfRecursive(SILFunction *Callee) {
|
|
for (auto &BB : *Callee)
|
|
for (auto &I : BB)
|
|
if (auto Apply = FullApplySite::isa(&I))
|
|
if (Apply.getReferencedFunction() == Callee)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
// Returns the callee of an apply_inst if it is basically inlineable.
|
|
SILFunction *SILPerformanceInliner::getEligibleFunction(FullApplySite AI) {
|
|
|
|
SILFunction *Callee = AI.getReferencedFunction();
|
|
|
|
if (!Callee) {
|
|
return nullptr;
|
|
}
|
|
|
|
// Don't inline functions that are marked with the @_semantics or @effects
|
|
// attribute if the inliner is asked not to inline them.
|
|
if (Callee->hasSemanticsAttrs() || Callee->hasEffectsKind()) {
|
|
if (WhatToInline == InlineSelection::NoSemanticsAndGlobalInit) {
|
|
return nullptr;
|
|
}
|
|
// The "availability" semantics attribute is treated like global-init.
|
|
if (Callee->hasSemanticsAttrs() &&
|
|
WhatToInline != InlineSelection::Everything &&
|
|
Callee->hasSemanticsAttrThatStartsWith("availability")) {
|
|
return nullptr;
|
|
}
|
|
} else if (Callee->isGlobalInit()) {
|
|
if (WhatToInline != InlineSelection::Everything) {
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
// We can't inline external declarations.
|
|
if (Callee->empty() || Callee->isExternalDeclaration()) {
|
|
return nullptr;
|
|
}
|
|
|
|
// Explicitly disabled inlining.
|
|
if (Callee->getInlineStrategy() == NoInline) {
|
|
return nullptr;
|
|
}
|
|
|
|
if (!Callee->shouldOptimize()) {
|
|
return nullptr;
|
|
}
|
|
|
|
// We don't support this yet.
|
|
if (AI.hasSubstitutions()) {
|
|
return nullptr;
|
|
}
|
|
|
|
// We don't support inlining a function that binds dynamic self because we
|
|
// have no mechanism to preserve the original function's local self metadata.
|
|
if (computeMayBindDynamicSelf(Callee)) {
|
|
return nullptr;
|
|
}
|
|
|
|
SILFunction *Caller = AI.getFunction();
|
|
|
|
// Detect self-recursive calls.
|
|
if (Caller == Callee) {
|
|
return nullptr;
|
|
}
|
|
|
|
// A non-fragile function may not be inlined into a fragile function.
|
|
if (Caller->isFragile() && !Callee->isFragile()) {
|
|
return nullptr;
|
|
}
|
|
|
|
// Inlining self-recursive functions into other functions can result
|
|
// in excessive code duplication since we run the inliner multiple
|
|
// times in our pipeline
|
|
if (calleeIsSelfRecursive(Callee)) {
|
|
return nullptr;
|
|
}
|
|
|
|
return Callee;
|
|
}
|
|
|
|
/// Return true if inlining this call site is profitable.
|
|
bool SILPerformanceInliner::isProfitableToInline(FullApplySite AI,
|
|
Weight CallerWeight,
|
|
ConstantTracker &callerTracker,
|
|
int &NumCallerBlocks) {
|
|
SILFunction *Callee = AI.getReferencedFunction();
|
|
|
|
if (Callee->getInlineStrategy() == AlwaysInline)
|
|
return true;
|
|
|
|
SILLoopInfo *LI = LA->get(Callee);
|
|
ShortestPathAnalysis *SPA = getSPA(Callee, LI);
|
|
assert(SPA->isValid());
|
|
|
|
ConstantTracker constTracker(Callee, &callerTracker, AI);
|
|
DominanceInfo *DT = DA->get(Callee);
|
|
SILBasicBlock *CalleeEntry = &Callee->front();
|
|
DominanceOrder domOrder(CalleeEntry, DT, Callee->size());
|
|
|
|
// Calculate the inlining cost of the callee.
|
|
int CalleeCost = 0;
|
|
int Benefit = 0;
|
|
|
|
// Start with a base benefit.
|
|
int BaseBenefit = RemovedCallBenefit;
|
|
const SILOptions &Opts = Callee->getModule().getOptions();
|
|
|
|
// For some reason -Ounchecked can accept a higher base benefit without
|
|
// increasing the code size too much.
|
|
if (Opts.Optimization == SILOptions::SILOptMode::OptimizeUnchecked)
|
|
BaseBenefit *= 2;
|
|
|
|
CallerWeight.updateBenefit(Benefit, BaseBenefit);
|
|
|
|
// Go through all blocks of the function, accumulate the cost and find
|
|
// benefits.
|
|
while (SILBasicBlock *block = domOrder.getNext()) {
|
|
constTracker.beginBlock();
|
|
Weight BlockW = SPA->getWeight(block, CallerWeight);
|
|
|
|
for (SILInstruction &I : *block) {
|
|
constTracker.trackInst(&I);
|
|
|
|
CalleeCost += (int)instructionInlineCost(I);
|
|
|
|
if (FullApplySite AI = FullApplySite::isa(&I)) {
|
|
|
|
// Check if the callee is passed as an argument. If so, increase the
|
|
// threshold, because inlining will (probably) eliminate the closure.
|
|
SILInstruction *def = constTracker.getDefInCaller(AI.getCallee());
|
|
if (def && (isa<FunctionRefInst>(def) || isa<PartialApplyInst>(def)))
|
|
BlockW.updateBenefit(Benefit, RemovedClosureBenefit);
|
|
} else if (auto *LI = dyn_cast<LoadInst>(&I)) {
|
|
// Check if it's a load from a stack location in the caller. Such a load
|
|
// might be optimized away if inlined.
|
|
if (constTracker.isStackAddrInCaller(LI->getOperand()))
|
|
BlockW.updateBenefit(Benefit, RemovedLoadBenefit);
|
|
} else if (auto *SI = dyn_cast<StoreInst>(&I)) {
|
|
// Check if it's a store to a stack location in the caller. Such a load
|
|
// might be optimized away if inlined.
|
|
if (constTracker.isStackAddrInCaller(SI->getDest()))
|
|
BlockW.updateBenefit(Benefit, RemovedStoreBenefit);
|
|
} else if (isa<StrongReleaseInst>(&I) || isa<ReleaseValueInst>(&I)) {
|
|
SILValue Op = stripCasts(I.getOperand(0));
|
|
if (SILArgument *Arg = dyn_cast<SILArgument>(Op)) {
|
|
if (Arg->isFunctionArg() && Arg->getArgumentConvention() ==
|
|
SILArgumentConvention::Direct_Guaranteed) {
|
|
BlockW.updateBenefit(Benefit, RefCountBenefit);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Don't count costs in blocks which are dead after inlining.
|
|
SILBasicBlock *takenBlock = constTracker.getTakenBlock(block->getTerminator());
|
|
if (takenBlock) {
|
|
BlockW.updateBenefit(Benefit, RemovedTerminatorBenefit);
|
|
domOrder.pushChildrenIf(block, [=] (SILBasicBlock *child) {
|
|
return child->getSinglePredecessor() != block || child == takenBlock;
|
|
});
|
|
} else {
|
|
domOrder.pushChildren(block);
|
|
}
|
|
}
|
|
|
|
if (AI.getFunction()->isThunk()) {
|
|
// Only inline trivial functions into thunks (which will not increase the
|
|
// code size).
|
|
if (CalleeCost > TrivialFunctionThreshold)
|
|
return false;
|
|
|
|
DEBUG(
|
|
|
|
dumpCaller(AI.getFunction());
|
|
llvm::dbgs() << " decision {" << CalleeCost << " into thunk} " <<
|
|
Callee->getName() << '\n';
|
|
);
|
|
return true;
|
|
}
|
|
|
|
// We reduce the benefit if the caller is too large. For this we use a
|
|
// cubic function on the number of caller blocks. This starts to prevent
|
|
// inlining at about 800 - 1000 caller blocks.
|
|
int blockMinus =
|
|
(NumCallerBlocks * NumCallerBlocks) / BlockLimitDenominator *
|
|
NumCallerBlocks / BlockLimitDenominator;
|
|
Benefit -= blockMinus;
|
|
|
|
// This is the final inlining decision.
|
|
if (CalleeCost > Benefit) {
|
|
return false;
|
|
}
|
|
|
|
NumCallerBlocks += Callee->size();
|
|
|
|
DEBUG(
|
|
dumpCaller(AI.getFunction());
|
|
llvm::dbgs() << " decision {c=" << CalleeCost << ", b=" << Benefit <<
|
|
", l=" << SPA->getScopeLength(CalleeEntry, 0) <<
|
|
", c-w=" << CallerWeight << ", bb=" << Callee->size() <<
|
|
", c-bb=" << NumCallerBlocks << "} " << Callee->getName() << '\n';
|
|
);
|
|
return true;
|
|
}
|
|
|
|
/// Return true if inlining this call site into a cold block is profitable.
|
|
bool SILPerformanceInliner::isProfitableInColdBlock(FullApplySite AI,
|
|
SILFunction *Callee) {
|
|
if (Callee->getInlineStrategy() == AlwaysInline)
|
|
return true;
|
|
|
|
int CalleeCost = 0;
|
|
|
|
for (SILBasicBlock &Block : *Callee) {
|
|
for (SILInstruction &I : Block) {
|
|
CalleeCost += int(instructionInlineCost(I));
|
|
if (CalleeCost > TrivialFunctionThreshold)
|
|
return false;
|
|
}
|
|
}
|
|
DEBUG(
|
|
dumpCaller(AI.getFunction());
|
|
llvm::dbgs() << " cold decision {" << CalleeCost << "} " <<
|
|
Callee->getName() << '\n';
|
|
);
|
|
return true;
|
|
}
|
|
|
|
/// Record additional weight increases.
|
|
///
|
|
/// Why can't we just add the weight when we call isProfitableToInline? Because
|
|
/// the additional weight is for _another_ function than the current handled
|
|
/// callee.
|
|
static void addWeightCorrection(FullApplySite FAS,
|
|
llvm::DenseMap<FullApplySite, int> &WeightCorrections) {
|
|
SILFunction *Callee = FAS.getReferencedFunction();
|
|
if (Callee && Callee->hasSemanticsAttr("array.uninitialized")) {
|
|
// We want to inline the argument to an array.uninitialized call, because
|
|
// this argument is most likely a call to a function which contains the
|
|
// buffer allocation for the array. It is essential to inline it for stack
|
|
// promotion of the array buffer.
|
|
SILValue BufferArg = FAS.getArgument(0);
|
|
SILValue Base = stripValueProjections(stripCasts(BufferArg));
|
|
if (auto BaseApply = FullApplySite::isa(Base))
|
|
WeightCorrections[BaseApply] += 6;
|
|
}
|
|
}
|
|
|
|
void SILPerformanceInliner::collectAppliesToInline(
|
|
SILFunction *Caller, SmallVectorImpl<FullApplySite> &Applies) {
|
|
DominanceInfo *DT = DA->get(Caller);
|
|
SILLoopInfo *LI = LA->get(Caller);
|
|
|
|
llvm::DenseMap<FullApplySite, int> WeightCorrections;
|
|
|
|
// Compute the shortest-path analysis for the caller.
|
|
ShortestPathAnalysis *SPA = getSPA(Caller, LI);
|
|
SPA->analyze(CBI, [&](FullApplySite FAS) -> int {
|
|
|
|
// This closure returns the length of a called function.
|
|
|
|
// At this occasion we record additional weight increases.
|
|
addWeightCorrection(FAS, WeightCorrections);
|
|
|
|
if (SILFunction *Callee = getEligibleFunction(FAS)) {
|
|
// Compute the shortest-path analysis for the callee.
|
|
SILLoopInfo *CalleeLI = LA->get(Callee);
|
|
ShortestPathAnalysis *CalleeSPA = getSPA(Callee, CalleeLI);
|
|
if (!CalleeSPA->isValid()) {
|
|
CalleeSPA->analyze(CBI, [](FullApplySite FAS) {
|
|
// We don't compute SPA for another call-level. Functions called from
|
|
// the callee are assumed to have DefaultApplyLength.
|
|
return DefaultApplyLength;
|
|
});
|
|
}
|
|
int CalleeLength = CalleeSPA->getScopeLength(&Callee->front(), 0);
|
|
// Just in case the callee is a noreturn function.
|
|
if (CalleeLength >= ShortestPathAnalysis::InitialDist)
|
|
return DefaultApplyLength;
|
|
return CalleeLength;
|
|
}
|
|
// Some unknown function.
|
|
return DefaultApplyLength;
|
|
});
|
|
|
|
#ifndef NDEBUG
|
|
if (PrintShortestPathInfo) {
|
|
SPA->dump();
|
|
}
|
|
#endif
|
|
|
|
ConstantTracker constTracker(Caller);
|
|
DominanceOrder domOrder(&Caller->front(), DT, Caller->size());
|
|
int NumCallerBlocks = (int)Caller->size();
|
|
|
|
// Go through all instructions and find candidates for inlining.
|
|
// We do this in dominance order for the constTracker.
|
|
SmallVector<FullApplySite, 8> InitialCandidates;
|
|
while (SILBasicBlock *block = domOrder.getNext()) {
|
|
constTracker.beginBlock();
|
|
Weight BlockWeight;
|
|
|
|
for (auto I = block->begin(), E = block->end(); I != E; ++I) {
|
|
constTracker.trackInst(&*I);
|
|
|
|
if (!FullApplySite::isa(&*I))
|
|
continue;
|
|
|
|
FullApplySite AI = FullApplySite(&*I);
|
|
|
|
auto *Callee = getEligibleFunction(AI);
|
|
if (Callee) {
|
|
if (!BlockWeight.isValid())
|
|
BlockWeight = SPA->getWeight(block, Weight(0, 0));
|
|
|
|
// The actual weight including a possible weight correction.
|
|
Weight W(BlockWeight, WeightCorrections.lookup(AI));
|
|
|
|
if (isProfitableToInline(AI, W, constTracker, NumCallerBlocks))
|
|
InitialCandidates.push_back(AI);
|
|
}
|
|
}
|
|
domOrder.pushChildrenIf(block, [&] (SILBasicBlock *child) {
|
|
if (CBI.isSlowPath(block, child)) {
|
|
// Handle cold blocks separately.
|
|
visitColdBlocks(InitialCandidates, child, DT);
|
|
return false;
|
|
}
|
|
return true;
|
|
});
|
|
}
|
|
|
|
// Calculate how many times a callee is called from this caller.
|
|
llvm::DenseMap<SILFunction *, unsigned> CalleeCount;
|
|
for (auto AI : InitialCandidates) {
|
|
SILFunction *Callee = AI.getReferencedFunction();
|
|
assert(Callee && "apply_inst does not have a direct callee anymore");
|
|
CalleeCount[Callee]++;
|
|
}
|
|
|
|
// Now copy each candidate callee that has a small enough number of
|
|
// call sites into the final set of call sites.
|
|
for (auto AI : InitialCandidates) {
|
|
SILFunction *Callee = AI.getReferencedFunction();
|
|
assert(Callee && "apply_inst does not have a direct callee anymore");
|
|
|
|
const unsigned CallsToCalleeThreshold = 1024;
|
|
if (CalleeCount[Callee] <= CallsToCalleeThreshold)
|
|
Applies.push_back(AI);
|
|
}
|
|
}
|
|
|
|
/// \brief Attempt to inline all calls smaller than our threshold.
|
|
/// returns True if a function was inlined.
|
|
bool SILPerformanceInliner::inlineCallsIntoFunction(SILFunction *Caller) {
|
|
// Don't optimize functions that are marked with the opt.never attribute.
|
|
if (!Caller->shouldOptimize())
|
|
return false;
|
|
|
|
// First step: collect all the functions we want to inline. We
|
|
// don't change anything yet so that the dominator information
|
|
// remains valid.
|
|
SmallVector<FullApplySite, 8> AppliesToInline;
|
|
collectAppliesToInline(Caller, AppliesToInline);
|
|
|
|
if (AppliesToInline.empty())
|
|
return false;
|
|
|
|
// Second step: do the actual inlining.
|
|
for (auto AI : AppliesToInline) {
|
|
SILFunction *Callee = AI.getReferencedFunction();
|
|
assert(Callee && "apply_inst does not have a direct callee anymore");
|
|
|
|
if (!Callee->shouldOptimize()) {
|
|
continue;
|
|
}
|
|
|
|
SmallVector<SILValue, 8> Args;
|
|
for (const auto &Arg : AI.getArguments())
|
|
Args.push_back(Arg);
|
|
|
|
DEBUG(
|
|
dumpCaller(Caller);
|
|
llvm::dbgs() << " inline [" << Callee->size() << "->" <<
|
|
Caller->size() << "] " << Callee->getName() << "\n";
|
|
);
|
|
|
|
// Notice that we will skip all of the newly inlined ApplyInsts. That's
|
|
// okay because we will visit them in our next invocation of the inliner.
|
|
TypeSubstitutionMap ContextSubs;
|
|
SILInliner Inliner(*Caller, *Callee,
|
|
SILInliner::InlineKind::PerformanceInline, ContextSubs,
|
|
AI.getSubstitutions());
|
|
|
|
auto Success = Inliner.inlineFunction(AI, Args);
|
|
(void) Success;
|
|
// We've already determined we should be able to inline this, so
|
|
// we expect it to have happened.
|
|
assert(Success && "Expected inliner to inline this function!");
|
|
|
|
recursivelyDeleteTriviallyDeadInstructions(AI.getInstruction(), true);
|
|
|
|
NumFunctionsInlined++;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// Find functions in cold blocks which are forced to be inlined.
|
|
// All other functions are not inlined in cold blocks.
|
|
void SILPerformanceInliner::visitColdBlocks(
|
|
SmallVectorImpl<FullApplySite> &AppliesToInline, SILBasicBlock *Root,
|
|
DominanceInfo *DT) {
|
|
DominanceOrder domOrder(Root, DT);
|
|
while (SILBasicBlock *block = domOrder.getNext()) {
|
|
for (SILInstruction &I : *block) {
|
|
ApplyInst *AI = dyn_cast<ApplyInst>(&I);
|
|
if (!AI)
|
|
continue;
|
|
|
|
auto *Callee = getEligibleFunction(AI);
|
|
if (Callee && isProfitableInColdBlock(AI, Callee)) {
|
|
AppliesToInline.push_back(AI);
|
|
}
|
|
}
|
|
domOrder.pushChildren(block);
|
|
}
|
|
}
|
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Performance Inliner Pass
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
namespace {
|
|
class SILPerformanceInlinerPass : public SILFunctionTransform {
|
|
/// Specifies which functions not to inline, based on @_semantics and
|
|
/// global_init attributes.
|
|
InlineSelection WhatToInline;
|
|
std::string PassName;
|
|
|
|
public:
|
|
SILPerformanceInlinerPass(InlineSelection WhatToInline, StringRef LevelName):
|
|
WhatToInline(WhatToInline), PassName(LevelName) {
|
|
PassName.append(" Performance Inliner");
|
|
}
|
|
|
|
void run() override {
|
|
DominanceAnalysis *DA = PM->getAnalysis<DominanceAnalysis>();
|
|
SILLoopAnalysis *LA = PM->getAnalysis<SILLoopAnalysis>();
|
|
|
|
if (getOptions().InlineThreshold == 0) {
|
|
return;
|
|
}
|
|
|
|
SILPerformanceInliner Inliner(WhatToInline, DA, LA);
|
|
|
|
assert(getFunction()->isDefinition() &&
|
|
"Expected only functions with bodies!");
|
|
|
|
// Inline things into this function, and if we do so invalidate
|
|
// analyses for this function and restart the pipeline so that we
|
|
// can further optimize this function before attempting to inline
|
|
// in it again.
|
|
if (Inliner.inlineCallsIntoFunction(getFunction())) {
|
|
invalidateAnalysis(SILAnalysis::InvalidationKind::FunctionBody);
|
|
restartPassPipeline();
|
|
}
|
|
}
|
|
|
|
StringRef getName() override { return PassName; }
|
|
};
|
|
} // end anonymous namespace
|
|
|
|
/// Create an inliner pass that does not inline functions that are marked with
|
|
/// the @_semantics, @effects or global_init attributes.
|
|
SILTransform *swift::createEarlyInliner() {
|
|
return new SILPerformanceInlinerPass(
|
|
InlineSelection::NoSemanticsAndGlobalInit, "Early");
|
|
}
|
|
|
|
/// Create an inliner pass that does not inline functions that are marked with
|
|
/// the global_init attribute or have an "availability" semantics attribute.
|
|
SILTransform *swift::createPerfInliner() {
|
|
return new SILPerformanceInlinerPass(InlineSelection::NoGlobalInit, "Middle");
|
|
}
|
|
|
|
/// Create an inliner pass that inlines all functions that are marked with
|
|
/// the @_semantics, @effects or global_init attributes.
|
|
SILTransform *swift::createLateInliner() {
|
|
return new SILPerformanceInlinerPass(InlineSelection::Everything, "Late");
|
|
}
|