swift-mirror/lib/SILOptimizer/Transforms/PerformanceInliner.cpp

//===--- PerformanceInliner.cpp - Basic cost based performance inlining ---===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

#define DEBUG_TYPE "sil-inliner"
#include "swift/SIL/SILInstruction.h"
#include "swift/SIL/Dominance.h"
#include "swift/SIL/SILModule.h"
#include "swift/SIL/Projection.h"
#include "swift/SIL/InstructionUtils.h"
#include "swift/SILOptimizer/Analysis/ColdBlockInfo.h"
#include "swift/SILOptimizer/Analysis/DominanceAnalysis.h"
#include "swift/SILOptimizer/Analysis/FunctionOrder.h"
#include "swift/SILOptimizer/Analysis/LoopAnalysis.h"
#include "swift/SILOptimizer/Analysis/ArraySemantic.h"
#include "swift/SILOptimizer/PassManager/Passes.h"
#include "swift/SILOptimizer/PassManager/Transforms.h"
#include "swift/SILOptimizer/Utils/Local.h"
#include "swift/SILOptimizer/Utils/ConstantFolding.h"
#include "swift/SILOptimizer/Utils/SILInliner.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/ADT/MapVector.h"
#include <functional>


using namespace swift;

STATISTIC(NumFunctionsInlined, "Number of functions inlined");

llvm::cl::opt<bool> PrintShortestPathInfo(
    "print-shortest-path-info", llvm::cl::init(false),
    llvm::cl::desc("Print shortest-path information for inlining"));

//===----------------------------------------------------------------------===//
//                               ConstantTracker
//===----------------------------------------------------------------------===//

// Tracks constants in the caller and callee to get an estimation of what
// values get constant if the callee is inlined.
// This can be seen as a "simulation" of several optimizations: SROA, mem2reg
// and constant propagation.
// Note that this is only a simplified model and not correct in all cases.
// For example aliasing information is not taken into account.
class ConstantTracker {

  // Represents a value in integer constant evaluation.
  struct IntConst {
    IntConst() : isValid(false), isFromCaller(false) { }

    IntConst(const APInt &value, bool isFromCaller) :
    value(value), isValid(true), isFromCaller(isFromCaller) { }

    // The actual value.
    APInt value;

    // True if the value is valid, i.e. could be evaluated to a constant.
    bool isValid;

    // True if the value is only valid, because a constant is passed to the
    // callee. False if constant propagation could do the same job inside the
    // callee without inlining it.
    bool isFromCaller;
  };

  // Links between loaded and stored values.
  // The key is a load instruction, the value is the corresponding store
  // instruction which stores the loaded value. Both, key and value can also
  // be copy_addr instructions.
  llvm::DenseMap<SILInstruction *, SILInstruction *> links;

  // The current stored values at memory addresses.
  // The key is the base address of the memory (after skipping address
  // projections). The value are store (or copy_addr) instructions, which
  // store the current value.
  // This is only an estimation, because e.g. it does not consider potential
  // aliasing.
  llvm::DenseMap<SILValue, SILInstruction *> memoryContent;

  // Cache for evaluated constants.
  llvm::SmallDenseMap<BuiltinInst *, IntConst> constCache;

  // The caller/callee function which is tracked.
  SILFunction *F;

  // The constant tracker of the caller function (null if this is the
  // tracker of the callee).
  ConstantTracker *callerTracker;

  // The apply instruction in the caller (null if this is the tracker of the
  // callee).
  FullApplySite AI;

  // Walks through address projections and (optionally) collects them.
  // Returns the base address, i.e. the first address which is not a
  // projection.
  SILValue scanProjections(SILValue addr,
                           SmallVectorImpl<Projection> *Result = nullptr);

  // Get the stored value for a load. The loadInst can be either a real load
  // or a copy_addr.
  SILValue getStoredValue(SILInstruction *loadInst,
                          ProjectionPath &projStack);

  // Gets the parameter in the caller for a function argument.
  SILValue getParam(SILValue value) {
    if (SILArgument *arg = dyn_cast<SILArgument>(value)) {
      if (AI && arg->isFunctionArg() && arg->getFunction() == F) {
        // Continue at the caller.
        return AI.getArgument(arg->getIndex());
      }
    }
    return SILValue();
  }

  SILInstruction *getMemoryContent(SILValue addr) {
    // The memory content can be stored in this ConstantTracker or in the
    // caller's ConstantTracker.
    SILInstruction *storeInst = memoryContent[addr];
    if (storeInst)
      return storeInst;
    if (callerTracker)
      return callerTracker->getMemoryContent(addr);
    return nullptr;
  }

  // Gets the estimated definition of a value.
  SILInstruction *getDef(SILValue val, ProjectionPath &projStack);

  // Gets the estimated integer constant result of a builtin.
  IntConst getBuiltinConst(BuiltinInst *BI, int depth);

public:

  // Constructor for the caller function.
  ConstantTracker(SILFunction *function) :
    F(function), callerTracker(nullptr), AI()
  { }

  // Constructor for the callee function.
  ConstantTracker(SILFunction *function, ConstantTracker *caller,
                  FullApplySite callerApply) :
     F(function), callerTracker(caller), AI(callerApply)
  { }

  void beginBlock() {
    // Currently we don't do any sophisticated dataflow analysis, so we keep
    // the memoryContent alive only for a single block.
    memoryContent.clear();
  }

  // Must be called for each instruction visited in dominance order.
  void trackInst(SILInstruction *inst);

  // Gets the estimated definition of a value.
  SILInstruction *getDef(SILValue val) {
    ProjectionPath projStack(val->getType());
    return getDef(val, projStack);
  }

  // Gets the estimated definition of a value if it is in the caller.
  SILInstruction *getDefInCaller(SILValue val) {
    SILInstruction *def = getDef(val);
    if (def && def->getFunction() != F)
      return def;
    return nullptr;
  }

  bool isStackAddrInCaller(SILValue val) {
    if (SILValue Param = getParam(stripAddressProjections(val))) {
      return isa<AllocStackInst>(stripAddressProjections(Param));
    }
    return false;
  }

  // Gets the estimated integer constant of a value.
  IntConst getIntConst(SILValue val, int depth = 0);

  SILBasicBlock *getTakenBlock(TermInst *term);
};

void ConstantTracker::trackInst(SILInstruction *inst) {
  if (auto *LI = dyn_cast<LoadInst>(inst)) {
    SILValue baseAddr = scanProjections(LI->getOperand());
    if (SILInstruction *loadLink = getMemoryContent(baseAddr))
      links[LI] = loadLink;
  } else if (StoreInst *SI = dyn_cast<StoreInst>(inst)) {
    SILValue baseAddr = scanProjections(SI->getOperand(1));
    memoryContent[baseAddr] = SI;
  } else if (CopyAddrInst *CAI = dyn_cast<CopyAddrInst>(inst)) {
    if (!CAI->isTakeOfSrc()) {
      // Treat a copy_addr as a load + store
      SILValue loadAddr = scanProjections(CAI->getOperand(0));
      if (SILInstruction *loadLink = getMemoryContent(loadAddr)) {
        links[CAI] = loadLink;
        SILValue storeAddr = scanProjections(CAI->getOperand(1));
        memoryContent[storeAddr] = CAI;
      }
    }
  }
}

SILValue ConstantTracker::scanProjections(SILValue addr,
                                          SmallVectorImpl<Projection> *Result) {
  for (;;) {
    if (Projection::isAddressProjection(addr)) {
      SILInstruction *I = cast<SILInstruction>(addr);
      if (Result) {
        Result->push_back(Projection(I));
      }
      addr = I->getOperand(0);
      continue;
    }
    if (SILValue param = getParam(addr)) {
      // Go to the caller.
      addr = param;
      continue;
    }
    // Return the base address = the first address which is not a projection.
    return addr;
  }
}

SILValue ConstantTracker::getStoredValue(SILInstruction *loadInst,
                                         ProjectionPath &projStack) {
  SILInstruction *store = links[loadInst];
  if (!store && callerTracker)
    store = callerTracker->links[loadInst];
  if (!store) return SILValue();

  assert(isa<LoadInst>(loadInst) || isa<CopyAddrInst>(loadInst));

  // Push the address projections of the load onto the stack.
  SmallVector<Projection, 4> loadProjections;
  scanProjections(loadInst->getOperand(0), &loadProjections);
  for (const Projection &proj : loadProjections) {
    projStack.push_back(proj);
  }

  //  Pop the address projections of the store from the stack.
  SmallVector<Projection, 4> storeProjections;
  scanProjections(store->getOperand(1), &storeProjections);
  for (auto iter = storeProjections.rbegin(); iter != storeProjections.rend();
       ++iter) {
    const Projection &proj = *iter;
    // The corresponding load-projection must match the store-projection.
    if (projStack.empty() || projStack.back() != proj)
      return SILValue();
    projStack.pop_back();
  }

  if (isa<StoreInst>(store))
    return store->getOperand(0);

  // The copy_addr instruction is both a load and a store. So we follow the link
  // again.
  assert(isa<CopyAddrInst>(store));
  return getStoredValue(store, projStack);
}

// Get the aggregate member based on the top of the projection stack.
static SILValue getMember(SILInstruction *inst, ProjectionPath &projStack) {
  if (!projStack.empty()) {
    const Projection &proj = projStack.back();
    return proj.getOperandForAggregate(inst);
  }
  return SILValue();
}

SILInstruction *ConstantTracker::getDef(SILValue val,
                                        ProjectionPath &projStack) {

  // Track the value up the dominator tree.
  for (;;) {
    if (SILInstruction *inst = dyn_cast<SILInstruction>(val)) {
      if (Projection::isObjectProjection(inst)) {
        // Extract a member from a struct/tuple/enum.
        projStack.push_back(Projection(inst));
        val = inst->getOperand(0);
        continue;
      } else if (SILValue member = getMember(inst, projStack)) {
        // The opposite of a projection instruction: composing a struct/tuple.
        projStack.pop_back();
        val = member;
        continue;
      } else if (SILValue loadedVal = getStoredValue(inst, projStack)) {
        // A value loaded from memory.
        val = loadedVal;
        continue;
      } else if (isa<ThinToThickFunctionInst>(inst)) {
        val = inst->getOperand(0);
        continue;
      }
      return inst;
    } else if (SILValue param = getParam(val)) {
      // Continue in the caller.
      val = param;
      continue;
    }
    return nullptr;
  }
}

ConstantTracker::IntConst ConstantTracker::getBuiltinConst(BuiltinInst *BI, int depth) {
  const BuiltinInfo &Builtin = BI->getBuiltinInfo();
  OperandValueArrayRef Args = BI->getArguments();
  switch (Builtin.ID) {
    default: break;

      // Fold comparison predicates.
#define BUILTIN(id, name, Attrs)
#define BUILTIN_BINARY_PREDICATE(id, name, attrs, overload) \
case BuiltinValueKind::id:
#include "swift/AST/Builtins.def"
    {
      IntConst lhs = getIntConst(Args[0], depth);
      IntConst rhs = getIntConst(Args[1], depth);
      if (lhs.isValid && rhs.isValid) {
        return IntConst(constantFoldComparison(lhs.value, rhs.value,
                                               Builtin.ID),
                        lhs.isFromCaller || rhs.isFromCaller);
      }
      break;
    }

    case BuiltinValueKind::SAddOver:
    case BuiltinValueKind::UAddOver:
    case BuiltinValueKind::SSubOver:
    case BuiltinValueKind::USubOver:
    case BuiltinValueKind::SMulOver:
    case BuiltinValueKind::UMulOver: {
      IntConst lhs = getIntConst(Args[0], depth);
      IntConst rhs = getIntConst(Args[1], depth);
      if (lhs.isValid && rhs.isValid) {
        bool IgnoredOverflow;
        return IntConst(constantFoldBinaryWithOverflow(lhs.value, rhs.value,
                        IgnoredOverflow,
                        getLLVMIntrinsicIDForBuiltinWithOverflow(Builtin.ID)),
                          lhs.isFromCaller || rhs.isFromCaller);
      }
      break;
    }

    case BuiltinValueKind::SDiv:
    case BuiltinValueKind::SRem:
    case BuiltinValueKind::UDiv:
    case BuiltinValueKind::URem: {
      IntConst lhs = getIntConst(Args[0], depth);
      IntConst rhs = getIntConst(Args[1], depth);
      if (lhs.isValid && rhs.isValid && rhs.value != 0) {
        bool IgnoredOverflow;
        return IntConst(constantFoldDiv(lhs.value, rhs.value,
                                        IgnoredOverflow, Builtin.ID),
                        lhs.isFromCaller || rhs.isFromCaller);
      }
      break;
    }

    case BuiltinValueKind::And:
    case BuiltinValueKind::AShr:
    case BuiltinValueKind::LShr:
    case BuiltinValueKind::Or:
    case BuiltinValueKind::Shl:
    case BuiltinValueKind::Xor: {
      IntConst lhs = getIntConst(Args[0], depth);
      IntConst rhs = getIntConst(Args[1], depth);
      if (lhs.isValid && rhs.isValid) {
        return IntConst(constantFoldBitOperation(lhs.value, rhs.value,
                                                 Builtin.ID),
                        lhs.isFromCaller || rhs.isFromCaller);
      }
      break;
    }

    case BuiltinValueKind::Trunc:
    case BuiltinValueKind::ZExt:
    case BuiltinValueKind::SExt:
    case BuiltinValueKind::TruncOrBitCast:
    case BuiltinValueKind::ZExtOrBitCast:
    case BuiltinValueKind::SExtOrBitCast: {
      IntConst val = getIntConst(Args[0], depth);
      if (val.isValid) {
        return IntConst(constantFoldCast(val.value, Builtin), val.isFromCaller);
      }
      break;
    }
  }
  return IntConst();
}

// Tries to evaluate the integer constant of a value. The \p depth is used
// to limit the complexity.
ConstantTracker::IntConst ConstantTracker::getIntConst(SILValue val, int depth) {

  // Don't spend too much time with constant evaluation.
  if (depth >= 10)
    return IntConst();

  SILInstruction *I = getDef(val);
  if (!I)
    return IntConst();

  if (auto *IL = dyn_cast<IntegerLiteralInst>(I)) {
    return IntConst(IL->getValue(), IL->getFunction() != F);
  }
  if (auto *BI = dyn_cast<BuiltinInst>(I)) {
    if (constCache.count(BI) != 0)
      return constCache[BI];

    IntConst builtinConst = getBuiltinConst(BI, depth + 1);
    constCache[BI] = builtinConst;
    return builtinConst;
  }
  return IntConst();
}

// Returns the taken block of a terminator instruction if the condition turns
// out to be constant.
SILBasicBlock *ConstantTracker::getTakenBlock(TermInst *term) {
  if (CondBranchInst *CBI = dyn_cast<CondBranchInst>(term)) {
    IntConst condConst = getIntConst(CBI->getCondition());
    if (condConst.isFromCaller) {
      return condConst.value != 0 ? CBI->getTrueBB() : CBI->getFalseBB();
    }
    return nullptr;
  }
  if (SwitchValueInst *SVI = dyn_cast<SwitchValueInst>(term)) {
    IntConst switchConst = getIntConst(SVI->getOperand());
    if (switchConst.isFromCaller) {
      for (unsigned Idx = 0; Idx < SVI->getNumCases(); ++Idx) {
        auto switchCase = SVI->getCase(Idx);
        if (auto *IL = dyn_cast<IntegerLiteralInst>(switchCase.first)) {
          if (switchConst.value == IL->getValue())
            return switchCase.second;
        } else {
          return nullptr;
        }
      }
      if (SVI->hasDefault())
        return SVI->getDefaultBB();
    }
    return nullptr;
  }
  if (SwitchEnumInst *SEI = dyn_cast<SwitchEnumInst>(term)) {
    if (SILInstruction *def = getDefInCaller(SEI->getOperand())) {
      if (EnumInst *EI = dyn_cast<EnumInst>(def)) {
        for (unsigned Idx = 0; Idx < SEI->getNumCases(); ++Idx) {
          auto enumCase = SEI->getCase(Idx);
          if (enumCase.first == EI->getElement())
            return enumCase.second;
        }
        if (SEI->hasDefault())
          return SEI->getDefaultBB();
      }
    }
    return nullptr;
  }
  if (CheckedCastBranchInst *CCB = dyn_cast<CheckedCastBranchInst>(term)) {
    if (SILInstruction *def = getDefInCaller(CCB->getOperand())) {
      if (UpcastInst *UCI = dyn_cast<UpcastInst>(def)) {
        SILType castType = UCI->getOperand()->getType();
        if (CCB->getCastType().isExactSuperclassOf(castType)) {
          return CCB->getSuccessBB();
        }
        if (!castType.isBindableToSuperclassOf(CCB->getCastType())) {
          return CCB->getFailureBB();
        }
      }
    }
  }
  return nullptr;
}

//===----------------------------------------------------------------------===//
//                           Shortest path analysis
//===----------------------------------------------------------------------===//

/// Computes the length of the shortest path through a block in a scope.
///
/// A scope is either a loop or the whole function. The "length" is an
/// estimation of the execution time. For example if the shortest path of a
/// block B is N, it means that the execution time of the scope (either
/// function or a single loop iteration), when execution includes the block, is
/// at least N, regardless of what branches are taken.
///
/// The shortest path of a block is the sum of the shortest path from the scope
/// entry and the shortest path to the scope exit.
class ShortestPathAnalysis {
public:
  enum {
    /// We assume that cold block take very long to execute.
    ColdBlockLength = 1000,

    /// Our assumption on how many times a loop is executed.
    LoopCount = 10,

    /// To keep things simple we only analyse up to this number of nested loops.
    MaxNumLoopLevels = 4,

    /// The "weight" for the benefit which a single loop nest gives.
    SingleLoopWeight = 4,

    /// Pretty large but small enough to add something without overflowing.
    InitialDist = (1 << 29)
  };

  /// A weight for an inlining benefit.
  class Weight {
    /// How long does it take to execute the scope (either loop or the whole
    /// function) of the call to inline? The larger it is the less important it
    /// is to inline.
    int ScopeLength;

    /// Represents the loop nest. The larger it is the more important it is to
    /// inline
    int LoopWeight;

    friend class ShortestPathAnalysis;

  public:
    Weight(int ScopeLength, int LoopWeight) : ScopeLength(ScopeLength),
                                              LoopWeight(LoopWeight) { }

    Weight(const Weight &RHS, int AdditionalLoopWeight) :
      Weight(RHS.ScopeLength, RHS.LoopWeight + AdditionalLoopWeight) { }

    Weight() : ScopeLength(-1), LoopWeight(0) {
      assert(!isValid());
    }

    bool isValid() const { return ScopeLength >= 0; }

    /// Updates the \p Benefit by weighting the \p Importance.
    void updateBenefit(int &Benefit, int Importance) const {
      assert(isValid());
      int newBenefit = 0;

      // Use some heuristics. The basic idea is: length is bad, loops are good.
      if (ScopeLength > 320) {
        newBenefit = Importance;
      } else if (ScopeLength > 160) {
        newBenefit = Importance + LoopWeight * 4;
      } else if (ScopeLength > 80) {
        newBenefit = Importance + LoopWeight * 8;
      } else if (ScopeLength > 40) {
        newBenefit = Importance + LoopWeight * 12;
      } else if (ScopeLength > 20) {
        newBenefit = Importance + LoopWeight * 16;
      } else {
        newBenefit = Importance + 20 + LoopWeight * 16;
      }
      // We don't accumulate the benefit instead we max it.
      if (newBenefit > Benefit)
        Benefit = newBenefit;
    }

#ifndef NDEBUG
    friend raw_ostream &operator<<(raw_ostream &os, const Weight &W) {
      os << W.LoopWeight << '/' << W.ScopeLength;
      return os;
    }
#endif
  };

private:
  /// The distances for a block in it's scope (either loop or function).
  struct Distances {
    /// The shortest distance from the scope entry to the block entry.
    int DistFromEntry = InitialDist;

    /// The shortest distance from the block entry to the scope exit.
    int DistToExit = InitialDist;

    /// Additional length to account for loop iterations. It's != 0 for loop
    /// headers (or loop predecessors).
    int LoopHeaderLength = 0;
  };

  /// Holds the distance information for a block in all it's scopes, i.e. in all
  /// its containing loops and the function itself.
  struct BlockInfo {
  private:
    /// Distances for all scopes. Element 0 refers to the function, elements
    /// > 0 to loops.
    Distances Dists[MaxNumLoopLevels];
  public:
    /// The length of the block itself.
    int Length = 0;

    /// Returns the distances for the loop with nesting level \p LoopDepth.
    Distances &getDistances(int LoopDepth) {
      assert(LoopDepth >= 0 && LoopDepth < MaxNumLoopLevels);
      return Dists[LoopDepth];
    }

    /// Returns the length including the LoopHeaderLength for a given
    /// \p LoopDepth.
    int getLength(int LoopDepth) {
      return Length + getDistances(LoopDepth).LoopHeaderLength;
    }

    /// Returns the length of the shortest path of this block in the loop with
    /// nesting level \p LoopDepth.
    int getScopeLength(int LoopDepth) {
      const Distances &D = getDistances(LoopDepth);
      return D.DistFromEntry + D.DistToExit;
    }
  };

  SILFunction *F;
  SILLoopInfo *LI;
  llvm::DenseMap<const SILBasicBlock *, BlockInfo *> BlockInfos;
  std::vector<BlockInfo> BlockInfoStorage;

  BlockInfo *getBlockInfo(const SILBasicBlock *BB) {
    BlockInfo *BI = BlockInfos[BB];
    assert(BI);
    return BI;
  }

  // Utility functions to make the template solveDataFlow compilable for a block
  // list containing references _and_ a list containing pointers.

  const SILBasicBlock *getBlock(const SILBasicBlock *BB) { return BB; }
  const SILBasicBlock *getBlock(const SILBasicBlock &BB) { return &BB; }

  /// Returns the minimum distance from all predecessor blocks of \p BB.
  int getEntryDistFromPreds(const SILBasicBlock *BB, int LoopDepth);

  /// Returns the minimum distance from all successor blocks of \p BB.
  int getExitDistFromSuccs(const SILBasicBlock *BB, int LoopDepth);

  /// Computes the distances by solving the dataflow problem for all \p Blocks
  /// in a scope.
  template <typename BlockList>
  void solveDataFlow(const BlockList &Blocks, int LoopDepth) {
    bool Changed = false;

    // Compute the distances from the entry block.
    do {
      Changed = false;
      for (auto &Block : Blocks) {
        const SILBasicBlock *BB = getBlock(Block);
        BlockInfo *BBInfo = getBlockInfo(BB);
        int DistFromEntry = getEntryDistFromPreds(BB, LoopDepth);
        Distances &BBDists = BBInfo->getDistances(LoopDepth);
        if (DistFromEntry < BBDists.DistFromEntry) {
          BBDists.DistFromEntry = DistFromEntry;
          Changed = true;
        }
      }
    } while (Changed);

    // Compute the distances to the exit block.
    do {
      Changed = false;
      for (auto &Block : reverse(Blocks)) {
        const SILBasicBlock *BB = getBlock(Block);
        BlockInfo *BBInfo = getBlockInfo(BB);
        int DistToExit =
          getExitDistFromSuccs(BB, LoopDepth) + BBInfo->getLength(LoopDepth);
        Distances &BBDists = BBInfo->getDistances(LoopDepth);
        if (DistToExit < BBDists.DistToExit) {
          BBDists.DistToExit = DistToExit;
          Changed = true;
        }
      }
    } while (Changed);
  }

  /// Analyze \p Loop and all its inner loops.
  void analyzeLoopsRecursively(SILLoop *Loop, int LoopDepth);

  void printFunction(llvm::raw_ostream &OS);

  void printLoop(llvm::raw_ostream &OS, SILLoop *Loop, int LoopDepth);

  void printBlockInfo(llvm::raw_ostream &OS, SILBasicBlock *BB, int LoopDepth);

public:
  ShortestPathAnalysis(SILFunction *F, SILLoopInfo *LI) : F(F), LI(LI) { }

  bool isValid() const { return !BlockInfos.empty(); }

  /// Compute the distances. The function \p getApplyLength returns the length
  /// of a function call.
  template <typename Func>
  void analyze(ColdBlockInfo &CBI, Func getApplyLength) {
    assert(!isValid());

    BlockInfoStorage.resize(F->size());

    // First step: compute the length of the blocks.
    unsigned BlockIdx = 0;
    for (SILBasicBlock &BB : *F) {
      int Length = 0;
      if (CBI.isCold(&BB)) {
        Length = ColdBlockLength;
      } else {
        for (SILInstruction &I : BB) {
          if (auto FAS = FullApplySite::isa(&I)) {
            Length += getApplyLength(FAS);
          } else {
            Length += (int)instructionInlineCost(I);
          }
        }
      }
      BlockInfo *BBInfo = &BlockInfoStorage[BlockIdx++];
      BlockInfos[&BB] = BBInfo;
      BBInfo->Length = Length;

      // Initialize the distances for the entry and exit blocks, used when
      // computing the distances for the function itself.
      if (&BB == &F->front())
        BBInfo->getDistances(0).DistFromEntry = 0;

      if (isa<ReturnInst>(BB.getTerminator()))
        BBInfo->getDistances(0).DistToExit = Length;
      else if (isa<ThrowInst>(BB.getTerminator()))
        BBInfo->getDistances(0).DistToExit = Length + ColdBlockLength;
    }
    // Compute the distances for all loops in the function.
    for (SILLoop *Loop : *LI) {
      analyzeLoopsRecursively(Loop, 1);
    }
    // Compute the distances for the function itself.
    solveDataFlow(F->getBlocks(), 0);
  }

  /// Returns the length of the shortest path of the block \p BB in the loop
  /// with nesting level \p LoopDepth. If \p LoopDepth is 0 ir returns the
  /// shortest path in the function.
  int getScopeLength(SILBasicBlock *BB, int LoopDepth) {
    assert(BB->getParent() == F);
    if (LoopDepth >= MaxNumLoopLevels)
      LoopDepth = MaxNumLoopLevels - 1;
    return getBlockInfo(BB)->getScopeLength(LoopDepth);
  }

  /// Returns the weight of block \p BB also considering the \p CallerWeight
  /// which is the weight of the call site's block in the caller.
  Weight getWeight(SILBasicBlock *BB, Weight CallerWeight);

  void dump();
};

int ShortestPathAnalysis::getEntryDistFromPreds(const SILBasicBlock *BB,
                                                int LoopDepth) {
  int MinDist = InitialDist;
  for (SILBasicBlock *Pred : BB->getPreds()) {
    BlockInfo *PredInfo = getBlockInfo(Pred);
    Distances &PDists = PredInfo->getDistances(LoopDepth);
    int DistFromEntry = PDists.DistFromEntry + PredInfo->Length +
                          PDists.LoopHeaderLength;
    assert(DistFromEntry >= 0);
    if (DistFromEntry < MinDist)
      MinDist = DistFromEntry;
  }
  return MinDist;
}

int ShortestPathAnalysis::getExitDistFromSuccs(const SILBasicBlock *BB,
                                               int LoopDepth) {
  int MinDist = InitialDist;
  for (const SILSuccessor &Succ : BB->getSuccessors()) {
    BlockInfo *SuccInfo = getBlockInfo(Succ);
    Distances &SDists = SuccInfo->getDistances(LoopDepth);
    if (SDists.DistToExit < MinDist)
      MinDist = SDists.DistToExit;
  }
  return MinDist;
}

/// Detect an edge from the loop pre-header's predecessor to the loop exit
/// block. Such an edge "short-cuts" a loop if it is never iterated. But usually
/// it is the less frequent case and we want to ignore it.
/// E.g. it handles the case of N==0 for
///     for i in 0..<N { ... }
/// If the \p Loop has such an edge the source block of this edge is returned,
/// which is the predecessor of the loop pre-header.
static SILBasicBlock *detectLoopBypassPreheader(SILLoop *Loop) {
  SILBasicBlock *Pred = Loop->getLoopPreheader();
  if (!Pred)
    return nullptr;

  SILBasicBlock *PredPred = Pred->getSinglePredecessor();
  if (!PredPred)
    return nullptr;

  auto *CBR = dyn_cast<CondBranchInst>(PredPred->getTerminator());
  if (!CBR)
    return nullptr;

  SILBasicBlock *Succ = (CBR->getTrueBB() == Pred ? CBR->getFalseBB() :
                                                    CBR->getTrueBB());

  for (SILBasicBlock *PredOfSucc : Succ->getPreds()) {
    SILBasicBlock *Exiting = PredOfSucc->getSinglePredecessor();
    if (!Exiting)
      Exiting = PredOfSucc;
    if (Loop->contains(Exiting))
      return PredPred;
  }
  return nullptr;
}

void ShortestPathAnalysis::analyzeLoopsRecursively(SILLoop *Loop, int LoopDepth) {
  if (LoopDepth >= MaxNumLoopLevels)
    return;

  // First dive into the inner loops.
  for (SILLoop *SubLoop : Loop->getSubLoops()) {
    analyzeLoopsRecursively(SubLoop, LoopDepth + 1);
  }

  BlockInfo *HeaderInfo = getBlockInfo(Loop->getHeader());
  Distances &HeaderDists = HeaderInfo->getDistances(LoopDepth);

  // Initial values for the entry (== header) and exit-predecessor (== header as
  // well).
  HeaderDists.DistFromEntry = 0;
  HeaderDists.DistToExit = 0;

  solveDataFlow(Loop->getBlocks(), LoopDepth);

  int LoopLength = getExitDistFromSuccs(Loop->getHeader(), LoopDepth) +
  HeaderInfo->getLength(LoopDepth);
  HeaderDists.DistToExit = LoopLength;

  // If there is a loop bypass edge, add the loop length to the loop pre-pre-
  // header instead to the header. This actually let us ignore the loop bypass
  // edge in the length calculation for the loop's parent scope.
  if (SILBasicBlock *Bypass = detectLoopBypassPreheader(Loop))
    HeaderInfo = getBlockInfo(Bypass);

  // Add the full loop length (= assumed-iteration-count * length) to the loop
  // header so that it is considered in the parent scope.
  HeaderInfo->getDistances(LoopDepth - 1).LoopHeaderLength =
    LoopCount * LoopLength;
}

ShortestPathAnalysis::Weight ShortestPathAnalysis::
getWeight(SILBasicBlock *BB, Weight CallerWeight) {
  assert(BB->getParent() == F);

  SILLoop *Loop = LI->getLoopFor(BB);
  if (!Loop) {
    // We are not in a loop. So just account the length of our function scope
    // in to the length of the CallerWeight.
    return Weight(CallerWeight.ScopeLength + getScopeLength(BB, 0),
                  CallerWeight.LoopWeight);
  }
  int LoopDepth = Loop->getLoopDepth();
  // Deal with the corner case of having more than 4 nested loops.
  while (LoopDepth >= MaxNumLoopLevels) {
    --LoopDepth;
    Loop = Loop->getParentLoop();
  }
  Weight W(getScopeLength(BB, LoopDepth), SingleLoopWeight);

  // Add weights for all the loops BB is in.
  while (Loop) {
    assert(LoopDepth > 0);
    BlockInfo *HeaderInfo = getBlockInfo(Loop->getHeader());
    int InnerLoopLength = HeaderInfo->getScopeLength(LoopDepth) *
                            ShortestPathAnalysis::LoopCount;
    int OuterLoopWeight = SingleLoopWeight;
    int OuterScopeLength = HeaderInfo->getScopeLength(LoopDepth - 1);

    // Reaching the outermost loop, we use the CallerWeight to get the outer
    // length+loopweight.
    if (LoopDepth == 1) {
      // If the apply in the caller is not in a significant loop, just stop with
      // what we have now.
      if (CallerWeight.LoopWeight < 4)
        return W;

      // If this function is part of the caller's scope length take the caller's
      // scope length. Note: this is not the case e.g. if the apply is in a
      // then-branch of an if-then-else in the caller and the else-branch is
      // the short path.
      if (CallerWeight.ScopeLength > OuterScopeLength)
        OuterScopeLength = CallerWeight.ScopeLength;
      OuterLoopWeight = CallerWeight.LoopWeight;
    }
    assert(OuterScopeLength >= InnerLoopLength);

    // If the current loop is only a small part of its outer loop, we don't
    // take the outer loop that much into account. Only if the current loop is
    // actually the "main part" in the outer loop we add the full loop weight
    // for the outer loop.
    if (OuterScopeLength < InnerLoopLength * 2) {
      W.LoopWeight += OuterLoopWeight - 1;
    } else if (OuterScopeLength < InnerLoopLength * 3) {
      W.LoopWeight += OuterLoopWeight - 2;
    } else if (OuterScopeLength < InnerLoopLength * 4) {
      W.LoopWeight += OuterLoopWeight - 3;
    } else {
      return W;
    }
    --LoopDepth;
    Loop = Loop->getParentLoop();
  }
  assert(LoopDepth == 0);
  return W;
}

void ShortestPathAnalysis::dump() {
  printFunction(llvm::errs());
}

void ShortestPathAnalysis::printFunction(llvm::raw_ostream &OS) {
  OS << "SPA @" << F->getName() << "\n";
  for (SILBasicBlock &BB : *F) {
    printBlockInfo(OS, &BB, 0);
  }
  for (SILLoop *Loop : *LI) {
    printLoop(OS, Loop, 1);
  }
}

void ShortestPathAnalysis::printLoop(llvm::raw_ostream &OS, SILLoop *Loop,
                                     int LoopDepth) {
  if (LoopDepth >= MaxNumLoopLevels)
    return;
  assert(LoopDepth == (int)Loop->getLoopDepth());

  OS << "Loop bb" << Loop->getHeader()->getDebugID() << ":\n";
  for (SILBasicBlock *BB : Loop->getBlocks()) {
    printBlockInfo(OS, BB, LoopDepth);
  }
  for (SILLoop *SubLoop : Loop->getSubLoops()) {
    printLoop(OS, SubLoop, LoopDepth + 1);
  }
}

void ShortestPathAnalysis::printBlockInfo(llvm::raw_ostream &OS,
                                          SILBasicBlock *BB, int LoopDepth) {
  BlockInfo *BBInfo = getBlockInfo(BB);
  Distances &D = BBInfo->getDistances(LoopDepth);
  OS << "  bb" << BB->getDebugID() << ": length=" << BBInfo->Length << '+'
     << D.LoopHeaderLength << ", d-entry=" << D.DistFromEntry
     << ", d-exit=" << D.DistToExit << '\n';
}

//===----------------------------------------------------------------------===//
//                           Performance Inliner
//===----------------------------------------------------------------------===//

namespace {

// Controls the decision to inline functions with @_semantics, @effect and
// global_init attributes.
enum class InlineSelection {
  Everything,
  NoGlobalInit, // and no availability semantics calls
  NoSemanticsAndGlobalInit
};

using Weight = ShortestPathAnalysis::Weight;

class SILPerformanceInliner {
  /// Specifies which functions not to inline, based on @_semantics and
  /// global_init attributes.
  InlineSelection WhatToInline;

  DominanceAnalysis *DA;
  SILLoopAnalysis *LA;

  // For keys of SILFunction and SILLoop.
  llvm::DenseMap<SILFunction *, ShortestPathAnalysis *> SPAs;
  llvm::SpecificBumpPtrAllocator<ShortestPathAnalysis> SPAAllocator;

  ColdBlockInfo CBI;

  /// The following constants define the cost model for inlining. Some constants
  /// are also defined in ShortestPathAnalysis.
  enum {
    /// The base value for every call: it represents the benefit of removing the
    /// call overhead itself.
    RemovedCallBenefit = 20,

    /// The benefit if the operand of an apply gets constant, e.g. if a closure
    /// is passed to an apply instruction in the callee.
    RemovedClosureBenefit = RemovedCallBenefit + 50,

    /// The benefit if a load can (probably) eliminated because it loads from
    /// a stack location in the caller.
    RemovedLoadBenefit = RemovedCallBenefit + 5,

    /// The benefit if a store can (probably) eliminated because it stores to
    /// a stack location in the caller.
    RemovedStoreBenefit = RemovedCallBenefit + 10,

    /// The benefit if the condition of a terminator instruction gets constant
    /// due to inlining.
    RemovedTerminatorBenefit = RemovedCallBenefit + 10,

    /// The benefit if a retain/release can (probably) be eliminated after
    /// inlining.
    RefCountBenefit = RemovedCallBenefit + 20,

    /// Approximately up to this cost level a function can be inlined without
    /// increasing the code size.
    TrivialFunctionThreshold = 18,

    /// Configuration for the caller block limit.
    BlockLimitDenominator = 10000,

    /// The assumed execution length of a function call.
    DefaultApplyLength = 10
  };

#ifndef NDEBUG
  SILFunction *LastPrintedCaller = nullptr;
  void dumpCaller(SILFunction *Caller) {
    if (Caller != LastPrintedCaller) {
      llvm::dbgs() << "\nInline into caller: " << Caller->getName() << '\n';
      LastPrintedCaller = Caller;
    }
  }
#endif

  ShortestPathAnalysis *getSPA(SILFunction *F, SILLoopInfo *LI) {
    ShortestPathAnalysis *&SPA = SPAs[F];
    if (!SPA) {
      SPA = new (SPAAllocator.Allocate()) ShortestPathAnalysis(F, LI);
    }
    return SPA;
  }

  SILFunction *getEligibleFunction(FullApplySite AI);

  bool isProfitableToInline(FullApplySite AI,
                            Weight CallerWeight,
                            ConstantTracker &constTracker,
                            int &NumCallerBlocks);

  bool isProfitableInColdBlock(FullApplySite AI, SILFunction *Callee);

  void visitColdBlocks(SmallVectorImpl<FullApplySite> &AppliesToInline,
                       SILBasicBlock *root, DominanceInfo *DT);

  void collectAppliesToInline(SILFunction *Caller,
                              SmallVectorImpl<FullApplySite> &Applies);

public:
  SILPerformanceInliner(InlineSelection WhatToInline, DominanceAnalysis *DA,
                        SILLoopAnalysis *LA)
      : WhatToInline(WhatToInline), DA(DA), LA(LA), CBI(DA) {}

  bool inlineCallsIntoFunction(SILFunction *F);
};

} // namespace

// Return true if the callee has self-recursive calls.
static bool calleeIsSelfRecursive(SILFunction *Callee) {
  for (auto &BB : *Callee)
    for (auto &I : BB)
      if (auto Apply = FullApplySite::isa(&I))
        if (Apply.getReferencedFunction() == Callee)
          return true;
  return false;
}

// Returns the callee of an apply_inst if it is basically inlineable.
SILFunction *SILPerformanceInliner::getEligibleFunction(FullApplySite AI) {

  SILFunction *Callee = AI.getReferencedFunction();

  if (!Callee) {
    return nullptr;
  }

  // Don't inline functions that are marked with the @_semantics or @effects
  // attribute if the inliner is asked not to inline them.
  if (Callee->hasSemanticsAttrs() || Callee->hasEffectsKind()) {
    if (WhatToInline == InlineSelection::NoSemanticsAndGlobalInit) {
      return nullptr;
    }
    // The "availability" semantics attribute is treated like global-init.
    if (Callee->hasSemanticsAttrs() &&
        WhatToInline != InlineSelection::Everything &&
        Callee->hasSemanticsAttrThatStartsWith("availability")) {
      return nullptr;
    }
  } else if (Callee->isGlobalInit()) {
    if (WhatToInline != InlineSelection::Everything) {
      return nullptr;
    }
  }

  // We can't inline external declarations.
  if (Callee->empty() || Callee->isExternalDeclaration()) {
    return nullptr;
  }

  // Explicitly disabled inlining.
  if (Callee->getInlineStrategy() == NoInline) {
    return nullptr;
  }

  if (!Callee->shouldOptimize()) {
    return nullptr;
  }

  // We don't support this yet.
  if (AI.hasSubstitutions()) {
    return nullptr;
  }

  // We don't support inlining a function that binds dynamic self because we
  // have no mechanism to preserve the original function's local self metadata.
  if (computeMayBindDynamicSelf(Callee)) {
    return nullptr;
  }

  SILFunction *Caller = AI.getFunction();

  // Detect self-recursive calls.
  if (Caller == Callee) {
    return nullptr;
  }

  // A non-fragile function may not be inlined into a fragile function.
  if (Caller->isFragile() && !Callee->isFragile()) {
    return nullptr;
  }

  // Inlining self-recursive functions into other functions can result
  // in excessive code duplication since we run the inliner multiple
  // times in our pipeline
  if (calleeIsSelfRecursive(Callee)) {
    return nullptr;
  }

  return Callee;
}

/// Return true if inlining this call site is profitable.
bool SILPerformanceInliner::isProfitableToInline(FullApplySite AI,
                                              Weight CallerWeight,
                                              ConstantTracker &callerTracker,
                                              int &NumCallerBlocks) {
  SILFunction *Callee = AI.getReferencedFunction();

  if (Callee->getInlineStrategy() == AlwaysInline)
    return true;

  SILLoopInfo *LI = LA->get(Callee);
  ShortestPathAnalysis *SPA = getSPA(Callee, LI);
  assert(SPA->isValid());

  ConstantTracker constTracker(Callee, &callerTracker, AI);
  DominanceInfo *DT = DA->get(Callee);
  SILBasicBlock *CalleeEntry = &Callee->front();
  DominanceOrder domOrder(CalleeEntry, DT, Callee->size());

  // Calculate the inlining cost of the callee.
  int CalleeCost = 0;
  int Benefit = 0;

  // Start with a base benefit.
  int BaseBenefit = RemovedCallBenefit;
  const SILOptions &Opts = Callee->getModule().getOptions();

  // For some reason -Ounchecked can accept a higher base benefit without
  // increasing the code size too much.
  if (Opts.Optimization == SILOptions::SILOptMode::OptimizeUnchecked)
    BaseBenefit *= 2;

  CallerWeight.updateBenefit(Benefit, BaseBenefit);

  // Go through all blocks of the function, accumulate the cost and find
  // benefits.
  while (SILBasicBlock *block = domOrder.getNext()) {
    constTracker.beginBlock();
    Weight BlockW = SPA->getWeight(block, CallerWeight);

    for (SILInstruction &I : *block) {
      constTracker.trackInst(&I);

      CalleeCost += (int)instructionInlineCost(I);

      if (FullApplySite AI = FullApplySite::isa(&I)) {

        // Check if the callee is passed as an argument. If so, increase the
        // threshold, because inlining will (probably) eliminate the closure.
        SILInstruction *def = constTracker.getDefInCaller(AI.getCallee());
        if (def && (isa<FunctionRefInst>(def) || isa<PartialApplyInst>(def)))
          BlockW.updateBenefit(Benefit, RemovedClosureBenefit);
      } else if (auto *LI = dyn_cast<LoadInst>(&I)) {
        // Check if it's a load from a stack location in the caller. Such a load
        // might be optimized away if inlined.
        if (constTracker.isStackAddrInCaller(LI->getOperand()))
          BlockW.updateBenefit(Benefit, RemovedLoadBenefit);
      } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
        // Check if it's a store to a stack location in the caller. Such a load
        // might be optimized away if inlined.
        if (constTracker.isStackAddrInCaller(SI->getDest()))
          BlockW.updateBenefit(Benefit, RemovedStoreBenefit);
      } else if (isa<StrongReleaseInst>(&I) || isa<ReleaseValueInst>(&I)) {
        SILValue Op = stripCasts(I.getOperand(0));
        if (SILArgument *Arg = dyn_cast<SILArgument>(Op)) {
          if (Arg->isFunctionArg() && Arg->getArgumentConvention() ==
              SILArgumentConvention::Direct_Guaranteed) {
            BlockW.updateBenefit(Benefit, RefCountBenefit);
          }
        }
      }
    }
    // Don't count costs in blocks which are dead after inlining.
    SILBasicBlock *takenBlock = constTracker.getTakenBlock(block->getTerminator());
    if (takenBlock) {
      BlockW.updateBenefit(Benefit, RemovedTerminatorBenefit);
      domOrder.pushChildrenIf(block, [=] (SILBasicBlock *child) {
        return child->getSinglePredecessor() != block || child == takenBlock;
      });
    } else {
      domOrder.pushChildren(block);
    }
  }

  if (AI.getFunction()->isThunk()) {
    // Only inline trivial functions into thunks (which will not increase the
    // code size).
    if (CalleeCost > TrivialFunctionThreshold)
      return false;

    DEBUG(

      dumpCaller(AI.getFunction());
      llvm::dbgs() << "    decision {" << CalleeCost << " into thunk} " <<
          Callee->getName() << '\n';
    );
    return true;
  }

  // We reduce the benefit if the caller is too large. For this we use a
  // cubic function on the number of caller blocks. This starts to prevent
  // inlining at about 800 - 1000 caller blocks.
  int blockMinus =
    (NumCallerBlocks * NumCallerBlocks) / BlockLimitDenominator *
                        NumCallerBlocks / BlockLimitDenominator;
  Benefit -= blockMinus;

  // This is the final inlining decision.
  if (CalleeCost > Benefit) {
    return false;
  }

  NumCallerBlocks += Callee->size();

  DEBUG(
    dumpCaller(AI.getFunction());
    llvm::dbgs() << "    decision {c=" << CalleeCost << ", b=" << Benefit <<
        ", l=" << SPA->getScopeLength(CalleeEntry, 0) <<
        ", c-w=" << CallerWeight << ", bb=" << Callee->size() <<
        ", c-bb=" << NumCallerBlocks << "} " << Callee->getName() << '\n';
  );
  return true;
}

/// Return true if inlining this call site into a cold block is profitable.
bool SILPerformanceInliner::isProfitableInColdBlock(FullApplySite AI,
                                                    SILFunction *Callee) {
  if (Callee->getInlineStrategy() == AlwaysInline)
    return true;

  int CalleeCost = 0;

  for (SILBasicBlock &Block : *Callee) {
    for (SILInstruction &I : Block) {
      CalleeCost += int(instructionInlineCost(I));
      if (CalleeCost > TrivialFunctionThreshold)
        return false;
    }
  }
  DEBUG(
    dumpCaller(AI.getFunction());
    llvm::dbgs() << "    cold decision {" << CalleeCost << "} " <<
              Callee->getName() << '\n';
  );
  return true;
}

/// Record additional weight increases.
///
/// Why can't we just add the weight when we call isProfitableToInline? Because
/// the additional weight is for _another_ function than the current handled
/// callee.
static void addWeightCorrection(FullApplySite FAS,
                        llvm::DenseMap<FullApplySite, int> &WeightCorrections) {
  SILFunction *Callee = FAS.getReferencedFunction();
  if (Callee && Callee->hasSemanticsAttr("array.uninitialized")) {
    // We want to inline the argument to an array.uninitialized call, because
    // this argument is most likely a call to a function which contains the
    // buffer allocation for the array. It is essential to inline it for stack
    // promotion of the array buffer.
    SILValue BufferArg = FAS.getArgument(0);
    SILValue Base = stripValueProjections(stripCasts(BufferArg));
    if (auto BaseApply = FullApplySite::isa(Base))
      WeightCorrections[BaseApply] += 6;
  }
}

void SILPerformanceInliner::collectAppliesToInline(
    SILFunction *Caller, SmallVectorImpl<FullApplySite> &Applies) {
  DominanceInfo *DT = DA->get(Caller);
  SILLoopInfo *LI = LA->get(Caller);

  llvm::DenseMap<FullApplySite, int> WeightCorrections;

  // Compute the shortest-path analysis for the caller.
  ShortestPathAnalysis *SPA = getSPA(Caller, LI);
  SPA->analyze(CBI, [&](FullApplySite FAS) -> int {

    // This closure returns the length of a called function.

    // At this occasion we record additional weight increases.
    addWeightCorrection(FAS, WeightCorrections);

    if (SILFunction *Callee = getEligibleFunction(FAS)) {
      // Compute the shortest-path analysis for the callee.
      SILLoopInfo *CalleeLI = LA->get(Callee);
      ShortestPathAnalysis *CalleeSPA = getSPA(Callee, CalleeLI);
      if (!CalleeSPA->isValid()) {
        CalleeSPA->analyze(CBI, [](FullApplySite FAS) {
          // We don't compute SPA for another call-level. Functions called from
          // the callee are assumed to have DefaultApplyLength.
          return DefaultApplyLength;
        });
      }
      int CalleeLength = CalleeSPA->getScopeLength(&Callee->front(), 0);
      // Just in case the callee is a noreturn function.
      if (CalleeLength >= ShortestPathAnalysis::InitialDist)
        return DefaultApplyLength;
      return CalleeLength;
    }
    // Some unknown function.
    return DefaultApplyLength;
  });

#ifndef NDEBUG
  if (PrintShortestPathInfo) {
    SPA->dump();
  }
#endif

  ConstantTracker constTracker(Caller);
  DominanceOrder domOrder(&Caller->front(), DT, Caller->size());
  int NumCallerBlocks = (int)Caller->size();

  // Go through all instructions and find candidates for inlining.
  // We do this in dominance order for the constTracker.
  SmallVector<FullApplySite, 8> InitialCandidates;
  while (SILBasicBlock *block = domOrder.getNext()) {
    constTracker.beginBlock();
    Weight BlockWeight;

    for (auto I = block->begin(), E = block->end(); I != E; ++I) {
      constTracker.trackInst(&*I);

      if (!FullApplySite::isa(&*I))
        continue;

      FullApplySite AI = FullApplySite(&*I);

      auto *Callee = getEligibleFunction(AI);
      if (Callee) {
        if (!BlockWeight.isValid())
          BlockWeight = SPA->getWeight(block, Weight(0, 0));

        // The actual weight including a possible weight correction.
        Weight W(BlockWeight, WeightCorrections.lookup(AI));

        if (isProfitableToInline(AI, W, constTracker, NumCallerBlocks))
          InitialCandidates.push_back(AI);
      }
    }
    domOrder.pushChildrenIf(block, [&] (SILBasicBlock *child) {
      if (CBI.isSlowPath(block, child)) {
        // Handle cold blocks separately.
        visitColdBlocks(InitialCandidates, child, DT);
        return false;
      }
      return true;
    });
  }

  // Calculate how many times a callee is called from this caller.
  llvm::DenseMap<SILFunction *, unsigned> CalleeCount;
  for (auto AI : InitialCandidates) {
    SILFunction *Callee = AI.getReferencedFunction();
    assert(Callee && "apply_inst does not have a direct callee anymore");
    CalleeCount[Callee]++;
  }

  // Now copy each candidate callee that has a small enough number of
  // call sites into the final set of call sites.
  for (auto AI : InitialCandidates) {
    SILFunction *Callee = AI.getReferencedFunction();
    assert(Callee && "apply_inst does not have a direct callee anymore");

    const unsigned CallsToCalleeThreshold = 1024;
    if (CalleeCount[Callee] <= CallsToCalleeThreshold)
      Applies.push_back(AI);
  }
}

/// \brief Attempt to inline all calls smaller than our threshold.
/// returns True if a function was inlined.
bool SILPerformanceInliner::inlineCallsIntoFunction(SILFunction *Caller) {
  // Don't optimize functions that are marked with the opt.never attribute.
  if (!Caller->shouldOptimize())
    return false;

  // First step: collect all the functions we want to inline.  We
  // don't change anything yet so that the dominator information
  // remains valid.
  SmallVector<FullApplySite, 8> AppliesToInline;
  collectAppliesToInline(Caller, AppliesToInline);

  if (AppliesToInline.empty())
    return false;

  // Second step: do the actual inlining.
  for (auto AI : AppliesToInline) {
    SILFunction *Callee = AI.getReferencedFunction();
    assert(Callee && "apply_inst does not have a direct callee anymore");

    if (!Callee->shouldOptimize()) {
      continue;
    }

    SmallVector<SILValue, 8> Args;
    for (const auto &Arg : AI.getArguments())
      Args.push_back(Arg);

    DEBUG(
      dumpCaller(Caller);
      llvm::dbgs() << "    inline [" << Callee->size() << "->" <<
          Caller->size() << "] " << Callee->getName() << "\n";
    );

    // Notice that we will skip all of the newly inlined ApplyInsts. That's
    // okay because we will visit them in our next invocation of the inliner.
    TypeSubstitutionMap ContextSubs;
    SILInliner Inliner(*Caller, *Callee,
                       SILInliner::InlineKind::PerformanceInline, ContextSubs,
                       AI.getSubstitutions());

    auto Success = Inliner.inlineFunction(AI, Args);
    (void) Success;
    // We've already determined we should be able to inline this, so
    // we expect it to have happened.
    assert(Success && "Expected inliner to inline this function!");

    recursivelyDeleteTriviallyDeadInstructions(AI.getInstruction(), true);

    NumFunctionsInlined++;
  }

  return true;
}

// Find functions in cold blocks which are forced to be inlined.
// All other functions are not inlined in cold blocks.
void SILPerformanceInliner::visitColdBlocks(
    SmallVectorImpl<FullApplySite> &AppliesToInline, SILBasicBlock *Root,
    DominanceInfo *DT) {
  DominanceOrder domOrder(Root, DT);
  while (SILBasicBlock *block = domOrder.getNext()) {
    for (SILInstruction &I : *block) {
      ApplyInst *AI = dyn_cast<ApplyInst>(&I);
      if (!AI)
        continue;

      auto *Callee = getEligibleFunction(AI);
      if (Callee && isProfitableInColdBlock(AI, Callee)) {
        AppliesToInline.push_back(AI);
      }
    }
    domOrder.pushChildren(block);
  }
}


//===----------------------------------------------------------------------===//
//                          Performance Inliner Pass
//===----------------------------------------------------------------------===//

namespace {
class SILPerformanceInlinerPass : public SILFunctionTransform {
  /// Specifies which functions not to inline, based on @_semantics and
  /// global_init attributes.
  InlineSelection WhatToInline;
  std::string PassName;

public:
  SILPerformanceInlinerPass(InlineSelection WhatToInline, StringRef LevelName):
    WhatToInline(WhatToInline), PassName(LevelName) {
    PassName.append(" Performance Inliner");
  }

  void run() override {
    DominanceAnalysis *DA = PM->getAnalysis<DominanceAnalysis>();
    SILLoopAnalysis *LA = PM->getAnalysis<SILLoopAnalysis>();

    if (getOptions().InlineThreshold == 0) {
      return;
    }

    SILPerformanceInliner Inliner(WhatToInline, DA, LA);

    assert(getFunction()->isDefinition() &&
           "Expected only functions with bodies!");

    // Inline things into this function, and if we do so invalidate
    // analyses for this function and restart the pipeline so that we
    // can further optimize this function before attempting to inline
    // in it again.
    if (Inliner.inlineCallsIntoFunction(getFunction())) {
      invalidateAnalysis(SILAnalysis::InvalidationKind::FunctionBody);
      restartPassPipeline();
    }
  }

  StringRef getName() override { return PassName; }
};
} // end anonymous namespace

/// Create an inliner pass that does not inline functions that are marked with
/// the @_semantics, @effects or global_init attributes.
SILTransform *swift::createEarlyInliner() {
  return new SILPerformanceInlinerPass(
    InlineSelection::NoSemanticsAndGlobalInit, "Early");
}

/// Create an inliner pass that does not inline functions that are marked with
/// the global_init attribute or have an "availability" semantics attribute.
SILTransform *swift::createPerfInliner() {
  return new SILPerformanceInlinerPass(InlineSelection::NoGlobalInit, "Middle");
}

/// Create an inliner pass that inlines all functions that are marked with
/// the @_semantics, @effects or global_init attributes.
SILTransform *swift::createLateInliner() {
  return new SILPerformanceInlinerPass(InlineSelection::Everything, "Late");
}