//===--- ARCCodeMotion.cpp - SIL ARC Code Motion --------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
///
/// \file
///
/// This pass moves retains down and releases up. This, hopefully, will help
/// ARC sequence opt to remove retain and release pairs without worrying too
/// much about control flows.
///
/// It uses an optimistic iterative data flow to compute where to insert the
/// retains and releases for every reference-counted root. It then removes all
/// the old retain and release instructions and create the new ones.
///
/// This pass is more sophisticated than SILCodeMotion, as arc optimizations
/// can be very beneficial, use an optimistic global data flow to achieve
/// optimality.
///
/// Proof of Correctness:
/// -------------------
///
/// 1. Retains are blocked by MayDecrements. Its straightforward to prove that
/// retain sinking is correct. 
/// 
/// If a retain is sunk from Region A to Region B, that means there is no
/// blocking operation between where the retain was in Region A to where it is
/// sunk to in Region B. Since we only sink retains (we do not move any other
/// instructions) which themselves are NOT MayDecrement operations, and moving
/// retains can't turn non-decrement instruction MayDecrement.
///
/// 2. Releases are blocked by MayInterfere. If a release is hoisted from
/// Region B to Region A, that means there is no blocking operation from where
/// the release was in Region B and where the release is hoisted to in Region A.
///
/// The question is whether we can introduce such operation while we hoist
/// other releases. The answer is NO. because if such releases exist, they
/// would be blocked by the old release (we remove old release and recreate new
/// ones at the end of the pass) and will not be able to be hoisted beyond the
/// old release.
///
/// This proof also hinges on the fact that if release A interferes with
/// releases B then release B must interfere with release A. i.e. the 2
/// releases must have the symmetric property. Consider the 2 releases as 2
/// function calls, i.e. CallA (release A) and CallB (release B), if CallA
/// interferes with CallB, that means CallA must share some program states
/// (through read or write) with CallB. Then it is not possible for CallB
/// to not share any states with CallA. And if they do share states, then
/// its not possible for CallB to block CallA and CallA not to block CallB.
///
/// TODO: Sinking retains can block releases to be hoisted, and hoisting
/// releases can block retains to be sunk. Investigate when to sink retains and
/// when to hoist releases and their ordering in the pass pipeline.
///
/// TODO: Consider doing retain hoisting and release sinking. This can help
/// to discover disjoint lifetimes and we can try to stitch them together.
///
/// TODO: There are a lot of code duplications between retain and release code
/// motion in the data flow part. Consider whether we can share them.
/// Essentially, we can implement the release code motion by inverting the 
/// retain code motion, but this can also make the code less readable.
///
//===----------------------------------------------------------------------===//

#define DEBUG_TYPE "sil-rr-code-motion"
#include "swift/SIL/SILBuilder.h"
#include "swift/SILOptimizer/Analysis/AliasAnalysis.h"
#include "swift/SILOptimizer/Analysis/ARCAnalysis.h"
#include "swift/SILOptimizer/Analysis/EscapeAnalysis.h"
#include "swift/SILOptimizer/Analysis/PostOrderAnalysis.h"
#include "swift/SILOptimizer/Analysis/RCIdentityAnalysis.h"
#include "swift/SILOptimizer/PassManager/Passes.h"
#include "swift/SILOptimizer/PassManager/Transforms.h"
#include "swift/SILOptimizer/Utils/CFG.h"
#include "swift/SILOptimizer/Utils/Local.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"

using namespace swift;

STATISTIC(NumRetainsSunk, "Number of retains sunk");
STATISTIC(NumReleasesHoisted, "Number of releases hoisted");

llvm::cl::opt<bool> DisableARCCodeMotion("disable-arc-cm", llvm::cl::init(false));

/// Disable optimization if we have to break critical edges in the function.
llvm::cl::opt<bool>
DisableIfWithCriticalEdge("disable-with-critical-edge", llvm::cl::init(false));

//===----------------------------------------------------------------------===//
//                             Block State 
//===----------------------------------------------------------------------===//

struct BlockState {
  /// A bit vector for which the ith bit represents the ith refcounted root in
  /// RCRootVault.
  ///
  /// NOTE: we could do the data flow with BBSetIn or BBSetOut, but that would
  /// require us to create a temporary copy to check whether the BBSet has
  /// changed after the genset and killset has been applied.
  llvm::SmallBitVector BBSetIn;

  /// A bit vector for which the ith bit represents the ith refcounted root in
  /// RCRootVault.
  llvm::SmallBitVector BBSetOut;

  /// A bit vector for which the ith bit represents the ith refcounted root in
  /// RCRootVault. If the bit is set, that means this basic block creates a
  /// retain which can be sunk or a release which can be hoisted.
  llvm::SmallBitVector BBGenSet;

  /// A bit vector for which the ith bit represents the ith refcounted root in
  /// RCRootVault. If this bit is set, that means this basic block stops retain
  /// or release of the refcounted root to be moved across.
  llvm::SmallBitVector BBKillSet;

  /// A bit vector for which the ith bit represents the ith refcounted root in
  /// RCRootVault. If this bit is set, that means this is potentially a retain
  /// or release that can be sunk or hoisted to this point. This is used to
  /// optimize the time for computing genset and killset.
  ///
  /// NOTE: this vector contains an approximation of whether there will be a
  /// retain or release to a certain point of a basic block.
  llvm::SmallBitVector BBMaxSet;
};

/// CodeMotionContext - This is the base class which retain code motion and
/// release code motion inherits from. It defines an interface as to how the
/// code motion procedure should be.
class CodeMotionContext {
protected:
  /// Dataflow needs multiple iteration to converge. If this is false, then we
  /// do not need to generate the genset or killset, i.e. we can simply do 1
  /// pessimistic data flow iteration.
  bool MultiIteration;

  /// The allocator we are currently using.
  llvm::SpecificBumpPtrAllocator<BlockState> &BPA;

  /// Current function we are analyzing.
  SILFunction *F;

  /// Current post-order we are using.
  PostOrderFunctionInfo *PO;

  /// Current alias analysis we are using.
  AliasAnalysis *AA;

  /// Current rc-identity we are using.
  RCIdentityFunctionInfo *RCFI;

  /// All the unique refcount roots retained or released in the function.
  llvm::SetVector<SILValue> RCRootVault;

  /// Contains a map between RC roots to their index in the RCRootVault.
  /// used to facilitate fast RC roots to index lookup.
  llvm::DenseMap<SILValue, unsigned> RCRootIndex;

  /// All the retains or releases originally in the function. Eventually
  /// they will all be removed after all the new ones are generated.
  llvm::SmallPtrSet<SILInstruction *, 8> RCInstructions;

  /// All the places to place the new retains or releases after code motion.
  using InsertPointList = llvm::SmallVector<SILInstruction *, 2>;
  llvm::SmallDenseMap<SILValue, InsertPointList> InsertPoints;

  /// These are the blocks that have an RC instruction to process or it blocks
  /// some RC instructions. If the basic block has neither, we do not need to
  /// process the block again in the last iteration. We populate this set when
  /// we compute the genset and killset.
  llvm::SmallPtrSet<SILBasicBlock *, 8> InterestBlocks;

  /// Return the rc-identity root of the SILValue.
  SILValue getRCRoot(SILValue R) {
     return RCFI->getRCIdentityRoot(R);
  }

  /// Return the rc-identity root of the RC instruction, i.e.
  /// retain or release.
  SILValue getRCRoot(SILInstruction *I) {
    assert(isRetainInstruction(I) || isReleaseInstruction(I) &&
           "Extracting RC root from invalid instruction");
    return getRCRoot(I->getOperand(0));
  }

public:
  /// Constructor.
  CodeMotionContext(llvm::SpecificBumpPtrAllocator<BlockState> &BPA,
                    SILFunction *F,
                    PostOrderFunctionInfo *PO, AliasAnalysis *AA,
                    RCIdentityFunctionInfo *RCFI)
    : MultiIteration(true), BPA(BPA), F(F), PO(PO), AA(AA), RCFI(RCFI) {}

  /// virtual destructor.
  virtual ~CodeMotionContext() {}

  /// Run the data flow to move retains and releases.
  bool run();

  /// Check whether we need to run an optimistic iteration data flow.
  /// or a pessimistic would suffice.
  virtual bool requireIteration() = 0;

  /// Initialize necessary things to run the iterative data flow.
  virtual void initializeCodeMotionDataFlow() = 0;

  /// Initialize the basic block maximum refcounted set.
  virtual void initializeCodeMotionBBMaxSet() = 0;

  /// Compute the genset and killset for every root in every basic block.
  virtual void computeCodeMotionGenKillSet() = 0;

  /// Run the iterative data flow to converge.
  virtual void convergeCodeMotionDataFlow() = 0; 

  /// Use the data flow results, come up with places to insert the new inst.
  virtual void computeCodeMotionInsertPoints() = 0;

  /// Remove the old retains and create the new *moved* refcounted instructions
  virtual bool performCodeMotion() = 0;

  /// Merge the data flow states.
  virtual void mergeBBDataFlowStates(SILBasicBlock *BB) = 0;

  /// Compute the BBSetIn and BBSetOut for the current basic
  /// block with the generated gen and kill set.
  virtual bool processBBWithGenKillSet(SILBasicBlock *BB) = 0;

  /// Return true if the instruction blocks the Ptr to be moved further.
  virtual bool mayBlockCodeMotion(SILInstruction *II, SILValue Ptr) = 0;
};

bool CodeMotionContext::run() {
  // Initialize the data flow.
  initializeCodeMotionDataFlow();

  // Converge the BBSetOut with iterative data flow.
  if (MultiIteration) {
    initializeCodeMotionBBMaxSet();
    computeCodeMotionGenKillSet();
    convergeCodeMotionDataFlow();
  }

  // Compute the insertion point where each RC root can be moved to.
  computeCodeMotionInsertPoints();

  // Finally, generate new retains and remove the old retains.
  return performCodeMotion();
}

//===----------------------------------------------------------------------===//
//                          Retain Code Motion 
//===----------------------------------------------------------------------===//

class RetainBlockState : public BlockState {
public:
  /// Check whether the BBSetOut has changed. If it does, we need to rerun
  /// the data flow on this block's successors to reach fixed point.
  bool updateBBSetOut(llvm::SmallBitVector &X) {
    if (BBSetOut == X)
      return false;
    BBSetOut = X;
    return true;
  }

  /// constructor.
  RetainBlockState(bool IsEntry, unsigned size, bool MultiIteration) {
    // Iterative forward data flow.
    BBSetIn.resize(size, false);
    // Initialize to true if we are running optimistic data flow, i.e.
    // MultiIteration is true.
    BBSetOut.resize(size, MultiIteration);
    BBMaxSet.resize(size, !IsEntry && MultiIteration);
  
    // Genset and Killset are initially empty.
    BBGenSet.resize(size, false);
    BBKillSet.resize(size, false);
  }
};

/// RetainCodeMotionContext - Context to perform retain code motion.
class RetainCodeMotionContext : public CodeMotionContext {
  /// All the retain block state for all the basic blocks in the function. 
  llvm::SmallDenseMap<SILBasicBlock *, RetainBlockState *> BlockStates;

  /// Return true if the instruction blocks the Ptr to be moved further.
  bool mayBlockCodeMotion(SILInstruction *II, SILValue Ptr) override {
    // NOTE: If more checks are to be added, place the most expensive in the
    // end, this function is called many times.
    //
    // These terminator instructions block.
    if (isa<ReturnInst>(II) || isa<ThrowInst>(II) || isa<UnreachableInst>(II))
      return true;
    // Identical RC root blocks code motion, we will be able to move this retain
    // further once we move the blocking retain.
    if (isRetainInstruction(II) && getRCRoot(II) == Ptr)
      return true;
    // Ref count checks do not have side effects, but are barriers for retains.
    if (mayCheckRefCount(II))
      return true;
    // mayDecrement reference count stops code motion.
    if (mayDecrementRefCount(II, Ptr, AA)) 
      return true;
    // This instruction does not block the retain code motion.
    return false;
  }

  /// Return the previous instruction if it happens to be a retain with the
  /// given RC root, nullptr otherwise.
  SILInstruction *getPrevReusableInst(SILInstruction *I, SILValue Root) {
    if (&*I->getParent()->begin() == I)
      return nullptr;
    auto Prev = &*std::prev(SILBasicBlock::iterator(I));
    if (isRetainInstruction(Prev) && getRCRoot(Prev) == Root)
      return Prev;
    return nullptr;
  }

public:
  /// Constructor.
  RetainCodeMotionContext(llvm::SpecificBumpPtrAllocator<BlockState> &BPA,
                          SILFunction *F, PostOrderFunctionInfo *PO,
                          AliasAnalysis *AA, RCIdentityFunctionInfo *RCFI)
    : CodeMotionContext(BPA, F, PO, AA, RCFI) {
    MultiIteration = requireIteration();
  }

  /// virtual destructor.
  ~RetainCodeMotionContext() override {}

  /// Return true if we do not need optimistic data flow.
  bool requireIteration() override;

  /// Initialize necessary things to run the iterative data flow.
  void initializeCodeMotionDataFlow() override;

  /// Initialize the basic block maximum refcounted set.
  void initializeCodeMotionBBMaxSet() override;

  /// Compute the genset and killset for every root in every basic block.
  void computeCodeMotionGenKillSet() override;

  /// Run the iterative data flow to converge.
  void convergeCodeMotionDataFlow() override;

  /// Use the data flow results, come up with places to insert the new inst.
  void computeCodeMotionInsertPoints() override;

  /// Remove the old retains and create the new *moved* refcounted instructions
  bool performCodeMotion() override;

  /// Compute the BBSetIn and BBSetOut for the current basic block with the
  /// generated gen and kill set.
  bool processBBWithGenKillSet(SILBasicBlock *BB) override;

  /// Merge the data flow states.
  void mergeBBDataFlowStates(SILBasicBlock *BB) override;
};

bool RetainCodeMotionContext::requireIteration() {
  // If all basic blocks will have their predecessors processed if the basic
  // blocks in the functions are iterated in reverse post order. Then this
  // function can be processed in one iteration, i.e. no need to generate the
  // genset and killset.
  llvm::SmallPtrSet<SILBasicBlock *, 4> PBBs;
  for (SILBasicBlock *B : PO->getReversePostOrder()) {
    for (auto X : B->getPredecessorBlocks()) {
      if (!PBBs.count(X))
        return true;
    }
    PBBs.insert(B);
  }
  return false;
}

void RetainCodeMotionContext::initializeCodeMotionDataFlow() {
  // Find all the RC roots in the function.
  for (auto &BB : *F) {
    for (auto &II : BB) {
      if (!isRetainInstruction(&II))
        continue;
      RCInstructions.insert(&II);
      SILValue Root = getRCRoot(&II);
      if (RCRootIndex.find(Root) != RCRootIndex.end())
        continue;
      RCRootIndex[Root] = RCRootVault.size();
      RCRootVault.insert(Root);
    }
  }

  // Initialize all the data flow bit vector for all basic blocks.
  for (auto &BB : *F) {
    BlockStates[&BB] = new (BPA.Allocate())
                            RetainBlockState(&BB == &*F->begin(),
                            RCRootVault.size(), MultiIteration);
  }
}

void RetainCodeMotionContext::initializeCodeMotionBBMaxSet() {
  for (SILBasicBlock *BB : PO->getReversePostOrder()) {
    // If basic block has no predecessor, do nothing.
    BlockState *State = BlockStates[BB];
    if (BB->pred_empty()) {
      State->BBMaxSet.reset();
    } else {
      // Intersect in all predecessors' BBSetOut.
      State->BBMaxSet.set();
      for (auto E = BB->pred_end(), I = BB->pred_begin(); I != E; ++I) {
        State->BBMaxSet &= BlockStates[*I]->BBMaxSet;
     }
   }

   // Process the instructions in the basic block to find what refcounted
   // roots are retained. If we know that an RC root can't be retained at a
   // basic block, then we know we do not need to consider it for the killset.
   // NOTE: this is a conservative approximation, because some retains may be
   // blocked before it reaches this block.
   for (auto &II : *BB) {
      if (!isRetainInstruction(&II))
        continue;
      State->BBMaxSet.set(RCRootIndex[getRCRoot(&II)]);
    }
  }
}

void RetainCodeMotionContext::computeCodeMotionGenKillSet() {
  for (SILBasicBlock *BB : PO->getReversePostOrder()) {
    auto *State = BlockStates[BB];
    bool InterestBlock = false; 
    for (auto &I : *BB) {
      // Check whether this instruction blocks any RC root code motion.
      for (unsigned i = 0; i < RCRootVault.size(); ++i) {
        if (!State->BBMaxSet.test(i) || !mayBlockCodeMotion(&I, RCRootVault[i]))
          continue;
        // This is a blocking instruction for the rcroot.
        InterestBlock = true;
        State->BBKillSet.set(i);
        State->BBGenSet.reset(i);
      }
      // If this is a retain instruction, it also generates.
      if (isRetainInstruction(&I)) {
        unsigned idx = RCRootIndex[getRCRoot(&I)];
        State->BBGenSet.set(idx);
        assert(State->BBKillSet.test(idx) && "Killset computed incorrectly");
        State->BBKillSet.reset(idx);
        InterestBlock = true;
      }
    }

    // Is this a block that is interesting to the last iteration of the data
    // flow.
    if (!InterestBlock)
      continue;
    InterestBlocks.insert(BB);
  }
}

bool RetainCodeMotionContext::performCodeMotion() {
  bool Changed = false;
  // Create the new retain instructions.
  for (auto RC : RCRootVault) {
    auto Iter = InsertPoints.find(RC);
    if (Iter == InsertPoints.end())
      continue;
    for (auto IP : Iter->second) {
      // we are about to insert a new retain instruction before the insertion
      // point. Check if the previous instruction is reusable, reuse it, do
      // not insert new instruction and delete old one.
      if (auto I = getPrevReusableInst(IP, Iter->first)) {
        RCInstructions.erase(I);
        continue;
      }
      createIncrementBefore(Iter->first, IP);
      Changed = true;
    }
  }
  // Remove the old retain instructions.
  for (auto R : RCInstructions) {
    ++NumRetainsSunk;
    recursivelyDeleteTriviallyDeadInstructions(R, true);
  }
  return Changed;
}

void RetainCodeMotionContext::mergeBBDataFlowStates(SILBasicBlock *BB) {
  BlockState *State = BlockStates[BB];
  State->BBSetIn.reset();
  // If basic block has no predecessor, simply reset and return.
  if (BB->pred_empty())
    return;

  // Intersect in all predecessors' BBSetOuts.
  auto Iter = BB->pred_begin();
  State->BBSetIn = BlockStates[*Iter]->BBSetOut;
  Iter = std::next(Iter);
  for (auto E = BB->pred_end(); Iter != E; ++Iter) {
    State->BBSetIn &= BlockStates[*Iter]->BBSetOut;
  }
}

bool RetainCodeMotionContext::processBBWithGenKillSet(SILBasicBlock *BB) {
  RetainBlockState *State = BlockStates[BB];
  // Compute the BBSetOut at the end of the basic block.
  mergeBBDataFlowStates(BB);

  // Compute the BBSetIn at the beginning of the basic block.
  State->BBSetIn.reset(State->BBKillSet);
  State->BBSetIn |= State->BBGenSet;
 
  // If BBSetIn changes, then keep iterating until reached a fixed point.
  return State->updateBBSetOut(State->BBSetIn);
}

void RetainCodeMotionContext::convergeCodeMotionDataFlow() {
  // Process each basic block with the genset and killset. Every time the
  // BBSetOut of a basic block changes, the optimization is rerun on its
  // successors. 
  llvm::SmallVector<SILBasicBlock *, 16> WorkList;
  llvm::SmallPtrSet<SILBasicBlock *, 4> HandledBBs;
  // Push into reverse post order so that we can pop from the back and get
  // post order.
  for (SILBasicBlock *B : PO->getReversePostOrder()) {
    WorkList.push_back(B);
    HandledBBs.insert(B);
  }
  while (!WorkList.empty()) {
    SILBasicBlock *BB = WorkList.pop_back_val();
    HandledBBs.erase(BB);
    if (processBBWithGenKillSet(BB)) {
      for (auto &X : BB->getSuccessors()) {
        // We do not push basic block into the worklist if its already 
        // in the worklist.
        if (HandledBBs.count(X))
          continue;
        WorkList.push_back(X);
      }
    }
  }
}

void RetainCodeMotionContext::computeCodeMotionInsertPoints() {
  // The BBSetOuts have converged, run last iteration and figure out
  // insertion point for each refcounted root.
  for (SILBasicBlock *BB : PO->getReversePostOrder()) {
    mergeBBDataFlowStates(BB);
    RetainBlockState *S = BlockStates[BB];

    // Compute insertion point generated by the edge value transition.
    // If there is a transition from 1 to 0, that means we have a partial
    // merge, which means the retain can NOT be sunk to the current block,
    // so place it at the end of the predecessors.
    for (unsigned i = 0; i < RCRootVault.size(); ++i) {
      if (S->BBSetIn[i])
        continue;
      for (auto Pred : BB->getPredecessorBlocks()) {
        BlockState *PBB = BlockStates[Pred];
        if (!PBB->BBSetOut[i])
          continue;
        InsertPoints[RCRootVault[i]].push_back(Pred->getTerminator());
      }
    }

    // Is this block interesting. If we are sure this block does not generate
    // retains nor does it block any retains (i.e. no insertion point will be
    // created), we can skip it, as the BBSetOut has been converged if this is
    // a multi-iteration function.
    if (MultiIteration && !InterestBlocks.count(BB))
      continue;

    // Compute insertion point within the basic block. Process instructions in
    // the basic block in reverse post-order fashion.
    for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
      for (unsigned i = 0; i < RCRootVault.size(); ++i) {
        if (!S->BBSetIn[i] || !mayBlockCodeMotion(&*I, RCRootVault[i]))
          continue;
        S->BBSetIn.reset(i);
        InsertPoints[RCRootVault[i]].push_back(&*I);
      }

      // If this is a retain instruction, it also generates.
      if (isRetainInstruction(&*I)) {
        S->BBSetIn.set(RCRootIndex[getRCRoot(&*I)]);
      }
    }

    // Lastly update the BBSetOut, only necessary when we are running a single
    // iteration dataflow.
    if (!MultiIteration) {
      S->updateBBSetOut(S->BBSetIn);
    }
  }
}

//===----------------------------------------------------------------------===//
//                          Release Code Motion 
//===----------------------------------------------------------------------===//

class ReleaseBlockState : public BlockState {
public:
  /// Check whether the BBSetIn has changed. If it does, we need to rerun
  /// the data flow on this block's predecessors to reach fixed point.
  bool updateBBSetIn(llvm::SmallBitVector &X) {
    if (BBSetIn == X)
      return false;
    BBSetIn = X;
    return true;
  }

  /// constructor.
  ///
  /// If \p InitOptimistic is true, the block in-bits are initialized to 1
  /// which enables optimistic data flow evaluation.
  ReleaseBlockState(bool InitOptimistic, unsigned size) {
    // backward data flow.
    // Initialize to true if we are running optimistic data flow, i.e.
    // MultiIteration is true.
    BBSetIn.resize(size, InitOptimistic);
    BBSetOut.resize(size, false);
    BBMaxSet.resize(size, InitOptimistic);
  
    // Genset and Killset are initially empty.
    BBGenSet.resize(size, false);
    BBKillSet.resize(size, false);
  }
};

/// ReleaseCodeMotionContext - Context to perform release code motion.
class ReleaseCodeMotionContext : public CodeMotionContext {
  /// All the release block state for all the basic blocks in the function. 
  llvm::SmallDenseMap<SILBasicBlock *, ReleaseBlockState *> BlockStates;

  /// We are not moving epilogue releases.
  bool FreezeEpilogueReleases;

  /// The epilogue release matcher we are currently using.
  ConsumedArgToEpilogueReleaseMatcher &ERM;

  /// Return true if the instruction blocks the Ptr to be moved further.
  bool mayBlockCodeMotion(SILInstruction *II, SILValue Ptr) override {
    // NOTE: If more checks are to be added, place the most expensive in the end.
    // This function is called many times.
    //
    // We can not move a release above the instruction that defines the
    // released value.
    if (II == Ptr->getDefiningInstruction())
      return true;
    // Identical RC root blocks code motion, we will be able to move this release
    // further once we move the blocking release.
    if (isReleaseInstruction(II) && getRCRoot(II) == Ptr)
      return true;
    // Stop at may interfere.
    if (mayHaveSymmetricInterference(II, Ptr, AA))
      return true;
    // This instruction does not block the release.
    return false;
  }

  /// Return the successor instruction if it happens to be a release with the
  /// given RC root, nullptr otherwise.
  SILInstruction *getPrevReusableInst(SILInstruction *I, SILValue Root) {
    if (&*I->getParent()->begin() == I)
      return nullptr;
    auto Prev = &*std::prev(SILBasicBlock::iterator(I));
    if (isReleaseInstruction(Prev) && getRCRoot(Prev) == Root)
      return Prev;
    return nullptr;
  }

public:
  /// Constructor.
  ReleaseCodeMotionContext(llvm::SpecificBumpPtrAllocator<BlockState> &BPA,
                           SILFunction *F, PostOrderFunctionInfo *PO,
                           AliasAnalysis *AA, RCIdentityFunctionInfo *RCFI,
                           bool FreezeEpilogueReleases,
                           ConsumedArgToEpilogueReleaseMatcher &ERM)
    : CodeMotionContext(BPA, F, PO, AA, RCFI),
      FreezeEpilogueReleases(FreezeEpilogueReleases), ERM(ERM) {
    MultiIteration = requireIteration();
  } 

  /// virtual destructor.
  ~ReleaseCodeMotionContext() override {}

  /// Return true if the data flow can converge in 1 iteration.
  bool requireIteration() override;

  /// Initialize necessary things to run the iterative data flow.
  void initializeCodeMotionDataFlow() override;

  /// Initialize the basic block maximum refcounted set.
  void initializeCodeMotionBBMaxSet() override;

  /// Compute the genset and killset for every root in every basic block.
  void computeCodeMotionGenKillSet() override;

  /// Run the iterative data flow to converge.
  void convergeCodeMotionDataFlow() override;

  /// Use the data flow results, come up with places to insert the new inst.
  void computeCodeMotionInsertPoints() override;

  /// Remove the old retains and create the new *moved* refcounted instructions
  bool performCodeMotion() override;

  /// Compute the BBSetIn and BBSetOut for the current basic
  /// block with the generated gen and kill set.
  bool processBBWithGenKillSet(SILBasicBlock *BB) override;

  /// Merge the data flow states.
  void mergeBBDataFlowStates(SILBasicBlock *BB) override;
};

bool ReleaseCodeMotionContext::requireIteration() {
  // If all basic blocks will have their successors processed if the basic
  // blocks in the functions are iterated in post order. Then this function
  // can be processed in one iteration, i.e. no need to generate the genset
  // and killset.
  llvm::SmallPtrSet<SILBasicBlock *, 4> PBBs;
  for (SILBasicBlock *B : PO->getPostOrder()) {
    for (auto &X : B->getSuccessors()) {
      if (!PBBs.count(X))
        return true;
    }
    PBBs.insert(B);
  }
  return false;
}

void ReleaseCodeMotionContext::initializeCodeMotionDataFlow() {
  // All blocks which are initialized with 1-bits. These are all blocks which
  // eventually reach the function exit (return, throw), excluding the
  // function exit blocks themselves.
  // Optimistic initialization enables moving releases across loops. On the
  // other hand, blocks, which never reach the function exit, e.g. infinite
  // loop blocks, must be excluded. Otherwise we would end up inserting
  // completely unrelated release instructions in such blocks.
  llvm::SmallPtrSet<SILBasicBlock *, 32> BlocksInitOptimistically;

  llvm::SmallVector<SILBasicBlock *, 32> Worklist;
  
  // Find all the RC roots in the function.
  for (auto &BB : *F) {
    for (auto &II : BB) {
      if (!isReleaseInstruction(&II))
        continue;
      // Do not try to enumerate if we are not hoisting epilogue releases.
      if (FreezeEpilogueReleases && ERM.isEpilogueRelease(&II))
        continue;
      SILValue Root = getRCRoot(&II);
      RCInstructions.insert(&II);
      if (RCRootIndex.find(Root) != RCRootIndex.end())
        continue;
      RCRootIndex[Root] = RCRootVault.size();
      RCRootVault.insert(Root);
    }
    if (MultiIteration && BB.getTerminator()->isFunctionExiting())
      Worklist.push_back(&BB);
  }

  // Find all blocks from which there is a path to the function exit.
  // Note: the Worklist is empty if we are not in MultiIteration mode.
  while (!Worklist.empty()) {
    SILBasicBlock *BB = Worklist.pop_back_val();
    for (SILBasicBlock *Pred : BB->getPredecessorBlocks()) {
      if (BlocksInitOptimistically.insert(Pred).second)
        Worklist.push_back(Pred);
    }
  }

  // Initialize all the data flow bit vector for all basic blocks.
  for (auto &BB : *F) {
    BlockStates[&BB] = new (BPA.Allocate())
            ReleaseBlockState(BlocksInitOptimistically.count(&BB) != 0,
                              RCRootVault.size());
  }
}

void ReleaseCodeMotionContext::initializeCodeMotionBBMaxSet() {
  for (SILBasicBlock *BB : PO->getPostOrder()) {
    // If basic block has no successor, do nothing.
    BlockState *State = BlockStates[BB];
    if (BB->succ_empty()) {
      State->BBMaxSet.reset();
    } else {
      // Intersect in all successors' BBMaxOuts.
      State->BBMaxSet.set();
      for (auto E = BB->succ_end(), I = BB->succ_begin(); I != E; ++I) {
        State->BBMaxSet &= BlockStates[*I]->BBMaxSet;
     }
   }

   // Process the instructions in the basic block to find what refcounted
   // roots are released. If we know that an RC root can't be released at a
   // basic block, then we know we do not need to consider it for the killset.
   // NOTE: this is a conservative approximation, because some releases may be
   // blocked before it reaches this block.
   for (auto II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
      if (!isReleaseInstruction(&*II))
        continue;
      State->BBMaxSet.set(RCRootIndex[getRCRoot(&*II)]);
    }
  }
}

void ReleaseCodeMotionContext::computeCodeMotionGenKillSet() {
  for (SILBasicBlock *BB : PO->getPostOrder()) {
    auto *State = BlockStates[BB];
    bool InterestBlock = false;
    for (auto I = BB->rbegin(), E = BB->rend(); I != E; ++I) {
      // Check whether this instruction blocks any RC root code motion.
      for (unsigned i = 0; i < RCRootVault.size(); ++i) {
        if (!State->BBMaxSet.test(i) || !mayBlockCodeMotion(&*I, RCRootVault[i]))
          continue;
        // This instruction blocks this RC root.
        InterestBlock = true;
        State->BBKillSet.set(i);
        State->BBGenSet.reset(i);
      }

      // If this is an epilogue release and we are freezing epilogue release
      // simply continue.
      if (FreezeEpilogueReleases && ERM.isEpilogueRelease(&*I))
        continue;

      // If this is a release instruction, it also generates.
      if (isReleaseInstruction(&*I)) {
        unsigned idx = RCRootIndex[getRCRoot(&*I)];
        State->BBGenSet.set(idx);
        assert(State->BBKillSet.test(idx) && "Killset computed incorrectly");
        State->BBKillSet.reset(idx);
        InterestBlock = true;
      }
    }

    // Handle SILArgument, SILArgument can invalidate.
    for (unsigned i = 0; i < RCRootVault.size(); ++i) {
      auto *A = dyn_cast<SILArgument>(RCRootVault[i]);
      if (!A || A->getParent() != BB)
        continue;
      InterestBlock = true;
      State->BBKillSet.set(i);
      State->BBGenSet.reset(i);
    }

    // Is this interesting to the last iteration of the data flow.
    if (!InterestBlock)
      continue;
    InterestBlocks.insert(BB);
  }
}

void ReleaseCodeMotionContext::mergeBBDataFlowStates(SILBasicBlock *BB) {
  BlockState *State = BlockStates[BB];
  State->BBSetOut.reset();
  // If basic block has no successor, simply reset and return.
  if (BB->succ_empty())
    return;

  // Intersect in all successors' BBSetIn.
  auto Iter = BB->succ_begin();
  State->BBSetOut = BlockStates[*Iter]->BBSetIn;
  Iter = std::next(Iter);
  for (auto E = BB->succ_end(); Iter != E; ++Iter) {
    State->BBSetOut &= BlockStates[*Iter]->BBSetIn;
  }
}

bool ReleaseCodeMotionContext::performCodeMotion() {
  bool Changed = false;
  SmallVector<SILInstruction *, 8> NewReleases;
  // Create the new releases at each anchor point.
  for (auto RC : RCRootVault) {
    auto Iter = InsertPoints.find(RC);
    if (Iter == InsertPoints.end())
      continue;
    for (auto IP : Iter->second) {
      // we are about to insert a new release instruction before the insertion
      // point. Check if the successor instruction is reusable, reuse it, do
      // not insert new instruction and delete old one.
      if (auto I = getPrevReusableInst(IP, Iter->first)) {
        if (RCInstructions.erase(I))
          NewReleases.push_back(I);
        continue;
      }
      if (SILInstruction *I = createDecrementBefore(Iter->first, IP).getPtrOrNull())
        NewReleases.push_back(I);
      Changed = true;
    }
  }
  // Remove the old release instructions.
  for (auto R : RCInstructions) {
    ++NumReleasesHoisted;
    recursivelyDeleteTriviallyDeadInstructions(R, true);
  }

  // Eliminate pairs of retain-release if they are adjacent to each other and
  // retain/release the same RCRoot, e.g.
  //    strong_retain %2
  //    strong_release %2
  for (SILInstruction *ReleaseInst : NewReleases) {
    auto InstIter = ReleaseInst->getIterator();
    if (InstIter == ReleaseInst->getParent()->begin())
      continue;

    SILInstruction *PrevInst = &*std::prev(InstIter);
    if (isRetainInstruction(PrevInst) && getRCRoot(PrevInst) == getRCRoot(ReleaseInst)) {
      recursivelyDeleteTriviallyDeadInstructions(PrevInst, true);
      recursivelyDeleteTriviallyDeadInstructions(ReleaseInst, true);
    }
  }

  return Changed;
}

bool ReleaseCodeMotionContext::processBBWithGenKillSet(SILBasicBlock *BB) {
  ReleaseBlockState *State = BlockStates[BB];
  // Compute the BBSetOut at the end of the basic block.
  mergeBBDataFlowStates(BB);

  // Compute the BBSetIn at the beginning of the basic block.
  State->BBSetOut.reset(State->BBKillSet);
  State->BBSetOut |= State->BBGenSet;
 
  // If BBSetIn changes, then keep iterating until reached a fixed point.
  return State->updateBBSetIn(State->BBSetOut);
}

void ReleaseCodeMotionContext::convergeCodeMotionDataFlow() {
  // Process each basic block with the gen and kill set. Every time the
  // BBSetIn of a basic block changes, the optimization is rerun on its
  // predecessors.
  llvm::SmallVector<SILBasicBlock *, 16> WorkList;
  llvm::SmallPtrSet<SILBasicBlock *, 8> HandledBBs;
  // Push into reverse post order so that we can pop from the back and get
  // post order.
  for (SILBasicBlock *B : PO->getPostOrder()) {
    WorkList.push_back(B);
    HandledBBs.insert(B);
  }
  while (!WorkList.empty()) {
    SILBasicBlock *BB = WorkList.pop_back_val();
    HandledBBs.erase(BB);
    if (processBBWithGenKillSet(BB)) {
      for (auto X : BB->getPredecessorBlocks()) {
        // We do not push basic block into the worklist if its already 
        // in the worklist.
        if (HandledBBs.count(X))
          continue;
        WorkList.push_back(X);
      }
    }
  }
}

void ReleaseCodeMotionContext::computeCodeMotionInsertPoints() {
  // The BBSetIns have converged, run last iteration and figure out insertion
  // point for each RC root.
  for (SILBasicBlock *BB : PO->getPostOrder()) {
    // Intersect in the successor BBSetIns.
    mergeBBDataFlowStates(BB);
    ReleaseBlockState *S = BlockStates[BB];

    // Compute insertion point generated by the edge value transition.
    // If there is a transition from 1 to 0, that means we have a partial
    // merge, which means the release can NOT be hoisted to the current block.
    // place it at the successors.
    for (unsigned i = 0; i < RCRootVault.size(); ++i) {  
      if (S->BBSetOut[i])
        continue;
      for (auto &Succ : BB->getSuccessors()) {
        BlockState *SBB = BlockStates[Succ];
        if (!SBB->BBSetIn[i])
          continue;
        InsertPoints[RCRootVault[i]].push_back(&*(*Succ).begin());
      }
    }

    // Is this block interesting ?
    if (MultiIteration && !InterestBlocks.count(BB))
      continue;

    // Compute insertion point generated by MayUse terminator inst.
    // If terminator instruction can block the RC root. We will have no
    // choice but to anchor the release instructions in the successor blocks.
    for (unsigned i = 0; i < RCRootVault.size(); ++i) {
      SILInstruction *Term = BB->getTerminator();
      if (!S->BBSetOut[i] || !mayBlockCodeMotion(Term, RCRootVault[i]))
        continue;
      for (auto &Succ : BB->getSuccessors()) {
        BlockState *SBB = BlockStates[Succ];
        if (!SBB->BBSetIn[i])
          continue;
        InsertPoints[RCRootVault[i]].push_back(&*(*Succ).begin());
      }
      S->BBSetOut.reset(i);
    }

    // Compute insertion point generated within the basic block. Process
    // instructions in post-order fashion.
    for (auto I = std::next(BB->rbegin()), E = BB->rend(); I != E; ++I) {
      for (unsigned i = 0; i < RCRootVault.size(); ++i) {
        if (!S->BBSetOut[i] || !mayBlockCodeMotion(&*I, RCRootVault[i]))
          continue;
        auto *InsertPt = &*std::next(SILBasicBlock::iterator(&*I));
        InsertPoints[RCRootVault[i]].push_back(InsertPt);
        S->BBSetOut.reset(i);
      }

      // If we are freezing this epilogue release. Simply continue.
      if (FreezeEpilogueReleases && ERM.isEpilogueRelease(&*I))
        continue;

      // This release generates.
      if (isReleaseInstruction(&*I)) {
        S->BBSetOut.set(RCRootIndex[getRCRoot(&*I)]);
      }
    }

    // Compute insertion point generated by SILArgument. SILArgument blocks if
    // it defines the released value.
    for (unsigned i = 0; i < RCRootVault.size(); ++i) {
      if (!S->BBSetOut[i]) 
        continue;
      auto *A = dyn_cast<SILArgument>(RCRootVault[i]);
      if (!A || A->getParent() != BB)
        continue;
      InsertPoints[RCRootVault[i]].push_back(&*BB->begin());
      S->BBSetOut.reset(i);
    }
   
    // Lastly update the BBSetIn, only necessary when we are running a single
    // iteration dataflow.
    if (!MultiIteration) {
      S->updateBBSetIn(S->BBSetOut);
    }
  }
}

//===----------------------------------------------------------------------===//
//                           Top Level Entry Point
//===----------------------------------------------------------------------===//

namespace {

/// Code motion kind.
enum CodeMotionKind : unsigned { Retain = 0, Release = 1};

class ARCCodeMotion : public SILFunctionTransform {
  /// Whether to hoist releases or sink retains.
  CodeMotionKind Kind;

  /// Freeze epilogue release or not.
  bool FreezeEpilogueReleases;

public:
  /// Constructor.
  ARCCodeMotion(CodeMotionKind H, bool F) : Kind(H), FreezeEpilogueReleases(F) {}

  /// The entry point to the transformation.
  void run() override {
    // Code motion disabled.
    if (DisableARCCodeMotion)
      return;

    // Respect function no.optimize.
    SILFunction *F = getFunction();
    if (!F->shouldOptimize())
      return;

    // Return if there is critical edge and we are disabling critical edge
    // splitting.
    if (DisableIfWithCriticalEdge && hasCriticalEdges(*F, false))
      return;

    DEBUG(llvm::dbgs() << "*** ARCCM on function: " << F->getName() << " ***\n");

    PostOrderAnalysis *POA = PM->getAnalysis<PostOrderAnalysis>();

    // Split all critical edges.
    //
    // TODO: maybe we can do this lazily or maybe we should disallow SIL passes
    // to create critical edges.
    bool EdgeChanged = splitAllCriticalEdges(*F, false, nullptr, nullptr);
    if (EdgeChanged)
      POA->invalidateFunction(F);

    auto *PO = POA->get(F);
    auto *AA = PM->getAnalysis<AliasAnalysis>();
    auto *RCFI = PM->getAnalysis<RCIdentityAnalysis>()->get(F);

    llvm::SpecificBumpPtrAllocator<BlockState> BPA;
    bool InstChanged = false;
    if (Kind == Release) {
      // TODO: we should consider Throw block as well, or better we should
      // abstract the Return block or Throw block away in the matcher.
      SILArgumentConvention Conv[] = {SILArgumentConvention::Direct_Owned};
      ConsumedArgToEpilogueReleaseMatcher ERM(RCFI, F,
            Conv,
            ConsumedArgToEpilogueReleaseMatcher::ExitKind::Return);

      ReleaseCodeMotionContext RelCM(BPA, F, PO, AA, RCFI, 
                                     FreezeEpilogueReleases, ERM); 
      // Run release hoisting.
      InstChanged |= RelCM.run();
    } else {
      RetainCodeMotionContext RetCM(BPA, F, PO, AA, RCFI);
      // Run retain sinking.
      InstChanged |= RetCM.run();
    }

    if (EdgeChanged) {
      // We splitted critical edges.
      invalidateAnalysis(SILAnalysis::InvalidationKind::FunctionBody);
      return;
    }
    if (InstChanged) {
      // We moved instructions.
      invalidateAnalysis(SILAnalysis::InvalidationKind::Instructions);
    }
  }
};

} // end anonymous namespace

/// Sink Retains.
SILTransform *swift::createRetainSinking() {
  return new ARCCodeMotion(CodeMotionKind::Retain, false);
}

/// Hoist releases, but not epilogue release. ASO relies on epilogue releases
/// to prove knownsafety on enclosed releases.
SILTransform *swift::createReleaseHoisting() {
  return new ARCCodeMotion(CodeMotionKind::Release, true);
}

/// Hoist all releases.
SILTransform *swift::createLateReleaseHoisting() {
  return new ARCCodeMotion(CodeMotionKind::Release, false);
}