swift-mirror/lib/SILOptimizer/Transforms/RedundantLoadElimination.cpp

//===--- RedundantLoadElimination.cpp - SIL Load Forwarding ---------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
///
/// \file
///
/// This pass eliminates redundant loads.
///
/// A load can be eliminated if its value has already been held somewhere,
/// i.e. generated by a previous load, LSLocation stored by a known value.
///
/// In this case, one can replace the load instruction with the previous
/// results.
///
/// Redundant Load Elimination (RLE) eliminates such loads by:
///
/// 1. Introducing a notion of a LSLocation that is used to model object
/// fields. (See below for more details).
///
/// 2. Introducing a notion of a LSValue that is used to model the value
/// that currently resides in the associated LSLocation on the particular
/// program path. (See below for more details).
///
/// 3. Performing a RPO walk over the control flow graph, tracking any
/// LSLocations that are read from or stored into in each basic block. The
/// read or stored value, kept in a map between LSLocation and LSValue,
/// becomes the available value for the LSLocation.
///
/// 4. An optimistic iterative intersection-based dataflow is performed on the
/// gensets until convergence.
///
/// At the core of RLE, there is the LSLocation class. A LSLocation is an
/// abstraction of an object field in program. It consists of a base and a
/// projection path to the field accessed.
///
/// In SIL, one can access an aggregate as a whole, i.e. store to a struct with
/// 2 Int fields. A store like this will generate 2 *indivisible* LSLocations,
/// 1 for each field and in addition to keeping a list of LSLocation, RLE also
/// keeps their available LSValues. We call it *indivisible* because it
/// cannot be broken down to more LSLocations.
///
/// LSValue consists of a base - a SILValue from the load or store inst,
/// as well as a projection path to which the field it represents. So, a
/// store to an 2-field struct as mentioned above will generate 2 LSLocations
/// and 2 LSValues.
///
/// Every basic block keeps a map between LSLocation and LSValue. By
/// keeping the LSLocation and LSValue in their indivisible form, one
/// can easily find which part of the load is redundant and how to compute its
/// forwarding value.
///
/// Given the case which the 2 fields of the struct both have available values,
/// RLE can find their LSValues (maybe by struct_extract from a larger
/// value) and then aggregate them.
///
/// However, this may introduce a lot of extraction and aggregation which may
/// not be necessary. i.e. a store the struct followed by a load from the
/// struct. To solve this problem, when RLE detects that a load instruction
/// can be replaced by forwarded value, it will try to find minimum # of
/// extractions necessary to form the forwarded value. It will group the
/// available value's by the LSValue base, i.e. the LSValues come from the
/// same instruction, and then use extraction to obtain the needed components
/// of the base.
///
//===----------------------------------------------------------------------===//

#define DEBUG_TYPE "sil-redundant-load-elim"
#include "swift/SIL/Projection.h"
#include "swift/SIL/SILArgument.h"
#include "swift/SIL/SILBuilder.h"
#include "swift/SIL/BasicBlockDatastructures.h"
#include "swift/SILOptimizer/Analysis/ARCAnalysis.h"
#include "swift/SILOptimizer/Analysis/AliasAnalysis.h"
#include "swift/SILOptimizer/Analysis/DeadEndBlocksAnalysis.h"
#include "swift/SILOptimizer/Analysis/DominanceAnalysis.h"
#include "swift/SILOptimizer/Analysis/PostOrderAnalysis.h"
#include "swift/SILOptimizer/Analysis/ValueTracking.h"
#include "swift/SILOptimizer/PassManager/Passes.h"
#include "swift/SILOptimizer/PassManager/Transforms.h"
#include "swift/SILOptimizer/Utils/CFGOptUtils.h"
#include "swift/SILOptimizer/Utils/InstOptUtils.h"
#include "swift/SILOptimizer/Utils/LoadStoreOptUtils.h"
#include "swift/SILOptimizer/Utils/SILSSAUpdater.h"
#include "swift/SIL/BasicBlockData.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"

using namespace swift;

STATISTIC(NumForwardedLoads, "Number of loads forwarded");

static AllocStackInst *findAllocStackInst(DeallocStackInst *I) {
  // It's allowed to be undef in unreachable code.
  return dyn_cast<AllocStackInst>(I->getOperand());
}

/// ComputeAvailSetMax - If we ignore all unknown writes, what is the max
/// available set that can reach the a certain point in a basic block. This
/// helps generating the genset and killset. i.e. if there is no downward visible
/// value that can reach the end of a basic block, then we know that the genset
/// and killset for the location need not be set.
///
/// ComputeAvailGenKillSet - Build the genset and killset of the basic block.
///
/// ComputeAvailValue - Compute the available value at the end of the basic
/// block.
///
/// PerformRLE - Perform the actual redundant load elimination.
enum class RLEKind : unsigned {
  ComputeAvailSetMax = 0,
  ComputeAvailGenKillSet = 1,
  ComputeAvailValue = 2,
  PerformRLE = 3,
};

//===----------------------------------------------------------------------===//
//                             Utility Functions
//===----------------------------------------------------------------------===//

static bool inline isComputeAvailSetMax(RLEKind Kind) {
  return Kind == RLEKind::ComputeAvailSetMax;
}

static bool inline isComputeAvailGenKillSet(RLEKind Kind) {
  return Kind == RLEKind::ComputeAvailGenKillSet;
}

static bool inline isComputeAvailValue(RLEKind Kind) {
  return Kind == RLEKind::ComputeAvailValue;
}

static bool inline isPerformingRLE(RLEKind Kind) {
  return Kind == RLEKind::PerformRLE;
}

/// Returns true if this is an instruction that may have side effects in a
/// general sense but are inert from a load store perspective.
static bool isRLEInertInstruction(SILInstruction *Inst) {
  switch (Inst->getKind()) {
  case SILInstructionKind::DeallocStackInst:
  case SILInstructionKind::CondFailInst:
  case SILInstructionKind::IsEscapingClosureInst:
  case SILInstructionKind::IsUniqueInst:
  case SILInstructionKind::EndCOWMutationInst:
  case SILInstructionKind::FixLifetimeInst:
  case SILInstructionKind::EndAccessInst:
  case SILInstructionKind::SetDeallocatingInst:
  case SILInstructionKind::DeallocRefInst:
  case SILInstructionKind::BeginBorrowInst:
  case SILInstructionKind::EndBorrowInst:
    return true;
  default:
    return false;
  }
}

//===----------------------------------------------------------------------===//
//                       Basic Block Location State
//===----------------------------------------------------------------------===//
namespace {

/// If this function has too many basic blocks or too many locations, it may
/// take a long time to compute the genset and killset. The number of memory
/// behavior or alias query we need to do in worst case is roughly linear to
/// # of BBs x(times) # of locations.
///
/// we could run RLE on functions with 128 basic blocks and 128 locations,
/// which is a large function.
constexpr unsigned MaxLSLocationBBMultiplicationNone = 128*128;

/// we could run optimistic RLE on functions with less than 64 basic blocks
/// and 64 locations which is a sizable function.
constexpr unsigned MaxLSLocationBBMultiplicationPessimistic = 64*64;

/// forward declaration.
class RLEContext;

/// State of the load store in one basic block which allows for forwarding from
/// loads, stores -> loads
class BlockState {
public:
  enum class ValueState : unsigned {
    CoverValues = 0,
    ConcreteValues = 1,
    CoverAndConcreteValues = 2,
  };

private:
  /// # of locations in the LocationVault.
  unsigned LocationNum;

  /// The basic block that we are optimizing.
  SILBasicBlock *BB;

  /// A bit vector for which the ith bit represents the ith LSLocation in
  /// LocationVault. If the bit is set, then the location currently has an
  /// downward visible value at the beginning of the basic block.
  SmallBitVector ForwardSetIn;

  /// A bit vector for which the ith bit represents the ith LSLocation in
  /// LocationVault. If the bit is set, then the location currently has an
  /// downward visible value at the end of the basic block.
  SmallBitVector ForwardSetOut;

  /// A bit vector for which the ith bit represents the ith LSLocation in
  /// LocationVault. If we ignore all unknown write, what's the maximum set
  /// of available locations at the current position in the basic block.
  SmallBitVector ForwardSetMax;

  /// A bit vector for which the ith bit represents the ith LSLocation in
  /// LocationVault. If the bit is set, then the basic block generates a
  /// value for the location.
  SmallBitVector BBGenSet;

  /// A bit vector for which the ith bit represents the ith LSLocation in
  /// LocationVault. If the bit is set, then the basic block kills the
  /// value for the location.
  SmallBitVector BBKillSet;

  /// This is map between LSLocations and their available values at the
  /// beginning of this basic block.
  ValueTableMap ForwardValIn;

  /// This is map between LSLocations and their available values at the end of
  /// this basic block.
  ValueTableMap ForwardValOut;

  /// Keeps a list of replaceable instructions in the current basic block as
  /// well as their SILValue replacement.
  llvm::DenseMap<SingleValueInstruction *, SILValue> RedundantLoads;

  /// LSLocation read or written has been extracted, expanded and mapped to the
  /// bit position in the bitvector. Update it in the ForwardSetIn of the
  /// current basic block.
  void updateForwardSetForRead(RLEContext &Ctx, unsigned B);
  void updateForwardSetForWrite(RLEContext &Ctx, unsigned B);

  /// LSLocation read or written has been extracted, expanded and mapped to the
  /// B position in the Bvector. Update it in the genset and killset of the
  /// current basic block.
  void updateGenKillSetForRead(RLEContext &Ctx, unsigned B);
  void updateGenKillSetForWrite(RLEContext &Ctx, unsigned B);

  /// LSLocation read or written has been extracted, expanded and mapped to the
  /// B position in the Bvector. Update it in the MaxAvailForwardSet of the
  /// current basic block.
  void updateMaxAvailSetForRead(RLEContext &Ctx, unsigned B);
  void updateMaxAvailSetForWrite(RLEContext &Ctx, unsigned B);

  /// LSLocation written has been extracted, expanded and mapped to the bit
  /// position in the bitvector. process it using the bit position.
  void updateForwardSetAndValForRead(RLEContext &Ctx, unsigned L, unsigned V);
  void updateForwardSetAndValForWrite(RLEContext &Ctx, unsigned L, unsigned V);

  /// There is a read to a LSLocation, expand the LSLocation into individual
  /// fields before processing them.
  void processRead(RLEContext &Ctx, SILInstruction *I, SILValue Mem,
                   SILValue Val, RLEKind Kind);

  /// There is a write to a LSLocation, expand the LSLocation into individual
  /// fields before processing them.
  void processWrite(RLEContext &Ctx, SILInstruction *I, SILValue Mem,
                    SILValue Val, RLEKind Kind);

  /// BitVector manipulation functions.
  void startTrackingLocation(SmallBitVector &BV, unsigned B);
  void stopTrackingLocation(SmallBitVector &BV, unsigned B);
  bool isTrackingLocation(SmallBitVector &BV, unsigned B);
  void startTrackingValue(ValueTableMap &VM, unsigned L, unsigned V);
  void stopTrackingValue(ValueTableMap &VM, unsigned B);

public:
  BlockState() = default;

  void init(SILBasicBlock *NewBB, unsigned bitcnt, bool optimistic) {
    BB = NewBB;
    LocationNum = bitcnt;
    // For reachable basic blocks, the initial state of ForwardSetOut should be
    // all 1's. Otherwise the dataflow solution could be too conservative.
    //
    // Consider this case, the forwardable value by var a = 10 before the loop
    // will not be forwarded if the ForwardSetOut is set to 0 initially.
    //
    //   var a = 10
    //   for _ in 0...1024 {}
    //   use(a);
    //
    // However, by doing so, we can only do the data forwarding after the
    // data flow stabilizes.
    //
    // We set the initial state of unreachable block to 0, as we do not have
    // a value for the location.
    //
    // This is a bit conservative as we could be missing forwarding
    // opportunities. i.e. a joint block with 1 predecessor being an
    // unreachable block.
    //
    // we rely on other passes to clean up unreachable block.
    ForwardSetIn.resize(LocationNum, false);
    ForwardSetOut.resize(LocationNum, optimistic);

    // If we are running an optimistic data flow, set forward max to true
    // initially.
    ForwardSetMax.resize(LocationNum, optimistic);

    BBGenSet.resize(LocationNum, false);
    BBKillSet.resize(LocationNum, false);
  }

  /// Initialize the AvailSetMax by intersecting this basic block's
  /// predecessors' AvailSetMax.
  void mergePredecessorsAvailSetMax(RLEContext &Ctx);

  /// Initialize the AvailSet by intersecting this basic block' predecessors'
  /// AvailSet.
  void mergePredecessorAvailSet(RLEContext &Ctx);

  /// Initialize the AvailSet and AvailVal of the current basic block.
  void mergePredecessorAvailSetAndValue(RLEContext &Ctx);

  /// Reached the end of the basic block, update the ForwardValOut with the
  /// ForwardValIn.
  void updateForwardValOut() { ForwardValOut = ForwardValIn; }

  /// Check whether the ForwardSetOut has changed. If it does, we need to
  /// rerun the data flow to reach fixed point.
  bool updateForwardSetOut() {
    if (ForwardSetIn == ForwardSetOut)
      return false;
    ForwardSetOut = ForwardSetIn;
    return true;
  }

  /// Returns the current basic block we are processing.
  SILBasicBlock *getBB() const { return BB; }

  /// Returns the ForwardValIn for the current basic block.
  ValueTableMap &getForwardValIn() { return ForwardValIn; }

  /// Returns the ForwardValOut for the current basic block.
  ValueTableMap &getForwardValOut() { return ForwardValOut; }

  /// Returns the redundant loads and their replacement in the currently basic
  /// block.
  llvm::DenseMap<SingleValueInstruction *, SILValue> &getRL() {
    return RedundantLoads;
  }

  /// Look into the value for the given LSLocation at end of the basic block,
  /// return one of the three ValueState type.
  ValueState getValueStateAtEndOfBlock(RLEContext &Ctx, LSLocation &L);

  /// Wrappers to query the value state of the location in this BlockState.
  bool isCoverValues(RLEContext &Ctx, LSLocation &L) {
    return getValueStateAtEndOfBlock(Ctx, L) == ValueState::CoverValues;
  }
  bool isConcreteValues(RLEContext &Ctx, LSLocation &L) {
    return getValueStateAtEndOfBlock(Ctx, L) == ValueState::ConcreteValues;
  }

  /// Iterate over the instructions in the basic block in forward order and
  /// process them w.r.t. the given \p Kind.
  void processInstructionWithKind(RLEContext &Ctx, SILInstruction *I,
                                  RLEKind Kind);
  void processBasicBlockWithKind(RLEContext &Ctx, RLEKind Kind);

  /// Process the current basic block with the genset and killset. Return true
  /// if the ForwardSetOut changes.
  bool processBasicBlockWithGenKillSet();

  /// Set up the value for redundant load elimination.
  bool setupRLE(RLEContext &Ctx, SILInstruction *I, SILValue Mem);

  /// Process Instruction which writes to memory in an unknown way.
  void processUnknownWriteInst(RLEContext &Ctx, SILInstruction *I,
                               RLEKind Kind);
  void processUnknownWriteInstForGenKillSet(RLEContext &Ctx, SILInstruction *I);
  void processUnknownWriteInstForRLE(RLEContext &Ctx, SILInstruction *I);


  void processDeallocStackInst(RLEContext &Ctx, DeallocStackInst *I,
                               RLEKind Kind);
  void processDeallocStackInstForGenKillSet(RLEContext &Ctx, DeallocStackInst *I);
  void processDeallocStackInstForRLE(RLEContext &Ctx, DeallocStackInst *I);

  /// Process LoadInst. Extract LSLocations from LoadInst.
  void processLoadInst(RLEContext &Ctx, LoadInst *LI, RLEKind Kind);

  /// Process LoadInst. Extract LSLocations from StoreInst.
  void processStoreInst(RLEContext &Ctx, StoreInst *SI, RLEKind Kind);

  /// Returns a *single* forwardable SILValue for the given LSLocation right
  /// before the InsertPt instruction.
  SILValue reduceValuesAtEndOfBlock(RLEContext &Ctx, LSLocation &L);

#ifndef NDEBUG
  void dump(RLEContext &Ctx);
#endif
};

} // end anonymous namespace

//===----------------------------------------------------------------------===//
//                            RLEContext Interface
//===----------------------------------------------------------------------===//

namespace {

/// This class stores global state that we use when computing redundant load and
/// their replacement in each basic block.
class RLEContext {
  enum class ProcessKind {
    ProcessMultipleIterations = 0,
    ProcessOneIteration = 1,
    ProcessNone = 2,
  };
private:
  /// Function currently processing.
  SILFunction *Fn;

  /// The passmanager we are using.
  SILPassManager *PM;

  /// The alias analysis that we will use during all computations.
  AliasAnalysis *AA;

  /// The type expansion analysis we will use during all computations.
  TypeExpansionAnalysis *TE;

  /// The SSA updater we use to materialize covering values.
  SILSSAUpdater Updater;

  /// The range that we use to iterate over the post order and reverse post
  /// order of the given function.
  PostOrderFunctionInfo *PO;

  /// Epilogue release analysis.
  EpilogueARCFunctionInfo *EAFI;

  /// Keeps all the locations for the current function. The BitVector in each
  /// BlockState is then laid on top of it to keep track of which LSLocation
  /// has a downward available value.
  std::vector<LSLocation> LocationVault;

  /// Contains a map between LSLocation to their index in the LocationVault.
  /// Use for fast lookup.
  LSLocationIndexMap LocToBitIndex;

  /// Keeps a map between the accessed SILValue and the location.
  LSLocationBaseMap BaseToLocIndex;

  /// Keeps all the loadstorevalues for the current function. The BitVector in
  /// each g is then laid on top of it to keep track of which LSLocation
  /// has a downward available value.
  std::vector<LSValue> LSValueVault;

  /// Contains a map between LSLocation to their index in the LocationVault.
  /// Use for fast lookup.
  llvm::DenseMap<LSValue, unsigned> ValToBitIndex;

  /// A map from each BasicBlock to its BlockState.
  BasicBlockData<BlockState> BBToLocState;

  /// Keeps a list of basic blocks that have LoadInsts. If a basic block does
  /// not have LoadInst, we do not actually perform the last iteration where
  /// RLE is actually performed on the basic block.
  ///
  /// NOTE: This is never populated for functions which will only require 1
  /// data flow iteration. For function that requires more than 1 iteration of
  /// the data flow this is populated when the first time the functions is
  /// walked, i.e. when the we generate the genset and killset.
  BasicBlockSet BBWithLoads;

  /// If set, RLE ignores loads from that array type.
  NominalTypeDecl *ArrayType;

#ifndef NDEBUG
  SILPrintContext printCtx;
#endif

public:
  RLEContext(SILFunction *F, SILPassManager *PM, AliasAnalysis *AA,
             TypeExpansionAnalysis *TE, PostOrderFunctionInfo *PO,
             EpilogueARCFunctionInfo *EAFI, bool disableArrayLoads);

  RLEContext(const RLEContext &) = delete;
  RLEContext(RLEContext &&) = delete;
  RLEContext &operator=(const RLEContext &) = delete;
  RLEContext &operator=(RLEContext &&) = delete;
  ~RLEContext() = default;

  /// Entry point to redundant load elimination.
  bool run();

  SILFunction *getFunction() const { return Fn; }

  /// Use a set of ad hoc rules to tell whether we should run a pessimistic
  /// one iteration data flow on the function.
  ProcessKind getProcessFunctionKind(unsigned LoadCount, unsigned StoreCount);

  /// Run the iterative data flow until convergence.
  void runIterativeRLE();

  /// Process the basic blocks for the gen and kill set.
  void processBasicBlocksForGenKillSet();

  /// Process the basic blocks with the gen and kill set.
  void processBasicBlocksWithGenKillSet();

  /// Process the basic block for values generated in the current basic
  /// block.
  void processBasicBlocksForAvailValue();

  /// Process basic blocks to perform the redundant load elimination.
  void processBasicBlocksForRLE(bool Optimistic);

  /// Returns the alias analysis we will use during all computations.
  AliasAnalysis *getAA() const { return AA; }

  /// Returns the current type expansion analysis we are .
  TypeExpansionAnalysis *getTE() const { return TE; }

  /// Returns current epilogue release function info we are using.
  EpilogueARCFunctionInfo *getEAFI() const { return EAFI; }

  /// Returns the SILValue base to bit index.
  LSLocationBaseMap &getBM() { return BaseToLocIndex; }

  /// Return the BlockState for the basic block this basic block belongs to.
  BlockState &getBlockState(SILBasicBlock *B) { return BBToLocState[B]; }

  /// Get the bit representing the LSLocation in the LocationVault.
  unsigned getLocationBit(const LSLocation &L);

  /// Given the bit, get the LSLocation from the LocationVault.
  LSLocation &getLocation(const unsigned index);

  /// Get the bit representing the LSValue in the LSValueVault.
  unsigned getValueBit(const LSValue &L);

  /// Given the bit, get the LSValue from the LSValueVault.
  LSValue &getValue(const unsigned index);

  /// Given a LSLocation, try to collect all the LSValues for this LSLocation
  /// in the given basic block. If part of the locations have covering values,
  /// find the values in its predecessors.
  bool collectLocationValues(SILBasicBlock *BB, LSLocation &L,
                             LSLocationValueMap &Values, ValueTableMap &VM);

  /// Transitively collect all the values that make up this location and
  /// create a SILArgument out of them.
  SILValue computePredecessorLocationValue(SILBasicBlock *BB, LSLocation &L);

  /// Returns the LoadInst if \p Inst is a load inst we want to handle.
  LoadInst *isLoadInstToHandle(SILInstruction *Inst) {
    if (auto *LI = dyn_cast<LoadInst>(Inst)) {
      if (!ArrayType ||
          LI->getType().getNominalOrBoundGenericNominal() != ArrayType) {
        return LI;
      }
    }
    return nullptr;
  }
};

} // end anonymous namespace

void BlockState::startTrackingValue(ValueTableMap &VM, unsigned L, unsigned V) {
  VM[L] = V;
}

void BlockState::stopTrackingValue(ValueTableMap &VM, unsigned B) {
  VM.erase(B);
}

bool BlockState::isTrackingLocation(SmallBitVector &BV, unsigned B) {
  return BV.test(B);
}

void BlockState::startTrackingLocation(SmallBitVector &BV, unsigned B) {
  BV.set(B);
}

void BlockState::stopTrackingLocation(SmallBitVector &BV, unsigned B) {
  BV.reset(B);
}

void BlockState::mergePredecessorsAvailSetMax(RLEContext &Ctx) {
  if (BB->pred_empty()) {
    ForwardSetMax.reset();
    return;
  }

  auto Iter = BB->pred_begin();
  ForwardSetMax = Ctx.getBlockState(*Iter).ForwardSetMax;
  Iter = std::next(Iter);
  for (auto EndIter = BB->pred_end(); Iter != EndIter; ++Iter) {
    ForwardSetMax &= Ctx.getBlockState(*Iter).ForwardSetMax;
  }
}

void BlockState::mergePredecessorAvailSet(RLEContext &Ctx) {
  // Clear the state if the basic block has no predecessor.
  if (BB->getPredecessorBlocks().begin() == BB->getPredecessorBlocks().end()) {
    ForwardSetIn.reset();
    return;
  }

  auto Iter = BB->pred_begin();
  ForwardSetIn = Ctx.getBlockState(*Iter).ForwardSetOut;
  Iter = std::next(Iter);
  for (auto EndIter = BB->pred_end(); Iter != EndIter; ++Iter) {
    ForwardSetIn &= Ctx.getBlockState(*Iter).ForwardSetOut;
  }
}

void BlockState::mergePredecessorAvailSetAndValue(RLEContext &Ctx) {
  // Clear the state if the basic block has no predecessor.
  if (BB->getPredecessorBlocks().begin() == BB->getPredecessorBlocks().end()) {
    ForwardSetIn.reset();
    ForwardValIn.clear();
    return;
  }

  auto Iter = BB->pred_begin();
  ForwardSetIn = Ctx.getBlockState(*Iter).ForwardSetOut;
  ForwardValIn = Ctx.getBlockState(*Iter).ForwardValOut;
  Iter = std::next(Iter);
  for (auto EndIter = BB->pred_end(); Iter != EndIter; ++Iter) {
    BlockState &OtherState = Ctx.getBlockState(*Iter);
    ForwardSetIn &= OtherState.ForwardSetOut;

    // Merge in the predecessor state.
    for (unsigned i = 0; i < LocationNum; ++i) {
      if (OtherState.ForwardSetOut[i]) {
        // There are multiple values from multiple predecessors, set this as
        // a covering value. We do not need to track the value itself, as we
        // can always go to the predecessors BlockState to find it.
        ForwardValIn[i] = Ctx.getValueBit(LSValue(true));
        continue;
      }
      // If this location does have an available value, then clear it.
      stopTrackingValue(ForwardValIn, i);
      stopTrackingLocation(ForwardSetIn, i);
    }
  }
}

void BlockState::processBasicBlockWithKind(RLEContext &Ctx, RLEKind Kind) {
  // Iterate over instructions in forward order.
  for (auto &II : *BB) {
    processInstructionWithKind(Ctx, &II, Kind);
  }
}

bool BlockState::processBasicBlockWithGenKillSet() {
  ForwardSetIn.reset(BBKillSet);
  ForwardSetIn |= BBGenSet;
  return updateForwardSetOut();
}

SILValue BlockState::reduceValuesAtEndOfBlock(RLEContext &Ctx, LSLocation &L) {
  // First, collect current available locations and their corresponding values
  // into a map.
  LSLocationValueMap Values;

  LSLocationList Locs;
  LSLocation::expand(L, &BB->getModule(),
                     TypeExpansionContext(*BB->getParent()), Locs, Ctx.getTE());

  // Find the values that this basic block defines and the locations which
  // we do not have a concrete value in the current basic block.
  ValueTableMap &OTM = getForwardValOut();
  for (unsigned i = 0; i < Locs.size(); ++i) {
    auto Val = Ctx.getValue(OTM[Ctx.getLocationBit(Locs[i])]);
    auto AvailVal = makeCopiedValueAvailable(Val.getBase(), BB);
    Values[Locs[i]] = LSValue(AvailVal, Val.getPath().getValue());
  }

  // Second, reduce the available values into a single SILValue we can use to
  // forward.
  SILValue TheForwardingValue =
      LSValue::reduce(L, &BB->getModule(), Values, BB->getTerminator());
  /// Return the forwarding value.
  return TheForwardingValue;
}

bool BlockState::setupRLE(RLEContext &Ctx, SILInstruction *I, SILValue Mem) {
  // Try to construct a SILValue for the current LSLocation.
  //
  // Collect the locations and their corresponding values into a map.
  LSLocation L;
  LSLocationBaseMap &BaseToLocIndex = Ctx.getBM();
  if (BaseToLocIndex.find(Mem) != BaseToLocIndex.end()) {
    L = BaseToLocIndex[Mem];
  } else {
    SILValue UO = getUnderlyingObject(Mem);
    L = LSLocation(UO, ProjectionPath::getProjectionPath(UO, Mem));
  }

  LSLocationValueMap Values;
  // Use the ForwardValIn as we are currently processing the basic block.
  if (!Ctx.collectLocationValues(I->getParent(), L, Values, getForwardValIn()))
    return false;

  // Reduce the available values into a single SILValue we can use to forward.
  SILModule *Mod = &I->getModule();
  SILValue TheForwardingValue =
      LSValue::reduce(L, Mod, Values, I);

  if (!TheForwardingValue)
    return false;

  // Now we have the forwarding value, record it for forwarding!.
  //
  // NOTE: we do not perform the RLE right here because doing so could introduce
  // new LSLocations.
  //
  // e.g.
  //    %0 = load %x
  //    %1 = load %x
  //    %2 = extract_struct %1, #a
  //    %3 = load %2
  //
  // If we perform the RLE and replace %1 with %0, we end up having a memory
  // location we do not have before, i.e. Base == %0, and Path == #a.
  //
  // We may be able to add the LSLocation to the vault, but it gets
  // complicated very quickly, e.g. we need to resize the bit vectors size,
  // etc.
  //
  // However, since we already know the instruction to replace and the value to
  // replace it with, we can record it for now and forwarded it after all the
  // forwardable values are recorded in the function.
  //
  RedundantLoads[cast<SingleValueInstruction>(I)] = TheForwardingValue;

  LLVM_DEBUG(llvm::dbgs() << "FORWARD " << TheForwardingValue << "  to" << *I);
  return true;
}

void BlockState::updateForwardSetForRead(RLEContext &Ctx, unsigned B) {
  startTrackingLocation(ForwardSetIn, B);
}

void BlockState::updateGenKillSetForRead(RLEContext &Ctx, unsigned B) {
  startTrackingLocation(BBGenSet, B);
  stopTrackingLocation(BBKillSet, B);
}

void BlockState::updateForwardSetAndValForRead(RLEContext &Ctx, unsigned L,
                                               unsigned V) {
  // Track the new location and value.
  startTrackingValue(ForwardValIn, L, V);
  startTrackingLocation(ForwardSetIn, L);
}

void BlockState::updateGenKillSetForWrite(RLEContext &Ctx, unsigned B) {
  // This is a store, invalidate any location that this location may alias, as
  // their values can no longer be forwarded.
  LSLocation &R = Ctx.getLocation(B);
  for (unsigned i = 0; i < LocationNum; ++i) {
    if (!isTrackingLocation(ForwardSetMax, i))
      continue;
    LSLocation &L = Ctx.getLocation(i);
    if (!L.isMayAliasLSLocation(R, Ctx.getAA()))
      continue;
    // MayAlias, invalidate the location.
    stopTrackingLocation(BBGenSet, i);
    startTrackingLocation(BBKillSet, i);
  }

  // Start tracking this location.
  startTrackingLocation(BBGenSet, B);
  stopTrackingLocation(BBKillSet, B);
}

void BlockState::updateMaxAvailSetForWrite(RLEContext &Ctx, unsigned B) {
  startTrackingLocation(ForwardSetMax, B);
}

void BlockState::updateMaxAvailSetForRead(RLEContext &Ctx, unsigned B) {
  startTrackingLocation(ForwardSetMax, B);
}

void BlockState::updateForwardSetForWrite(RLEContext &Ctx, unsigned B) {
  // This is a store, invalidate any location that this location may alias, as
  // their values can no longer be forwarded.
  LSLocation &R = Ctx.getLocation(B);
  for (unsigned i = 0; i < LocationNum; ++i) {
    if (!isTrackingLocation(ForwardSetIn, i))
      continue;
    LSLocation &L = Ctx.getLocation(i);
    if (!L.isMayAliasLSLocation(R, Ctx.getAA()))
      continue;
    // MayAlias, invalidate the location.
    stopTrackingLocation(ForwardSetIn, i);
  }

  // Start tracking this location.
  startTrackingLocation(ForwardSetIn, B);
}

void BlockState::updateForwardSetAndValForWrite(RLEContext &Ctx, unsigned L,
                                                unsigned V) {
  // This is a store, invalidate any location that this location may alias, as
  // their values can no longer be forwarded.
  LSLocation &R = Ctx.getLocation(L);
  for (unsigned i = 0; i < LocationNum; ++i) {
    if (!isTrackingLocation(ForwardSetIn, i))
      continue;
    LSLocation &L = Ctx.getLocation(i);
    if (!L.isMayAliasLSLocation(R, Ctx.getAA()))
      continue;
    // MayAlias, invalidate the location and value.
    stopTrackingValue(ForwardValIn, i);
    stopTrackingLocation(ForwardSetIn, i);
  }

  // Start tracking this location and value.
  startTrackingLocation(ForwardSetIn, L);
  startTrackingValue(ForwardValIn, L, V);
}

void BlockState::processWrite(RLEContext &Ctx, SILInstruction *I, SILValue Mem,
                              SILValue Val, RLEKind Kind) {
  // Initialize the LSLocation.
  LSLocation L;
  LSLocationBaseMap &BaseToLocIndex = Ctx.getBM();
  if (BaseToLocIndex.find(Mem) != BaseToLocIndex.end()) {
    L = BaseToLocIndex[Mem];
  } else {
    SILValue UO = getUnderlyingObject(Mem);
    L = LSLocation(UO, ProjectionPath::getProjectionPath(UO, Mem));
  }

  // If we can't figure out the Base or Projection Path for the write,
  // process it as an unknown memory instruction.
  if (!L.isValid()) {
    // we can ignore unknown store instructions if we are computing the
    // AvailSetMax.
    if (!isComputeAvailSetMax(Kind)) {
      processUnknownWriteInst(Ctx, I, Kind);
    }
    return;
  }

  auto *Fn = I->getFunction();
  // Expand the given location and val into individual fields and process
  // them as separate writes.
  LSLocationList Locs;
  LSLocation::expand(L, &I->getModule(), TypeExpansionContext(*Fn), Locs,
                     Ctx.getTE());

  if (isComputeAvailSetMax(Kind)) {
    for (unsigned i = 0; i < Locs.size(); ++i) {
      updateMaxAvailSetForWrite(Ctx, Ctx.getLocationBit(Locs[i]));
    }
    return;
  }

  // Are we computing the genset and killset ?
  if (isComputeAvailGenKillSet(Kind)) {
    for (unsigned i = 0; i < Locs.size(); ++i) {
      updateGenKillSetForWrite(Ctx, Ctx.getLocationBit(Locs[i]));
    }
    return;
  }

  // Are we computing available value or performing RLE?
  LSValueList Vals;
  LSValue::expand(Val, &I->getModule(), TypeExpansionContext(*Fn), Vals,
                  Ctx.getTE());
  if (isComputeAvailValue(Kind) || isPerformingRLE(Kind)) {
    for (unsigned i = 0; i < Locs.size(); ++i) {
      updateForwardSetAndValForWrite(Ctx, Ctx.getLocationBit(Locs[i]),
                                     Ctx.getValueBit(Vals[i]));
    }
    return;
  }

  llvm_unreachable("Unknown RLE compute kind");
}

void BlockState::processRead(RLEContext &Ctx, SILInstruction *I, SILValue Mem,
                             SILValue Val, RLEKind Kind) {
  // Initialize the LSLocation.
  LSLocation L;
  LSLocationBaseMap &BaseToLocIndex = Ctx.getBM();
  if (BaseToLocIndex.find(Mem) != BaseToLocIndex.end()) {
    L = BaseToLocIndex[Mem];
  } else {
    SILValue UO = getUnderlyingObject(Mem);
    L = LSLocation(UO, ProjectionPath::getProjectionPath(UO, Mem));
  }

  // If we can't figure out the Base or Projection Path for the read, simply
  // ignore it for now.
  if (!L.isValid())
    return;

  auto *Fn = I->getFunction();
  // Expand the given LSLocation and Val into individual fields and process
  // them as separate reads.
  LSLocationList Locs;
  LSLocation::expand(L, &I->getModule(), TypeExpansionContext(*Fn), Locs,
                     Ctx.getTE());

  if (isComputeAvailSetMax(Kind)) {
    for (unsigned i = 0; i < Locs.size(); ++i) {
      updateMaxAvailSetForRead(Ctx, Ctx.getLocationBit(Locs[i]));
    }
    return;
  }

  // Are we computing the genset and killset.
  if (isComputeAvailGenKillSet(Kind)) {
    for (unsigned i = 0; i < Locs.size(); ++i) {
      updateGenKillSetForRead(Ctx, Ctx.getLocationBit(Locs[i]));
    }
    return;
  }

  // Are we computing available values ?.
  bool CanForward = true;
  LSValueList Vals;
  LSValue::expand(Val, &I->getModule(), TypeExpansionContext(*Fn), Vals,
                  Ctx.getTE());
  if (isComputeAvailValue(Kind) || isPerformingRLE(Kind)) {
    for (unsigned i = 0; i < Locs.size(); ++i) {
      if (isTrackingLocation(ForwardSetIn, Ctx.getLocationBit(Locs[i])))
        continue;
      updateForwardSetAndValForRead(Ctx, Ctx.getLocationBit(Locs[i]),
                                    Ctx.getValueBit(Vals[i]));
      // We can not perform the forwarding as we are at least missing
      // some pieces of the read location.
      CanForward = false;
    }
  }

  // Simply return if we are not performing RLE or we do not have all the
  // values available to perform RLE.
  if (!isPerformingRLE(Kind) || !CanForward)
    return;

  // Lastly, forward value to the load.
  setupRLE(Ctx, I, Mem);
}

void BlockState::processStoreInst(RLEContext &Ctx, StoreInst *SI, RLEKind Kind) {
  processWrite(Ctx, SI, SI->getDest(), SI->getSrc(), Kind);
}

void BlockState::processLoadInst(RLEContext &Ctx, LoadInst *LI, RLEKind Kind) {
  processRead(Ctx, LI, LI->getOperand(), SILValue(LI), Kind);
}

void BlockState::processUnknownWriteInstForGenKillSet(RLEContext &Ctx,
                                                      SILInstruction *I) {
  auto *AA = Ctx.getAA();
  for (unsigned i = 0; i < LocationNum; ++i) {
    if (!isTrackingLocation(ForwardSetMax, i))
      continue;
    // Invalidate any location this instruction may write to.
    //
    // TODO: checking may alias with Base is overly conservative,
    // we should check may alias with base plus projection path.
    LSLocation &R = Ctx.getLocation(i);
    if (!AA->mayWriteToMemory(I, R.getBase()))
      continue;
    // MayAlias.
    stopTrackingLocation(BBGenSet, i);
    startTrackingLocation(BBKillSet, i);
  }
}

void BlockState::processUnknownWriteInstForRLE(RLEContext &Ctx,
                                               SILInstruction *I) {
  auto *AA = Ctx.getAA();
  for (unsigned i = 0; i < LocationNum; ++i) {
    if (!isTrackingLocation(ForwardSetIn, i))
      continue;
    // Invalidate any location this instruction may write to.
    //
    // TODO: checking may alias with Base is overly conservative,
    // we should check may alias with base plus projection path.
    LSLocation &R = Ctx.getLocation(i);
    if (!AA->mayWriteToMemory(I, R.getBase()))
      continue;
    // MayAlias.
    stopTrackingLocation(ForwardSetIn, i);
    stopTrackingValue(ForwardValIn, i);
  }
}

void BlockState::processUnknownWriteInst(RLEContext &Ctx, SILInstruction *I,
                                         RLEKind Kind) {
  // If this is a release on a guaranteed parameter, it can not call deinit,
  // which might read or write memory.
  if (isIntermediateRelease(I, Ctx.getEAFI()))
    return;

  // Are we computing the genset and killset ?
  if (isComputeAvailGenKillSet(Kind)) {
    processUnknownWriteInstForGenKillSet(Ctx, I);
    return;
  }

  // Are we computing the available value or doing RLE ?
  if (isComputeAvailValue(Kind) || isPerformingRLE(Kind)) {
    processUnknownWriteInstForRLE(Ctx, I);
    return;
  }

  llvm_unreachable("Unknown RLE compute kind");
}

void BlockState::
processDeallocStackInstForGenKillSet(RLEContext &Ctx, DeallocStackInst *I) {
  SILValue ASI = findAllocStackInst(I);
  for (unsigned i = 0; i < LocationNum; ++i) {
    LSLocation &R = Ctx.getLocation(i);
    if (R.getBase() != ASI)
      continue;
    // MayAlias.
    stopTrackingLocation(BBGenSet, i);
    startTrackingLocation(BBKillSet, i);
  }
}

void BlockState::
processDeallocStackInstForRLE(RLEContext &Ctx, DeallocStackInst *I) {
  SILValue ASI = findAllocStackInst(I);
  for (unsigned i = 0; i < LocationNum; ++i) {
    LSLocation &R = Ctx.getLocation(i);
    if (R.getBase() != ASI)
      continue;
    // MayAlias.
    stopTrackingLocation(ForwardSetIn, i);
    stopTrackingValue(ForwardValIn, i);
  }
}

void BlockState::
processDeallocStackInst(RLEContext &Ctx, DeallocStackInst *I, RLEKind Kind) {
  // Are we computing the genset and killset ?
  if (isComputeAvailGenKillSet(Kind)) {
    processDeallocStackInstForGenKillSet(Ctx, I);
    return;
  }

  // Are we computing the available value or doing RLE ?
  if (isComputeAvailValue(Kind) || isPerformingRLE(Kind)) {
    processDeallocStackInstForRLE(Ctx, I);
    return;
  }

  llvm_unreachable("Unknown RLE compute kind");
}


void BlockState::processInstructionWithKind(RLEContext &Ctx,
                                            SILInstruction *Inst,
                                            RLEKind Kind) {
  // This is a StoreInst, try to see whether it clobbers any forwarding value
  if (auto *SI = dyn_cast<StoreInst>(Inst)) {
    processStoreInst(Ctx, SI, Kind);
    return;
  }

  // This is a LoadInst. Let's see if we can find a previous loaded, stored
  // value to use instead of this load.
  if (auto *LI = Ctx.isLoadInstToHandle(Inst)) {
    processLoadInst(Ctx, LI, Kind);
    return;
  }

  if (auto *DSI = dyn_cast<DeallocStackInst>(Inst)) {
    processDeallocStackInst(Ctx, DSI, Kind);
    return;
  }

  // If this instruction has side effects, but is inert from a load store
  // perspective, skip it.
  if (isRLEInertInstruction(Inst))
    return;

  // If this instruction does not read or write memory, we can skip it.
  if (!Inst->mayReadOrWriteMemory())
    return;

  // If we have an instruction that may write to memory and we cannot prove
  // that it and its operands cannot alias a load we have visited,
  // invalidate that load.
  if (Inst->mayWriteToMemory()) {
    LLVM_DEBUG(llvm::dbgs() << "WRITE " << *Inst);
    processUnknownWriteInst(Ctx, Inst, Kind);
    return;
  }
  LLVM_DEBUG(llvm::dbgs() << "READ " << *Inst);
}

RLEContext::ProcessKind
RLEContext::
getProcessFunctionKind(unsigned LoadCount, unsigned StoreCount) {
  // Don't optimize function that are marked as 'no.optimize'.
  if (!Fn->shouldOptimize())
    return ProcessKind::ProcessNone;

  // Really no point optimizing here as there is no forwardable loads.
  if (LoadCount + StoreCount < 2)
    return ProcessKind::ProcessNone;

  bool RunOneIteration = true;
  unsigned BBCount = 0;
  unsigned LocationCount = LocationVault.size();

  if (LocationCount == 0)
    return ProcessKind::ProcessNone;

  // If all basic blocks will have their predecessors processed if
  // the basic blocks in the functions are iterated in post order.
  // Then this function can be processed in one iteration, i.e. no
  // need to generate the genset and killset.
  auto *PO = PM->getAnalysis<PostOrderAnalysis>()->get(Fn);
  BasicBlockSet HandledBBs(Fn);
  for (SILBasicBlock *B : PO->getReversePostOrder()) {
    ++BBCount;
    for (SILBasicBlock *pred : B->getPredecessorBlocks()) {
      if (!HandledBBs.contains(pred)) {
        RunOneIteration = false;
        break;
      }
    }
    HandledBBs.insert(B);
  }

  // Data flow may take too long to run.
  if (BBCount * LocationCount > MaxLSLocationBBMultiplicationNone)
    return ProcessKind::ProcessNone;

  // This function's data flow would converge in 1 iteration.
  if (RunOneIteration)
    return ProcessKind::ProcessOneIteration;

  // We run one pessimistic data flow to do dead store elimination on
  // the function.
  if (BBCount * LocationCount > MaxLSLocationBBMultiplicationPessimistic)
    return ProcessKind::ProcessOneIteration;

  return ProcessKind::ProcessMultipleIterations;
}

#ifndef NDEBUG
void BlockState::dump(RLEContext &Ctx) {
  for (unsigned i = 0; i < LocationNum; ++i) {
    if (!isTrackingLocation(ForwardSetMax, i))
      continue;

    llvm::dbgs() << "Loc #" << i << ":" << (BBGenSet[i] ? " Gen" : "")
                 << (BBKillSet[i] ? " Kill" : "");
    if (!ForwardSetIn.empty() && ForwardSetIn.test(i)) {
      llvm::dbgs() << " IN ";
      ValueTableMap::const_iterator inIter = ForwardValIn.find(i);
      if (inIter != ForwardValIn.end()) {
        if (SILValue base = Ctx.getValue(inIter->second).getBase())
          llvm::dbgs() << base;
        else
          llvm::dbgs() << "no base";
      }
    }
    if (!ForwardSetOut.empty() && ForwardSetOut.test(i)) {
      llvm::dbgs() << " OUT ";
      ValueTableMap::const_iterator outIter = ForwardValOut.find(i);
      if (outIter != ForwardValOut.end()) {
        if (SILValue base = Ctx.getValue(outIter->second).getBase())
          llvm::dbgs() << base;
        else
          llvm::dbgs() << "no base";
      }
    }
    llvm::dbgs() << "\n";
  }
}
#endif

//===----------------------------------------------------------------------===//
//                          RLEContext Implementation
//===----------------------------------------------------------------------===//

RLEContext::RLEContext(SILFunction *F, SILPassManager *PM, AliasAnalysis *AA,
                       TypeExpansionAnalysis *TE, PostOrderFunctionInfo *PO,
                       EpilogueARCFunctionInfo *EAFI, bool disableArrayLoads)
    : Fn(F), PM(PM), AA(AA), TE(TE), PO(PO), EAFI(EAFI), BBToLocState(F),
      BBWithLoads(F),
      ArrayType(disableArrayLoads
                    ? F->getModule().getASTContext().getArrayDecl()
                    : nullptr)
#ifndef NDEBUG
      ,
      printCtx(llvm::dbgs(), /*Verbose=*/false, /*Sorted=*/true)
#endif
{
}

LSLocation &RLEContext::getLocation(const unsigned index) {
  return LocationVault[index];
}

unsigned RLEContext::getLocationBit(const LSLocation &Loc) {
  // Return the bit position of the given Loc in the LocationVault. The bit
  // position is then used to set/reset the bitvector kept by each BlockState.
  //
  // We should have the location populated by the enumerateLSLocation at this
  // point.
  auto Iter = LocToBitIndex.find(Loc);
  assert(Iter != LocToBitIndex.end() && "Location should have been enum'ed");
  return Iter->second;
}

LSValue &RLEContext::getValue(const unsigned index) {
  return LSValueVault[index];
}

unsigned RLEContext::getValueBit(const LSValue &Val) {
  // Return the bit position of the given Val in the LSValueVault. The bit
  // position is then used to set/reset the bitvector kept by each g.
  auto Iter = ValToBitIndex.find(Val);

  // We do not walk over the function and find all the possible LSValues
  // in this function, as some of the these values will not be used, i.e.
  // if the LoadInst that generates this value is actually RLE'ed.
  // Instead, we create the LSValues when we need them.
  if (Iter == ValToBitIndex.end()) {
    ValToBitIndex[Val] = LSValueVault.size();
    LSValueVault.push_back(Val);
    return ValToBitIndex[Val];
  }
  return Iter->second;
}

BlockState::ValueState BlockState::getValueStateAtEndOfBlock(RLEContext &Ctx,
                                                             LSLocation &L) {
  // Find number of covering value and concrete values for the locations
  // expanded from the given location.
  unsigned CSCount = 0, CTCount = 0;
  LSLocationList Locs;
  LSLocation::expand(L, &BB->getModule(),
                     TypeExpansionContext(*BB->getParent()), Locs, Ctx.getTE());

  ValueTableMap &OTM = getForwardValOut();
  for (auto &X : Locs) {
    LSValue &V = Ctx.getValue(OTM[Ctx.getLocationBit(X)]);
    if (V.isCoveringValue()) {
      ++CSCount;
      continue;
    }
    ++CTCount;
  }

  if (CSCount == Locs.size())
    return ValueState::CoverValues;
  if (CTCount == Locs.size())
    return ValueState::ConcreteValues;
  return ValueState::CoverAndConcreteValues;
}

SILValue RLEContext::computePredecessorLocationValue(SILBasicBlock *BB,
                                                     LSLocation &L) {
  llvm::SmallVector<std::pair<SILBasicBlock *, SILValue>, 8> Values;
  BasicBlockWorklist WorkList(Fn);

  // Push in all the predecessors to get started.
  for (auto Pred : BB->getPredecessorBlocks()) {
    WorkList.pushIfNotVisited(Pred);
  }

  while (SILBasicBlock *CurBB = WorkList.pop()) {
    BlockState &Forwarder = getBlockState(CurBB);

    // There are 3 cases that can happen here.
    //
    // 1. The current basic block contains concrete values for the entire
    //    location.
    // 2. The current basic block contains covering values for the entire
    //    location.
    // 3. The current basic block contains concrete value for part of the
    //    location and covering values for the rest.
    //
    // This BlockState contains concrete values for all the expanded
    // locations, collect and reduce them into a single value in the current
    // basic block.
    if (Forwarder.isConcreteValues(*this, L)) {
      Values.push_back({CurBB, Forwarder.reduceValuesAtEndOfBlock(*this, L)});
      continue;
    }

    // This BlockState does not contain concrete value for any of the expanded
    // locations, collect in this block's predecessors.
    if (Forwarder.isCoverValues(*this, L)) {
      for (auto Pred : CurBB->getPredecessorBlocks()) {
        WorkList.pushIfNotVisited(Pred);
      }
      continue;
    }

    // This block contains concrete values for some but not all the expanded
    // locations, recursively call collectLocationValues to materialize the
    // value that reaches this basic block.
    LSLocationValueMap LSValues;
    if (!collectLocationValues(CurBB, L, LSValues, Forwarder.getForwardValOut()))
      return SILValue();

    // Reduce the available values into a single SILValue we can use to forward
    SILInstruction *IPt = CurBB->getTerminator();
    Values.push_back({CurBB, LSValue::reduce(L, &BB->getModule(), LSValues, IPt)});
  }

  auto ownershipRange =
      makeTransformRange(llvm::make_range(Values.begin(), Values.end()),
                         [](std::pair<SILBasicBlock *, SILValue> v) {
                           return v.second.getOwnershipKind();
                         });

  auto mergedOwnershipKind = ValueOwnershipKind::merge(ownershipRange);

  // Finally, collect all the values for the SILArgument, materialize it using
  // the SSAUpdater.
  Updater.initialize(
      L.getType(&BB->getModule(), TypeExpansionContext(*BB->getParent()))
          .getObjectType(),
      mergedOwnershipKind);

  SmallVector<SILPhiArgument *, 8> insertedPhis;
  Updater.setInsertedPhis(&insertedPhis);

  for (auto V : Values) {
    Updater.addAvailableValue(V.first, V.second);
  }

  auto Val = Updater.getValueInMiddleOfBlock(BB);

  for (auto *phi : insertedPhis) {
    if (phi == Val) {
      continue;
    }
    // Fix lifetime of intermediate phis
    SmallVector<SILBasicBlock *, 4> userBBs;
    for (auto use : phi->getUses()) {
      userBBs.push_back(use->getParentBlock());
    }
    endLifetimeAtLeakingBlocks(phi, userBBs);
  }
  return makeNewValueAvailable(Val, BB);
}

bool RLEContext::collectLocationValues(SILBasicBlock *BB, LSLocation &L,
                                       LSLocationValueMap &Values,
                                       ValueTableMap &VM) {
  LSLocationList CSLocs;
  LSLocationList Locs;
  LSLocation::expand(L, &BB->getModule(),
                     TypeExpansionContext(*BB->getParent()), Locs, TE);

  auto *Mod = &BB->getModule();
  // Find the locations that this basic block defines and the locations which
  // we do not have a concrete value in the current basic block.
  for (auto &X : Locs) {
    auto Val = getValue(VM[getLocationBit(X)]);
    if (!Val.isCoveringValue()) {
      auto AvailValue =
          makeCopiedValueAvailable(Val.getBase(), BB);
      Values[X] = LSValue(AvailValue, Val.getPath().getValue());
      continue;
    }
    Values[X] = Val;
    CSLocs.push_back(X);
  }

  // For locations which we do not have concrete values for in this basic
  // block, try to reduce it to the minimum # of locations possible, this
  // will help us to generate as few SILArguments as possible.
  LSLocation::reduce(L, Mod, TypeExpansionContext(*BB->getParent()), CSLocs);

  // To handle covering value, we need to go to the predecessors and
  // materialize them there.
  for (auto &X : CSLocs) {
    SILValue V = computePredecessorLocationValue(BB, X);
    if (!V)
      return false;
    // We've constructed a concrete value for the covering value. Expand and
    // collect the newly created forwardable values.
    LSLocationList Locs;
    LSValueList Vals;
    auto expansionContext = TypeExpansionContext(*BB->getParent());
    LSLocation::expand(X, Mod, expansionContext, Locs, TE);
    LSValue::expand(V, Mod, expansionContext, Vals, TE);

    for (unsigned i = 0; i < Locs.size(); ++i) {
      Values[Locs[i]] = Vals[i];
      assert(Values[Locs[i]].isValid() && "Invalid load store value");
    }
  }
  return true;
}

void RLEContext::processBasicBlocksForGenKillSet() {
  for (SILBasicBlock *BB : PO->getReversePostOrder()) {
    LLVM_DEBUG(llvm::dbgs() << "PROCESS " << printCtx.getID(BB)
                            << " for Gen/Kill:\n";
               BB->print(printCtx));

    BlockState &S = getBlockState(BB);

    // Compute the AvailSetMax at the beginning of the basic block.
    S.mergePredecessorsAvailSetMax(*this);

    // Compute the genset and killset.
    //
    // To optimize this process, we also compute the AvailSetMax at particular
    // point in the basic block.
    for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
      if (auto *LI = dyn_cast<LoadInst>(&*I)) {
        if (!BBWithLoads.contains(BB))
          BBWithLoads.insert(BB);
        S.processLoadInst(*this, LI, RLEKind::ComputeAvailSetMax);
      }
      if (auto *SI = dyn_cast<StoreInst>(&*I)) {
        S.processStoreInst(*this, SI, RLEKind::ComputeAvailSetMax);
      }

      S.processInstructionWithKind(*this, &*I, RLEKind::ComputeAvailGenKillSet);
    }
    LLVM_DEBUG(S.dump(*this));
  }
}

void RLEContext::processBasicBlocksWithGenKillSet() {
  // Process each basic block with the gen and kill set. Every time the
  // ForwardSetOut of a basic block changes, the optimization is rerun on its
  // successors.
  BasicBlockWorklist WorkList(Fn);

  // Push into the worklist in post order so that we can pop from the back and
  // get reverse post order.
  for (SILBasicBlock *BB : PO->getPostOrder()) {
    WorkList.push(BB);
  }
  while (SILBasicBlock *BB = WorkList.popAndForget()) {
    LLVM_DEBUG(llvm::dbgs() << "PROCESS " << printCtx.getID(BB)
                            << " with Gen/Kill.\n");
    // Intersection.
    BlockState &Forwarder = getBlockState(BB);
    // Compute the ForwardSetIn at the beginning of the basic block.
    Forwarder.mergePredecessorAvailSet(*this);

    if (Forwarder.processBasicBlockWithGenKillSet()) {
      for (SILBasicBlock *succ : BB->getSuccessors()) {
        WorkList.pushIfNotVisited(succ);
      }
    }
    LLVM_DEBUG(Forwarder.dump(*this));
  }
}

void RLEContext::processBasicBlocksForAvailValue() {
  for (SILBasicBlock *BB : PO->getReversePostOrder()) {
    LLVM_DEBUG(llvm::dbgs() << "PROCESS " << printCtx.getID(BB)
                            << " for available.\n");

    BlockState &Forwarder = getBlockState(BB);

    // Merge the predecessors. After merging, BlockState now contains
    // lists of available LSLocations and their values that reach the
    // beginning of the basic block along all paths.
    Forwarder.mergePredecessorAvailSetAndValue(*this);

    // Merge duplicate loads, and forward stores to
    // loads. We also update lists of stores|loads to reflect the end
    // of the basic block.
    Forwarder.processBasicBlockWithKind(*this, RLEKind::ComputeAvailValue);

    // Update the locations with available values. We do not need to update
    // the available BitVector here as they should have been initialized and
    // stabilized in the processBasicBlocksWithGenKillSet.
    Forwarder.updateForwardValOut();

    LLVM_DEBUG(Forwarder.dump(*this));
  }
}

void RLEContext::processBasicBlocksForRLE(bool Optimistic) {
  for (SILBasicBlock *BB : PO->getReversePostOrder()) {
    LLVM_DEBUG(llvm::dbgs() << "PROCESS " << printCtx.getID(BB)
                            << " for RLE.\n");

    // If we know this is not a one iteration function which means its
    // forward sets have been computed and converged,
    // and this basic block does not even have LoadInsts, there is no point
    // in processing every instruction in the basic block again as no store
    // will be eliminated.
    if (Optimistic && !BBWithLoads.contains(BB))
      continue;

    BlockState &Forwarder = getBlockState(BB);

    // Merge the predecessors. After merging, BlockState now contains
    // lists of available LSLocations and their values that reach the
    // beginning of the basic block along all paths.
    Forwarder.mergePredecessorAvailSetAndValue(*this);

    LLVM_DEBUG(Forwarder.dump(*this));

    // Perform the actual redundant load elimination.
    Forwarder.processBasicBlockWithKind(*this, RLEKind::PerformRLE);

    // If this is not a one iteration data flow, then the forward sets
    // have been computed.
    if (Optimistic)
      continue;

    // Update the locations with available values and their values.
    Forwarder.updateForwardSetOut();
    Forwarder.updateForwardValOut();
  }
}

void RLEContext::runIterativeRLE() {
  // Generate the genset and killset for every basic block.
  processBasicBlocksForGenKillSet();

  // Process basic blocks in RPO. After the data flow converges, run last
  // iteration and perform load forwarding.
  processBasicBlocksWithGenKillSet();

  // We have computed the available value bit, now go through every basic
  // block and compute the forwarding value locally. This is necessary as
  // when we perform the RLE in the last iteration, we must handle loops, i.e.
  // predecessor blocks which have not been processed when a basic block is
  // processed.
  processBasicBlocksForAvailValue();
}

bool RLEContext::run() {
  // We perform redundant load elimination in the following phases.
  //
  // Phase 1. Compute the genset and killset for every basic block.
  //
  // Phase 2. Use an iterative data flow to compute whether there is an
  // available value at a given point, we do not yet care about what the value
  // is.
  //
  // Phase 3. we compute the real forwardable value at a given point.
  //
  // Phase 4. we perform the redundant load elimination.
  // Walk over the function and find all the locations accessed by
  // this function.
  std::pair<int, int> LSCount = std::make_pair(0, 0);
  LSLocation::enumerateLSLocations(*Fn, LocationVault,
                                   LocToBitIndex,
                                   BaseToLocIndex, TE,
                                   LSCount);

  // Check how to optimize this function.
  ProcessKind Kind = getProcessFunctionKind(LSCount.first, LSCount.second);

  // We do not optimize this function at all.
  if (Kind == ProcessKind::ProcessNone)
    return false;

  // Do we run a multi-iteration data flow ?
  const bool Optimistic = (Kind == ProcessKind::ProcessMultipleIterations);

  // These are a list of basic blocks that we actually processed.
  // We do not process unreachable block, instead we set their liveouts to nil.
  BasicBlockSet BBToProcess(Fn);
  for (auto X : PO->getPostOrder())
    BBToProcess.insert(X);

  // For all basic blocks in the function, initialize a BB state. Since we
  // know all the locations accessed in this function, we can resize the bit
  // vector to the appropriate size.
  for (auto bs : BBToLocState) {
    bs.data.init(&bs.block, LocationVault.size(),
                 Optimistic && BBToProcess.contains(&bs.block));
  }

  LLVM_DEBUG(for (unsigned i = 0; i < LocationVault.size(); ++i) {
    llvm::dbgs() << "LSLocation #" << i;
    getLocation(i).print(llvm::dbgs());
  });

  if (Optimistic)
    runIterativeRLE();

  // We have the available value bit computed and the local forwarding value.
  // Set up the load forwarding.
  processBasicBlocksForRLE(Optimistic);

  // Finally, perform the redundant load replacements.
  llvm::SmallVector<SILInstruction *, 16> InstsToDelete;
  bool SILChanged = false;
  for (auto bs : BBToLocState) {
    auto &Loads = bs.data.getRL();
    // Nothing to forward.
    if (Loads.empty())
      continue;
    // We iterate the instructions in the basic block in a deterministic order
    // and use this order to perform the load forwarding.
    //
    // NOTE: we could end up with different SIL depending on the ordering load
    // forwarding is performed.
    for (auto I = bs.block.rbegin(), E = bs.block.rend(); I != E; ++I) {
      auto V = dyn_cast<SingleValueInstruction>(&*I);
      if (!V)
        continue;
      auto Iter = Loads.find(V);
      if (Iter == Loads.end())
        continue;
      LLVM_DEBUG(llvm::dbgs() << "Replacing  " << SILValue(Iter->first)
                              << "With " << Iter->second);
      auto *origLoad = cast<LoadInst>(Iter->first);
      SILValue newValue = Iter->second;
      if (origLoad->getOwnershipQualifier() == LoadOwnershipQualifier::Take) {
        SILBuilderWithScope(origLoad).createDestroyAddr(origLoad->getLoc(),
                                                        origLoad->getOperand());
      }
      SILChanged = true;
      origLoad->replaceAllUsesWith(newValue);
      InstsToDelete.push_back(origLoad);
      ++NumForwardedLoads;
    }
  }

  // Erase the instructions recursively, this way, we get rid of pass
  // dependence on DCE.
  for (auto &X : InstsToDelete) {
    // It is possible that the instruction still has uses, because it could be
    // used as the replacement Value, i.e. F.second, for some other RLE pairs.
    //
    // TODO: we should fix this, otherwise we are missing RLE opportunities.
    if (X->hasUsesOfAnyResult())
      continue;
    recursivelyDeleteTriviallyDeadInstructions(X, true);
  }
  return SILChanged;
}

//===----------------------------------------------------------------------===//
//                           Top Level Entry Point
//===----------------------------------------------------------------------===//

namespace {

class RedundantLoadElimination : public SILFunctionTransform {

private:
  bool disableArrayLoads;

public:

  RedundantLoadElimination(bool disableArrayLoads)
    : disableArrayLoads(disableArrayLoads) { }

  /// The entry point to the transformation.
  void run() override {
    SILFunction *F = getFunction();

    LLVM_DEBUG(llvm::dbgs() << "*** RLE on function: " << F->getName()
                            << " ***\n");

    auto *AA = PM->getAnalysis<AliasAnalysis>(F);
    auto *TE = PM->getAnalysis<TypeExpansionAnalysis>();
    auto *PO = PM->getAnalysis<PostOrderAnalysis>()->get(F);
    auto *EAFI = PM->getAnalysis<EpilogueARCAnalysis>()->get(F);

    RLEContext RLE(F, PM, AA, TE, PO, EAFI, disableArrayLoads);
    if (RLE.run()) {
      invalidateAnalysis(SILAnalysis::InvalidationKind::Instructions);
    }
  }
};

} // end anonymous namespace

SILTransform *swift::createEarlyRedundantLoadElimination() {
  return new RedundantLoadElimination(/*disableArrayLoads=*/true);
}

SILTransform *swift::createRedundantLoadElimination() {
  return new RedundantLoadElimination(/*disableArrayLoads=*/false);
}