swift-mirror/include/swift/Demangling/ManglingUtils.h

//===--- ManglingUtils.h - Utilities for Swift name mangling ----*- C++ -*-===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

#ifndef SWIFT_DEMANGLING_MANGLINGUTILS_H
#define SWIFT_DEMANGLING_MANGLINGUTILS_H

#include "llvm/ADT/Optional.h"
#include "llvm/ADT/StringRef.h"
#include "swift/Demangling/NamespaceMacros.h"
#include "swift/Demangling/Punycode.h"

namespace swift {
namespace Mangle {
SWIFT_BEGIN_INLINE_NAMESPACE

using llvm::StringRef;

inline bool isLowerLetter(char ch) {
  return ch >= 'a' && ch <= 'z';
}

inline bool isUpperLetter(char ch) {
  return ch >= 'A' && ch <= 'Z';
}

inline bool isDigit(char ch) {
  return ch >= '0' && ch <= '9';
}

inline bool isLetter(char ch) {
  return isLowerLetter(ch) || isUpperLetter(ch);
}

/// Returns true if \p ch is a character which defines the begin of a
/// substitution word.
inline bool isWordStart(char ch) {
  return !isDigit(ch) && ch != '_' && ch != 0;
}

/// Returns true if \p ch is a character (following \p prevCh) which defines
/// the end of a substitution word.
inline bool isWordEnd(char ch, char prevCh) {
  if (ch == '_' || ch == 0)
    return true;

  if (!isUpperLetter(prevCh) && isUpperLetter(ch))
    return true;

  return false;
}

/// Returns true if \p ch is a valid character which may appear in a symbol
/// mangling.
inline bool isValidSymbolChar(char ch) {
  return isLetter(ch) || isDigit(ch) || ch == '_' || ch == '$';
}

/// Returns true if \p str contains any character which may not appear in a
/// mangled symbol string and therefore must be punycode encoded.
bool needsPunycodeEncoding(StringRef str);

/// Returns true if \p str contains any non-ASCII character.
bool isNonAscii(StringRef str);

/// Describes a Word in a mangled identifier.
struct SubstitutionWord {

  /// The position of the first word character in the mangled string.
  size_t start;

  /// The length of the word.
  size_t length;
};

/// Helper struct which represents a word substitution.
struct WordReplacement {
  /// The position in the identifier where the word is substituted.
  size_t StringPos;

  /// The index into the mangler's Words array (-1 if invalid).
  int WordIdx;
};

/// Translate the given operator character into its mangled form.
///
/// Current operator characters:   @/=-+*%<>!&|^~ and the special operator '..'
char translateOperatorChar(char op);

/// Returns a string where all characters of the operator \p Op are translated
/// to their mangled form.
std::string translateOperator(StringRef Op);

/// Returns the standard type kind for an 'S' substitution, e.g. 'i' for "Int".
///
/// \param allowConcurrencyManglings When true, allows the standard
/// substitutions for types in the _Concurrency module that were introduced in
/// Swift 5.5.
llvm::Optional<StringRef> getStandardTypeSubst(
    StringRef TypeName, bool allowConcurrencyManglings);

/// Mangles an identifier using a generic Mangler class.
///
/// The Mangler class must provide the following:
/// *) Words: An array of SubstitutionWord which holds the current list of
///           found words which can be used for substitutions.
/// *) SubstWordsInIdent: An array of WordReplacement, which is just used
///           as a temporary storage during mangling. Must be empty.
/// *) Buffer: A stream where the mangled identifier is written to.
/// *) getBufferStr(): Returns a StringRef of the current content of Buffer.
/// *) UsePunycode: A flag indicating if punycode encoding should be done.
template <typename Mangler>
void mangleIdentifier(Mangler &M, StringRef ident) {

  size_t WordsInBuffer = M.Words.size();
  assert(M.SubstWordsInIdent.empty());
  if (M.UsePunycode && needsPunycodeEncoding(ident)) {
    // If the identifier contains non-ASCII character, we mangle
    // with an initial '00' and Punycode the identifier string.
    std::string punycodeBuf;
    Punycode::encodePunycodeUTF8(ident, punycodeBuf,
                                 /*mapNonSymbolChars*/ true);
    StringRef pcIdent = punycodeBuf;
    M.Buffer << "00" << pcIdent.size();
    if (isDigit(pcIdent[0]) || pcIdent[0] == '_')
      M.Buffer << '_';
    M.Buffer << pcIdent;
    return;
  }
  // Search for word substitutions and for new words.
  const size_t NotInsideWord = ~0;
  size_t wordStartPos = NotInsideWord;
  for (size_t Pos = 0, Len = ident.size(); Pos <= Len; ++Pos) {
    char ch = (Pos < Len ? ident[Pos] : 0);
    if (wordStartPos != NotInsideWord && isWordEnd(ch, ident[Pos - 1])) {
      // This position is the end of a word, i.e. the next character after a
      // word.
      assert(Pos > wordStartPos);
      size_t wordLen = Pos - wordStartPos;
      StringRef Word = ident.substr(wordStartPos, wordLen);

      // Helper function to lookup the Word in a string.
      auto lookupWord = [&] (StringRef Str,
                             size_t FromWordIdx, size_t ToWordIdx) -> int {
        for (size_t Idx = FromWordIdx; Idx < ToWordIdx; ++Idx) {
          const SubstitutionWord &w = M.Words[Idx];
          StringRef existingWord =  Str.substr(w.start, w.length);
          if (Word == existingWord)
            return (int)Idx;
        }
        return -1;
      };

      // Is the word already present in the so far mangled string?
      int WordIdx = lookupWord(M.getBufferStr(), 0, WordsInBuffer);
      // Otherwise, is the word already present in this identifier?
      if (WordIdx < 0)
        WordIdx = lookupWord(ident, WordsInBuffer, M.Words.size());

      if (WordIdx >= 0) {
        // We found a word substitution!
        assert(WordIdx < 26);
        M.addSubstWordsInIdent({wordStartPos, WordIdx});
      } else if (wordLen >= 2 && M.Words.size() < M.MaxNumWords) {
        // It's a new word: remember it.
        // Note: at this time the word's start position is relative to the
        // begin of the identifier. We must update it afterwards so that it is
        // relative to the begin of the whole mangled Buffer.
        M.addWord({wordStartPos, wordLen});
      }
      wordStartPos = NotInsideWord;
    }
    if (wordStartPos == NotInsideWord && isWordStart(ch)) {
      // This position is the begin of a word.
      wordStartPos = Pos;
    }
  }
  // If we have word substitutions mangle an initial '0'.
  if (!M.SubstWordsInIdent.empty())
    M.Buffer << '0';

  size_t Pos = 0;
  // Add a dummy-word at the end of the list.
  M.addSubstWordsInIdent({ident.size(), -1});

  // Mangle a sequence of word substitutions and sub-strings.
  for (size_t Idx = 0, End = M.SubstWordsInIdent.size(); Idx < End; ++Idx) {
    const WordReplacement &Repl = M.SubstWordsInIdent[Idx];
    if (Pos < Repl.StringPos) {
      // Mangle the sub-string up to the next word substitution (or to the end
      // of the identifier - that's why we added the dummy-word).
      // The first thing: we add the encoded sub-string length.
      bool first = true;
      M.Buffer << (Repl.StringPos - Pos);
      do {
        // Update the start position of new added words, so that they refer to
        // the begin of the whole mangled Buffer.
        if (WordsInBuffer < M.Words.size() &&
            M.Words[WordsInBuffer].start == Pos) {
          M.Words[WordsInBuffer].start = M.getBufferStr().size();
          WordsInBuffer++;
        }
        // Error recovery. We sometimes need to mangle identifiers coming
        // from invalid code.
        if (first && isDigit(ident[Pos]))
          M.Buffer << 'X';
        // Add a literal character of the sub-string.
        else
          M.Buffer << ident[Pos];

        Pos++;
        first = false;
      } while (Pos < Repl.StringPos);
    }
    // Is it a "real" word substitution (and not the dummy-word)?
    if (Repl.WordIdx >= 0) {
      assert(Repl.WordIdx <= (int)WordsInBuffer);
      Pos += M.Words[Repl.WordIdx].length;
      if (Idx < End - 2) {
        M.Buffer << (char)(Repl.WordIdx + 'a');
      } else {
        // The last word substitution is a capital letter.
        M.Buffer << (char)(Repl.WordIdx + 'A');
        if (Pos == ident.size())
          M.Buffer << '0';
      }
    }
  }
  M.SubstWordsInIdent.clear();
}

/// Utility class for mangling merged substitutions.
///
/// Used in the Mangler and Remangler.
class SubstitutionMerging {

  /// The position of the last substitution mangling,
  /// e.g. 3 for 'AabC' and 'Aab4C'
  size_t lastSubstPosition = 0;

  /// The size of the last substitution mangling,
  /// e.g. 1 for 'AabC' or 2 for 'Aab4C'
  size_t lastSubstSize = 0;

  /// The repeat count of the last substitution,
  /// e.g. 1 for 'AabC' or 4 for 'Aab4C'
  size_t lastNumSubsts = 0;

  /// True if the last substitution is an 'S' substitution,
  /// false if the last substitution is an 'A' substitution.
  bool lastSubstIsStandardSubst = false;

public:

  // The only reason to limit the number of repeated substitutions is that we
  // don't want that the demangler blows up on a bogus substitution, e.g.
  // ...A832456823746582B...
  enum { MaxRepeatCount = 2048 };

  void clear() {
    lastNumSubsts = 0;
  }

  /// Tries to merge the substitution \p Subst with a previously mangled
  /// substitution.
  ///
  /// Returns true on success. In case of false, the caller must mangle the
  /// substitution separately in the form 'S<Subst>' or 'A<Subst>'.
  ///
  /// The Mangler class must provide the following:
  /// *) Buffer: A stream where the mangled identifier is written to.
  /// *) getBufferStr(): Returns a StringRef of the current content of Buffer.
  /// *) resetBuffer(size_t): Resets the buffer to an old position.
  template <typename Mangler>
  bool tryMergeSubst(Mangler &M, StringRef Subst, bool isStandardSubst) {
    assert(isUpperLetter(Subst.back()) ||
           (isStandardSubst && isLowerLetter(Subst.back())));
    StringRef BufferStr = M.getBufferStr();
    if (lastNumSubsts > 0 && lastNumSubsts < MaxRepeatCount
        && BufferStr.size() == lastSubstPosition + lastSubstSize
        && lastSubstIsStandardSubst == isStandardSubst) {

      // The last mangled thing is a substitution.
      assert(lastSubstPosition > 0 && lastSubstPosition < BufferStr.size());
      assert(lastSubstSize > 0);
      StringRef lastSubst = BufferStr.take_back(lastSubstSize)
            .drop_while([](char c) {
        return isDigit(c);
      });
      assert(isUpperLetter(lastSubst.back())
             || (isStandardSubst && isLowerLetter(lastSubst.back())));
      if (lastSubst != Subst && !isStandardSubst) {
        // We can merge with a different 'A' substitution,
        // e.g. 'AB' -> 'AbC'.
        lastSubstPosition = BufferStr.size();
        lastNumSubsts = 1;
        M.resetBuffer(BufferStr.size() - 1);
        assert(isUpperLetter(lastSubst.back()));
        M.Buffer << (char)(lastSubst.back() - 'A' + 'a') << Subst;
        lastSubstSize = 1;
        return true;
      }
      if (lastSubst == Subst) {
        // We can merge with the same 'A' or 'S' substitution,
        // e.g. 'AB' -> 'A2B', or 'S3i' -> 'S4i'
        lastNumSubsts++;
        M.resetBuffer(lastSubstPosition);
        M.Buffer << lastNumSubsts;
        M.Buffer << Subst;
        lastSubstSize = M.getBufferStr().size() - lastSubstPosition;
        return true;
      }
    }
    // We can't merge with the previous substitution, but let's remember this
    // substitution which will be mangled by the caller.
    lastSubstPosition = BufferStr.size() + 1;
    lastSubstSize = Subst.size();
    lastNumSubsts = 1;
    lastSubstIsStandardSubst = isStandardSubst;
    return false;
  }
};

SWIFT_END_INLINE_NAMESPACE
} // end namespace Mangle
} // end namespace swift

#endif // SWIFT_DEMANGLING_MANGLINGUTILS_H