Implement SE-0200 (extended escaping in string literals)

Supports string literals like #"foo"\n"bar"#.
2025-12-14 20:36:38 +01:00 · 2018-09-06 23:19:52 +01:00
parent 5e2b705f6d
commit 4da8cbe655
9 changed files with 357 additions and 79 deletions
--- a/include/swift/AST/DiagnosticsParse.def
+++ b/include/swift/AST/DiagnosticsParse.def
@@ -138,6 +138,10 @@ ERROR(lex_invalid_u_escape,none,
      "\\u{...} escape sequence expects between 1 and 8 hex digits", ())
 ERROR(lex_invalid_u_escape_rbrace,none,
      "expected '}' in \\u{...} escape sequence", ())
+ERROR(lex_invalid_escape_delimiter,none,
+      "too many '#' characters in delimited escape", ())
+ERROR(lex_invalid_closing_delimiter,none,
+      "too many '#' characters in closing delimiter", ())

 ERROR(lex_invalid_unicode_scalar,none,
      "invalid unicode scalar", ())
--- a/include/swift/Parse/Lexer.h
+++ b/include/swift/Parse/Lexer.h
@@ -364,12 +364,13 @@ public:
    enum : char { Literal, Expr } Kind;
    // Loc+Length for the segment inside the string literal, without quotes.
    SourceLoc Loc;
-    unsigned Length, IndentToStrip;
+    unsigned Length, IndentToStrip, CustomDelimiterLen;
    bool IsFirstSegment, IsLastSegment;

    static StringSegment getLiteral(SourceLoc Loc, unsigned Length,
                                    bool IsFirstSegment, bool IsLastSegment,
-                                    unsigned IndentToStrip) {
+                                    unsigned IndentToStrip,
+                                    unsigned CustomDelimiterLen) {
      StringSegment Result;
      Result.Kind = Literal;
      Result.Loc = Loc;
@@ -377,6 +378,7 @@ public:
      Result.IsFirstSegment = IsFirstSegment;
      Result.IsLastSegment = IsLastSegment;
      Result.IndentToStrip = IndentToStrip;
+      Result.CustomDelimiterLen = CustomDelimiterLen;
      return Result;
    }
    
@@ -388,6 +390,7 @@ public:
      Result.IsFirstSegment = false;
      Result.IsLastSegment = false;
      Result.IndentToStrip = 0;
+      Result.CustomDelimiterLen = 0;
      return Result;
    }

@@ -404,13 +407,14 @@ public:
                                           SmallVectorImpl<char> &Buffer,
                                           bool IsFirstSegment = false,
                                           bool IsLastSegment = false,
-                                           unsigned IndentToStrip = 0);
+                                           unsigned IndentToStrip = 0,
+                                           unsigned CustomDelimiterLen = 0);
  StringRef getEncodedStringSegment(StringSegment Segment,
                                    SmallVectorImpl<char> &Buffer) const {
    return getEncodedStringSegment(
        StringRef(getBufferPtrForSourceLoc(Segment.Loc), Segment.Length),
        Buffer, Segment.IsFirstSegment, Segment.IsLastSegment,
-        Segment.IndentToStrip);
+        Segment.IndentToStrip, Segment.CustomDelimiterLen);
  }

  /// \brief Given a string literal token, separate it into string/expr segments
@@ -474,7 +478,8 @@ private:
    return diagnose(Loc, Diagnostic(DiagID, std::forward<ArgTypes>(Args)...));
  }

-  void formToken(tok Kind, const char *TokStart, bool MultilineString = false);
+  void formToken(tok Kind, const char *TokStart, bool IsMultilineString = false,
+                 unsigned CustomDelimiterLen = 0);
  void formEscapedIdentifierToken(const char *TokStart);

  /// Advance to the end of the line.
@@ -498,10 +503,10 @@ private:
  void lexTrivia(syntax::Trivia &T, bool IsForTrailingTrivia);
  static unsigned lexUnicodeEscape(const char *&CurPtr, Lexer *Diags);

-  unsigned lexCharacter(const char *&CurPtr,
-                        char StopQuote, bool EmitDiagnostics,
-                        bool MultilineString = false);
-  void lexStringLiteral();
+  unsigned lexCharacter(const char *&CurPtr, char StopQuote,
+                        bool EmitDiagnostics, bool IsMultilineString = false,
+                        unsigned CustomDelimiterLen = 0);
+  void lexStringLiteral(unsigned CustomDelimiterLen = 0);
  void lexEscapedIdentifier();

  void tryLexEditorPlaceholder();
--- a/include/swift/Parse/Token.h
+++ b/include/swift/Parse/Token.h
@@ -45,7 +45,10 @@ class Token {
  /// Modifiers for string literals
  unsigned MultilineString : 1;

-  // Padding bits == 32 - sizeof(Kind) * 8 - 3;
+  /// Length of custom delimiter of "raw" string literals
+  unsigned CustomDelimiterLen : 8;
+
+  // Padding bits == 32 - 11;

  /// \brief The length of the comment that precedes the token.
  unsigned CommentLength;
@@ -62,8 +65,8 @@ class Token {
 public:
  Token(tok Kind, StringRef Text, unsigned CommentLength = 0)
          : Kind(Kind), AtStartOfLine(false), EscapedIdentifier(false),
-            MultilineString(false), CommentLength(CommentLength),
-            Text(Text) {}
+            MultilineString(false), CustomDelimiterLen(0),
+            CommentLength(CommentLength), Text(Text) {}

  Token() : Token(tok::NUM_TOKENS, {}, 0) {}

@@ -266,17 +269,24 @@ public:

  /// \brief Set the token to the specified kind and source range.
  void setToken(tok K, StringRef T, unsigned CommentLength = 0,
-                bool MultilineString = false) {
+                bool IsMultilineString = false, unsigned CustomDelimiterLen = 0) {
    Kind = K;
    Text = T;
    this->CommentLength = CommentLength;
    EscapedIdentifier = false;
-    this->MultilineString = MultilineString;
+    this->MultilineString = IsMultilineString;
+    this->CustomDelimiterLen = CustomDelimiterLen;
+    assert(this->CustomDelimiterLen == CustomDelimiterLen &&
+           "custom string delimiter length > 255");
  }

-  bool IsMultilineString() const {
+  bool isMultilineString() const {
    return MultilineString;
  }
+
+  unsigned getCustomDelimiterLen() const {
+    return CustomDelimiterLen;
+  }
 };
  
 } // end namespace swift
--- a/lib/Parse/Lexer.cpp
+++ b/lib/Parse/Lexer.cpp
@@ -272,7 +272,8 @@ Token Lexer::getTokenAt(SourceLoc Loc) {
  return Result;
 }

-void Lexer::formToken(tok Kind, const char *TokStart, bool MultilineString) {
+void Lexer::formToken(tok Kind, const char *TokStart,
+                      bool IsMultilineString, unsigned CustomDelimiterLen) {
  assert(CurPtr >= BufferStart &&
         CurPtr <= BufferEnd && "Current pointer out of range!");

@@ -304,7 +305,8 @@ void Lexer::formToken(tok Kind, const char *TokStart, bool MultilineString) {
    lexTrivia(TrailingTrivia, /* IsForTrailingTrivia */ true);
  }

-  NextToken.setToken(Kind, TokenText, CommentLength, MultilineString);
+  NextToken.setToken(Kind, TokenText, CommentLength,
+                     IsMultilineString, CustomDelimiterLen);
 }

 void Lexer::formEscapedIdentifierToken(const char *TokStart) {
@@ -1211,6 +1213,69 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) {
  }
 }

+/// diagnoseZeroWidthMatchAndAdvance - Error invisible characters in delimiters.
+/// An invisible character in the middle of a delimiter can be used to extend
+/// the literal beyond what it would appear creating potential security bugs.
+static bool diagnoseZeroWidthMatchAndAdvance(char Target, const char *&CurPtr,
+                                             DiagnosticEngine *Diags) {
+  // TODO: Detect, diagnose and skip over zero-width characters if required.
+  // See https://github.com/apple/swift/pull/17668 for possible implementation.
+  return *CurPtr == Target && CurPtr++;
+}
+
+/// advanceIfMultilineDelimiter - Centralized check for multiline delimiter.
+static bool advanceIfMultilineDelimiter(const char *&CurPtr,
+                                        DiagnosticEngine *Diags) {
+  const char *TmpPtr = CurPtr;
+  if (*(TmpPtr - 1) == '"' &&
+      diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags) &&
+      diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags)) {
+    CurPtr = TmpPtr;
+    return true;
+  }
+  return false;
+}
+
+/// advanceIfCustomDelimiter - Extracts/detects any custom delimiter on
+/// opening a string literal, advances CurPtr if a delimiter is found and
+/// returns a non-zero delimiter length. CurPtr[-1] generally '#' when called.
+static unsigned advanceIfCustomDelimiter(const char *&CurPtr,
+                                         DiagnosticEngine *Diags) {
+  const char *TmpPtr = CurPtr;
+  unsigned CustomDelimiterLen = 1;
+  while (diagnoseZeroWidthMatchAndAdvance('#', TmpPtr, Diags))
+    CustomDelimiterLen++;
+  if (diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags)) {
+    CurPtr = TmpPtr;
+    return CustomDelimiterLen;
+  }
+  return 0;
+}
+
+/// delimiterMatches - Does custom delimiter ('#' characters surrounding quotes)
+/// match the number of '#' characters after '\' inside the string? This allows
+/// interpolation inside a "raw" string. Normal/cooked string processing is
+/// the degenerate case of there being no '#' characters surrounding the quotes.
+/// If delimiter matches, advances byte pointer passed in and returns true.
+/// Also used to detect the final delimiter of a string when IsClosing == true.
+static bool delimiterMatches(unsigned CustomDelimiterLen, const char *&BytesPtr,
+                             DiagnosticEngine *Diags, bool IsClosing = false) {
+  if (!CustomDelimiterLen)
+    return true;
+  const char *TmpPtr = BytesPtr;
+  while (CustomDelimiterLen--)
+    if (!diagnoseZeroWidthMatchAndAdvance('#', TmpPtr, Diags))
+      return false;
+  BytesPtr = TmpPtr;
+  if (*BytesPtr == '#' && Diags)
+    Diags->diagnose(Lexer::getSourceLoc(BytesPtr), IsClosing ?
+                    diag::lex_invalid_closing_delimiter :
+                    diag::lex_invalid_escape_delimiter)
+      .fixItRemoveChars(Lexer::getSourceLoc(BytesPtr),
+                        Lexer::getSourceLoc(BytesPtr + 1));
+  return true;
+}
+
 /// lexCharacter - Read a character and return its UTF32 code.  If this is the
 /// end of enclosing string/character sequence (i.e. the character is equal to
 /// 'StopQuote'), this returns ~0U and leaves 'CurPtr' pointing to the terminal
@@ -1220,7 +1285,8 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) {
 ///   character_escape  ::= [\][\] | [\]t | [\]n | [\]r | [\]" | [\]' | [\]0
 ///   character_escape  ::= unicode_character_escape
 unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
-                             bool EmitDiagnostics, bool MultilineString) {
+                             bool EmitDiagnostics, bool IsMultilineString,
+                             unsigned CustomDelimiterLen) {
  const char *CharStart = CurPtr;

  switch (*CurPtr++) {
@@ -1228,7 +1294,7 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
    // If this is a "high" UTF-8 character, validate it.
    if ((signed char)(CurPtr[-1]) >= 0) {
      if (isPrintable(CurPtr[-1]) == 0)
-        if (!(MultilineString && (CurPtr[-1] == '\t')))
+        if (!(IsMultilineString && (CurPtr[-1] == '\t')))
          if (EmitDiagnostics)
            diagnose(CharStart, diag::lex_unprintable_ascii_character);
      return CurPtr[-1];
@@ -1263,12 +1329,15 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
    return ~1U;
  case '\n':  // String literals cannot have \n or \r in them.
  case '\r':
-    if (MultilineString) // ... unless they are multiline
+    if (IsMultilineString) // ... unless they are multiline
      return CurPtr[-1];
    if (EmitDiagnostics)
      diagnose(CurPtr-1, diag::lex_unterminated_string);
    return ~1U;
  case '\\':  // Escapes.
+    if (!delimiterMatches(CustomDelimiterLen, CurPtr,
+                          EmitDiagnostics ? Diags : nullptr))
+      return '\\';
    break;
  }
  
@@ -1276,7 +1345,7 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
  // Escape processing.  We already ate the "\".
  switch (*CurPtr) {
  case ' ': case '\t': case '\n': case '\r':
-    if (MultilineString && maybeConsumeNewlineEscape(CurPtr, 0))
+    if (IsMultilineString && maybeConsumeNewlineEscape(CurPtr, 0))
      return '\n';
    LLVM_FALLTHROUGH;
  default:  // Invalid escape.
@@ -1334,10 +1403,11 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
 static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
                                                     const char *EndPtr,
                                                     DiagnosticEngine *Diags,
-                                                     bool MultilineString) {
-  llvm::SmallVector<char, 4> OpenDelimiters;
-  llvm::SmallVector<bool, 4> AllowNewline;
-  AllowNewline.push_back(MultilineString);
+                                                     bool IsMultilineString) {
+  SmallVector<char, 4> OpenDelimiters;
+  SmallVector<bool, 4> AllowNewline;
+  SmallVector<unsigned, 4> CustomDelimiter;
+  AllowNewline.push_back(IsMultilineString);

  auto inStringLiteral = [&]() {
    return !OpenDelimiters.empty() &&
@@ -1352,6 +1422,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
    // On success scanning the expression body, the real lexer will be used to
    // relex the body when parsing the expressions.  We let it diagnose any
    // issues with malformed tokens or other problems.
+    unsigned CustomDelimiterLen = 0;
    switch (*CurPtr++) {
    // String literals in general cannot be split across multiple lines;
    // interpolated ones are no exception - unless multiline literals.
@@ -1362,43 +1433,52 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
      // Will be diagnosed as an unterminated string literal.
      return CurPtr-1;

+    case '#':
+      if (inStringLiteral() ||
+          !(CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, Diags)))
+        continue;
+      LLVM_FALLTHROUGH;
+
    case '"':
    case '\'': {
      if (!AllowNewline.back() && inStringLiteral()) {
-        if (OpenDelimiters.back() == CurPtr[-1]) {
+        if (OpenDelimiters.back() == CurPtr[-1] &&
+            delimiterMatches(CustomDelimiter.back(), CurPtr, Diags, true)) {
          // Closing single line string literal.
          OpenDelimiters.pop_back();
          AllowNewline.pop_back();
+          CustomDelimiter.pop_back();
        }
        // Otherwise, it's just a quote in string literal. e.g. "foo's".
        continue;
      }

-      bool isMultilineQuote = (
-          *CurPtr == '"' && *(CurPtr + 1) == '"' && *(CurPtr - 1) == '"');
-      if (isMultilineQuote)
-        CurPtr += 2;
+      bool isMultilineQuote = advanceIfMultilineDelimiter(CurPtr, Diags);

      if (!inStringLiteral()) {
        // Open string literal
        OpenDelimiters.push_back(CurPtr[-1]);
        AllowNewline.push_back(isMultilineQuote);
+        CustomDelimiter.push_back(CustomDelimiterLen);
        continue;
      }

      // We are in multiline string literal.
      assert(AllowNewline.back() && "other cases must be handled above");
-      if (isMultilineQuote) {
+      if (isMultilineQuote &&
+          delimiterMatches(CustomDelimiter.back(), CurPtr, Diags, true)) {
        // Close multiline string literal.
        OpenDelimiters.pop_back();
        AllowNewline.pop_back();
+        CustomDelimiter.pop_back();
      }

      // Otherwise, it's just a normal character in multiline string.
      continue;
    }
    case '\\':
-      if (inStringLiteral()) {
+      if (inStringLiteral() &&
+          delimiterMatches(CustomDelimiter.back(), CurPtr, Diags)) {
        char escapedChar = *CurPtr++;
        switch (escapedChar) {
        case '(':
@@ -1458,7 +1538,10 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
 static StringRef getStringLiteralContent(const Token &Str) {
  StringRef Bytes = Str.getText();

-  if (Str.IsMultilineString())
+  if (unsigned CustomDelimiterLen = Str.getCustomDelimiterLen())
+    Bytes = Bytes.drop_front(CustomDelimiterLen).drop_back(CustomDelimiterLen);
+
+  if (Str.isMultilineString())
    Bytes = Bytes.drop_front(3).drop_back(3);
  else
    Bytes = Bytes.drop_front().drop_back();
@@ -1496,7 +1579,7 @@ getMultilineTrailingIndent(const Token &Str, DiagnosticEngine *Diags) {
      auto string = StringRef(start, end - start);

      // Disallow escaped newline in the last line.
-      if (Diags) {
+      if (Diags && Str.getCustomDelimiterLen() == 0) {
        auto *Ptr = start - 1;
        if (*Ptr == '\n') --Ptr;
        if (*Ptr == '\r') --Ptr;
@@ -1652,30 +1735,31 @@ static void validateMultilineIndents(const Token &Str,
 /// lexStringLiteral:
 ///   string_literal ::= ["]([^"\\\n\r]|character_escape)*["]
 ///   string_literal ::= ["]["]["].*["]["]["] - approximately
-void Lexer::lexStringLiteral() {
+///   string_literal ::= (#+)("")?".*"(\2\1) - "raw" strings
+void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) {
  const char *TokStart = CurPtr-1;
  assert((*TokStart == '"' || *TokStart == '\'') && "Unexpected start");
  // NOTE: We only allow single-quote string literals so we can emit useful
  // diagnostics about changing them to double quotes.

-  bool wasErroneous = false, MultilineString = false;
+  bool wasErroneous = false, IsMultilineString = false;

  // Is this the start of a multiline string literal?
-  if (*TokStart == '"' && *CurPtr == '"' && *(CurPtr + 1) == '"') {
-    MultilineString = true;
-    CurPtr += 2;
+  if ((IsMultilineString = advanceIfMultilineDelimiter(CurPtr, Diags))) {
    if (*CurPtr != '\n' && *CurPtr != '\r')
      diagnose(CurPtr, diag::lex_illegal_multiline_string_start)
        .fixItInsert(Lexer::getSourceLoc(CurPtr), "\n");
  }

  while (true) {
-    if (*CurPtr == '\\' && *(CurPtr + 1) == '(') {
+    const char *TmpPtr = CurPtr + 1;
+    if (*CurPtr == '\\' && delimiterMatches(CustomDelimiterLen, TmpPtr, nullptr)
+        && *TmpPtr == '(') {
      // Consume tokens until we hit the corresponding ')'.
-      CurPtr += 2;
+      CurPtr = TmpPtr + 1;
      const char *EndPtr =
          skipToEndOfInterpolatedExpression(CurPtr, BufferEnd,
-                                            Diags, MultilineString);
+                                            Diags, IsMultilineString);
      
      if (*EndPtr == ')') {
        // Successfully scanned the body of the expression literal.
@@ -1688,21 +1772,21 @@ void Lexer::lexStringLiteral() {
    }

    // String literals cannot have \n or \r in them (unless multiline).
-    if (((*CurPtr == '\r' || *CurPtr == '\n') && !MultilineString)
+    if (((*CurPtr == '\r' || *CurPtr == '\n') && !IsMultilineString)
        || CurPtr == BufferEnd) {
+      TokStart -= CustomDelimiterLen;
      diagnose(TokStart, diag::lex_unterminated_string);
      return formToken(tok::unknown, TokStart);
    }

-    unsigned CharValue = lexCharacter(CurPtr, *TokStart, true, MultilineString);
+    unsigned CharValue = lexCharacter(CurPtr, *TokStart, true,
+                                      IsMultilineString, CustomDelimiterLen);
    wasErroneous |= CharValue == ~1U;

    // If this is the end of string, we are done.  If it is a normal character
    // or an already-diagnosed error, just munch it.
    if (CharValue == ~0U) {
      ++CurPtr;
-      if (wasErroneous)
-        return formToken(tok::unknown, TokStart);

      if (*TokStart == '\'') {
        // Complain about single-quote string and suggest replacement with
@@ -1738,20 +1822,19 @@ void Lexer::lexStringLiteral() {
                             replacement);
      }

-      // Is this the end of a multiline string literal?
-      if (MultilineString) {
-        if (*CurPtr == '"' && *(CurPtr + 1) == '"' && *(CurPtr + 2) != '"') {
-          CurPtr += 2;
-          formToken(tok::string_literal, TokStart, MultilineString);
-          if (Diags)
+      // Is this the end of multiline/custom-delimited string literal?
+      if ((!IsMultilineString || advanceIfMultilineDelimiter(CurPtr, Diags)) &&
+          delimiterMatches(CustomDelimiterLen, CurPtr, Diags, true)) {
+        TokStart -= CustomDelimiterLen;
+        if (wasErroneous)
+          return formToken(tok::unknown, TokStart);
+
+        formToken(tok::string_literal, TokStart,
+                  IsMultilineString, CustomDelimiterLen);
+        if (IsMultilineString && Diags)
          validateMultilineIndents(NextToken, Diags);
        return;
      }
-        else
-          continue;
-      }
-
-      return formToken(tok::string_literal, TokStart, MultilineString);
    }
  }
 }
@@ -2016,13 +2099,35 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes,
                                         SmallVectorImpl<char> &TempString,
                                         bool IsFirstSegment,
                                         bool IsLastSegment,
-                                         unsigned IndentToStrip) {
+                                         unsigned IndentToStrip,
+                                         unsigned CustomDelimiterLen) {

  TempString.clear();
  // Note that it is always safe to read one over the end of "Bytes" because
  // we know that there is a terminating " character.  Use BytesPtr to avoid a
  // range check subscripting on the StringRef.
  const char *BytesPtr = Bytes.begin();
+
+  // Special case when being called from EncodedDiagnosticMessage(...).
+  // This allows multiline and delimited strings to work in attributes.
+  // The string has already been validated by the initial parse.
+  if (IndentToStrip == ~0u && CustomDelimiterLen == ~0u) {
+    IndentToStrip = CustomDelimiterLen = 0;
+
+    // Restore trailing indent removal for multiline.
+    const char *Backtrack = BytesPtr - 1;
+    if (Backtrack[-1] == '"' && Backtrack[-2] == '"') {
+      Backtrack -= 2;
+      for (const char *Trailing = Bytes.end() - 1;
+           *Trailing == ' ' || *Trailing == '\t'; Trailing--)
+        IndentToStrip++;
+    }
+
+    // Restore delimiter if any.
+    while (*--Backtrack == '#')
+      CustomDelimiterLen++;
+  }
+
  bool IsEscapedNewline = false;
  while (BytesPtr < Bytes.end()) {
    char CurChar = *BytesPtr++;
@@ -2043,7 +2148,8 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes,
      continue;
    }

-    if (CurChar != '\\') {
+    if (CurChar != '\\' ||
+        !delimiterMatches(CustomDelimiterLen, BytesPtr, nullptr)) {
      TempString.push_back(CurChar);
      continue;
    }
@@ -2113,8 +2219,8 @@ void Lexer::getStringLiteralSegments(

  // Are substitutions required either for indent stripping or line ending
  // normalization?
-  bool MultilineString = Str.IsMultilineString(), IsFirstSegment = true;
-  unsigned IndentToStrip = 0;
+  bool MultilineString = Str.isMultilineString(), IsFirstSegment = true;
+  unsigned IndentToStrip = 0, CustomDelimiterLen = Str.getCustomDelimiterLen();
  if (MultilineString)
    IndentToStrip = 
      std::get<0>(getMultilineTrailingIndent(Str, /*Diags=*/nullptr)).size();
@@ -2124,13 +2230,12 @@ void Lexer::getStringLiteralSegments(
  // range check subscripting on the StringRef.
  const char *SegmentStartPtr = Bytes.begin();
  const char *BytesPtr = SegmentStartPtr;
-  // FIXME: Use SSE to scan for '\'.
-  while (BytesPtr != Bytes.end()) {
-    char CurChar = *BytesPtr++;
-    if (CurChar != '\\')
-      continue;
+  size_t pos;
+  while ((pos = Bytes.find('\\', BytesPtr-Bytes.begin())) != StringRef::npos) {
+    BytesPtr = Bytes.begin() + pos + 1;

-    if (*BytesPtr++ != '(')
+    if (!delimiterMatches(CustomDelimiterLen, BytesPtr, Diags) ||
+        *BytesPtr++ != '(')
      continue;

    // String interpolation.
@@ -2138,8 +2243,9 @@ void Lexer::getStringLiteralSegments(
    // Push the current segment.
    Segments.push_back(
        StringSegment::getLiteral(getSourceLoc(SegmentStartPtr),
-                                  BytesPtr-SegmentStartPtr-2,
-                                  IsFirstSegment, false, IndentToStrip));
+                                  BytesPtr-SegmentStartPtr-2-CustomDelimiterLen,
+                                  IsFirstSegment, false, IndentToStrip,
+                                  CustomDelimiterLen));
    IsFirstSegment = false;

    // Find the closing ')'.
@@ -2162,7 +2268,8 @@ void Lexer::getStringLiteralSegments(
  Segments.push_back(
      StringSegment::getLiteral(getSourceLoc(SegmentStartPtr),
                                Bytes.end()-SegmentStartPtr,
-                                IsFirstSegment, true, IndentToStrip));
+                                IsFirstSegment, true, IndentToStrip,
+                                CustomDelimiterLen));
 }


@@ -2261,6 +2368,8 @@ void Lexer::lexImpl() {
  case '\\': return formToken(tok::backslash, TokStart);

  case '#':
+    if (unsigned CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, Diags))
+      return lexStringLiteral(CustomDelimiterLen);
    return lexHash();

      // Operator characters.
--- a/lib/Parse/ParseExpr.cpp
+++ b/lib/Parse/ParseExpr.cpp
@@ -1923,7 +1923,7 @@ ParserResult<Expr> Parser::parseExprStringLiteral() {
  LocalContext.setCreateSyntax(SyntaxKind::StringInterpolationExpr);
  StringRef Quote;
  tok QuoteKind;
-  std::tie(Quote, QuoteKind) = Tok.IsMultilineString() ?
+  std::tie(Quote, QuoteKind) = Tok.isMultilineString() ?
    std::make_tuple("\"\"\"", tok::multiline_string_quote) :
    std::make_tuple("\"", tok::string_quote);

--- a/lib/Parse/Parser.cpp
+++ b/lib/Parse/Parser.cpp
@@ -216,8 +216,9 @@ static void getStringPartTokens(const Token &Tok, const LangOptions &LangOpts,
                                const SourceManager &SM,
                                int BufID, std::vector<Token> &Toks) {
  assert(Tok.is(tok::string_literal));
-  bool IsMultiline = Tok.IsMultilineString();
-  unsigned QuoteLen = IsMultiline ? 3 : 1;
+  bool IsMultiline = Tok.isMultilineString();
+  unsigned CustomDelimiterLen = Tok.getCustomDelimiterLen();
+  unsigned QuoteLen = (IsMultiline ? 3 : 1) + CustomDelimiterLen;
  SmallVector<Lexer::StringSegment, 4> Segments;
  Lexer::getStringLiteralSegments(Tok, Segments, /*Diags=*/nullptr);
  for (unsigned i = 0, e = Segments.size(); i != e; ++i) {
@@ -239,7 +240,8 @@ static void getStringPartTokens(const Token &Tok, const LangOptions &LangOpts,

      StringRef Text = SM.extractText({ Loc, Len });
      Token NewTok;
-      NewTok.setToken(tok::string_literal, Text, IsMultiline);
+      NewTok.setToken(tok::string_literal, Text,
+                      IsMultiline, CustomDelimiterLen);
      Toks.push_back(NewTok);

    } else {
@@ -372,7 +374,7 @@ class TokenRecorder: public ConsumeTokenReceiver {
  }

  void relexComment(CharSourceRange CommentRange,
-                    llvm::SmallVectorImpl<Token> &Scracth) {
+                    llvm::SmallVectorImpl<Token> &Scratch) {
    Lexer L(Ctx.LangOpts, Ctx.SourceMgr, BufferID, nullptr, /*InSILMode=*/false,
            HashbangMode::Disallowed,
            CommentRetentionMode::ReturnAsTokens,
@@ -385,7 +387,7 @@ class TokenRecorder: public ConsumeTokenReceiver {
      if (Result.is(tok::eof))
        break;
      assert(Result.is(tok::comment));
-      Scracth.push_back(Result);
+      Scratch.push_back(Result);
    }
  }

--- a/lib/Sema/TypeChecker.h
+++ b/lib/Sema/TypeChecker.h
@@ -2164,7 +2164,7 @@ class EncodedDiagnosticMessage {
 public:
  /// \param S A string with an encoded message
  EncodedDiagnosticMessage(StringRef S)
-      : Message(Lexer::getEncodedStringSegment(S, Buf)) {}
+      : Message(Lexer::getEncodedStringSegment(S, Buf, true, true, ~0, ~0)) {}

  /// The unescaped message to display to the user.
  const StringRef Message;
--- a/test/Parse/raw_string.swift
+++ b/test/Parse/raw_string.swift
@@ -0,0 +1,134 @@
+// RUN: %target-swift-frontend -dump-ast %s 2>&1 | %FileCheck --strict-whitespace %s
+
+import Swift
+
+_ = #"""
+###################################################################
+## This source file is part of the Swift.org open source project ##
+###################################################################
+"""#
+// CHECK: "###################################################################\n## This source file is part of the Swift.org open source project ##\n###################################################################"
+
+_ = #"""
+    # H1 #
+    ## H2 ##
+    ### H3 ###
+    """#
+// CHECK: "# H1 #\n## H2 ##\n### H3 ###"
+
+// ===---------- Multiline RawString --------===
+
+_ = ##"""
+    One
+    ""Alpha""
+    """##
+// CHECK: "One\n\"\"Alpha\"\""
+
+_ = ##"""
+    Two
+  Beta
+  """##
+// CHECK: "  Two\nBeta"
+
+_ = #"""
+    Three\r
+    Gamma\
+  """#
+// CHECK: "  Three\\r\n  Gamma\\"
+
+_ = ###"""
+    Four \(foo)
+    Delta
+"""###
+// CHECK: "    Four \\(foo)\n    Delta"
+
+_ = ##"""
+  print("""
+    Five\##n\##n\##nEpsilon
+    """)
+  """##
+// CHECK: "print(\"\"\"\n  Five\n\n\nEpsilon\n  \"\"\")"
+
+// ===---------- Single line --------===
+
+_ = #""Zeta""#
+// CHECK: "\"Zeta\""
+
+_ = #""Eta"\#n\#n\#n\#""#
+// CHECK: "\"Eta\"\n\n\n\""
+
+_ = #""Iota"\n\n\n\""#
+// CHECK: "\"Iota\"\\n\\n\\n\\\""
+
+_ = #"a raw string with \" in it"#
+// CHECK: "a raw string with \\\" in it"
+
+_ = ##"""
+      a raw string with """ in it
+      """##
+// CHECK: "a raw string with \"\"\" in it"
+
+let foo = "Interpolation"
+_ = #"\b\b \#(foo)\#(foo) Kappa"#
+// CHECK: "\\b\\b "
+// CHECK: " Kappa"
+
+_ = """
+  interpolating \(##"""
+    delimited \##("string")\#n\##n
+    """##)
+  """
+
+// CHECK: "interpolating "
+// CHECK: "delimited "
+// CHECK: "string"
+// CHECK: "\\#n\n"
+
+#"unused literal"#
+// CHECK: "unused literal"
+
+// ===---------- From proposal --------===
+
+_ = #"This is a string"#
+// CHECK: "This is a string"
+
+_ = #####"This is a string"#####
+// CHECK: "This is a string"
+
+_ = #"enum\s+.+\{.*case\s+[:upper:]"#
+// CHECK: "enum\\s+.+\\{.*case\\s+[:upper:]"
+
+_ = #"Alice: "How long is forever?" White Rabbit: "Sometimes, just one second.""#
+// CHECK: "Alice: \"How long is forever?\" White Rabbit: \"Sometimes, just one second.\""
+
+_ = #"\#\#1"#
+// CHECK: "\\#1"
+
+_ = ##"\#1"##
+// CHECK: "\\#1"
+
+_ = #"c:\windows\system32"#
+// CHECK: "c:\\windows\\system32"
+
+_ = #"\d{3) \d{3} \d{4}"#
+// CHECK: "\\d{3) \\d{3} \\d{4}"
+
+_ = #"""
+    a string with
+    """
+    in it
+    """#
+// CHECK: "a string with\n\"\"\"\nin it"
+
+_ = #"a raw string containing \r\n"#
+// CHECK: "a raw string containing \\r\\n"
+
+_ = #"""
+    [
+        {
+            "id": "12345",
+            "title": "A title that \"contains\" \\\""
+        }
+    ]
+    """#
+// CHECK: "[\n    {\n        \"id\": \"12345\",\n        \"title\": \"A title that \\\"contains\\\" \\\\\\\"\"\n    }\n]"
--- a/test/Parse/raw_string_errors.swift
+++ b/test/Parse/raw_string_errors.swift
@@ -0,0 +1,14 @@
+// RUN: %target-typecheck-verify-swift
+
+#"\##("invalid")"#
+// expected-error@-1{{too many '#' characters in delimited escape}}
+// expected-error@-2{{invalid escape sequence in literal}}
+
+####"invalid"###
+// expected-error@-1{{unterminated string literal}}
+
+###"invalid"####
+// expected-error@-1{{too many '#' characters in closing delimiter}}
+// expected-error@-2{{consecutive statements on a line must be separated by ';'}}
+// expected-error@-3{{expected expression}}
+// expected-warning@-4{{string literal is unused}}