Implement SE-0200 (extended escaping in string literals)

Supports string literals like #"foo"\n"bar"#.
This commit is contained in:
John Holdsworth
2018-09-06 23:19:52 +01:00
committed by Brent Royal-Gordon
parent 5e2b705f6d
commit 4da8cbe655
9 changed files with 357 additions and 79 deletions

View File

@@ -138,6 +138,10 @@ ERROR(lex_invalid_u_escape,none,
"\\u{...} escape sequence expects between 1 and 8 hex digits", ()) "\\u{...} escape sequence expects between 1 and 8 hex digits", ())
ERROR(lex_invalid_u_escape_rbrace,none, ERROR(lex_invalid_u_escape_rbrace,none,
"expected '}' in \\u{...} escape sequence", ()) "expected '}' in \\u{...} escape sequence", ())
ERROR(lex_invalid_escape_delimiter,none,
"too many '#' characters in delimited escape", ())
ERROR(lex_invalid_closing_delimiter,none,
"too many '#' characters in closing delimiter", ())
ERROR(lex_invalid_unicode_scalar,none, ERROR(lex_invalid_unicode_scalar,none,
"invalid unicode scalar", ()) "invalid unicode scalar", ())

View File

@@ -364,12 +364,13 @@ public:
enum : char { Literal, Expr } Kind; enum : char { Literal, Expr } Kind;
// Loc+Length for the segment inside the string literal, without quotes. // Loc+Length for the segment inside the string literal, without quotes.
SourceLoc Loc; SourceLoc Loc;
unsigned Length, IndentToStrip; unsigned Length, IndentToStrip, CustomDelimiterLen;
bool IsFirstSegment, IsLastSegment; bool IsFirstSegment, IsLastSegment;
static StringSegment getLiteral(SourceLoc Loc, unsigned Length, static StringSegment getLiteral(SourceLoc Loc, unsigned Length,
bool IsFirstSegment, bool IsLastSegment, bool IsFirstSegment, bool IsLastSegment,
unsigned IndentToStrip) { unsigned IndentToStrip,
unsigned CustomDelimiterLen) {
StringSegment Result; StringSegment Result;
Result.Kind = Literal; Result.Kind = Literal;
Result.Loc = Loc; Result.Loc = Loc;
@@ -377,6 +378,7 @@ public:
Result.IsFirstSegment = IsFirstSegment; Result.IsFirstSegment = IsFirstSegment;
Result.IsLastSegment = IsLastSegment; Result.IsLastSegment = IsLastSegment;
Result.IndentToStrip = IndentToStrip; Result.IndentToStrip = IndentToStrip;
Result.CustomDelimiterLen = CustomDelimiterLen;
return Result; return Result;
} }
@@ -388,6 +390,7 @@ public:
Result.IsFirstSegment = false; Result.IsFirstSegment = false;
Result.IsLastSegment = false; Result.IsLastSegment = false;
Result.IndentToStrip = 0; Result.IndentToStrip = 0;
Result.CustomDelimiterLen = 0;
return Result; return Result;
} }
@@ -404,13 +407,14 @@ public:
SmallVectorImpl<char> &Buffer, SmallVectorImpl<char> &Buffer,
bool IsFirstSegment = false, bool IsFirstSegment = false,
bool IsLastSegment = false, bool IsLastSegment = false,
unsigned IndentToStrip = 0); unsigned IndentToStrip = 0,
unsigned CustomDelimiterLen = 0);
StringRef getEncodedStringSegment(StringSegment Segment, StringRef getEncodedStringSegment(StringSegment Segment,
SmallVectorImpl<char> &Buffer) const { SmallVectorImpl<char> &Buffer) const {
return getEncodedStringSegment( return getEncodedStringSegment(
StringRef(getBufferPtrForSourceLoc(Segment.Loc), Segment.Length), StringRef(getBufferPtrForSourceLoc(Segment.Loc), Segment.Length),
Buffer, Segment.IsFirstSegment, Segment.IsLastSegment, Buffer, Segment.IsFirstSegment, Segment.IsLastSegment,
Segment.IndentToStrip); Segment.IndentToStrip, Segment.CustomDelimiterLen);
} }
/// \brief Given a string literal token, separate it into string/expr segments /// \brief Given a string literal token, separate it into string/expr segments
@@ -474,7 +478,8 @@ private:
return diagnose(Loc, Diagnostic(DiagID, std::forward<ArgTypes>(Args)...)); return diagnose(Loc, Diagnostic(DiagID, std::forward<ArgTypes>(Args)...));
} }
void formToken(tok Kind, const char *TokStart, bool MultilineString = false); void formToken(tok Kind, const char *TokStart, bool IsMultilineString = false,
unsigned CustomDelimiterLen = 0);
void formEscapedIdentifierToken(const char *TokStart); void formEscapedIdentifierToken(const char *TokStart);
/// Advance to the end of the line. /// Advance to the end of the line.
@@ -498,10 +503,10 @@ private:
void lexTrivia(syntax::Trivia &T, bool IsForTrailingTrivia); void lexTrivia(syntax::Trivia &T, bool IsForTrailingTrivia);
static unsigned lexUnicodeEscape(const char *&CurPtr, Lexer *Diags); static unsigned lexUnicodeEscape(const char *&CurPtr, Lexer *Diags);
unsigned lexCharacter(const char *&CurPtr, unsigned lexCharacter(const char *&CurPtr, char StopQuote,
char StopQuote, bool EmitDiagnostics, bool EmitDiagnostics, bool IsMultilineString = false,
bool MultilineString = false); unsigned CustomDelimiterLen = 0);
void lexStringLiteral(); void lexStringLiteral(unsigned CustomDelimiterLen = 0);
void lexEscapedIdentifier(); void lexEscapedIdentifier();
void tryLexEditorPlaceholder(); void tryLexEditorPlaceholder();

View File

@@ -45,7 +45,10 @@ class Token {
/// Modifiers for string literals /// Modifiers for string literals
unsigned MultilineString : 1; unsigned MultilineString : 1;
// Padding bits == 32 - sizeof(Kind) * 8 - 3; /// Length of custom delimiter of "raw" string literals
unsigned CustomDelimiterLen : 8;
// Padding bits == 32 - 11;
/// \brief The length of the comment that precedes the token. /// \brief The length of the comment that precedes the token.
unsigned CommentLength; unsigned CommentLength;
@@ -62,8 +65,8 @@ class Token {
public: public:
Token(tok Kind, StringRef Text, unsigned CommentLength = 0) Token(tok Kind, StringRef Text, unsigned CommentLength = 0)
: Kind(Kind), AtStartOfLine(false), EscapedIdentifier(false), : Kind(Kind), AtStartOfLine(false), EscapedIdentifier(false),
MultilineString(false), CommentLength(CommentLength), MultilineString(false), CustomDelimiterLen(0),
Text(Text) {} CommentLength(CommentLength), Text(Text) {}
Token() : Token(tok::NUM_TOKENS, {}, 0) {} Token() : Token(tok::NUM_TOKENS, {}, 0) {}
@@ -266,17 +269,24 @@ public:
/// \brief Set the token to the specified kind and source range. /// \brief Set the token to the specified kind and source range.
void setToken(tok K, StringRef T, unsigned CommentLength = 0, void setToken(tok K, StringRef T, unsigned CommentLength = 0,
bool MultilineString = false) { bool IsMultilineString = false, unsigned CustomDelimiterLen = 0) {
Kind = K; Kind = K;
Text = T; Text = T;
this->CommentLength = CommentLength; this->CommentLength = CommentLength;
EscapedIdentifier = false; EscapedIdentifier = false;
this->MultilineString = MultilineString; this->MultilineString = IsMultilineString;
this->CustomDelimiterLen = CustomDelimiterLen;
assert(this->CustomDelimiterLen == CustomDelimiterLen &&
"custom string delimiter length > 255");
} }
bool IsMultilineString() const { bool isMultilineString() const {
return MultilineString; return MultilineString;
} }
unsigned getCustomDelimiterLen() const {
return CustomDelimiterLen;
}
}; };
} // end namespace swift } // end namespace swift

View File

@@ -272,7 +272,8 @@ Token Lexer::getTokenAt(SourceLoc Loc) {
return Result; return Result;
} }
void Lexer::formToken(tok Kind, const char *TokStart, bool MultilineString) { void Lexer::formToken(tok Kind, const char *TokStart,
bool IsMultilineString, unsigned CustomDelimiterLen) {
assert(CurPtr >= BufferStart && assert(CurPtr >= BufferStart &&
CurPtr <= BufferEnd && "Current pointer out of range!"); CurPtr <= BufferEnd && "Current pointer out of range!");
@@ -304,7 +305,8 @@ void Lexer::formToken(tok Kind, const char *TokStart, bool MultilineString) {
lexTrivia(TrailingTrivia, /* IsForTrailingTrivia */ true); lexTrivia(TrailingTrivia, /* IsForTrailingTrivia */ true);
} }
NextToken.setToken(Kind, TokenText, CommentLength, MultilineString); NextToken.setToken(Kind, TokenText, CommentLength,
IsMultilineString, CustomDelimiterLen);
} }
void Lexer::formEscapedIdentifierToken(const char *TokStart) { void Lexer::formEscapedIdentifierToken(const char *TokStart) {
@@ -1211,6 +1213,69 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) {
} }
} }
/// diagnoseZeroWidthMatchAndAdvance - Error invisible characters in delimiters.
/// An invisible character in the middle of a delimiter can be used to extend
/// the literal beyond what it would appear creating potential security bugs.
static bool diagnoseZeroWidthMatchAndAdvance(char Target, const char *&CurPtr,
DiagnosticEngine *Diags) {
// TODO: Detect, diagnose and skip over zero-width characters if required.
// See https://github.com/apple/swift/pull/17668 for possible implementation.
return *CurPtr == Target && CurPtr++;
}
/// advanceIfMultilineDelimiter - Centralized check for multiline delimiter.
static bool advanceIfMultilineDelimiter(const char *&CurPtr,
DiagnosticEngine *Diags) {
const char *TmpPtr = CurPtr;
if (*(TmpPtr - 1) == '"' &&
diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags) &&
diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags)) {
CurPtr = TmpPtr;
return true;
}
return false;
}
/// advanceIfCustomDelimiter - Extracts/detects any custom delimiter on
/// opening a string literal, advances CurPtr if a delimiter is found and
/// returns a non-zero delimiter length. CurPtr[-1] generally '#' when called.
static unsigned advanceIfCustomDelimiter(const char *&CurPtr,
DiagnosticEngine *Diags) {
const char *TmpPtr = CurPtr;
unsigned CustomDelimiterLen = 1;
while (diagnoseZeroWidthMatchAndAdvance('#', TmpPtr, Diags))
CustomDelimiterLen++;
if (diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags)) {
CurPtr = TmpPtr;
return CustomDelimiterLen;
}
return 0;
}
/// delimiterMatches - Does custom delimiter ('#' characters surrounding quotes)
/// match the number of '#' characters after '\' inside the string? This allows
/// interpolation inside a "raw" string. Normal/cooked string processing is
/// the degenerate case of there being no '#' characters surrounding the quotes.
/// If delimiter matches, advances byte pointer passed in and returns true.
/// Also used to detect the final delimiter of a string when IsClosing == true.
static bool delimiterMatches(unsigned CustomDelimiterLen, const char *&BytesPtr,
DiagnosticEngine *Diags, bool IsClosing = false) {
if (!CustomDelimiterLen)
return true;
const char *TmpPtr = BytesPtr;
while (CustomDelimiterLen--)
if (!diagnoseZeroWidthMatchAndAdvance('#', TmpPtr, Diags))
return false;
BytesPtr = TmpPtr;
if (*BytesPtr == '#' && Diags)
Diags->diagnose(Lexer::getSourceLoc(BytesPtr), IsClosing ?
diag::lex_invalid_closing_delimiter :
diag::lex_invalid_escape_delimiter)
.fixItRemoveChars(Lexer::getSourceLoc(BytesPtr),
Lexer::getSourceLoc(BytesPtr + 1));
return true;
}
/// lexCharacter - Read a character and return its UTF32 code. If this is the /// lexCharacter - Read a character and return its UTF32 code. If this is the
/// end of enclosing string/character sequence (i.e. the character is equal to /// end of enclosing string/character sequence (i.e. the character is equal to
/// 'StopQuote'), this returns ~0U and leaves 'CurPtr' pointing to the terminal /// 'StopQuote'), this returns ~0U and leaves 'CurPtr' pointing to the terminal
@@ -1220,7 +1285,8 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) {
/// character_escape ::= [\][\] | [\]t | [\]n | [\]r | [\]" | [\]' | [\]0 /// character_escape ::= [\][\] | [\]t | [\]n | [\]r | [\]" | [\]' | [\]0
/// character_escape ::= unicode_character_escape /// character_escape ::= unicode_character_escape
unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
bool EmitDiagnostics, bool MultilineString) { bool EmitDiagnostics, bool IsMultilineString,
unsigned CustomDelimiterLen) {
const char *CharStart = CurPtr; const char *CharStart = CurPtr;
switch (*CurPtr++) { switch (*CurPtr++) {
@@ -1228,7 +1294,7 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
// If this is a "high" UTF-8 character, validate it. // If this is a "high" UTF-8 character, validate it.
if ((signed char)(CurPtr[-1]) >= 0) { if ((signed char)(CurPtr[-1]) >= 0) {
if (isPrintable(CurPtr[-1]) == 0) if (isPrintable(CurPtr[-1]) == 0)
if (!(MultilineString && (CurPtr[-1] == '\t'))) if (!(IsMultilineString && (CurPtr[-1] == '\t')))
if (EmitDiagnostics) if (EmitDiagnostics)
diagnose(CharStart, diag::lex_unprintable_ascii_character); diagnose(CharStart, diag::lex_unprintable_ascii_character);
return CurPtr[-1]; return CurPtr[-1];
@@ -1263,12 +1329,15 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
return ~1U; return ~1U;
case '\n': // String literals cannot have \n or \r in them. case '\n': // String literals cannot have \n or \r in them.
case '\r': case '\r':
if (MultilineString) // ... unless they are multiline if (IsMultilineString) // ... unless they are multiline
return CurPtr[-1]; return CurPtr[-1];
if (EmitDiagnostics) if (EmitDiagnostics)
diagnose(CurPtr-1, diag::lex_unterminated_string); diagnose(CurPtr-1, diag::lex_unterminated_string);
return ~1U; return ~1U;
case '\\': // Escapes. case '\\': // Escapes.
if (!delimiterMatches(CustomDelimiterLen, CurPtr,
EmitDiagnostics ? Diags : nullptr))
return '\\';
break; break;
} }
@@ -1276,7 +1345,7 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
// Escape processing. We already ate the "\". // Escape processing. We already ate the "\".
switch (*CurPtr) { switch (*CurPtr) {
case ' ': case '\t': case '\n': case '\r': case ' ': case '\t': case '\n': case '\r':
if (MultilineString && maybeConsumeNewlineEscape(CurPtr, 0)) if (IsMultilineString && maybeConsumeNewlineEscape(CurPtr, 0))
return '\n'; return '\n';
LLVM_FALLTHROUGH; LLVM_FALLTHROUGH;
default: // Invalid escape. default: // Invalid escape.
@@ -1334,10 +1403,11 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
const char *EndPtr, const char *EndPtr,
DiagnosticEngine *Diags, DiagnosticEngine *Diags,
bool MultilineString) { bool IsMultilineString) {
llvm::SmallVector<char, 4> OpenDelimiters; SmallVector<char, 4> OpenDelimiters;
llvm::SmallVector<bool, 4> AllowNewline; SmallVector<bool, 4> AllowNewline;
AllowNewline.push_back(MultilineString); SmallVector<unsigned, 4> CustomDelimiter;
AllowNewline.push_back(IsMultilineString);
auto inStringLiteral = [&]() { auto inStringLiteral = [&]() {
return !OpenDelimiters.empty() && return !OpenDelimiters.empty() &&
@@ -1352,6 +1422,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
// On success scanning the expression body, the real lexer will be used to // On success scanning the expression body, the real lexer will be used to
// relex the body when parsing the expressions. We let it diagnose any // relex the body when parsing the expressions. We let it diagnose any
// issues with malformed tokens or other problems. // issues with malformed tokens or other problems.
unsigned CustomDelimiterLen = 0;
switch (*CurPtr++) { switch (*CurPtr++) {
// String literals in general cannot be split across multiple lines; // String literals in general cannot be split across multiple lines;
// interpolated ones are no exception - unless multiline literals. // interpolated ones are no exception - unless multiline literals.
@@ -1362,43 +1433,52 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
// Will be diagnosed as an unterminated string literal. // Will be diagnosed as an unterminated string literal.
return CurPtr-1; return CurPtr-1;
case '#':
if (inStringLiteral() ||
!(CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, Diags)))
continue;
LLVM_FALLTHROUGH;
case '"': case '"':
case '\'': { case '\'': {
if (!AllowNewline.back() && inStringLiteral()) { if (!AllowNewline.back() && inStringLiteral()) {
if (OpenDelimiters.back() == CurPtr[-1]) { if (OpenDelimiters.back() == CurPtr[-1] &&
delimiterMatches(CustomDelimiter.back(), CurPtr, Diags, true)) {
// Closing single line string literal. // Closing single line string literal.
OpenDelimiters.pop_back(); OpenDelimiters.pop_back();
AllowNewline.pop_back(); AllowNewline.pop_back();
CustomDelimiter.pop_back();
} }
// Otherwise, it's just a quote in string literal. e.g. "foo's". // Otherwise, it's just a quote in string literal. e.g. "foo's".
continue; continue;
} }
bool isMultilineQuote = ( bool isMultilineQuote = advanceIfMultilineDelimiter(CurPtr, Diags);
*CurPtr == '"' && *(CurPtr + 1) == '"' && *(CurPtr - 1) == '"');
if (isMultilineQuote)
CurPtr += 2;
if (!inStringLiteral()) { if (!inStringLiteral()) {
// Open string literal // Open string literal
OpenDelimiters.push_back(CurPtr[-1]); OpenDelimiters.push_back(CurPtr[-1]);
AllowNewline.push_back(isMultilineQuote); AllowNewline.push_back(isMultilineQuote);
CustomDelimiter.push_back(CustomDelimiterLen);
continue; continue;
} }
// We are in multiline string literal. // We are in multiline string literal.
assert(AllowNewline.back() && "other cases must be handled above"); assert(AllowNewline.back() && "other cases must be handled above");
if (isMultilineQuote) { if (isMultilineQuote &&
delimiterMatches(CustomDelimiter.back(), CurPtr, Diags, true)) {
// Close multiline string literal. // Close multiline string literal.
OpenDelimiters.pop_back(); OpenDelimiters.pop_back();
AllowNewline.pop_back(); AllowNewline.pop_back();
CustomDelimiter.pop_back();
} }
// Otherwise, it's just a normal character in multiline string. // Otherwise, it's just a normal character in multiline string.
continue; continue;
} }
case '\\': case '\\':
if (inStringLiteral()) { if (inStringLiteral() &&
delimiterMatches(CustomDelimiter.back(), CurPtr, Diags)) {
char escapedChar = *CurPtr++; char escapedChar = *CurPtr++;
switch (escapedChar) { switch (escapedChar) {
case '(': case '(':
@@ -1458,7 +1538,10 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
static StringRef getStringLiteralContent(const Token &Str) { static StringRef getStringLiteralContent(const Token &Str) {
StringRef Bytes = Str.getText(); StringRef Bytes = Str.getText();
if (Str.IsMultilineString()) if (unsigned CustomDelimiterLen = Str.getCustomDelimiterLen())
Bytes = Bytes.drop_front(CustomDelimiterLen).drop_back(CustomDelimiterLen);
if (Str.isMultilineString())
Bytes = Bytes.drop_front(3).drop_back(3); Bytes = Bytes.drop_front(3).drop_back(3);
else else
Bytes = Bytes.drop_front().drop_back(); Bytes = Bytes.drop_front().drop_back();
@@ -1496,7 +1579,7 @@ getMultilineTrailingIndent(const Token &Str, DiagnosticEngine *Diags) {
auto string = StringRef(start, end - start); auto string = StringRef(start, end - start);
// Disallow escaped newline in the last line. // Disallow escaped newline in the last line.
if (Diags) { if (Diags && Str.getCustomDelimiterLen() == 0) {
auto *Ptr = start - 1; auto *Ptr = start - 1;
if (*Ptr == '\n') --Ptr; if (*Ptr == '\n') --Ptr;
if (*Ptr == '\r') --Ptr; if (*Ptr == '\r') --Ptr;
@@ -1652,30 +1735,31 @@ static void validateMultilineIndents(const Token &Str,
/// lexStringLiteral: /// lexStringLiteral:
/// string_literal ::= ["]([^"\\\n\r]|character_escape)*["] /// string_literal ::= ["]([^"\\\n\r]|character_escape)*["]
/// string_literal ::= ["]["]["].*["]["]["] - approximately /// string_literal ::= ["]["]["].*["]["]["] - approximately
void Lexer::lexStringLiteral() { /// string_literal ::= (#+)("")?".*"(\2\1) - "raw" strings
void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) {
const char *TokStart = CurPtr-1; const char *TokStart = CurPtr-1;
assert((*TokStart == '"' || *TokStart == '\'') && "Unexpected start"); assert((*TokStart == '"' || *TokStart == '\'') && "Unexpected start");
// NOTE: We only allow single-quote string literals so we can emit useful // NOTE: We only allow single-quote string literals so we can emit useful
// diagnostics about changing them to double quotes. // diagnostics about changing them to double quotes.
bool wasErroneous = false, MultilineString = false; bool wasErroneous = false, IsMultilineString = false;
// Is this the start of a multiline string literal? // Is this the start of a multiline string literal?
if (*TokStart == '"' && *CurPtr == '"' && *(CurPtr + 1) == '"') { if ((IsMultilineString = advanceIfMultilineDelimiter(CurPtr, Diags))) {
MultilineString = true;
CurPtr += 2;
if (*CurPtr != '\n' && *CurPtr != '\r') if (*CurPtr != '\n' && *CurPtr != '\r')
diagnose(CurPtr, diag::lex_illegal_multiline_string_start) diagnose(CurPtr, diag::lex_illegal_multiline_string_start)
.fixItInsert(Lexer::getSourceLoc(CurPtr), "\n"); .fixItInsert(Lexer::getSourceLoc(CurPtr), "\n");
} }
while (true) { while (true) {
if (*CurPtr == '\\' && *(CurPtr + 1) == '(') { const char *TmpPtr = CurPtr + 1;
if (*CurPtr == '\\' && delimiterMatches(CustomDelimiterLen, TmpPtr, nullptr)
&& *TmpPtr == '(') {
// Consume tokens until we hit the corresponding ')'. // Consume tokens until we hit the corresponding ')'.
CurPtr += 2; CurPtr = TmpPtr + 1;
const char *EndPtr = const char *EndPtr =
skipToEndOfInterpolatedExpression(CurPtr, BufferEnd, skipToEndOfInterpolatedExpression(CurPtr, BufferEnd,
Diags, MultilineString); Diags, IsMultilineString);
if (*EndPtr == ')') { if (*EndPtr == ')') {
// Successfully scanned the body of the expression literal. // Successfully scanned the body of the expression literal.
@@ -1688,21 +1772,21 @@ void Lexer::lexStringLiteral() {
} }
// String literals cannot have \n or \r in them (unless multiline). // String literals cannot have \n or \r in them (unless multiline).
if (((*CurPtr == '\r' || *CurPtr == '\n') && !MultilineString) if (((*CurPtr == '\r' || *CurPtr == '\n') && !IsMultilineString)
|| CurPtr == BufferEnd) { || CurPtr == BufferEnd) {
TokStart -= CustomDelimiterLen;
diagnose(TokStart, diag::lex_unterminated_string); diagnose(TokStart, diag::lex_unterminated_string);
return formToken(tok::unknown, TokStart); return formToken(tok::unknown, TokStart);
} }
unsigned CharValue = lexCharacter(CurPtr, *TokStart, true, MultilineString); unsigned CharValue = lexCharacter(CurPtr, *TokStart, true,
IsMultilineString, CustomDelimiterLen);
wasErroneous |= CharValue == ~1U; wasErroneous |= CharValue == ~1U;
// If this is the end of string, we are done. If it is a normal character // If this is the end of string, we are done. If it is a normal character
// or an already-diagnosed error, just munch it. // or an already-diagnosed error, just munch it.
if (CharValue == ~0U) { if (CharValue == ~0U) {
++CurPtr; ++CurPtr;
if (wasErroneous)
return formToken(tok::unknown, TokStart);
if (*TokStart == '\'') { if (*TokStart == '\'') {
// Complain about single-quote string and suggest replacement with // Complain about single-quote string and suggest replacement with
@@ -1738,20 +1822,19 @@ void Lexer::lexStringLiteral() {
replacement); replacement);
} }
// Is this the end of a multiline string literal? // Is this the end of multiline/custom-delimited string literal?
if (MultilineString) { if ((!IsMultilineString || advanceIfMultilineDelimiter(CurPtr, Diags)) &&
if (*CurPtr == '"' && *(CurPtr + 1) == '"' && *(CurPtr + 2) != '"') { delimiterMatches(CustomDelimiterLen, CurPtr, Diags, true)) {
CurPtr += 2; TokStart -= CustomDelimiterLen;
formToken(tok::string_literal, TokStart, MultilineString); if (wasErroneous)
if (Diags) return formToken(tok::unknown, TokStart);
validateMultilineIndents(NextToken, Diags);
return;
}
else
continue;
}
return formToken(tok::string_literal, TokStart, MultilineString); formToken(tok::string_literal, TokStart,
IsMultilineString, CustomDelimiterLen);
if (IsMultilineString && Diags)
validateMultilineIndents(NextToken, Diags);
return;
}
} }
} }
} }
@@ -2016,13 +2099,35 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes,
SmallVectorImpl<char> &TempString, SmallVectorImpl<char> &TempString,
bool IsFirstSegment, bool IsFirstSegment,
bool IsLastSegment, bool IsLastSegment,
unsigned IndentToStrip) { unsigned IndentToStrip,
unsigned CustomDelimiterLen) {
TempString.clear(); TempString.clear();
// Note that it is always safe to read one over the end of "Bytes" because // Note that it is always safe to read one over the end of "Bytes" because
// we know that there is a terminating " character. Use BytesPtr to avoid a // we know that there is a terminating " character. Use BytesPtr to avoid a
// range check subscripting on the StringRef. // range check subscripting on the StringRef.
const char *BytesPtr = Bytes.begin(); const char *BytesPtr = Bytes.begin();
// Special case when being called from EncodedDiagnosticMessage(...).
// This allows multiline and delimited strings to work in attributes.
// The string has already been validated by the initial parse.
if (IndentToStrip == ~0u && CustomDelimiterLen == ~0u) {
IndentToStrip = CustomDelimiterLen = 0;
// Restore trailing indent removal for multiline.
const char *Backtrack = BytesPtr - 1;
if (Backtrack[-1] == '"' && Backtrack[-2] == '"') {
Backtrack -= 2;
for (const char *Trailing = Bytes.end() - 1;
*Trailing == ' ' || *Trailing == '\t'; Trailing--)
IndentToStrip++;
}
// Restore delimiter if any.
while (*--Backtrack == '#')
CustomDelimiterLen++;
}
bool IsEscapedNewline = false; bool IsEscapedNewline = false;
while (BytesPtr < Bytes.end()) { while (BytesPtr < Bytes.end()) {
char CurChar = *BytesPtr++; char CurChar = *BytesPtr++;
@@ -2043,7 +2148,8 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes,
continue; continue;
} }
if (CurChar != '\\') { if (CurChar != '\\' ||
!delimiterMatches(CustomDelimiterLen, BytesPtr, nullptr)) {
TempString.push_back(CurChar); TempString.push_back(CurChar);
continue; continue;
} }
@@ -2113,8 +2219,8 @@ void Lexer::getStringLiteralSegments(
// Are substitutions required either for indent stripping or line ending // Are substitutions required either for indent stripping or line ending
// normalization? // normalization?
bool MultilineString = Str.IsMultilineString(), IsFirstSegment = true; bool MultilineString = Str.isMultilineString(), IsFirstSegment = true;
unsigned IndentToStrip = 0; unsigned IndentToStrip = 0, CustomDelimiterLen = Str.getCustomDelimiterLen();
if (MultilineString) if (MultilineString)
IndentToStrip = IndentToStrip =
std::get<0>(getMultilineTrailingIndent(Str, /*Diags=*/nullptr)).size(); std::get<0>(getMultilineTrailingIndent(Str, /*Diags=*/nullptr)).size();
@@ -2124,13 +2230,12 @@ void Lexer::getStringLiteralSegments(
// range check subscripting on the StringRef. // range check subscripting on the StringRef.
const char *SegmentStartPtr = Bytes.begin(); const char *SegmentStartPtr = Bytes.begin();
const char *BytesPtr = SegmentStartPtr; const char *BytesPtr = SegmentStartPtr;
// FIXME: Use SSE to scan for '\'. size_t pos;
while (BytesPtr != Bytes.end()) { while ((pos = Bytes.find('\\', BytesPtr-Bytes.begin())) != StringRef::npos) {
char CurChar = *BytesPtr++; BytesPtr = Bytes.begin() + pos + 1;
if (CurChar != '\\')
continue;
if (*BytesPtr++ != '(') if (!delimiterMatches(CustomDelimiterLen, BytesPtr, Diags) ||
*BytesPtr++ != '(')
continue; continue;
// String interpolation. // String interpolation.
@@ -2138,8 +2243,9 @@ void Lexer::getStringLiteralSegments(
// Push the current segment. // Push the current segment.
Segments.push_back( Segments.push_back(
StringSegment::getLiteral(getSourceLoc(SegmentStartPtr), StringSegment::getLiteral(getSourceLoc(SegmentStartPtr),
BytesPtr-SegmentStartPtr-2, BytesPtr-SegmentStartPtr-2-CustomDelimiterLen,
IsFirstSegment, false, IndentToStrip)); IsFirstSegment, false, IndentToStrip,
CustomDelimiterLen));
IsFirstSegment = false; IsFirstSegment = false;
// Find the closing ')'. // Find the closing ')'.
@@ -2162,7 +2268,8 @@ void Lexer::getStringLiteralSegments(
Segments.push_back( Segments.push_back(
StringSegment::getLiteral(getSourceLoc(SegmentStartPtr), StringSegment::getLiteral(getSourceLoc(SegmentStartPtr),
Bytes.end()-SegmentStartPtr, Bytes.end()-SegmentStartPtr,
IsFirstSegment, true, IndentToStrip)); IsFirstSegment, true, IndentToStrip,
CustomDelimiterLen));
} }
@@ -2261,6 +2368,8 @@ void Lexer::lexImpl() {
case '\\': return formToken(tok::backslash, TokStart); case '\\': return formToken(tok::backslash, TokStart);
case '#': case '#':
if (unsigned CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, Diags))
return lexStringLiteral(CustomDelimiterLen);
return lexHash(); return lexHash();
// Operator characters. // Operator characters.

View File

@@ -1923,7 +1923,7 @@ ParserResult<Expr> Parser::parseExprStringLiteral() {
LocalContext.setCreateSyntax(SyntaxKind::StringInterpolationExpr); LocalContext.setCreateSyntax(SyntaxKind::StringInterpolationExpr);
StringRef Quote; StringRef Quote;
tok QuoteKind; tok QuoteKind;
std::tie(Quote, QuoteKind) = Tok.IsMultilineString() ? std::tie(Quote, QuoteKind) = Tok.isMultilineString() ?
std::make_tuple("\"\"\"", tok::multiline_string_quote) : std::make_tuple("\"\"\"", tok::multiline_string_quote) :
std::make_tuple("\"", tok::string_quote); std::make_tuple("\"", tok::string_quote);

View File

@@ -216,8 +216,9 @@ static void getStringPartTokens(const Token &Tok, const LangOptions &LangOpts,
const SourceManager &SM, const SourceManager &SM,
int BufID, std::vector<Token> &Toks) { int BufID, std::vector<Token> &Toks) {
assert(Tok.is(tok::string_literal)); assert(Tok.is(tok::string_literal));
bool IsMultiline = Tok.IsMultilineString(); bool IsMultiline = Tok.isMultilineString();
unsigned QuoteLen = IsMultiline ? 3 : 1; unsigned CustomDelimiterLen = Tok.getCustomDelimiterLen();
unsigned QuoteLen = (IsMultiline ? 3 : 1) + CustomDelimiterLen;
SmallVector<Lexer::StringSegment, 4> Segments; SmallVector<Lexer::StringSegment, 4> Segments;
Lexer::getStringLiteralSegments(Tok, Segments, /*Diags=*/nullptr); Lexer::getStringLiteralSegments(Tok, Segments, /*Diags=*/nullptr);
for (unsigned i = 0, e = Segments.size(); i != e; ++i) { for (unsigned i = 0, e = Segments.size(); i != e; ++i) {
@@ -239,7 +240,8 @@ static void getStringPartTokens(const Token &Tok, const LangOptions &LangOpts,
StringRef Text = SM.extractText({ Loc, Len }); StringRef Text = SM.extractText({ Loc, Len });
Token NewTok; Token NewTok;
NewTok.setToken(tok::string_literal, Text, IsMultiline); NewTok.setToken(tok::string_literal, Text,
IsMultiline, CustomDelimiterLen);
Toks.push_back(NewTok); Toks.push_back(NewTok);
} else { } else {
@@ -372,7 +374,7 @@ class TokenRecorder: public ConsumeTokenReceiver {
} }
void relexComment(CharSourceRange CommentRange, void relexComment(CharSourceRange CommentRange,
llvm::SmallVectorImpl<Token> &Scracth) { llvm::SmallVectorImpl<Token> &Scratch) {
Lexer L(Ctx.LangOpts, Ctx.SourceMgr, BufferID, nullptr, /*InSILMode=*/false, Lexer L(Ctx.LangOpts, Ctx.SourceMgr, BufferID, nullptr, /*InSILMode=*/false,
HashbangMode::Disallowed, HashbangMode::Disallowed,
CommentRetentionMode::ReturnAsTokens, CommentRetentionMode::ReturnAsTokens,
@@ -385,7 +387,7 @@ class TokenRecorder: public ConsumeTokenReceiver {
if (Result.is(tok::eof)) if (Result.is(tok::eof))
break; break;
assert(Result.is(tok::comment)); assert(Result.is(tok::comment));
Scracth.push_back(Result); Scratch.push_back(Result);
} }
} }

View File

@@ -2164,7 +2164,7 @@ class EncodedDiagnosticMessage {
public: public:
/// \param S A string with an encoded message /// \param S A string with an encoded message
EncodedDiagnosticMessage(StringRef S) EncodedDiagnosticMessage(StringRef S)
: Message(Lexer::getEncodedStringSegment(S, Buf)) {} : Message(Lexer::getEncodedStringSegment(S, Buf, true, true, ~0, ~0)) {}
/// The unescaped message to display to the user. /// The unescaped message to display to the user.
const StringRef Message; const StringRef Message;

134
test/Parse/raw_string.swift Normal file
View File

@@ -0,0 +1,134 @@
// RUN: %target-swift-frontend -dump-ast %s 2>&1 | %FileCheck --strict-whitespace %s
import Swift
_ = #"""
###################################################################
## This source file is part of the Swift.org open source project ##
###################################################################
"""#
// CHECK: "###################################################################\n## This source file is part of the Swift.org open source project ##\n###################################################################"
_ = #"""
# H1 #
## H2 ##
### H3 ###
"""#
// CHECK: "# H1 #\n## H2 ##\n### H3 ###"
// ===---------- Multiline RawString --------===
_ = ##"""
One
""Alpha""
"""##
// CHECK: "One\n\"\"Alpha\"\""
_ = ##"""
Two
Beta
"""##
// CHECK: " Two\nBeta"
_ = #"""
Three\r
Gamma\
"""#
// CHECK: " Three\\r\n Gamma\\"
_ = ###"""
Four \(foo)
Delta
"""###
// CHECK: " Four \\(foo)\n Delta"
_ = ##"""
print("""
Five\##n\##n\##nEpsilon
""")
"""##
// CHECK: "print(\"\"\"\n Five\n\n\nEpsilon\n \"\"\")"
// ===---------- Single line --------===
_ = #""Zeta""#
// CHECK: "\"Zeta\""
_ = #""Eta"\#n\#n\#n\#""#
// CHECK: "\"Eta\"\n\n\n\""
_ = #""Iota"\n\n\n\""#
// CHECK: "\"Iota\"\\n\\n\\n\\\""
_ = #"a raw string with \" in it"#
// CHECK: "a raw string with \\\" in it"
_ = ##"""
a raw string with """ in it
"""##
// CHECK: "a raw string with \"\"\" in it"
let foo = "Interpolation"
_ = #"\b\b \#(foo)\#(foo) Kappa"#
// CHECK: "\\b\\b "
// CHECK: " Kappa"
_ = """
interpolating \(##"""
delimited \##("string")\#n\##n
"""##)
"""
// CHECK: "interpolating "
// CHECK: "delimited "
// CHECK: "string"
// CHECK: "\\#n\n"
#"unused literal"#
// CHECK: "unused literal"
// ===---------- From proposal --------===
_ = #"This is a string"#
// CHECK: "This is a string"
_ = #####"This is a string"#####
// CHECK: "This is a string"
_ = #"enum\s+.+\{.*case\s+[:upper:]"#
// CHECK: "enum\\s+.+\\{.*case\\s+[:upper:]"
_ = #"Alice: "How long is forever?" White Rabbit: "Sometimes, just one second.""#
// CHECK: "Alice: \"How long is forever?\" White Rabbit: \"Sometimes, just one second.\""
_ = #"\#\#1"#
// CHECK: "\\#1"
_ = ##"\#1"##
// CHECK: "\\#1"
_ = #"c:\windows\system32"#
// CHECK: "c:\\windows\\system32"
_ = #"\d{3) \d{3} \d{4}"#
// CHECK: "\\d{3) \\d{3} \\d{4}"
_ = #"""
a string with
"""
in it
"""#
// CHECK: "a string with\n\"\"\"\nin it"
_ = #"a raw string containing \r\n"#
// CHECK: "a raw string containing \\r\\n"
_ = #"""
[
{
"id": "12345",
"title": "A title that \"contains\" \\\""
}
]
"""#
// CHECK: "[\n {\n \"id\": \"12345\",\n \"title\": \"A title that \\\"contains\\\" \\\\\\\"\"\n }\n]"

View File

@@ -0,0 +1,14 @@
// RUN: %target-typecheck-verify-swift
#"\##("invalid")"#
// expected-error@-1{{too many '#' characters in delimited escape}}
// expected-error@-2{{invalid escape sequence in literal}}
####"invalid"###
// expected-error@-1{{unterminated string literal}}
###"invalid"####
// expected-error@-1{{too many '#' characters in closing delimiter}}
// expected-error@-2{{consecutive statements on a line must be separated by ';'}}
// expected-error@-3{{expected expression}}
// expected-warning@-4{{string literal is unused}}