//===--- Lexer.cpp - Swift Language Lexer ---------------------------------===// // // This source file is part of the Swift.org open source project // // Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See http://swift.org/LICENSE.txt for license information // See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// // // This file implements the Lexer and Token interfaces. // //===----------------------------------------------------------------------===// #include "swift/Parse/Lexer.h" #include "swift/AST/ASTContext.h" #include "swift/AST/Diagnostics.h" #include "swift/AST/Identifier.h" #include "swift/Basic/Fallthrough.h" #include "swift/Basic/SourceManager.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" using namespace swift; //===----------------------------------------------------------------------===// // UTF8 Validation/Encoding/Decoding helper functions //===----------------------------------------------------------------------===// /// EncodeToUTF8 - Encode the specified code point into a UTF8 stream. Return /// true if it is an erroneous code point. static bool EncodeToUTF8(unsigned CharValue, llvm::SmallVectorImpl &Result) { assert(CharValue >= 0x80 && "Single-byte encoding should be already handled"); // Number of bits in the value, ignoring leading zeros. unsigned NumBits = 32-llvm::countLeadingZeros(CharValue); unsigned LowHalf = CharValue & 0xFFFF; // Reserved values in each plane if (LowHalf == 0xFFFE || LowHalf == 0xFFFF) return true; // Handle the leading byte, based on the number of bits in the value. unsigned NumTrailingBytes; if (NumBits <= 5+6) { // Encoding is 0x110aaaaa 10bbbbbb Result.push_back(char(0xC0 | (CharValue >> 6))); NumTrailingBytes = 1; } else if (NumBits <= 4+6+6) { // Encoding is 0x1110aaaa 10bbbbbb 10cccccc Result.push_back(char(0xE0 | (CharValue >> (6+6)))); NumTrailingBytes = 2; // UTF-16 surrogate pair values are not valid code points. if (CharValue >= 0xD800 && CharValue <= 0xDFFF) return true; // U+FDD0...U+FDEF are also reserved if (CharValue >= 0xFDD0 && CharValue <= 0xFDEF) return true; } else if (NumBits <= 3+6+6+6) { // Encoding is 0x11110aaa 10bbbbbb 10cccccc 10dddddd Result.push_back(char(0xF0 | (CharValue >> (6+6+6)))); NumTrailingBytes = 3; // Reject over-large code points. These cannot be encoded as UTF-16 // surrogate pairs, so UTF-32 doesn't allow them. if (CharValue > 0x10FFFF) return true; } else { return true; // UTF8 can encode these, but they aren't valid code points. } // Emit all of the trailing bytes. while (NumTrailingBytes--) Result.push_back(char(0x80 | (0x3F & (CharValue >> (NumTrailingBytes*6))))); return false; } /// CLO8 - Return the number of leading ones in the specified 8-bit value. static unsigned CLO8(unsigned char C) { return llvm::CountLeadingOnes_32(uint32_t(C) << 24); } /// isStartOfUTF8Character - Return true if this isn't a UTF8 continuation /// character, which will be of the form 0b10XXXXXX static bool isStartOfUTF8Character(unsigned char C) { return (signed char)C >= 0 || C >= 0xC0; // C0 = 0b11000000 } /// validateUTF8CharacterAndAdvance - Given a pointer to the starting byte of a /// UTF8 character, validate it and advance the lexer past it. This returns the /// encoded character or ~0U if the encoding is invalid. static uint32_t validateUTF8CharacterAndAdvance(const char *&Ptr, const char *End) { if (Ptr >= End) return ~0U; unsigned char CurByte = *Ptr++; if (CurByte < 0x80) return CurByte; // Read the number of high bits set, which indicates the number of bytes in // the character. unsigned EncodedBytes = CLO8(CurByte); // If this is 0b10XXXXXX, then it is a continuation character. if (EncodedBytes == 1 || // If the number of encoded bytes is > 4, then this is an invalid // character in the range of 0xF5 and above. These would start an // encoding for something that couldn't be represented with UTF16 // digraphs, so Unicode rejects them. EncodedBytes > 4) { // Skip until we get the start of another character. This is guaranteed to // at least stop at the nul at the end of the buffer. while (Ptr < End && !isStartOfUTF8Character(*Ptr)) ++Ptr; return ~0U; } // Drop the high bits indicating the # bytes of the result. unsigned CharValue = (unsigned char)(CurByte << EncodedBytes) >> EncodedBytes; // Read and validate the continuation bytes. for (unsigned i = 1; i != EncodedBytes; ++i) { if (Ptr >= End) return ~0U; CurByte = *Ptr; // If the high bit isn't set or the second bit isn't clear, then this is not // a continuation byte! if (CurByte < 0x80 || CurByte >= 0xC0) return ~0U; // Accumulate our result. CharValue <<= 6; CharValue |= CurByte & 0x3F; ++Ptr; } // UTF-16 surrogate pair values are not valid code points. if (CharValue >= 0xD800 && CharValue <= 0xDFFF) return ~0U; // If we got here, we read the appropriate number of accumulated bytes. // Verify that the encoding was actually minimal. // Number of bits in the value, ignoring leading zeros. unsigned NumBits = 32-llvm::countLeadingZeros(CharValue); if (NumBits <= 5+6) return EncodedBytes == 2 ? CharValue : ~0U; if (NumBits <= 4+6+6) return EncodedBytes == 3 ? CharValue : ~0U; return EncodedBytes == 4 ? CharValue : ~0U; } //===----------------------------------------------------------------------===// // Setup and Helper Methods //===----------------------------------------------------------------------===// Lexer::Lexer(SourceManager &SourceMgr, DiagnosticEngine *Diags, unsigned BufferID, bool InSILMode, bool KeepComments) : SourceMgr(SourceMgr), Diags(Diags), BufferID(BufferID), InSILMode(InSILMode), KeepComments(KeepComments) { // Initialize buffer pointers. auto *Buffer = SourceMgr->getMemoryBuffer(BufferID); BufferStart = Buffer->getBufferStart(); BufferEnd = Buffer->getBufferEnd(); CurPtr = BufferStart; // Initialize code completion. if (BufferID == SourceMgr.getCodeCompletionBufferID()) { const char *Ptr = BufferStart + SourceMgr.getCodeCompletionOffset(); if (Ptr >= BufferStart && Ptr <= BufferEnd) CodeCompletionPtr = Ptr; } } Lexer::Lexer(SourceManager &SourceMgr, unsigned BufferID, DiagnosticEngine *Diags, bool InSILMode, bool KeepComments) : Lexer(SourceMgr, Diags, BufferID, InSILMode, KeepComments) { primeLexer(); } void Lexer::primeLexer() { assert(NextToken.is(tok::NUM_TOKENS)); lexImpl(); assert((NextToken.isAtStartOfLine() || CurPtr != BufferStart) && "The token should be at the beginning of the line, " "or we should be lexing from the middle of the buffer"); } void Lexer::initSubLexer(Lexer &Parent, State BeginState, State EndState) { assert(BufferID == static_cast(SourceMgr->FindBufferContainingLoc( BeginState.Loc.Value)) && "state for the wrong buffer"); assert(BufferID == static_cast(SourceMgr->FindBufferContainingLoc( EndState.Loc.Value)) && "state for the wrong buffer"); // If the parent lexer should stop prematurely, and the ArtificialEOF // position is in this subrange, then we should stop at that point, too. if (Parent.ArtificialEOF && Parent.ArtificialEOF >= BufferStart && Parent.ArtificialEOF <= BufferEnd) { ArtificialEOF = Parent.ArtificialEOF; } else ArtificialEOF = EndState.Loc.Value.getPointer(); primeLexer(); restoreState(BeginState); } InFlightDiagnostic Lexer::diagnose(const char *Loc, Diag<> ID) { if (Diags) return Diags->diagnose(getSourceLoc(Loc), ID); return InFlightDiagnostic(); } Token Lexer::getTokenAt(SourceLoc Loc) { assert(BufferID == static_cast( SourceMgr->FindBufferContainingLoc(Loc.Value)) && "location from the wrong buffer"); Lexer L(SourceMgr, BufferID, Diags, InSILMode, /*KeepComments=*/false); L.restoreState(State(Loc)); Token Result; L.lex(Result); return Result; } void Lexer::formToken(tok Kind, const char *TokStart) { // When we are lexing a subrange from the middle of a file buffer, we will // run past the end of the range, but will stay within the file. Check if // we are past the imaginary EOF, and synthesize a tok::eof in this case. if (Kind != tok::eof && ArtificialEOF && TokStart >= ArtificialEOF) { Kind = tok::eof; } NextToken.setToken(Kind, StringRef(TokStart, CurPtr-TokStart)); } Lexer::State Lexer::getStateForBeginningOfTokenLoc(SourceLoc Loc) const { const char *Ptr = Loc.Value.getPointer(); assert(Ptr >= BufferStart && Ptr <= BufferEnd); // Skip whitespace backwards until we hit a newline. This is needed to // correctly lex the token if it is at the beginning of the line. while (Ptr >= BufferStart + 1) { char C = Ptr[-1]; if (C == ' ' || C == '\t' || C == 0) { Ptr--; continue; } if (C == '\n' || C == '\r') { Ptr--; break; } break; } return State(SourceLoc(llvm::SMLoc::getFromPointer(Ptr))); } //===----------------------------------------------------------------------===// // Lexer Subroutines //===----------------------------------------------------------------------===// static void diagnoseEmbeddedNul(DiagnosticEngine *Diags, const char *Ptr) { assert(Ptr && "invalid source location"); assert(*Ptr == '\0' && "not an embedded null"); if (!Diags) return; SourceLoc NulLoc = Lexer::getSourceLoc(Ptr); SourceLoc NulEndLoc = Lexer::getSourceLoc(Ptr+1); Diags->diagnose(NulLoc, diag::lex_nul_character) .fixItRemove(DiagnosticInfo::Range(NulLoc, NulEndLoc)); } void Lexer::skipToEndOfLine() { while (1) { switch (*CurPtr++) { case '\n': case '\r': NextToken.setAtStartOfLine(true); return; // If we found the end of the line, return. default: // If this is a "high" UTF-8 character, validate it. if ((signed char)(CurPtr[-1]) < 0) { --CurPtr; const char *CharStart = CurPtr; if (validateUTF8CharacterAndAdvance(CurPtr, BufferEnd) == ~0U) diagnose(CharStart, diag::lex_invalid_utf8_character); } break; // Otherwise, eat other characters. case 0: // If this is a random nul character in the middle of a buffer, skip it as // whitespace. if (CurPtr-1 != BufferEnd) { diagnoseEmbeddedNul(Diags, CurPtr-1); break; } // Otherwise, the last line of the file does not have a newline. --CurPtr; return; } } } void Lexer::skipSlashSlashComment() { assert(CurPtr[-1] == '/' && CurPtr[0] == '/' && "Not a // comment"); skipToEndOfLine(); } void Lexer::skipHashbang() { assert(CurPtr == BufferStart && CurPtr[0] == '#' && CurPtr[1] == '!' && "Not a hashbang"); skipToEndOfLine(); } /// skipSlashStarComment - /**/ comments are skipped (treated as whitespace). /// Note that (unlike in C) block comments can be nested. void Lexer::skipSlashStarComment() { const char *StartPtr = CurPtr-1; assert(CurPtr[-1] == '/' && CurPtr[0] == '*' && "Not a /* comment"); // Make sure to advance over the * so that we don't incorrectly handle /*/ as // the beginning and end of the comment. ++CurPtr; // /**/ comments can be nested, keep track of how deep we've gone. unsigned Depth = 1; while (1) { switch (*CurPtr++) { case '*': // Check for a '*/' if (*CurPtr == '/') { ++CurPtr; if (--Depth == 0) return; } break; case '/': // Check for a '/*' if (*CurPtr == '*') { ++CurPtr; ++Depth; } break; case '\n': case '\r': NextToken.setAtStartOfLine(true); break; default: // If this is a "high" UTF-8 character, validate it. if ((signed char)(CurPtr[-1]) < 0) { --CurPtr; const char *CharStart = CurPtr; if (validateUTF8CharacterAndAdvance(CurPtr, BufferEnd) == ~0U) diagnose(CharStart, diag::lex_invalid_utf8_character); } break; // Otherwise, eat other characters. case 0: // If this is a random nul character in the middle of a buffer, skip it as // whitespace. if (CurPtr-1 != BufferEnd) { diagnoseEmbeddedNul(Diags, CurPtr-1); break; } // Otherwise, we have an unterminated /* comment. --CurPtr; // Count how many levels deep we are. llvm::SmallString<8> Terminator("*/"); while (--Depth != 0) Terminator += "*/"; const char *EOL = (CurPtr[-1] == '\n') ? (CurPtr - 1) : CurPtr; diagnose(EOL, diag::lex_unterminated_block_comment) .fixItInsert(getSourceLoc(EOL), Terminator); diagnose(StartPtr, diag::lex_comment_start); return; } } } static bool isValidIdentifierContinuationCodePoint(uint32_t c) { if (c < 0x80) return isalnum(c) || c == '_' || c == '$'; // N1518: Recommendations for extended identifier characters for C and C++ // Proposed Annex X.1: Ranges of characters allowed return c == 0x00A8 || c == 0x00AA || c == 0x00AD || c == 0x00AF || (c >= 0x00B2 && c <= 0x00B5) || (c >= 0x00B7 && c <= 0x00BA) || (c >= 0x00BC && c <= 0x00BE) || (c >= 0x00C0 && c <= 0x00D6) || (c >= 0x00D8 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FF) || (c >= 0x0100 && c <= 0x167F) || (c >= 0x1681 && c <= 0x180D) || (c >= 0x180F && c <= 0x1FFF) || (c >= 0x200B && c <= 0x200D) || (c >= 0x202A && c <= 0x202E) || (c >= 0x203F && c <= 0x2040) || c == 0x2054 || (c >= 0x2060 && c <= 0x206F) || (c >= 0x2070 && c <= 0x218F) || (c >= 0x2460 && c <= 0x24FF) || (c >= 0x2776 && c <= 0x2793) || (c >= 0x2C00 && c <= 0x2DFF) || (c >= 0x2E80 && c <= 0x2FFF) || (c >= 0x3004 && c <= 0x3007) || (c >= 0x3021 && c <= 0x302F) || (c >= 0x3031 && c <= 0x303F) || (c >= 0x3040 && c <= 0xD7FF) || (c >= 0xF900 && c <= 0xFD3D) || (c >= 0xFD40 && c <= 0xFDCF) || (c >= 0xFDF0 && c <= 0xFE44) || (c >= 0xFE47 && c <= 0xFFFD) || (c >= 0x10000 && c <= 0x1FFFD) || (c >= 0x20000 && c <= 0x2FFFD) || (c >= 0x30000 && c <= 0x3FFFD) || (c >= 0x40000 && c <= 0x4FFFD) || (c >= 0x50000 && c <= 0x5FFFD) || (c >= 0x60000 && c <= 0x6FFFD) || (c >= 0x70000 && c <= 0x7FFFD) || (c >= 0x80000 && c <= 0x8FFFD) || (c >= 0x90000 && c <= 0x9FFFD) || (c >= 0xA0000 && c <= 0xAFFFD) || (c >= 0xB0000 && c <= 0xBFFFD) || (c >= 0xC0000 && c <= 0xCFFFD) || (c >= 0xD0000 && c <= 0xDFFFD) || (c >= 0xE0000 && c <= 0xEFFFD); } static bool isValidIdentifierStartCodePoint(uint32_t c) { if (!isValidIdentifierContinuationCodePoint(c)) return false; if (c < 0x80 && (isnumber(c) || c == '$')) return false; // N1518: Recommendations for extended identifier characters for C and C++ // Proposed Annex X.2: Ranges of characters disallowed initially if ((c >= 0x0300 && c <= 0x036F) || (c >= 0x1DC0 && c <= 0x1DFF) || (c >= 0x20D0 && c <= 0x20FF) || (c >= 0xFE20 && c <= 0xFE2F)) return false; return true; } static bool advanceIf(char const *&ptr, char const *end, bool (*predicate)(uint32_t)) { char const *next = ptr; uint32_t c = validateUTF8CharacterAndAdvance(next, end); if (c == ~0U) return false; if (predicate(c)) { ptr = next; return true; } return false; } static bool advanceIfValidStartOfIdentifier(char const *&ptr, char const *end) { return advanceIf(ptr, end, isValidIdentifierStartCodePoint); } static bool advanceIfValidContinuationOfIdentifier(char const *&ptr, char const *end) { return advanceIf(ptr, end, isValidIdentifierContinuationCodePoint); } static bool advanceIfValidStartOfOperator(char const *&ptr, char const *end) { return advanceIf(ptr, end, Identifier::isOperatorStartCodePoint); } static bool advanceIfValidContinuationOfOperator(char const *&ptr, char const *end) { return advanceIf(ptr, end, Identifier::isOperatorContinuationCodePoint); } /// isIdentifier - Checks whether a string matches the identifier regex. bool Lexer::isIdentifier(llvm::StringRef string) { if (string.empty()) return false; char const *p = string.data(), *end = string.end(); if (!advanceIfValidStartOfIdentifier(p, end)) return false; while (p < end && advanceIfValidContinuationOfIdentifier(p, end)); return p == end; } /// lexIdentifier - Match [a-zA-Z_][a-zA-Z_$0-9]* /// /// FIXME: We should also allow unicode characters in identifiers. void Lexer::lexIdentifier() { const char *TokStart = CurPtr-1; CurPtr = TokStart; bool didStart = advanceIfValidStartOfIdentifier(CurPtr, BufferEnd); assert(didStart && "Unexpected start"); (void) didStart; // Lex [a-zA-Z_$0-9[[:XID_Continue:]]]* while (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)); tok Kind = llvm::StringSwitch(StringRef(TokStart, CurPtr-TokStart)) #define KEYWORD(kw) \ .Case(#kw, tok::kw_##kw) #include "swift/Parse/Tokens.def" .Default(tok::identifier); // "sil" and "sil_stage" are only keywords in SIL mode. if ((Kind == tok::kw_sil || Kind == tok::kw_sil_stage) && !InSILMode) Kind = tok::identifier; return formToken(Kind, TokStart); } /// Is the operator beginning at the given character "left-bound"? static bool isLeftBound(const char *tokBegin, const char *bufferBegin) { // The first character in the file is not left-bound. if (tokBegin == bufferBegin) return false; switch (tokBegin[-1]) { case ' ': case '\r': case '\n': case '\t': // whitespace case '(': case '[': case '{': // opening delimiters case ',': case ';': case ':': // expression separators case '\0': // whitespace / last char in file return false; default: return true; } } /// Is the operator ending at the given character (actually one past the end) /// "right-bound"? static bool isRightBound(const char *tokEnd) { switch (*tokEnd) { case ' ': case '\r': case '\n': case '\t': // whitespace case ')': case ']': case '}': // closing delimiters case ',': case ';': case ':': // expression separators case '\0': // whitespace / last char in file return false; default: return true; } } /// lexOperatorIdentifier - Match identifiers formed out of punctuation. void Lexer::lexOperatorIdentifier() { const char *TokStart = CurPtr-1; // We only allow '.' in a series if (*TokStart == '.') { while (*CurPtr == '.') ++CurPtr; if (CurPtr-TokStart > 3) { diagnose(TokStart, diag::lex_unexpected_long_period_series); return formToken(tok::unknown, TokStart); } } else { CurPtr = TokStart; bool didStart = advanceIfValidStartOfOperator(CurPtr, BufferEnd); assert(didStart && "unexpected operator start"); (void) didStart; while (advanceIfValidContinuationOfOperator(CurPtr, BufferEnd)); } // Decide between the binary, prefix, and postfix cases. // It's binary if either both sides are bound or both sides are not bound. // Otherwise, it's postfix if left-bound and prefix if right-bound. bool leftBound = isLeftBound(TokStart, BufferStart); bool rightBound = isRightBound(CurPtr); // Match various reserved words. if (CurPtr-TokStart == 1) { switch (TokStart[0]) { case '=': if (leftBound != rightBound) diagnose(TokStart, diag::lex_unary_equal_is_reserved); // always emit 'tok::equal' to avoid trickle down parse errors return formToken(tok::equal, TokStart); case '&': if (leftBound == rightBound || leftBound) break; return formToken(tok::amp_prefix, TokStart); case '.': if (leftBound == rightBound) return formToken(tok::period, TokStart); if (rightBound) return formToken(tok::period_prefix, TokStart); diagnose(TokStart, diag::lex_unary_postfix_dot_is_reserved); // always emit 'tok::period' to avoid trickle down parse errors return formToken(tok::period, TokStart); } } else if (CurPtr-TokStart == 2) { switch ((TokStart[0] << 8) | TokStart[1]) { case ('-' << 8) | '>': // -> return formToken(tok::arrow, TokStart); case ('*' << 8) | '/': // */ diagnose(TokStart, diag::lex_unexpected_block_comment_end); return formToken(tok::unknown, TokStart); } } else { if (CurPtr-TokStart == 3) { switch ((TokStart[0] << 16) | (TokStart[1] << 8) | TokStart[0]) { case ('.' << 16) | ('.' << 8) | '.': return formToken(tok::ellipsis, TokStart); } } // If there is a "//" in the middle of an identifier token, it starts // a single-line comment. auto Pos = StringRef(TokStart, CurPtr-TokStart).find("//"); if (Pos != StringRef::npos) CurPtr = TokStart+Pos; // If there is a "/*" in the middle of an identifier token, it starts // a multi-line comment. Pos = StringRef(TokStart, CurPtr-TokStart).find("/*"); if (Pos != StringRef::npos) CurPtr = TokStart+Pos; // Verify there is no "*/" in the middle of the identifier token, we reject // it as potentially ending a block comment. Pos = StringRef(TokStart, CurPtr-TokStart).find("*/"); if (Pos != StringRef::npos) { diagnose(TokStart+Pos, diag::lex_unexpected_block_comment_end); return formToken(tok::unknown, TokStart); } } if (leftBound == rightBound) return formToken(tok::oper_binary, TokStart); return formToken(leftBound ? tok::oper_postfix : tok::oper_prefix, TokStart); } /// lexDollarIdent - Match $[0-9a-zA-Z_$]* void Lexer::lexDollarIdent() { const char *TokStart = CurPtr-1; assert(*TokStart == '$'); // In a SIL function body, '$' is a token by itself. if (InSILBody) return formToken(tok::sil_dollar, TokStart); // Lex [a-zA-Z_$0-9]* while (isalnum(*CurPtr) || *CurPtr == '_' || *CurPtr == '$') ++CurPtr; return formToken(tok::dollarident, TokStart); } void Lexer::lexHexNumber() { // We assume we're starting from the 'x' in a '0x...' floating-point literal. assert(*CurPtr == 'x' && "not a hex literal"); const char *TokStart = CurPtr-1; assert(*TokStart == '0' && "not a hex literal"); // 0x[0-9a-fA-F][0-9a-fA-F_]* ++CurPtr; if (!isxdigit(*CurPtr)) { diagnose(CurPtr, diag::lex_expected_digit_in_int_literal); while (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)); return formToken(tok::unknown, TokStart); } while (isxdigit(*CurPtr) || *CurPtr == '_') ++CurPtr; if (CurPtr - TokStart == 2) { diagnose(CurPtr, diag::lex_expected_digit_in_int_literal); while (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)); return formToken(tok::unknown, TokStart); } if (*CurPtr != '.' && *CurPtr != 'p' && *CurPtr != 'P') return formToken(tok::integer_literal, TokStart); // (\.[0-9A-Fa-f][0-9A-Fa-f_]*)? if (*CurPtr == '.') { ++CurPtr; // If the character after the '.' is not a digit, assume we have an int // literal followed by a dot expression. if (!isxdigit(*CurPtr)) { --CurPtr; return formToken(tok::integer_literal, TokStart); } while (isxdigit(*CurPtr) || *CurPtr == '_') ++CurPtr; if (*CurPtr != 'p' && *CurPtr != 'P') { diagnose(CurPtr, diag::lex_expected_binary_exponent_in_hex_float_literal); return formToken(tok::unknown, TokStart); } } // [pP][+-]?[0-9][0-9_]* assert(*CurPtr == 'p' || *CurPtr == 'P' && "not at a hex float exponent?!"); ++CurPtr; if (*CurPtr == '+' || *CurPtr == '-') ++CurPtr; // Eat the sign. if (!isdigit(*CurPtr)) { diagnose(CurPtr, diag::lex_expected_digit_in_fp_exponent); return formToken(tok::unknown, TokStart); } while (isdigit(*CurPtr) || *CurPtr == '_') ++CurPtr; return formToken(tok::floating_literal, TokStart); } /// lexNumber: /// integer_literal ::= [0-9][0-9_]* /// integer_literal ::= 0x[0-9a-fA-F][0-9a-fA-F_]* /// integer_literal ::= 0o[0-7][0-7_]* /// integer_literal ::= 0b[01][01_]* /// floating_literal ::= [0-9][0-9]_*\.[0-9][0-9_]* /// floating_literal ::= [0-9][0-9]*\.[0-9][0-9_]*[eE][+-]?[0-9][0-9_]* /// floating_literal ::= [0-9][0-9_]*[eE][+-]?[0-9][0-9_]* /// floating_literal ::= 0x[0-9A-Fa-f][0-9A-Fa-f_]* /// (\.[0-9A-Fa-f][0-9A-Fa-f_]*)?[pP][+-]?[0-9][0-9_]* void Lexer::lexNumber() { const char *TokStart = CurPtr-1; assert((isdigit(*TokStart) || *TokStart == '.') && "Unexpected start"); auto expected_digit = [&](const char *loc, Diag<> msg) { diagnose(loc, msg); while (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)); return formToken(tok::unknown, TokStart); }; if (*TokStart == '0' && *CurPtr == 'x') return lexHexNumber(); if (*TokStart == '0' && *CurPtr == 'o') { // 0o[0-7][0-7_]* ++CurPtr; if (*CurPtr < '0' || *CurPtr > '7') return expected_digit(CurPtr, diag::lex_expected_digit_in_int_literal); while ((*CurPtr >= '0' && *CurPtr <= '7') || *CurPtr == '_') ++CurPtr; if (CurPtr - TokStart == 2) return expected_digit(CurPtr, diag::lex_expected_digit_in_int_literal); return formToken(tok::integer_literal, TokStart); } if (*TokStart == '0' && *CurPtr == 'b') { // 0b[01][01_]* ++CurPtr; if (*CurPtr != '0' && *CurPtr != '1') return expected_digit(CurPtr, diag::lex_expected_digit_in_int_literal); while (*CurPtr == '0' || *CurPtr == '1' || *CurPtr == '_') ++CurPtr; if (CurPtr - TokStart == 2) return expected_digit(CurPtr, diag::lex_expected_digit_in_int_literal); return formToken(tok::integer_literal, TokStart); } // Handle a leading [0-9]+, lexing an integer or falling through if we have a // floating point value. while (isdigit(*CurPtr) || *CurPtr == '_') ++CurPtr; // Lex things like 4.x as '4' followed by a tok::period. if (*CurPtr == '.') { // NextToken is the soon to be previous token // Therefore: x.0.1 is sub-tuple access, not x.float_literal if (!isdigit(CurPtr[1]) || NextToken.is(tok::period)) return formToken(tok::integer_literal, TokStart); } else { // Floating literals must have '.', 'e', or 'E' after digits. If it is // something else, then this is the end of the token. if (*CurPtr != 'e' && *CurPtr != 'E') { char const *tmp = CurPtr; if (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)) return expected_digit(tmp, diag::lex_expected_digit_in_int_literal); return formToken(tok::integer_literal, TokStart); } } // Lex decimal point. if (*CurPtr == '.') { ++CurPtr; // Lex any digits after the decimal point. while (isdigit(*CurPtr) || *CurPtr == '_') ++CurPtr; } // Lex exponent. if (*CurPtr == 'e' || *CurPtr == 'E') { ++CurPtr; // Eat the 'e' if (*CurPtr == '+' || *CurPtr == '-') ++CurPtr; // Eat the sign. if (!isdigit(*CurPtr)) return expected_digit(CurPtr, diag::lex_expected_digit_in_fp_exponent); while (isdigit(*CurPtr) || *CurPtr == '_') ++CurPtr; } return formToken(tok::floating_literal, TokStart); } /// lexCharacter - Read a character and return its UTF32 code. If this is the /// end of enclosing string/character sequence, this returns ~0U. If this is a /// malformed character sequence, it emits a diagnostic (when EmitDiagnostics is /// true) and returns ~1U. /// /// character_escape ::= [\][\] | [\]t | [\]n | [\]r | [\]" | [\]' | [\]0 /// character_escape ::= [\]x hex hex /// character_escape ::= [\]u hex hex hex hex /// character_escape ::= [\]U hex hex hex hex hex hex hex hex /// hex ::= [0-9a-fA-F] unsigned Lexer::lexCharacter(const char *&CurPtr, bool StopAtDoubleQuote, bool EmitDiagnostics) { const char *CharStart = CurPtr; switch (*CurPtr++) { default: {// Normal characters are part of the string. // If this is a "high" UTF-8 character, validate it. if ((signed char)(CurPtr[-1]) >= 0) { if (isprint(CurPtr[-1]) == 0) diagnose(CharStart, diag::lex_unprintable_ascii_character); return CurPtr[-1]; } --CurPtr; unsigned CharValue = validateUTF8CharacterAndAdvance(CurPtr, BufferEnd); if (CharValue != ~0U) return CharValue; if (EmitDiagnostics) diagnose(CharStart, diag::lex_invalid_utf8_character); return ~1U; } case '"': // If we found the closing " character, we're done. if (StopAtDoubleQuote) { --CurPtr; return ~0U; } // In a single quoted string, this is just a character. return CurPtr[-1]; case '\'': if (!StopAtDoubleQuote) { --CurPtr; return ~0U; } // In a double quoted string, this is just a character. return CurPtr[-1]; case 0: if (CurPtr-2 != BufferEnd) { if (EmitDiagnostics) diagnose(CurPtr-2, diag::lex_nul_character); return 0; } SWIFT_FALLTHROUGH; case '\n': // String literals cannot have \n or \r in them. case '\r': diagnose(CurPtr-1, diag::lex_unterminated_string); return ~1U; case '\\': // Escapes. break; } unsigned CharValue = 0; // Escape processing. We already ate the "\". switch (*CurPtr) { default: // Invalid escape. diagnose(CurPtr, diag::lex_invalid_escape); // If this looks like a plausible escape character, recover as though this // is an invalid escape. if (isalnum(*CurPtr)) ++CurPtr; return ~1U; // Simple single-character escapes. case '0': ++CurPtr; return '\0'; case 'n': ++CurPtr; return '\n'; case 'r': ++CurPtr; return '\r'; case 't': ++CurPtr; return '\t'; case '"': ++CurPtr; return '"'; case '\'': ++CurPtr; return '\''; case '\\': ++CurPtr; return '\\'; // Unicode escapes of various lengths. case 'x': { // \x HEX HEX unsigned ValidChars = !!isxdigit(CurPtr[1]) + !!isxdigit(CurPtr[2]); if (ValidChars != 2) { if (EmitDiagnostics) diagnose(CurPtr, diag::lex_invalid_x_escape); CurPtr += 1 + ValidChars; return ~1U; } StringRef(CurPtr+1, ValidChars).getAsInteger(16, CharValue); CurPtr += 1 + ValidChars; // Reject \x80 and above, since it is going to encode into a multibyte // unicode encoding, which is something that C folks may not expect. if (CharValue >= 0x80) { if (EmitDiagnostics) diagnose(CurPtr, diag::lex_invalid_hex_escape); return ~1U; } break; } case 'u': { // \u HEX HEX HEX HEX unsigned ValidChars = !!isxdigit(CurPtr[1]) + !!isxdigit(CurPtr[2]) + !!isxdigit(CurPtr[3]) + !!isxdigit(CurPtr[4]); StringRef(CurPtr+1, ValidChars).getAsInteger(16, CharValue); if (ValidChars != 4) { if (EmitDiagnostics) diagnose(CurPtr, diag::lex_invalid_u_escape); CurPtr += 1 + ValidChars; return ~1U; } CurPtr += 1 + ValidChars; break; } case 'U': { // \U HEX HEX HEX HEX HEX HEX HEX HEX unsigned ValidChars = !!isxdigit(CurPtr[1]) + !!isxdigit(CurPtr[2]) + !!isxdigit(CurPtr[3]) + !!isxdigit(CurPtr[4]) + !!isxdigit(CurPtr[5]) + !!isxdigit(CurPtr[6]) + !!isxdigit(CurPtr[7]) + !!isxdigit(CurPtr[8]); StringRef(CurPtr+1, ValidChars).getAsInteger(16, CharValue); if (ValidChars != 8) { if (EmitDiagnostics) diagnose(CurPtr, diag::lex_invalid_U_escape); CurPtr += 1 + ValidChars; return ~1U; } CurPtr += 1 + ValidChars; break; } } // Check to see if the encoding is valid. llvm::SmallString<64> TempString; if (CharValue >= 0x80 && EncodeToUTF8(CharValue, TempString)) { if (EmitDiagnostics) diagnose(CharStart, diag::lex_invalid_unicode_code_point); return ~1U; } return CharValue; } /// lexCharacterLiteral: /// character_literal ::= '([^'\\\n\r]|character_escape)' void Lexer::lexCharacterLiteral() { const char *TokStart = CurPtr-1; assert(*TokStart == '\'' && "Unexpected start"); unsigned CharValue = lexCharacter(CurPtr, false, true); // If we have '', we have an invalid character literal. if (CharValue == ~0U) { diagnose(TokStart, diag::lex_invalid_character_literal); return formToken(tok::unknown, TokStart); } // If this wasn't a normal character, then this is a malformed character. if (CharValue == ~1U) { // If we successfully skipped to the ending ', eat it now to avoid confusing // error recovery. if (*CurPtr == '\'') ++CurPtr; return formToken(tok::unknown, TokStart); } if (*CurPtr != '\'') { diagnose(TokStart, diag::lex_invalid_character_literal); return formToken(tok::unknown, TokStart); } ++CurPtr; return formToken(tok::character_literal, TokStart); } /// getEncodedCharacterLiteral - Return the UTF32 codepoint for the specified /// character literal. uint32_t Lexer::getEncodedCharacterLiteral(const Token &Str) { const char *CharStart = Str.getText().data()+1; return lexCharacter(CharStart, false, false); } /// skipToEndOfInterpolatedExpression - Given the first character after a \( /// sequence in a string literal (the start of an interpolated expression), /// scan forward to the end of the interpolated expression and return the end. /// On success, the returned pointer will point to a ')'. On failure, it will /// point to something else. This basically just does brace matching. static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, const char *EndPtr, DiagnosticEngine *Diags) { SourceLoc InterpStart = Lexer::getSourceLoc(CurPtr-1); unsigned ParenCount = 1; while (true) { // This is a very simple scanner. The implications of this include not // being able to use string literals in an interpolated string, and not // being able to break an expression over multiple lines in an interpolated // string. Both of these limitations make this simple and allow us to // recover from common errors though. // // On success scanning the expression body, the real lexer will be used to // relex the body when parsing the expressions. We let it diagnose any // issues with malformed tokens or other problems. switch (*CurPtr++) { // String literals in general cannot be split across multiple lines, // interpolated ones are no exception. case '\n': case '\r': // Will be diagnosed as an unterminated string literal. return CurPtr-1; // String literals cannot be used in interpolated string literals. case '"': if (Diags) Diags->diagnose(Lexer::getSourceLoc(CurPtr - 1), diag::lex_unexpected_quote_string_interpolation) .highlight(Diagnostic::Range(InterpStart, Lexer::getSourceLoc(CurPtr-1))); return CurPtr-1; case 0: // If we hit EOF, we fail. if (CurPtr-1 == EndPtr) { if (Diags) Diags->diagnose(Lexer::getSourceLoc(CurPtr-1), diag::lex_unterminated_string); return CurPtr-1; } continue; // Paren nesting deeper to support "foo = \((a+b)-(c*d)) bar". case '(': ++ParenCount; continue; case ')': // If this is the last level of nesting, then we're done! if (--ParenCount == 0) return CurPtr-1; continue; default: // Normal token character. continue; } } } /// lexStringLiteral: /// string_literal ::= ["]([^"\\\n\r]|character_escape)*["] void Lexer::lexStringLiteral() { const char *TokStart = CurPtr-1; assert(*TokStart == '"' && "Unexpected start"); bool wasErroneous = false; while (1) { if (*CurPtr == '\\' && *(CurPtr + 1) == '(') { // Consume tokens until we hit the corresponding ')'. CurPtr += 2; const char *EndPtr = skipToEndOfInterpolatedExpression(CurPtr, BufferEnd, Diags); if (*EndPtr == ')') { // Successfully scanned the body of the expression literal. CurPtr = EndPtr+1; } else { wasErroneous = true; } continue; } // String literals cannot have \n or \r in them. if (*CurPtr == '\r' || *CurPtr == '\n') { diagnose(TokStart, diag::lex_unterminated_string); return formToken(tok::unknown, TokStart); } unsigned CharValue = lexCharacter(CurPtr, true, true); wasErroneous |= CharValue == ~1U; // If this is the end of string, we are done. If it is a normal character // or an already-diagnosed error, just munch it. if (CharValue == ~0U) { ++CurPtr; if (wasErroneous) return formToken(tok::unknown, TokStart); return formToken(tok::string_literal, TokStart); } } } StringRef Lexer::getEncodedStringSegment(StringRef Bytes, llvm::SmallVectorImpl &TempString) { TempString.clear(); // Note that it is always safe to read one over the end of "Bytes" because // we know that there is a terminating " character. Use BytesPtr to avoid a // range check subscripting on the StringRef. const char *BytesPtr = Bytes.begin(); while (BytesPtr != Bytes.end()) { char CurChar = *BytesPtr++; if (CurChar != '\\') { TempString.push_back(CurChar); continue; } // Invalid escapes are accepted by the lexer but diagnosed as an error. We // just ignore them here. unsigned CharValue = 0; // Unicode character value for \x, \u, \U. switch (*BytesPtr++) { default: continue; // Invalid escape, ignore it. // Simple single-character escapes. case '0': TempString.push_back('\0'); continue; case 'n': TempString.push_back('\n'); continue; case 'r': TempString.push_back('\r'); continue; case 't': TempString.push_back('\t'); continue; case '"': TempString.push_back('"'); continue; case '\'': TempString.push_back('\''); continue; case '\\': TempString.push_back('\\'); continue; // String interpolation. case '(': llvm_unreachable("string contained interpolated segments"); // Unicode escapes of various lengths. case 'x': // \x HEX HEX if (!isxdigit(BytesPtr[0]) || !isxdigit(BytesPtr[1])) continue; // Ignore invalid escapes. StringRef(BytesPtr, 2).getAsInteger(16, CharValue); BytesPtr += 2; break; case 'u': // \u HEX HEX HEX HEX if (!isxdigit(BytesPtr[0]) || !isxdigit(BytesPtr[1]) || !isxdigit(BytesPtr[2]) || !isxdigit(BytesPtr[3])) continue; // Ignore invalid escapes. StringRef(BytesPtr, 4).getAsInteger(16, CharValue); BytesPtr += 4; break; case 'U': // \U HEX HEX HEX HEX HEX HEX HEX HEX if (!isxdigit(BytesPtr[0]) || !isxdigit(BytesPtr[1]) || !isxdigit(BytesPtr[2]) || !isxdigit(BytesPtr[3]) || !isxdigit(BytesPtr[4]) || !isxdigit(BytesPtr[5]) || !isxdigit(BytesPtr[6]) || !isxdigit(BytesPtr[7])) continue; // Ignore invalid escapes. StringRef(BytesPtr, 8).getAsInteger(16, CharValue); BytesPtr += 8; break; } if (CharValue < 0x80) TempString.push_back(CharValue); else EncodeToUTF8(CharValue, TempString); } // If we didn't escape or reprocess anything, then we don't need to use the // temporary string, just point to the original one. We know that this // is safe because unescaped strings are always shorter than their escaped // forms (in a valid string). if (TempString.size() == Bytes.size()) { TempString.clear(); return Bytes; } return StringRef(TempString.begin(), TempString.size()); } void Lexer::getStringLiteralSegments(const Token &Str, llvm::SmallVectorImpl &Segments, DiagnosticEngine *Diags) { assert(Str.is(tok::string_literal)); // Get the bytes behind the string literal, dropping the double quotes. StringRef Bytes = Str.getText().drop_front().drop_back(); // Note that it is always safe to read one over the end of "Bytes" because // we know that there is a terminating " character. Use BytesPtr to avoid a // range check subscripting on the StringRef. const char *SegmentStartPtr = Bytes.begin(); const char *BytesPtr = SegmentStartPtr; // FIXME: Use SSE to scan for '\'. while (BytesPtr != Bytes.end()) { char CurChar = *BytesPtr++; if (CurChar != '\\') continue; if (*BytesPtr++ != '(') continue; // String interpolation. // Push the current segment. if (BytesPtr-SegmentStartPtr > 2) { Segments.push_back( StringSegment::getLiteral(getSourceLoc(SegmentStartPtr), BytesPtr-SegmentStartPtr-2)); } // Find the closing ')'. const char *End = skipToEndOfInterpolatedExpression(BytesPtr, Str.getText().end(), Diags); assert(*End == ')' && "invalid string literal interpolations should" " not be returned as string literals"); ++End; // Add an expression segment. Segments.push_back( StringSegment::getExpr(getSourceLoc(BytesPtr-1), End-BytesPtr+1)); // Reset the beginning of the segment to the string that remains to be // consumed. SegmentStartPtr = BytesPtr = End; } if (Segments.empty() || SegmentStartPtr < Bytes.end()) { Segments.push_back( StringSegment::getLiteral(getSourceLoc(SegmentStartPtr), Bytes.end()-SegmentStartPtr)); } } //===----------------------------------------------------------------------===// // Main Lexer Loop //===----------------------------------------------------------------------===// void Lexer::lexImpl() { assert(CurPtr >= BufferStart && CurPtr <= BufferEnd && "Cur Char Pointer out of range!"); NextToken.setAtStartOfLine(CurPtr == BufferStart); Restart: // Remember the start of the token so we can form the text range. const char *TokStart = CurPtr; switch (*CurPtr++) { default: { char const *tmp = CurPtr-1; if (advanceIfValidStartOfIdentifier(tmp, BufferEnd)) return lexIdentifier(); if (advanceIfValidStartOfOperator(tmp, BufferEnd)) return lexOperatorIdentifier(); if (advanceIfValidContinuationOfIdentifier(tmp, BufferEnd)) { // If this is a valid identifier continuation, but not a valid identifier // start, attempt to recover by eating more continuation characters. diagnose(CurPtr-1, diag::lex_invalid_identifier_start_character); while (advanceIfValidContinuationOfIdentifier(tmp, BufferEnd)); } else { // This character isn't allowed in Swift source. validateUTF8CharacterAndAdvance(tmp, BufferEnd); diagnose(CurPtr-1, diag::lex_invalid_character); } CurPtr = tmp; return formToken(tok::unknown, TokStart); } case '\n': case '\r': NextToken.setAtStartOfLine(true); goto Restart; // Skip whitespace. case ' ': case '\t': goto Restart; // Skip whitespace. case 0: if (CurPtr-1 == CodeCompletionPtr) return formToken(tok::code_complete, TokStart); // If this is a random nul character in the middle of a buffer, skip it as // whitespace. if (CurPtr-1 != BufferEnd) { diagnoseEmbeddedNul(Diags, CurPtr-1); goto Restart; } // Otherwise, this is the real end of the buffer. if (BufferStart != BufferEnd && CurPtr[-2] != '\n' && CurPtr[-2] != '\r') { // While we are not C, we should not ignore the strong Unix command-line // tool conventions that motivate this warning. diagnose(CurPtr-1, diag::lex_missing_newline_eof) .fixItInsert(getSourceLoc(CurPtr-1), "\n"); } // Put CurPtr back into buffer bounds. CurPtr--; // Return EOF. return formToken(tok::eof, TokStart); case '{': return formToken(tok::l_brace, TokStart); case '[': return formToken(tok::l_square, TokStart); case '(': return formToken(tok::l_paren, TokStart); case '}': return formToken(tok::r_brace, TokStart); case ']': return formToken(tok::r_square, TokStart); case ')': return formToken(tok::r_paren, TokStart); case ',': return formToken(tok::comma, TokStart); case ';': return formToken(tok::semi, TokStart); case ':': return formToken(tok::colon, TokStart); case '?': return formToken(tok::question, TokStart); case '@': // @ is only a token in SIL mode. if (InSILMode) return formToken(tok::sil_at_sign, TokStart); diagnose(CurPtr-1, diag::lex_invalid_character); goto Restart; case '#': // # is only a token in SIL mode. if (InSILMode) return formToken(tok::sil_pound, TokStart); // Allow a hashbang #! line at the beginning of the file. if (CurPtr - 1 == BufferStart && *CurPtr == '!') { CurPtr--; if (BufferID != SourceMgr.getHashbangBufferID()) diagnose(CurPtr, diag::lex_hashbang_not_allowed); skipHashbang(); goto Restart; } diagnose(CurPtr-1, diag::lex_invalid_character); goto Restart; // Operator characters. case '/': if (CurPtr[0] == '/') { // "//" skipSlashSlashComment(); if (KeepComments) return formToken(tok::comment, TokStart); goto Restart; } if (CurPtr[0] == '*') { // "/*" skipSlashStarComment(); if (KeepComments) return formToken(tok::comment, TokStart); goto Restart; } return lexOperatorIdentifier(); case '%': // Lex %[0-9a-zA-Z]+ as a local SIL value if (InSILBody && isalnum(CurPtr[0])) { do { ++CurPtr; } while (isdigit(CurPtr[0])); return formToken(tok::sil_local_name, TokStart); } return lexOperatorIdentifier(); case '!': if (InSILBody) return formToken(tok::sil_exclamation, TokStart); return lexOperatorIdentifier(); case '=': case '-': case '+': case '*': case '<': case '>': case '&': case '|': case '^': case '~': return lexOperatorIdentifier(); case '.': return lexOperatorIdentifier(); case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case '_': return lexIdentifier(); case '$': return lexDollarIdent(); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return lexNumber(); case '\'': return lexCharacterLiteral(); case '"': return lexStringLiteral(); } } SourceLoc Lexer::getLocForEndOfToken(SourceManager &SM, SourceLoc Loc) { // Don't try to do anything with an invalid location. if (!Loc.isValid()) return Loc; // Figure out which buffer contains this location. int BufferID = SM->FindBufferContainingLoc(Loc.Value); if (BufferID < 0) return SourceLoc(); const llvm::MemoryBuffer *Buffer = SM->getMemoryBuffer(BufferID); if (!Buffer) return SourceLoc(); // KeepComments is true because either the caller skipped comments and // normally we won't be at the beginning of a comment token (making // KeepComments irrelevant), or the caller lexed comments and KeepComments // must be true. Lexer L(SM, BufferID, nullptr, /*InSILMode=*/ false, /*KeepComments=*/true); L.restoreState(State(Loc)); unsigned Length = L.peekNextToken().getLength(); return Loc.getAdvancedLoc(Length); }