//===--- Lexer.cpp - Swift Language Lexer ---------------------------------===// // // This source file is part of the Swift.org open source project // // Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See http://swift.org/LICENSE.txt for license information // See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// // // This file implements the Lexer and Token interfaces. // //===----------------------------------------------------------------------===// #include "swift/Parse/Lexer.h" #include "swift/AST/ASTContext.h" #include "swift/AST/Diagnostics.h" #include "swift/AST/Identifier.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/SourceMgr.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" using namespace swift; //===----------------------------------------------------------------------===// // UTF8 Validation/Encoding/Decoding helper functions //===----------------------------------------------------------------------===// /// EncodeToUTF8 - Encode the specified code point into a UTF8 stream. Return /// true if it is an erroneous code point. static bool EncodeToUTF8(unsigned CharValue, llvm::SmallVectorImpl &Result) { assert(CharValue >= 0x80 && "Single-byte encoding should be already handled"); // Number of bits in the value, ignoring leading zeros. unsigned NumBits = 32-llvm::CountLeadingZeros_32(CharValue); // Handle the leading byte, based on the number of bits in the value. unsigned NumTrailingBytes; if (NumBits <= 5+6) { // Encoding is 0x110aaaaa 10bbbbbb Result.push_back(char(0xC0 | (CharValue >> 6))); NumTrailingBytes = 1; } else if (NumBits <= 4+6+6) { // Encoding is 0x1110aaaa 10bbbbbb 10cccccc Result.push_back(char(0xE0 | (CharValue >> (6+6)))); NumTrailingBytes = 2; // UTF-16 surrogate pair values are not valid code points. if (CharValue >= 0xD800 && CharValue <= 0xDFFF) return true; } else if (NumBits <= 3+6+6+6) { // Encoding is 0x11110aaa 10bbbbbb 10cccccc 10dddddd Result.push_back(char(0xF0 | (CharValue >> (6+6+6)))); NumTrailingBytes = 3; // Reject over-large code points. These cannot be encoded as UTF-16 // surrogate pairs, so UTF-32 doesn't allow them. if (CharValue > 0x10FFFF) return true; } else { return true; // UTF8 can encode these, but they aren't valid code points. } // Emit all of the trailing bytes. while (NumTrailingBytes--) Result.push_back(char(0x80 | (0x3F & (CharValue >> (NumTrailingBytes*6))))); return false; } /// CLO8 - Return the number of leading ones in the specified 8-bit value. static unsigned CLO8(unsigned char C) { return llvm::CountLeadingOnes_32(uint32_t(C) << 24); } /// isStartOfUTF8Character - Return true if this isn't a UTF8 continuation /// character, which will be of the form 0b10XXXXXX static bool isStartOfUTF8Character(unsigned char C) { return (signed char)C >= 0 || C >= 0xC0; // C0 = 0b11000000 } /// validateUTF8CharacterAndAdvance - Given a pointer to the starting byte of a /// UTF8 character, validate it and advance the lexer past it. This returns the /// encoded character or ~0U if the encoding is invalid. static uint32_t validateUTF8CharacterAndAdvance(const char *&Ptr) { assert((signed char)(*Ptr) < 0 && "Not the start of an encoded letter"); unsigned char CurByte = *Ptr++; // Read the number of high bits set, which indicates the number of bytes in // the character. unsigned EncodedBytes = CLO8(CurByte); // If this is 0b10XXXXXX, then it is a continuation character. if (EncodedBytes == 1 || // If the number of encoded bytes is > 4, then this is an invalid // character in the range of 0xF5 and above. These would start an // encoding for something that couldn't be represented with UTF16 // digraphs, so Unicode rejects them. EncodedBytes > 4) { // Skip until we get the start of another character. This is guaranteed to // at least stop at the nul at the end of the buffer. while (!isStartOfUTF8Character(*Ptr)) ++Ptr; return ~0U; } // Drop the high bits indicating the # bytes of the result. unsigned CharValue = (unsigned char)(CurByte << EncodedBytes) >> EncodedBytes; // Read and validate the continuation bytes. for (unsigned i = 1; i != EncodedBytes; ++i) { CurByte = *Ptr; // If the high bit isn't set or the second bit isn't clear, then this is not // a continuation byte! if (CurByte < 0x80 || CurByte >= 0xC0) return ~0U; // Accumulate our result. CharValue <<= 6; CharValue |= CurByte & 0x3F; ++Ptr; } // UTF-16 surrogate pair values are not valid code points. if (CharValue >= 0xD800 && CharValue <= 0xDFFF) return ~0U; // If we got here, we read the appropriate number of accumulated bytes. // Verify that the encoding was actually minimal. // Number of bits in the value, ignoring leading zeros. unsigned NumBits = 32-llvm::CountLeadingZeros_32(CharValue); if (NumBits <= 5+6) return EncodedBytes == 2 ? CharValue : ~0U; if (NumBits <= 4+6+6) return EncodedBytes == 3 ? CharValue : ~0U; return EncodedBytes == 4 ? CharValue : ~0U; } //===----------------------------------------------------------------------===// // Setup and Helper Methods //===----------------------------------------------------------------------===// Lexer::Lexer(StringRef Buffer, llvm::SourceMgr &SourceMgr, DiagnosticEngine *Diags, const char *CurrentPosition) : SourceMgr(SourceMgr), Diags(Diags) { BufferStart = Buffer.begin(); BufferEnd = Buffer.end(); CurPtr = CurrentPosition; assert(CurPtr >= BufferStart && CurPtr <= BufferEnd && "Current position is out-of-range"); // Prime the lexer. lexImpl(); } InFlightDiagnostic Lexer::diagnose(const char *Loc, Diag<> ID) { if (Diags) Diags->diagnose(getSourceLoc(Loc), ID); return InFlightDiagnostic(); } void Lexer::formToken(tok Kind, const char *TokStart) { NextToken.setToken(Kind, StringRef(TokStart, CurPtr-TokStart)); } bool Lexer::isPrecededBySpace() { // For these purposes, the start of the file is considered to be // preceeded by infinite whitespace. if (CurPtr - 1 == BufferStart) return true; // Otherwise, our list of whitespace characters is pretty short. char LastChar = *(CurPtr - 2); return (isspace(LastChar) || LastChar == '\0'); } //===----------------------------------------------------------------------===// // Lexer Subroutines //===----------------------------------------------------------------------===// /// skipSlashSlashComment - Skip to the end of the line of a // comment. void Lexer::skipSlashSlashComment() { assert(CurPtr[-1] == '/' && CurPtr[0] == '/' && "Not a // comment"); while (1) { switch (*CurPtr++) { case '\n': case '\r': return; // If we found the end of the line, return. default: // If this is a "high" UTF-8 character, validate it. if ((signed char)(CurPtr[-1]) < 0) { --CurPtr; const char *CharStart = CurPtr; if (validateUTF8CharacterAndAdvance(CurPtr) == ~0U) diagnose(CharStart, diag::lex_invalid_utf8_character); } break; // Otherwise, eat other characters. case 0: // If this is a random nul character in the middle of a buffer, skip it as // whitespace. if (CurPtr-1 != BufferEnd) { diagnose(CurPtr-1, diag::lex_nul_character); break; } // Otherwise, we have a // comment at end of file. --CurPtr; return; } } } /// skipSlashStarComment - /**/ comments are skipped (treated as whitespace). /// Note that (unlike in C) block comments can be nested. void Lexer::skipSlashStarComment() { const char *StartPtr = CurPtr-1; assert(CurPtr[-1] == '/' && CurPtr[0] == '*' && "Not a /* comment"); // Make sure to advance over the * so that we don't incorrectly handle /*/ as // the beginning and end of the comment. ++CurPtr; // /**/ comments can be nested, keep track of how deep we've gone. unsigned Depth = 1; while (1) { switch (*CurPtr++) { case '*': // Check for a '*/' if (*CurPtr == '/') { ++CurPtr; if (--Depth == 0) return; } break; case '/': // Check for a '/*' if (*CurPtr == '*') { ++CurPtr; ++Depth; } break; default: // If this is a "high" UTF-8 character, validate it. if ((signed char)(CurPtr[-1]) < 0) { --CurPtr; const char *CharStart = CurPtr; if (validateUTF8CharacterAndAdvance(CurPtr) == ~0U) diagnose(CharStart, diag::lex_invalid_utf8_character); } break; // Otherwise, eat other characters. case 0: // If this is a random nul character in the middle of a buffer, skip it as // whitespace. if (CurPtr-1 != BufferEnd) { diagnose(CurPtr-1, diag::lex_nul_character); break; } // Otherwise, we have an unterminated /* comment. --CurPtr; diagnose(CurPtr-(CurPtr[-1] == '\n'), diag::lex_unterminated_block_comment); diagnose(StartPtr, diag::lex_comment_start); return; } } } static bool isValidStartOfIdentifier(char c) { return isalpha(c) || c == '_'; } static bool isValidContinuationOfIdentifier(char c) { return isalnum(c) || c == '_' || c == '$'; } /// isIdentifier - Checks whether a string matches the identifier regex. bool Lexer::isIdentifier(llvm::StringRef string) { if (string.empty()) return false; if (!isValidStartOfIdentifier(string[0])) return false; for (unsigned i = 1, e = string.size(); i != e; ++i) if (!isValidContinuationOfIdentifier(string[i])) return false; return true; } /// lexIdentifier - Match [a-zA-Z_][a-zA-Z_$0-9]* /// /// FIXME: We should also allow unicode characters in identifiers. void Lexer::lexIdentifier() { const char *TokStart = CurPtr-1; assert(isValidStartOfIdentifier(*TokStart) && "Unexpected start"); // Lex [a-zA-Z_$0-9]* while (isValidContinuationOfIdentifier(*CurPtr)) ++CurPtr; tok Kind = llvm::StringSwitch(StringRef(TokStart, CurPtr-TokStart)) // decl and type keywords .Case("extension", tok::kw_extension) .Case("import", tok::kw_import) .Case("func", tok::kw_func) .Case("oneof", tok::kw_oneof) .Case("protocol", tok::kw_protocol) .Case("struct", tok::kw_struct) .Case("typealias", tok::kw_typealias) .Case("var", tok::kw_var) .Case("static", tok::kw_static) .Case("subscript", tok::kw_subscript) // Statements .Case("if", tok::kw_if) .Case("else", tok::kw_else) .Case("for", tok::kw_for) .Case("while", tok::kw_while) .Case("return", tok::kw_return) // Expressions .Case("new", tok::kw_new) .Default(tok::identifier); return formToken(Kind, TokStart); } /// lexOperatorIdentifier - Match identifiers formed out of punctuation. void Lexer::lexOperatorIdentifier() { const char *TokStart = CurPtr-1; while (Identifier::isOperatorChar(*CurPtr) && *CurPtr != '.') ++CurPtr; // Match various reserved words. if (CurPtr-TokStart == 1) { switch (TokStart[0]) { case '=': return formToken(tok::equal, TokStart); } } else if (CurPtr-TokStart == 2) { switch ((TokStart[0] << 8) | TokStart[1]) { case ('-' << 8) | '>': // -> return formToken(tok::arrow, TokStart); } } return formToken(tok::oper, TokStart); } /// lexDollarIdent - Match $[0-9a-zA-Z_$]* void Lexer::lexDollarIdent() { const char *TokStart = CurPtr-1; assert(*TokStart == '$'); // Lex [a-zA-Z_$0-9]* while (isalnum(*CurPtr) || *CurPtr == '_' || *CurPtr == '$') ++CurPtr; return formToken(tok::dollarident, TokStart); } // Return true if the string starts with "[eE][+-][0-9]" static bool isValidExponent(const char *P) { if (*P != 'e' && *P != 'E') return false; ++P; if (*P != '+' && *P != '-') return false; ++P; return isdigit(*P); } /// lexNumber: /// integer_literal ::= [0-9]+ /// integer_literal ::= 0x[0-9a-fA-F]+ /// integer_literal ::= 0o[0-7]+ /// integer_literal ::= 0b[01]+ /// floating_literal ::= [0-9]+\.[0-9]+ /// floating_literal ::= [0-9]+(\.[0-9]*)?[eE][+-][0-9]+ /// floating_literal ::= \.[0-9]+([eE][+-][0-9]+)? void Lexer::lexNumber() { const char *TokStart = CurPtr-1; assert((isdigit(*TokStart) || *TokStart == '.') && "Unexpected start"); if (*TokStart == '0' && *CurPtr == 'x') { // 0x[0-9a-fA-F]+ ++CurPtr; while (isdigit(*CurPtr) || (*CurPtr >= 'a' && *CurPtr <= 'f') || (*CurPtr >= 'A' && *CurPtr <= 'F')) ++CurPtr; if (CurPtr - TokStart == 2) { diagnose(CurPtr, diag::lex_expected_digit_in_int_literal); return formToken(tok::unknown, TokStart); } return formToken(tok::integer_literal, TokStart); } else if (*TokStart == '0' && *CurPtr == 'o') { // 0o[0-7]+ ++CurPtr; while (*CurPtr >= '0' && *CurPtr <= '7') ++CurPtr; if (CurPtr - TokStart == 2) { diagnose(CurPtr, diag::lex_expected_digit_in_int_literal); return formToken(tok::unknown, TokStart); } return formToken(tok::integer_literal, TokStart); } else if (*TokStart == '0' && *CurPtr == 'b') { // 0b[01]+ ++CurPtr; while (*CurPtr == '0' || *CurPtr == '1') ++CurPtr; if (CurPtr - TokStart == 2) { diagnose(CurPtr, diag::lex_expected_digit_in_int_literal); return formToken(tok::unknown, TokStart); } return formToken(tok::integer_literal, TokStart); } // Handle the leading character here as well. --CurPtr; // Handle a leading [0-9]+, lexing an integer or falling through if we have a // floating point value. if (isdigit(*CurPtr)) { while (isdigit(*CurPtr)) ++CurPtr; // Floating literals must have '.', 'e', or 'E' after digits. If it is // something else, then this is the end of the token. if (*CurPtr != '.' && *CurPtr != 'e' && *CurPtr != 'E') return formToken(tok::integer_literal, TokStart); // Lex things like 4.x as '4' followed by a tok::period. if (*CurPtr == '.' && !isdigit(CurPtr[1]) && !isValidExponent(CurPtr+1)) return formToken(tok::integer_literal, TokStart); } // Lex decimal point. if (*CurPtr == '.') { ++CurPtr; // Lex any digits after the decimal point. while (isdigit(*CurPtr)) ++CurPtr; } // Lex exponent. if (*CurPtr == 'e' || *CurPtr == 'E') { ++CurPtr; // Eat the 'e' if (*CurPtr != '+' && *CurPtr != '-') { diagnose(CurPtr, diag::lex_expected_sign_in_fp); return formToken(tok::unknown, TokStart); } ++CurPtr; // Eat the sign. if (!isdigit(*CurPtr)) { diagnose(CurPtr, diag::lex_expected_digit_in_fp_exponent); return formToken(tok::unknown, TokStart); } while (isdigit(*CurPtr)) ++CurPtr; } return formToken(tok::floating_literal, TokStart); } /// lexStringLiteral: /// string_literal ::= ["]([^"\\\n\r]|string_escape)*["] /// /// string_escape ::= [\][\] | [\]t | [\]n | [\]r | [\]" | [\]' /// string_escape ::= [\]x hex hex /// string_escape ::= [\]u hex hex hex hex /// string_escape ::= [\]U hex hex hex hex hex hex hex hex /// hex ::= [0-9a-fA-F] void Lexer::lexStringLiteral() { const char *TokStart = CurPtr-1; assert(*TokStart == '"' && "Unexpected start"); llvm::SmallString<64> TempString; while (1) { unsigned CharValue = 0; const char *CharStart = CurPtr; switch (*CurPtr++) { default: // Normal characters are part of the string. // If this is a "high" UTF-8 character, validate it. if ((signed char)(CurPtr[-1]) < 0) { --CurPtr; if (validateUTF8CharacterAndAdvance(CurPtr) == ~0U) diagnose(CharStart, diag::lex_invalid_utf8_character); } continue; // If we found the closing " character, we're done. case '"': return formToken(tok::string_literal, TokStart); case 0: if (CurPtr-2 != BufferEnd) { diagnose(CurPtr-2, diag::lex_nul_character); continue; } --CurPtr; // FALL THROUGH. case '\n': // String literals cannot have \n or \r in them. case '\r': diagnose(TokStart, diag::lex_unterminated_string); return; case '\\': // Escapes. switch (*CurPtr) { default: // Invalid escape. diagnose(CurPtr, diag::lex_invalid_string_escape); continue; // Simple single-character escapes. case 't': case 'n': case 'r': case '"': case '\'': case '\\': ++CurPtr; continue; // Unicode escapes of various lengths. case 'x': // \x HEX HEX if (!isxdigit(CurPtr[1]) || !isxdigit(CurPtr[2])) { diagnose(CurPtr, diag::lex_invalid_string_x_escape); continue; } StringRef(CurPtr+1, 2).getAsInteger(16, CharValue); // Reject \x80 and above, since it is going to encode into a multibyte // unicode encoding, which is something that C folks may not expect. if (CharValue >= 0x80) diagnose(CurPtr, diag::lex_invalid_hex_escape); CurPtr += 3; break; case 'u': // \u HEX HEX HEX HEX if (!isxdigit(CurPtr[1]) || !isxdigit(CurPtr[2]) || !isxdigit(CurPtr[3]) || !isxdigit(CurPtr[4])) { diagnose(CurPtr, diag::lex_invalid_string_u_escape); continue; } StringRef(CurPtr+1, 4).getAsInteger(16, CharValue); CurPtr += 5; break; case 'U': // \U HEX HEX HEX HEX HEX HEX HEX HEX if (!isxdigit(CurPtr[1]) || !isxdigit(CurPtr[2]) || !isxdigit(CurPtr[3]) || !isxdigit(CurPtr[4]) || !isxdigit(CurPtr[5]) || !isxdigit(CurPtr[6]) || !isxdigit(CurPtr[7]) || !isxdigit(CurPtr[8])) { diagnose(CurPtr, diag::lex_invalid_string_U_escape); continue; } StringRef(CurPtr+1, 8).getAsInteger(16, CharValue); CurPtr += 9; break; } } // Check to see if the encoding is valid. if (CharValue >= 0x80 && EncodeToUTF8(CharValue, TempString)) diagnose(CharStart, diag::lex_invalid_unicode_code_point); } } /// getEncodedStringLiteral - Given a string literal token, return the bytes /// that the actual string literal should codegen to. If a copy needs to be /// made, it will be allocated out of the ASTContext allocator. StringRef Lexer::getEncodedStringLiteral(const Token &Str, ASTContext &Ctx) { // Get the bytes behind the string literal, dropping the double quotes. StringRef Bytes = Str.getText().drop_front().drop_back(); llvm::SmallString<64> TempString; // Note that it is always safe to read one over the end of "Bytes" because // we know that there is a terminating " character. Use BytesPtr to avoid a // range check subscripting on the StringRef. const char *BytesPtr = Bytes.begin(); while (BytesPtr != Bytes.end()) { char CurChar = *BytesPtr++; if (CurChar != '\\') { TempString += CurChar; continue; } // Invalid escapes are accepted by the lexer but diagnosed as an error. We // just ignore them here. unsigned CharValue; // Unicode character value for \x, \u, \U. switch (*BytesPtr++) { default: continue; // Invalid escape, ignore it. // Simple single-character escapes. case 't': TempString += '\t'; continue; case 'n': TempString += '\n'; continue; case 'r': TempString += '\r'; continue; case '"': TempString += '"'; continue; case '\'': TempString += '\''; continue; case '\\': TempString += '\\'; continue; // Unicode escapes of various lengths. case 'x': // \x HEX HEX if (!isxdigit(BytesPtr[0]) || !isxdigit(BytesPtr[1])) continue; // Ignore invalid escapes. StringRef(BytesPtr, 2).getAsInteger(16, CharValue); BytesPtr += 2; break; case 'u': // \u HEX HEX HEX HEX if (!isxdigit(BytesPtr[0]) || !isxdigit(BytesPtr[1]) || !isxdigit(BytesPtr[2]) || !isxdigit(BytesPtr[3])) continue; // Ignore invalid escapes. StringRef(BytesPtr, 4).getAsInteger(16, CharValue); BytesPtr += 4; break; case 'U': // \U HEX HEX HEX HEX HEX HEX HEX HEX if (!isxdigit(BytesPtr[0]) || !isxdigit(BytesPtr[1]) || !isxdigit(BytesPtr[2]) || !isxdigit(BytesPtr[3]) || !isxdigit(BytesPtr[4]) || !isxdigit(BytesPtr[5]) || !isxdigit(BytesPtr[6]) || !isxdigit(BytesPtr[7])) continue; // Ignore invalid escapes. StringRef(BytesPtr, 8).getAsInteger(16, CharValue); BytesPtr += 8; break; } if (CharValue < 0x80) TempString += (char)CharValue; else EncodeToUTF8(CharValue, TempString); } // If we didn't escape or reprocess anything, then we don't need to reallocate // a copy of the string, just point to the lexer's version. We know that this // is safe because unescaped strings are always shorter than their escaped // forms (in a valid string). if (TempString.size() == Bytes.size()) return Bytes; auto Res = Ctx.AllocateCopy(TempString); return StringRef(Res.data(), Res.size()); // ArrayRef to StringRef. } //===----------------------------------------------------------------------===// // Main Lexer Loop //===----------------------------------------------------------------------===// void Lexer::lexImpl() { assert(CurPtr >= BufferStart && CurPtr <= BufferEnd && "Cur Char Pointer out of range!"); Restart: // Remember the start of the token so we can form the text range. const char *TokStart = CurPtr; switch (*CurPtr++) { default: diagnose(CurPtr-1, diag::lex_invalid_character); return formToken(tok::unknown, TokStart); case ' ': case '\t': case '\n': case '\r': goto Restart; // Skip whitespace. case 0: // If this is a random nul character in the middle of a buffer, skip it as // whitespace. if (CurPtr-1 != BufferEnd) { diagnose(CurPtr-1, diag::lex_nul_character); goto Restart; } // Otherwise, this is the end of the buffer. Return EOF. return formToken(tok::eof, TokStart); case '(': return formToken(isPrecededBySpace() ? tok::l_paren_space : tok::l_paren, TokStart); case ')': return formToken(tok::r_paren, TokStart); case '{': return formToken(tok::l_brace, TokStart); case '}': return formToken(tok::r_brace, TokStart); case '[': return formToken(isPrecededBySpace() ? tok::l_square_space : tok::l_square, TokStart); case ']': return formToken(tok::r_square, TokStart); case '.': if (isdigit(CurPtr[0])) // .42 return lexNumber(); if (CurPtr[0] == '.') { ++CurPtr; return formToken(tok::oper, TokStart); } return formToken(tok::period, TokStart); case ',': return formToken(tok::comma, TokStart); case ';': return formToken(tok::semi, TokStart); case ':': return formToken(tok::colon, TokStart); // Punctuator identifier characters. case '/': if (CurPtr[0] == '/') { // "//" skipSlashSlashComment(); goto Restart; } if (CurPtr[0] == '*') { // "/*" skipSlashStarComment(); goto Restart; } // '/' starts an operator identifier. return lexOperatorIdentifier(); case '=': case '-': case '+': case '*': case '%': case '<': case '>': case '!': case '&': case '|': case '^': case '~': return lexOperatorIdentifier(); case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case '_': return lexIdentifier(); case '$': return lexDollarIdent(); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return lexNumber(); case '"': return lexStringLiteral(); } } SourceLoc Lexer::getLocForEndOfToken(llvm::SourceMgr &SM, SourceLoc Loc) { // Don't try to do anything with an invalid location. if (!Loc.isValid()) return Loc; // Figure out which buffer contains this location. int BufferID = SM.FindBufferContainingLoc(Loc.Value); if (BufferID < 0) return SourceLoc(); const llvm::MemoryBuffer *Buffer = SM.getMemoryBuffer(BufferID); if (!Buffer) return SourceLoc(); Lexer L(Buffer->getBuffer(), SM, 0, Loc.Value.getPointer()); unsigned Length = L.peekNextToken().getLength(); return Loc.getAdvancedLoc(Length); }