mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
752 lines
24 KiB
C++
752 lines
24 KiB
C++
//===--- Lexer.cpp - Swift Language Lexer ---------------------------------===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See http://swift.org/LICENSE.txt for license information
|
|
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements the Lexer and Token interfaces.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "swift/Parse/Lexer.h"
|
|
#include "swift/AST/ASTContext.h"
|
|
#include "swift/AST/Diagnostics.h"
|
|
#include "swift/AST/Identifier.h"
|
|
#include "llvm/Support/MathExtras.h"
|
|
#include "llvm/Support/MemoryBuffer.h"
|
|
#include "llvm/Support/SourceMgr.h"
|
|
#include "llvm/ADT/SmallString.h"
|
|
#include "llvm/ADT/StringSwitch.h"
|
|
#include "llvm/ADT/Twine.h"
|
|
using namespace swift;
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Setup and Helper Methods
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
Lexer::Lexer(StringRef Buffer, llvm::SourceMgr &SourceMgr,
|
|
DiagnosticEngine *Diags, const char *CurrentPosition)
|
|
: SourceMgr(SourceMgr), Diags(Diags) {
|
|
BufferStart = Buffer.begin();
|
|
BufferEnd = Buffer.end();
|
|
CurPtr = CurrentPosition;
|
|
assert(CurPtr >= BufferStart && CurPtr <= BufferEnd &&
|
|
"Current position is out-of-range");
|
|
|
|
// Prime the lexer.
|
|
lexImpl();
|
|
}
|
|
|
|
InFlightDiagnostic Lexer::diagnose(const char *Loc, Diag<> ID) {
|
|
if (Diags)
|
|
Diags->diagnose(getSourceLoc(Loc), ID);
|
|
|
|
return InFlightDiagnostic();
|
|
}
|
|
|
|
void Lexer::formToken(tok Kind, const char *TokStart) {
|
|
NextToken.setToken(Kind, StringRef(TokStart, CurPtr-TokStart));
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Lexer Subroutines
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// skipSlashSlashComment - Skip to the end of the line of a // comment.
|
|
void Lexer::skipSlashSlashComment() {
|
|
assert(CurPtr[-1] == '/' && CurPtr[0] == '/' && "Not a // comment");
|
|
while (1) {
|
|
switch (*CurPtr++) {
|
|
case '\n':
|
|
case '\r':
|
|
return; // If we found the end of the line, return.
|
|
default:
|
|
break; // Otherwise, eat other characters.
|
|
case 0:
|
|
// If this is a random nul character in the middle of a buffer, skip it as
|
|
// whitespace.
|
|
if (CurPtr-1 != BufferEnd) {
|
|
diagnose(CurPtr-1, diag::lex_nul_character);
|
|
break;
|
|
}
|
|
|
|
// Otherwise, we have a // comment at end of file.
|
|
--CurPtr;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// skipSlashStarComment - /**/ comments are skipped (treated as whitespace).
|
|
/// Note that (unlike in C) block comments can be nested.
|
|
void Lexer::skipSlashStarComment() {
|
|
const char *StartPtr = CurPtr-1;
|
|
assert(CurPtr[-1] == '/' && CurPtr[0] == '*' && "Not a /* comment");
|
|
// Make sure to advance over the * so that we don't incorrectly handle /*/ as
|
|
// the beginning and end of the comment.
|
|
++CurPtr;
|
|
|
|
// /**/ comments can be nested, keep track of how deep we've gone.
|
|
unsigned Depth = 1;
|
|
|
|
while (1) {
|
|
switch (*CurPtr++) {
|
|
case '*':
|
|
// Check for a '*/'
|
|
if (*CurPtr == '/') {
|
|
++CurPtr;
|
|
if (--Depth == 0)
|
|
return;
|
|
}
|
|
break;
|
|
case '/':
|
|
// Check for a '/*'
|
|
if (*CurPtr == '*') {
|
|
++CurPtr;
|
|
++Depth;
|
|
}
|
|
break;
|
|
default:
|
|
break; // Otherwise, eat other characters.
|
|
case 0:
|
|
// If this is a random nul character in the middle of a buffer, skip it as
|
|
// whitespace.
|
|
if (CurPtr-1 != BufferEnd) {
|
|
diagnose(CurPtr-1, diag::lex_nul_character);
|
|
break;
|
|
}
|
|
|
|
// Otherwise, we have an unterminated /* comment.
|
|
--CurPtr;
|
|
diagnose(CurPtr-(CurPtr[-1] == '\n'),
|
|
diag::lex_unterminated_block_comment);
|
|
diagnose(StartPtr, diag::lex_comment_start);
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
static bool isValidStartOfIdentifier(char c) {
|
|
return isalpha(c) || c == '_';
|
|
}
|
|
static bool isValidContinuationOfIdentifier(char c) {
|
|
return isalnum(c) || c == '_' || c == '$';
|
|
}
|
|
|
|
/// isIdentifier - Checks whether a string matches the identifier regex.
|
|
bool Lexer::isIdentifier(llvm::StringRef string) {
|
|
if (string.empty()) return false;
|
|
if (!isValidStartOfIdentifier(string[0])) return false;
|
|
for (unsigned i = 1, e = string.size(); i != e; ++i)
|
|
if (!isValidContinuationOfIdentifier(string[i]))
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
/// lexIdentifier - Match [a-zA-Z_][a-zA-Z_$0-9]*
|
|
void Lexer::lexIdentifier() {
|
|
const char *TokStart = CurPtr-1;
|
|
assert(isValidStartOfIdentifier(*TokStart) && "Unexpected start");
|
|
|
|
// Lex [a-zA-Z_$0-9]*
|
|
while (isValidContinuationOfIdentifier(*CurPtr))
|
|
++CurPtr;
|
|
|
|
tok Kind =
|
|
llvm::StringSwitch<tok>(StringRef(TokStart, CurPtr-TokStart))
|
|
// decl and type keywords
|
|
.Case("extension", tok::kw_extension)
|
|
.Case("import", tok::kw_import)
|
|
.Case("func", tok::kw_func)
|
|
.Case("oneof", tok::kw_oneof)
|
|
.Case("protocol", tok::kw_protocol)
|
|
.Case("struct", tok::kw_struct)
|
|
.Case("typealias", tok::kw_typealias)
|
|
.Case("var", tok::kw_var)
|
|
.Case("static", tok::kw_static)
|
|
|
|
// Statements
|
|
.Case("if", tok::kw_if)
|
|
.Case("else", tok::kw_else)
|
|
.Case("while", tok::kw_while)
|
|
.Case("return", tok::kw_return)
|
|
.Default(tok::identifier);
|
|
|
|
return formToken(Kind, TokStart);
|
|
}
|
|
|
|
/// lexOperatorIdentifier - Match identifiers formed out of punctuation.
|
|
void Lexer::lexOperatorIdentifier() {
|
|
const char *TokStart = CurPtr-1;
|
|
|
|
while (Identifier::isOperatorChar(*CurPtr))
|
|
++CurPtr;
|
|
|
|
// Match various reserved words.
|
|
if (CurPtr-TokStart == 1) {
|
|
switch (TokStart[0]) {
|
|
case '=': return formToken(tok::equal, TokStart);
|
|
}
|
|
} else if (CurPtr-TokStart == 2) {
|
|
switch ((TokStart[0] << 8) | TokStart[1]) {
|
|
case ('-' << 8) | '>': // ->
|
|
return formToken(tok::arrow, TokStart);
|
|
}
|
|
}
|
|
|
|
return formToken(tok::oper, TokStart);
|
|
}
|
|
|
|
/// lexDollarIdent - Match $[0-9a-zA-Z_$]*
|
|
void Lexer::lexDollarIdent() {
|
|
const char *TokStart = CurPtr-1;
|
|
assert(*TokStart == '$');
|
|
|
|
// Lex [a-zA-Z_$0-9]*
|
|
while (isalnum(*CurPtr) || *CurPtr == '_' || *CurPtr == '$')
|
|
++CurPtr;
|
|
|
|
return formToken(tok::dollarident, TokStart);
|
|
}
|
|
|
|
|
|
// Return true if the string starts with "[eE][+-][0-9]"
|
|
static bool isValidExponent(const char *P) {
|
|
if (*P != 'e' && *P != 'E')
|
|
return false;
|
|
++P;
|
|
if (*P != '+' && *P != '-')
|
|
return false;
|
|
++P;
|
|
return isdigit(*P);
|
|
}
|
|
|
|
/// lexNumber:
|
|
/// integer_literal ::= [0-9]+
|
|
/// integer_literal ::= 0x[0-9a-fA-F]+
|
|
/// integer_literal ::= 0o[0-7]+
|
|
/// integer_literal ::= 0b[01]+
|
|
/// floating_literal ::= [0-9]+\.[0-9]+
|
|
/// floating_literal ::= [0-9]+(\.[0-9]*)?[eE][+-][0-9]+
|
|
/// floating_literal ::= \.[0-9]+([eE][+-][0-9]+)?
|
|
void Lexer::lexNumber() {
|
|
const char *TokStart = CurPtr-1;
|
|
assert((isdigit(*TokStart) || *TokStart == '.') && "Unexpected start");
|
|
|
|
if (*TokStart == '0' && *CurPtr == 'x') {
|
|
// 0x[0-9a-fA-F]+
|
|
++CurPtr;
|
|
while (isdigit(*CurPtr) ||
|
|
(*CurPtr >= 'a' && *CurPtr <= 'f') ||
|
|
(*CurPtr >= 'A' && *CurPtr <= 'F'))
|
|
++CurPtr;
|
|
if (CurPtr - TokStart == 2) {
|
|
diagnose(CurPtr, diag::lex_expected_digit_in_int_literal);
|
|
return formToken(tok::unknown, TokStart);
|
|
}
|
|
return formToken(tok::integer_literal, TokStart);
|
|
} else if (*TokStart == '0' && *CurPtr == 'o') {
|
|
// 0o[0-7]+
|
|
++CurPtr;
|
|
while (*CurPtr >= '0' && *CurPtr <= '7')
|
|
++CurPtr;
|
|
if (CurPtr - TokStart == 2) {
|
|
diagnose(CurPtr, diag::lex_expected_digit_in_int_literal);
|
|
return formToken(tok::unknown, TokStart);
|
|
}
|
|
return formToken(tok::integer_literal, TokStart);
|
|
} else if (*TokStart == '0' && *CurPtr == 'b') {
|
|
// 0b[01]+
|
|
++CurPtr;
|
|
while (*CurPtr == '0' || *CurPtr == '1')
|
|
++CurPtr;
|
|
if (CurPtr - TokStart == 2) {
|
|
diagnose(CurPtr, diag::lex_expected_digit_in_int_literal);
|
|
return formToken(tok::unknown, TokStart);
|
|
}
|
|
return formToken(tok::integer_literal, TokStart);
|
|
}
|
|
|
|
// Handle the leading character here as well.
|
|
--CurPtr;
|
|
|
|
// Handle a leading [0-9]+, lexing an integer or falling through if we have a
|
|
// floating point value.
|
|
if (isdigit(*CurPtr)) {
|
|
while (isdigit(*CurPtr))
|
|
++CurPtr;
|
|
|
|
// Floating literals must have '.', 'e', or 'E' after digits. If it is
|
|
// something else, then this is the end of the token.
|
|
if (*CurPtr != '.' && *CurPtr != 'e' && *CurPtr != 'E')
|
|
return formToken(tok::integer_literal, TokStart);
|
|
|
|
// Lex things like 4.x as '4' followed by a tok::period.
|
|
if (*CurPtr == '.' && !isdigit(CurPtr[1]) && !isValidExponent(CurPtr+1))
|
|
return formToken(tok::integer_literal, TokStart);
|
|
}
|
|
|
|
// Lex decimal point.
|
|
if (*CurPtr == '.') {
|
|
++CurPtr;
|
|
|
|
// Lex any digits after the decimal point.
|
|
while (isdigit(*CurPtr))
|
|
++CurPtr;
|
|
}
|
|
|
|
// Lex exponent.
|
|
if (*CurPtr == 'e' || *CurPtr == 'E') {
|
|
++CurPtr; // Eat the 'e'
|
|
if (*CurPtr != '+' && *CurPtr != '-') {
|
|
diagnose(CurPtr, diag::lex_expected_sign_in_fp);
|
|
return formToken(tok::unknown, TokStart);
|
|
}
|
|
++CurPtr; // Eat the sign.
|
|
|
|
if (!isdigit(*CurPtr)) {
|
|
diagnose(CurPtr, diag::lex_expected_digit_in_fp_exponent);
|
|
return formToken(tok::unknown, TokStart);
|
|
}
|
|
|
|
while (isdigit(*CurPtr))
|
|
++CurPtr;
|
|
}
|
|
|
|
return formToken(tok::floating_literal, TokStart);
|
|
}
|
|
|
|
/// EncodeToUTF8 - Encode the specified code point into a UTF8 stream. Return
|
|
/// true if it is an erroneous code point.
|
|
static bool EncodeToUTF8(unsigned CharValue,
|
|
llvm::SmallVectorImpl<char> &Result) {
|
|
assert(CharValue >= 0x80 && "Single-byte encoding should be already handled");
|
|
// Number of bits in the value, ignoring leading zeros.
|
|
unsigned NumBits = 32-llvm::CountLeadingZeros_32(CharValue);
|
|
|
|
// Handle the leading byte, based on the number of bits in the value.
|
|
unsigned NumTrailingBytes;
|
|
if (NumBits <= 5+6) {
|
|
// Encoding is 0x110aaaaa 10bbbbbb
|
|
Result.push_back(char(0xC0 | (CharValue >> 6)));
|
|
NumTrailingBytes = 1;
|
|
} else if (NumBits <= 4+6+6) {
|
|
// Encoding is 0x1110aaaa 10bbbbbb 10cccccc
|
|
Result.push_back(char(0xE0 | (CharValue >> (6+6))));
|
|
NumTrailingBytes = 2;
|
|
|
|
// UTF-16 surrogate pair values are not valid code points.
|
|
if (CharValue >= 0xD800 && CharValue <= 0xDFFF)
|
|
return true;
|
|
} else if (NumBits <= 3+6+6+6) {
|
|
// Encoding is 0x11110aaa 10bbbbbb 10cccccc 10dddddd
|
|
Result.push_back(char(0xF0 | (CharValue >> (6+6+6))));
|
|
NumTrailingBytes = 3;
|
|
// Reject over-large code points. These cannot be encoded as UTF-16
|
|
// surrogate pairs, so UTF-32 doesn't allow them.
|
|
if (CharValue > 0x10FFFF)
|
|
return true;
|
|
} else {
|
|
return true; // UTF8 can encode these, but they aren't valid code points.
|
|
}
|
|
|
|
// Emit all of the trailing bytes.
|
|
while (NumTrailingBytes--)
|
|
Result.push_back(char(0x80 | (0x3F & (CharValue >> (NumTrailingBytes*6)))));
|
|
return false;
|
|
}
|
|
|
|
|
|
/// CLO8 - Return the number of leading ones in the specified 8-bit value.
|
|
static unsigned CLO8(unsigned char C) {
|
|
return llvm::CountLeadingOnes_32(uint32_t(C) << 24);
|
|
}
|
|
|
|
/// isStartOfUTF8Character - Return true if this isn't a UTF8 continuation
|
|
/// character, which will be of the form 0b10XXXXXX
|
|
static bool isStartOfUTF8Character(unsigned char C) {
|
|
return (signed char)C >= 0 || C >= 0xC0; // C0 = 0b11000000
|
|
}
|
|
|
|
/// validateUTF8CharacterAndAdvance - Given a pointer to the starting byte of a
|
|
/// UTF8 character, validate it and advance the lexer past it. This returns the
|
|
/// encoded character or ~0U if the encoding is invalid.
|
|
static uint32_t validateUTF8CharacterAndAdvance(const char *&Ptr) {
|
|
assert((signed char)(*Ptr) < 0 && "Not the start of an encoded letter");
|
|
|
|
unsigned char CurByte = *Ptr++;
|
|
|
|
// Read the number of high bits set, which indicates the number of bytes in
|
|
// the character.
|
|
unsigned EncodedBytes = CLO8(CurByte);
|
|
|
|
// If this is 0b10XXXXXX, then it is a continuation character.
|
|
if (EncodedBytes == 1 ||
|
|
// If the number of encoded bytes is > 4, then this is an invalid
|
|
// character in the range of 0xF5 and above. These would start an
|
|
// encoding for something that couldn't be represented with UTF16
|
|
// digraphs, so Unicode rejects them.
|
|
EncodedBytes > 4) {
|
|
// Skip until we get the start of another character. This is guaranteed to
|
|
// at least stop at the nul at the end of the buffer.
|
|
while (!isStartOfUTF8Character(*Ptr))
|
|
++Ptr;
|
|
return ~0U;
|
|
}
|
|
|
|
// Drop the high bits indicating the # bytes of the result.
|
|
unsigned CharValue = (unsigned char)(CurByte << EncodedBytes) >> EncodedBytes;
|
|
|
|
// Read and validate the continuation bytes.
|
|
for (unsigned i = 1; i != EncodedBytes; ++i) {
|
|
CurByte = *Ptr;
|
|
// If the high bit isn't set or the second bit isn't clear, then this is not
|
|
// a continuation byte!
|
|
if (CurByte < 0x80 || CurByte >= 0xC0) return ~0U;
|
|
|
|
// Accumulate our result.
|
|
CharValue <<= 6;
|
|
CharValue |= CurByte & 0x3F;
|
|
++Ptr;
|
|
}
|
|
|
|
// If we got here, we read the appropriate number of accumulated bytes.
|
|
// Verify that the encoding was actually minimal.
|
|
// Number of bits in the value, ignoring leading zeros.
|
|
unsigned NumBits = 32-llvm::CountLeadingZeros_32(CharValue);
|
|
|
|
if (NumBits <= 5+6)
|
|
return EncodedBytes == 2 ? CharValue : ~0U;
|
|
if (NumBits <= 4+6+6)
|
|
return EncodedBytes == 3 ? CharValue : ~0U;
|
|
if (NumBits <= 3+6+6+6)
|
|
return EncodedBytes == 4 ? CharValue : ~0U;
|
|
return EncodedBytes == 5 ? CharValue : ~0U;
|
|
}
|
|
|
|
/// lexStringLiteral:
|
|
/// string_literal ::= ["]([^"\\\n\r]|string_escape)*["]
|
|
///
|
|
/// string_escape ::= [\][\] | [\]t | [\]n | [\]r | [\]" | [\]'
|
|
/// string_escape ::= [\]x hex hex
|
|
/// string_escape ::= [\]u hex hex hex hex
|
|
/// string_escape ::= [\]U hex hex hex hex hex hex hex hex
|
|
/// hex ::= [0-9a-fA-F]
|
|
void Lexer::lexStringLiteral() {
|
|
const char *TokStart = CurPtr-1;
|
|
assert(*TokStart == '"' && "Unexpected start");
|
|
|
|
llvm::SmallString<64> TempString;
|
|
|
|
while (1) {
|
|
unsigned CharValue = 0;
|
|
const char *CharStart = CurPtr;
|
|
switch (*CurPtr++) {
|
|
default: // Normal characters are part of the string.
|
|
// If this is a "high" UTF-8 character, validate it.
|
|
if ((signed char)(CurPtr[-1]) < 0) {
|
|
--CurPtr;
|
|
if (validateUTF8CharacterAndAdvance(CurPtr) == ~0U)
|
|
diagnose(CharStart, diag::lex_invalid_utf8_character);
|
|
}
|
|
continue;
|
|
// If we found the closing " character, we're done.
|
|
case '"':
|
|
return formToken(tok::string_literal, TokStart);
|
|
case 0:
|
|
if (CurPtr-2 != BufferEnd) {
|
|
diagnose(CurPtr-2, diag::lex_nul_character);
|
|
continue;
|
|
}
|
|
--CurPtr;
|
|
// FALL THROUGH.
|
|
case '\n': // String literals cannot have \n or \r in them.
|
|
case '\r':
|
|
diagnose(TokStart, diag::lex_unterminated_string);
|
|
return;
|
|
case '\\': // Escapes.
|
|
switch (*CurPtr) {
|
|
default: // Invalid escape.
|
|
diagnose(CurPtr, diag::lex_invalid_string_escape);
|
|
continue;
|
|
|
|
// Simple single-character escapes.
|
|
case 't':
|
|
case 'n':
|
|
case 'r':
|
|
case '"':
|
|
case '\'':
|
|
case '\\':
|
|
++CurPtr;
|
|
continue;
|
|
// Unicode escapes of various lengths.
|
|
case 'x': // \x HEX HEX
|
|
if (!isxdigit(CurPtr[1]) || !isxdigit(CurPtr[2])) {
|
|
diagnose(CurPtr, diag::lex_invalid_string_x_escape);
|
|
continue;
|
|
}
|
|
|
|
StringRef(CurPtr+1, 2).getAsInteger(16, CharValue);
|
|
|
|
// Reject \x80 and above, since it is going to encode into a multibyte
|
|
// unicode encoding, which is something that C folks may not expect.
|
|
if (CharValue >= 0x80)
|
|
diagnose(CurPtr, diag::lex_invalid_hex_escape);
|
|
|
|
CurPtr += 3;
|
|
break;
|
|
|
|
case 'u': // \u HEX HEX HEX HEX
|
|
if (!isxdigit(CurPtr[1]) || !isxdigit(CurPtr[2]) ||
|
|
!isxdigit(CurPtr[3]) || !isxdigit(CurPtr[4])) {
|
|
diagnose(CurPtr, diag::lex_invalid_string_u_escape);
|
|
continue;
|
|
}
|
|
|
|
StringRef(CurPtr+1, 4).getAsInteger(16, CharValue);
|
|
CurPtr += 5;
|
|
break;
|
|
case 'U': // \U HEX HEX HEX HEX HEX HEX HEX HEX
|
|
if (!isxdigit(CurPtr[1]) || !isxdigit(CurPtr[2]) ||
|
|
!isxdigit(CurPtr[3]) || !isxdigit(CurPtr[4]) ||
|
|
!isxdigit(CurPtr[5]) || !isxdigit(CurPtr[6]) ||
|
|
!isxdigit(CurPtr[7]) || !isxdigit(CurPtr[8])) {
|
|
diagnose(CurPtr, diag::lex_invalid_string_U_escape);
|
|
continue;
|
|
}
|
|
StringRef(CurPtr+1, 8).getAsInteger(16, CharValue);
|
|
CurPtr += 9;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Check to see if the encoding is valid.
|
|
if (CharValue >= 0x80 && EncodeToUTF8(CharValue, TempString))
|
|
diagnose(CharStart, diag::lex_invalid_unicode_code_point);
|
|
}
|
|
}
|
|
|
|
/// getEncodedStringLiteral - Given a string literal token, return the bytes
|
|
/// that the actual string literal should codegen to. If a copy needs to be
|
|
/// made, it will be allocated out of the ASTContext allocator.
|
|
StringRef Lexer::getEncodedStringLiteral(const Token &Str, ASTContext &Ctx) {
|
|
// Get the bytes behind the string literal, dropping the double quotes.
|
|
StringRef Bytes = Str.getText().drop_front().drop_back();
|
|
llvm::SmallString<64> TempString;
|
|
|
|
// Note that it is always safe to read one over the end of "Bytes" because
|
|
// we know that there is a terminating " character. Use BytesPtr to avoid a
|
|
// range check subscripting on the StringRef.
|
|
const char *BytesPtr = Bytes.begin();
|
|
while (BytesPtr != Bytes.end()) {
|
|
char CurChar = *BytesPtr++;
|
|
if (CurChar != '\\') {
|
|
TempString += CurChar;
|
|
continue;
|
|
}
|
|
|
|
// Invalid escapes are accepted by the lexer but diagnosed as an error. We
|
|
// just ignore them here.
|
|
unsigned CharValue; // Unicode character value for \x, \u, \U.
|
|
switch (*BytesPtr++) {
|
|
default:
|
|
continue; // Invalid escape, ignore it.
|
|
|
|
// Simple single-character escapes.
|
|
case 't': TempString += '\t'; continue;
|
|
case 'n': TempString += '\n'; continue;
|
|
case 'r': TempString += '\r'; continue;
|
|
case '"': TempString += '"'; continue;
|
|
case '\'': TempString += '\''; continue;
|
|
case '\\': TempString += '\\'; continue;
|
|
|
|
// Unicode escapes of various lengths.
|
|
case 'x': // \x HEX HEX
|
|
if (!isxdigit(BytesPtr[0]) || !isxdigit(BytesPtr[1]))
|
|
continue; // Ignore invalid escapes.
|
|
|
|
StringRef(BytesPtr, 2).getAsInteger(16, CharValue);
|
|
BytesPtr += 2;
|
|
break;
|
|
case 'u': // \u HEX HEX HEX HEX
|
|
if (!isxdigit(BytesPtr[0]) || !isxdigit(BytesPtr[1]) ||
|
|
!isxdigit(BytesPtr[2]) || !isxdigit(BytesPtr[3]))
|
|
continue; // Ignore invalid escapes.
|
|
|
|
StringRef(BytesPtr, 4).getAsInteger(16, CharValue);
|
|
BytesPtr += 4;
|
|
break;
|
|
case 'U': // \U HEX HEX HEX HEX HEX HEX HEX HEX
|
|
if (!isxdigit(BytesPtr[0]) || !isxdigit(BytesPtr[1]) ||
|
|
!isxdigit(BytesPtr[2]) || !isxdigit(BytesPtr[3]) ||
|
|
!isxdigit(BytesPtr[4]) || !isxdigit(BytesPtr[5]) ||
|
|
!isxdigit(BytesPtr[6]) || !isxdigit(BytesPtr[7]))
|
|
continue; // Ignore invalid escapes.
|
|
|
|
StringRef(BytesPtr, 8).getAsInteger(16, CharValue);
|
|
BytesPtr += 8;
|
|
break;
|
|
}
|
|
|
|
if (CharValue < 0x80)
|
|
TempString += (char)CharValue;
|
|
else
|
|
EncodeToUTF8(CharValue, TempString);
|
|
}
|
|
|
|
// If we didn't escape or reprocess anything, then we don't need to reallocate
|
|
// a copy of the string, just point to the lexer's version. We know that this
|
|
// is safe because unescaped strings are always shorter than their escaped
|
|
// forms (in a valid string).
|
|
if (TempString.size() == Bytes.size())
|
|
return Bytes;
|
|
|
|
auto Res = Ctx.AllocateCopy(TempString);
|
|
return StringRef(Res.data(), Res.size()); // ArrayRef to StringRef.
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Main Lexer Loop
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
void Lexer::lexImpl() {
|
|
assert(CurPtr >= BufferStart &&
|
|
CurPtr <= BufferEnd && "Cur Char Pointer out of range!");
|
|
|
|
Restart:
|
|
// Remember the start of the token so we can form the text range.
|
|
const char *TokStart = CurPtr;
|
|
|
|
switch (*CurPtr++) {
|
|
default:
|
|
diagnose(CurPtr-1, diag::lex_invalid_character);
|
|
return formToken(tok::unknown, TokStart);
|
|
|
|
case ' ':
|
|
case '\t':
|
|
case '\n':
|
|
case '\r':
|
|
goto Restart; // Skip whitespace.
|
|
case 0:
|
|
// If this is a random nul character in the middle of a buffer, skip it as
|
|
// whitespace.
|
|
if (CurPtr-1 != BufferEnd) {
|
|
diagnose(CurPtr-1, diag::lex_nul_character);
|
|
goto Restart;
|
|
}
|
|
|
|
// Otherwise, this is the end of the buffer. Return EOF.
|
|
return formToken(tok::eof, TokStart);
|
|
|
|
case '(': {
|
|
// This is either l_paren or l_paren_space depending on whether there is
|
|
// whitespace before it.
|
|
bool PrecededBySpace;
|
|
|
|
// For these purposes, the start of the file is considered to be
|
|
// preceeded by infinite whitespace.
|
|
if (CurPtr - 1 == BufferStart) {
|
|
PrecededBySpace = true;
|
|
|
|
// Otherwise, our list of whitespace characters is pretty short.
|
|
} else {
|
|
char LastChar = *(CurPtr - 2);
|
|
PrecededBySpace = (isspace(LastChar) || LastChar == '\0');
|
|
}
|
|
|
|
if (PrecededBySpace)
|
|
return formToken(tok::l_paren_space, TokStart);
|
|
|
|
return formToken(tok::l_paren, TokStart);
|
|
}
|
|
case ')': return formToken(tok::r_paren, TokStart);
|
|
case '{': return formToken(tok::l_brace, TokStart);
|
|
case '}': return formToken(tok::r_brace, TokStart);
|
|
case '[': return formToken(tok::l_square, TokStart);
|
|
case ']': return formToken(tok::r_square, TokStart);
|
|
|
|
case '.':
|
|
if (isdigit(CurPtr[0])) // .42
|
|
return lexNumber();
|
|
|
|
return formToken(tok::period, TokStart);
|
|
case ',': return formToken(tok::comma, TokStart);
|
|
case ';': return formToken(tok::semi, TokStart);
|
|
case ':': return formToken(tok::colon, TokStart);
|
|
|
|
// Punctuator identifier characters.
|
|
case '/':
|
|
if (CurPtr[0] == '/') { // "//"
|
|
skipSlashSlashComment();
|
|
goto Restart;
|
|
}
|
|
|
|
if (CurPtr[0] == '*') { // "/*"
|
|
skipSlashStarComment();
|
|
goto Restart;
|
|
}
|
|
|
|
// '/' starts an operator identifier.
|
|
return lexOperatorIdentifier();
|
|
|
|
case '=':
|
|
case '-':
|
|
case '+':
|
|
case '*':
|
|
case '%':
|
|
case '<':
|
|
case '>':
|
|
case '!':
|
|
case '&':
|
|
case '|':
|
|
case '^':
|
|
return lexOperatorIdentifier();
|
|
|
|
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
|
|
case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
|
|
case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
|
|
case 'V': case 'W': case 'X': case 'Y': case 'Z':
|
|
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
|
|
case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
|
|
case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
|
|
case 'v': case 'w': case 'x': case 'y': case 'z':
|
|
case '_':
|
|
return lexIdentifier();
|
|
|
|
case '$':
|
|
return lexDollarIdent();
|
|
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
return lexNumber();
|
|
case '"':
|
|
return lexStringLiteral();
|
|
}
|
|
}
|
|
|
|
SourceLoc Lexer::getLocForEndOfToken(llvm::SourceMgr &SM, SourceLoc Loc) {
|
|
// Don't try to do anything with an invalid location.
|
|
if (!Loc.isValid())
|
|
return Loc;
|
|
|
|
// Figure out which buffer contains this location.
|
|
int BufferID = SM.FindBufferContainingLoc(Loc.Value);
|
|
if (BufferID < 0)
|
|
return SourceLoc();
|
|
|
|
const llvm::MemoryBuffer *Buffer = SM.getMemoryBuffer(BufferID);
|
|
if (!Buffer)
|
|
return SourceLoc();
|
|
|
|
Lexer L(Buffer->getBuffer(), SM, 0, Loc.Value.getPointer());
|
|
unsigned Length = L.peekNextToken().getLength();
|
|
return Loc.getAdvancedLoc(Length);
|
|
}
|
|
|