mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
* replaced yet another variant of isWhitespace with the version from
clang/Basic/CharInfo.h. The major difference is that our variant used to
consider '\0' whitespace.
* made sure that we don't construct StringRefs that point after the end of the
buffer. If the buffer ends with "#", then MemoryBuffer will only guarantee
that there is one additional NUL character. memcmp(), OTOH, is allowed to
access the complete span of the provided memory. I colud not actually get
this to crash on OSX 10.10, but I do remember similar crashes we fixed in Clang.
* added checks to reject extra tokens at the end of the build configuration
directive -- see tests, that code used to compile without diagnostics. The
lexer tried to do this, but in a self-referential way -- by checking the
NextToken variable (which is actually the previous token, when viewed from
the point of lexImpl()). The checks I added are a little too strict, they
reject comments at the end of the directive, but at least we don't accept
strange constructs. Allowing comments would not be hard, just requires
factoring out lexer's routines to skip comments so that they accept a pointer
to the buffer and return the comment end point. Filed
<rdar://problem/16301704> Allow comments at the end of bulid configuration directives
for that.
Found by inspection... I was grepping the codebase for 'isWhitespace'.
Swift SVN r14959
388 lines
13 KiB
C++
388 lines
13 KiB
C++
//===--- Lexer.h - Swift Language Lexer -------------------------*- C++ -*-===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See http://swift.org/LICENSE.txt for license information
|
|
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file defines the Lexer interface.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef SWIFT_LEXER_H
|
|
#define SWIFT_LEXER_H
|
|
|
|
#include "swift/Basic/SourceLoc.h"
|
|
#include "swift/Basic/SourceManager.h"
|
|
#include "swift/Parse/Token.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
|
|
namespace swift {
|
|
class DiagnosticEngine;
|
|
class InFlightDiagnostic;
|
|
class LangOptions;
|
|
|
|
template<typename ...T> struct Diag;
|
|
|
|
enum class CommentRetentionMode {
|
|
None,
|
|
AttachToNextToken,
|
|
ReturnAsTokens,
|
|
};
|
|
|
|
class Lexer {
|
|
const LangOptions &LangOpts;
|
|
const SourceManager &SourceMgr;
|
|
DiagnosticEngine *Diags;
|
|
const unsigned BufferID;
|
|
|
|
/// Pointer to the first character of the buffer, even in a lexer that
|
|
/// scans a subrange of the buffer.
|
|
const char *BufferStart;
|
|
|
|
/// Pointer to one past the end character of the buffer, even in a lexer
|
|
/// that scans a subrange of the buffer. Because the buffer is always
|
|
/// NUL-terminated, this points to the NUL terminator.
|
|
const char *BufferEnd;
|
|
|
|
/// Pointer to the artificial EOF that is located before BufferEnd. Useful
|
|
/// for lexing subranges of a buffer.
|
|
const char *ArtificialEOF = nullptr;
|
|
|
|
/// If non-null, points to the '\0' character in the buffer where we should
|
|
/// produce a code completion token.
|
|
const char *CodeCompletionPtr = nullptr;
|
|
|
|
/// Pointer to the next not consumed character.
|
|
const char *CurPtr;
|
|
|
|
/// @{
|
|
/// Members that are *not* permanent lexer state. The values only make sense
|
|
/// during the lexImpl() invocation. These variables are declared as members
|
|
/// rather than locals so that we don't have to thread them through to all
|
|
/// lexing helpers.
|
|
|
|
/// Points to the point in the source buffer where we started scanning for
|
|
/// the current token. Thus, the range [LastCommentBlockStart, CurPtr)
|
|
/// covers all comments and whitespace that we skipped, and the token itself.
|
|
const char *LastCommentBlockStart = nullptr;
|
|
|
|
/// True if we have seen a comment while scanning for the current token.
|
|
bool SeenComment = false;
|
|
|
|
/// @}
|
|
|
|
Token NextToken;
|
|
|
|
/// \brief This is true if we're lexing a .sil file instead of a .swift
|
|
/// file. This enables the 'sil' keyword.
|
|
const bool InSILMode;
|
|
|
|
const CommentRetentionMode RetainComments;
|
|
|
|
/// InSILBody - This is true when we're lexing the body of a SIL declaration
|
|
/// in a SIL file. This enables some context-sensitive lexing.
|
|
bool InSILBody = false;
|
|
|
|
public:
|
|
/// \brief Lexer state can be saved/restored to/from objects of this class.
|
|
class State {
|
|
public:
|
|
State() {}
|
|
|
|
bool isValid() const {
|
|
return Loc.isValid();
|
|
}
|
|
|
|
State advance(unsigned Offset) const {
|
|
assert(isValid());
|
|
return State(Loc.getAdvancedLoc(Offset));
|
|
}
|
|
|
|
private:
|
|
explicit State(SourceLoc Loc): Loc(Loc) {}
|
|
SourceLoc Loc;
|
|
friend class Lexer;
|
|
};
|
|
|
|
private:
|
|
Lexer(const Lexer&) = delete;
|
|
void operator=(const Lexer&) = delete;
|
|
|
|
/// The principal constructor used by public constructors below.
|
|
/// Don't use this constructor for other purposes, it does not initialize
|
|
/// everything.
|
|
Lexer(const LangOptions &Options,
|
|
const SourceManager &SourceMgr, DiagnosticEngine *Diags,
|
|
unsigned BufferID, bool InSILMode,
|
|
CommentRetentionMode RetainComments);
|
|
|
|
/// @{
|
|
/// Helper routines used in \c Lexer constructors.
|
|
void primeLexer();
|
|
void initSubLexer(Lexer &Parent, State BeginState, State EndState);
|
|
/// @}
|
|
|
|
public:
|
|
/// \brief Create a normal lexer that scans the whole source buffer.
|
|
///
|
|
/// \param Options - the language options under which to lex. By
|
|
/// design, language options only affect whether a token is valid
|
|
/// and/or the exact token kind produced (e.g. keyword or
|
|
/// identifier), but not things like how many characters are
|
|
/// consumed. If that changes, APIs like getLocForEndOfToken will
|
|
/// need to take a LangOptions explicitly.
|
|
/// \param InSILMode - whether we're parsing a SIL source file.
|
|
/// Unlike language options, this does affect primitive lexing, which
|
|
/// means that APIs like getLofForEndOfToken really ought to take
|
|
/// this flag; it's just that we don't care that much about fidelity
|
|
/// when parsing SIL files.
|
|
Lexer(const LangOptions &Options,
|
|
const SourceManager &SourceMgr, unsigned BufferID,
|
|
DiagnosticEngine *Diags, bool InSILMode,
|
|
CommentRetentionMode RetainComments = CommentRetentionMode::None)
|
|
: Lexer(Options, SourceMgr, Diags, BufferID, InSILMode, RetainComments) {
|
|
primeLexer();
|
|
}
|
|
|
|
/// \brief Create a lexer that scans a subrange of the source buffer.
|
|
Lexer(const LangOptions &Options,
|
|
const SourceManager &SourceMgr, unsigned BufferID,
|
|
DiagnosticEngine *Diags, bool InSILMode,
|
|
CommentRetentionMode RetainComments,
|
|
unsigned Offset, unsigned EndOffset)
|
|
: Lexer(Options, SourceMgr, Diags, BufferID, InSILMode, RetainComments) {
|
|
assert(Offset <= EndOffset && "invalid range");
|
|
initSubLexer(
|
|
*this,
|
|
State(getLocForStartOfBuffer().getAdvancedLoc(Offset)),
|
|
State(getLocForStartOfBuffer().getAdvancedLoc(EndOffset)));
|
|
}
|
|
|
|
/// \brief Create a sub-lexer that lexes from the same buffer, but scans
|
|
/// a subrange of the buffer.
|
|
///
|
|
/// \param Parent the parent lexer that scans the whole buffer
|
|
/// \param BeginState start of the subrange
|
|
/// \param EndState end of the subrange
|
|
Lexer(Lexer &Parent, State BeginState, State EndState)
|
|
: Lexer(Parent.LangOpts, Parent.SourceMgr, Parent.Diags, Parent.BufferID,
|
|
Parent.InSILMode, Parent.RetainComments) {
|
|
initSubLexer(Parent, BeginState, EndState);
|
|
}
|
|
|
|
/// \brief Returns true if this lexer will produce a code completion token.
|
|
bool isCodeCompletion() const {
|
|
return CodeCompletionPtr != nullptr;
|
|
}
|
|
|
|
void lex(Token &Result) {
|
|
Result = NextToken;
|
|
if (Result.isNot(tok::eof))
|
|
lexImpl();
|
|
}
|
|
|
|
bool isKeepingComments() const {
|
|
return RetainComments == CommentRetentionMode::ReturnAsTokens;
|
|
}
|
|
|
|
/// peekNextToken - Return the next token to be returned by Lex without
|
|
/// actually lexing it.
|
|
const Token &peekNextToken() const { return NextToken; }
|
|
|
|
/// \brief Returns the lexer state for the beginning of the given token
|
|
/// location. After restoring the state, lexer will return this token and
|
|
/// continue from there.
|
|
State getStateForBeginningOfTokenLoc(SourceLoc Loc) const;
|
|
|
|
/// \brief Returns the lexer state for the beginning of the given token.
|
|
/// After restoring the state, lexer will return this token and continue from
|
|
/// there.
|
|
State getStateForBeginningOfToken(const Token &Tok) const {
|
|
return getStateForBeginningOfTokenLoc(Tok.getLoc());
|
|
}
|
|
|
|
State getStateForEndOfTokenLoc(SourceLoc Loc) const {
|
|
return State(getLocForEndOfToken(SourceMgr, Loc));
|
|
}
|
|
|
|
/// \brief Restore the lexer state to a given one, that can be located either
|
|
/// before or after the current position.
|
|
void restoreState(State S) {
|
|
assert(S.isValid());
|
|
CurPtr = getBufferPtrForSourceLoc(S.Loc);
|
|
// Don't reemit diagnostics while readvancing the lexer.
|
|
auto TmpDiags = Diags;
|
|
Diags = nullptr;
|
|
lexImpl();
|
|
Diags = TmpDiags;
|
|
}
|
|
|
|
/// \brief Restore the lexer state to a given state that is located before
|
|
/// current position.
|
|
void backtrackToState(State S) {
|
|
assert(getBufferPtrForSourceLoc(S.Loc) <= CurPtr &&
|
|
"can't backtrack forward");
|
|
restoreState(S);
|
|
}
|
|
|
|
/// \brief Retrieve the source location that points just past the
|
|
/// end of the token refered to by \c Loc.
|
|
///
|
|
/// \param SM The source manager in which the given source location
|
|
/// resides.
|
|
///
|
|
/// \param Loc The source location of the beginning of a token.
|
|
static SourceLoc getLocForEndOfToken(const SourceManager &SM, SourceLoc Loc);
|
|
|
|
/// Return the start location of the token that the offset in the given buffer
|
|
/// points to.
|
|
///
|
|
/// Note that this is more expensive than \c getLocForEndOfToken because it
|
|
/// finds and re-lexes from the beginning of the line.
|
|
///
|
|
/// Due to the parser splitting tokens the adjustment may be incorrect, e.g:
|
|
/// \code
|
|
/// func +<T>(a : T, b : T)
|
|
/// \endcode
|
|
/// The start of the '<' token is '<', but the lexer will produce "+<" before
|
|
/// the parser splits it up.
|
|
////
|
|
/// If the offset points to whitespace the returned source location will point
|
|
/// to the whitespace offset.
|
|
static SourceLoc getLocForStartOfToken(SourceManager &SM, unsigned BufferID,
|
|
unsigned Offset);
|
|
|
|
/// \brief Determines if the given string is a valid non-operator
|
|
/// identifier, without escaping characters.
|
|
static bool isIdentifier(StringRef identifier);
|
|
|
|
SourceLoc getLocForStartOfBuffer() const {
|
|
return SourceLoc(llvm::SMLoc::getFromPointer(BufferStart));
|
|
}
|
|
|
|
/// StringSegment - A segment of a (potentially interpolated) string.
|
|
struct StringSegment {
|
|
enum : char { Literal, Expr } Kind;
|
|
// Loc+Length for the segment inside the string literal, without quotes.
|
|
SourceLoc Loc;
|
|
unsigned Length;
|
|
|
|
static StringSegment getLiteral(SourceLoc Loc, unsigned Length) {
|
|
StringSegment Result;
|
|
Result.Kind = Literal;
|
|
Result.Loc = Loc;
|
|
Result.Length = Length;
|
|
return Result;
|
|
}
|
|
|
|
static StringSegment getExpr(SourceLoc Loc, unsigned Length) {
|
|
StringSegment Result;
|
|
Result.Kind = Expr;
|
|
Result.Loc = Loc;
|
|
Result.Length = Length;
|
|
return Result;
|
|
}
|
|
};
|
|
|
|
/// \brief Compute the bytes that the actual string literal should codegen to.
|
|
/// If a copy needs to be made, it will be allocated out of the provided
|
|
/// Buffer.
|
|
static StringRef getEncodedStringSegment(StringRef Str,
|
|
SmallVectorImpl<char> &Buffer);
|
|
StringRef getEncodedStringSegment(StringSegment Segment,
|
|
SmallVectorImpl<char> &Buffer) const {
|
|
return getEncodedStringSegment(
|
|
StringRef(getBufferPtrForSourceLoc(Segment.Loc), Segment.Length),
|
|
Buffer);
|
|
}
|
|
|
|
/// \brief Given a string literal token, separate it into string/expr segments
|
|
/// of a potentially interpolated string.
|
|
static void getStringLiteralSegments(
|
|
const Token &Str,
|
|
SmallVectorImpl<StringSegment> &Segments,
|
|
DiagnosticEngine *Diags);
|
|
|
|
void getStringLiteralSegments(const Token &Str,
|
|
SmallVectorImpl<StringSegment> &Segments) {
|
|
return getStringLiteralSegments(Str, Segments, Diags);
|
|
}
|
|
|
|
/// \brief Return the UTF32 codepoint for the specified character literal.
|
|
uint32_t getEncodedCharacterLiteral(const Token &Tok);
|
|
|
|
static SourceLoc getSourceLoc(const char *Loc) {
|
|
return SourceLoc(llvm::SMLoc::getFromPointer(Loc));
|
|
}
|
|
|
|
/// Get the token that starts at the given location.
|
|
Token getTokenAt(SourceLoc Loc);
|
|
|
|
/// SILBodyRAII - This helper class is used when parsing a SIL body to inform
|
|
/// the lexer that SIL-specific lexing should be enabled.
|
|
struct SILBodyRAII {
|
|
Lexer &L;
|
|
SILBodyRAII(Lexer &L) : L(L) {
|
|
assert(!L.InSILBody && "Already in a sil body?");
|
|
L.InSILBody = true;
|
|
}
|
|
~SILBodyRAII() {
|
|
assert(L.InSILBody && "Left sil body already?");
|
|
L.InSILBody = false;
|
|
}
|
|
SILBodyRAII(const SILBodyRAII&) = delete;
|
|
void operator=(const SILBodyRAII&) = delete;
|
|
};
|
|
|
|
private:
|
|
/// For a source location in the current buffer, returns the corresponding
|
|
/// pointer.
|
|
const char *getBufferPtrForSourceLoc(SourceLoc Loc) const {
|
|
return BufferStart + SourceMgr.getLocOffsetInBuffer(Loc, BufferID);
|
|
}
|
|
|
|
StringRef getSubstring(const char *Start, unsigned Length) const {
|
|
assert(Start >= BufferStart && Start <= BufferEnd);
|
|
unsigned BytesUntilBufferEnd = BufferEnd - Start;
|
|
if (Length > BytesUntilBufferEnd)
|
|
Length = BytesUntilBufferEnd;
|
|
return StringRef(Start, Length);
|
|
}
|
|
|
|
void lexImpl();
|
|
InFlightDiagnostic diagnose(const char *Loc, Diag<> ID);
|
|
void formToken(tok Kind, const char *TokStart);
|
|
|
|
void skipToEndOfLine();
|
|
|
|
/// Skip to the end of the line of a // comment.
|
|
void skipSlashSlashComment();
|
|
|
|
/// Skip a #! hashbang line.
|
|
void skipHashbang();
|
|
|
|
void skipSlashStarComment();
|
|
void lexIdentifier();
|
|
void lexDollarIdent();
|
|
void lexOperatorIdentifier();
|
|
void lexHexNumber();
|
|
void lexNumber();
|
|
|
|
unsigned lexCharacter(const char *&CurPtr,
|
|
bool StopAtDoubleQuote, bool EmitDiagnostics);
|
|
void lexCharacterLiteral();
|
|
void lexStringLiteral();
|
|
void lexEscapedIdentifier();
|
|
};
|
|
|
|
|
|
} // end namespace swift
|
|
|
|
#endif
|