mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
LLVM is presumably moving towards `std::string_view` - `StringRef::startswith` is deprecated on tip. `SmallString::startswith` was just renamed there (maybe with some small deprecation inbetween, but if so, we've missed it). The `SmallString::startswith` references were moved to `.str().starts_with()`, rather than adding the `starts_with` on `stable/20230725` as we only had a few of them. Open to switching that over if anyone feels strongly though.
815 lines
25 KiB
C++
815 lines
25 KiB
C++
#include "swift/AST/DiagnosticConsumer.h"
|
|
#include "swift/AST/DiagnosticEngine.h"
|
|
#include "swift/Basic/Defer.h"
|
|
#include "swift/Basic/LangOptions.h"
|
|
#include "swift/Basic/SourceManager.h"
|
|
#include "swift/Parse/Lexer.h"
|
|
#include "swift/Subsystems.h"
|
|
#include "llvm/Support/MemoryBuffer.h"
|
|
#include "llvm/Support/Process.h"
|
|
#include "gtest/gtest.h"
|
|
|
|
#if __has_include(<sys/mman.h>)
|
|
# include <sys/mman.h>
|
|
# define HAS_MMAP 1
|
|
#else
|
|
# define HAS_MMAP 0
|
|
#endif
|
|
|
|
using namespace swift;
|
|
using namespace llvm;
|
|
|
|
// The test fixture.
|
|
class LexerTest : public ::testing::Test {
|
|
public:
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
|
|
std::vector<Token> tokenizeAndKeepEOF(unsigned BufferID) {
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr,
|
|
LexerMode::Swift);
|
|
std::vector<Token> Tokens;
|
|
do {
|
|
Tokens.emplace_back();
|
|
L.lex(Tokens.back());
|
|
} while (Tokens.back().isNot(tok::eof));
|
|
return Tokens;
|
|
}
|
|
|
|
std::vector<Token> checkLex(StringRef Source,
|
|
ArrayRef<tok> ExpectedTokens,
|
|
bool KeepComments = false,
|
|
bool KeepEOF = false) {
|
|
unsigned BufID = SourceMgr.addMemBufferCopy(Source);
|
|
|
|
std::vector<Token> Toks;
|
|
if (KeepEOF)
|
|
Toks = tokenizeAndKeepEOF(BufID);
|
|
else
|
|
Toks = tokenize(LangOpts, SourceMgr, BufID, 0, 0, /*Diags=*/nullptr, KeepComments);
|
|
EXPECT_EQ(ExpectedTokens.size(), Toks.size());
|
|
for (unsigned i = 0, e = ExpectedTokens.size(); i != e; ++i) {
|
|
EXPECT_EQ(ExpectedTokens[i], Toks[i].getKind()) << "i = " << i;
|
|
}
|
|
|
|
return Toks;
|
|
}
|
|
|
|
SourceLoc getLocForEndOfToken(SourceLoc Loc) {
|
|
return Lexer::getLocForEndOfToken(SourceMgr, Loc);
|
|
}
|
|
};
|
|
|
|
TEST_F(LexerTest, TokenizeSkipComments) {
|
|
const char *Source =
|
|
"// Blah\n"
|
|
"(/*yo*/)";
|
|
std::vector<tok> ExpectedTokens{ tok::l_paren, tok::r_paren };
|
|
checkLex(Source, ExpectedTokens, /*KeepComments=*/false);
|
|
}
|
|
|
|
TEST_F(LexerTest, TokenizeWithComments) {
|
|
const char *Source =
|
|
"// Blah\n"
|
|
"(/*yo*/)";
|
|
std::vector<tok> ExpectedTokens{
|
|
tok::comment, tok::l_paren, tok::comment, tok::r_paren
|
|
};
|
|
std::vector<Token> Toks = checkLex(Source, ExpectedTokens,
|
|
/*KeepComments=*/true);
|
|
EXPECT_EQ(Toks[0].getLength(), 8U);
|
|
EXPECT_EQ(Toks[2].getLength(), 6U);
|
|
EXPECT_EQ(getLocForEndOfToken(Toks[0].getLoc()),
|
|
Toks[0].getLoc().getAdvancedLoc(8));
|
|
}
|
|
|
|
TEST_F(LexerTest, EOFTokenLengthIsZero) {
|
|
const char *Source = "meow";
|
|
std::vector<tok> ExpectedTokens{ tok::identifier, tok::eof };
|
|
std::vector<Token> Toks = checkLex(Source, ExpectedTokens,
|
|
/*KeepComments=*/true,
|
|
/*KeepEOF=*/true);
|
|
EXPECT_EQ(Toks[1].getLength(), 0U);
|
|
}
|
|
|
|
TEST_F(LexerTest, BrokenStringLiteral1) {
|
|
StringRef Source("\"meow\0", 6);
|
|
std::vector<tok> ExpectedTokens{ tok::unknown, tok::eof };
|
|
std::vector<Token> Toks = checkLex(Source, ExpectedTokens,
|
|
/*KeepComments=*/true,
|
|
/*KeepEOF=*/true);
|
|
EXPECT_EQ(Toks[0].getLength(), 6U);
|
|
EXPECT_EQ(Toks[1].getLength(), 0U);
|
|
}
|
|
|
|
TEST_F(LexerTest, BrokenStringLiteral2) {
|
|
StringRef Source("\"\\(meow\0", 8);
|
|
std::vector<tok> ExpectedTokens{ tok::unknown, tok::eof };
|
|
std::vector<Token> Toks = checkLex(Source, ExpectedTokens,
|
|
/*KeepComments=*/true,
|
|
/*KeepEOF=*/true);
|
|
EXPECT_EQ(Toks[0].getLength(), 8U);
|
|
EXPECT_EQ(Toks[1].getLength(), 0U);
|
|
}
|
|
|
|
TEST_F(LexerTest, StringLiteralWithNUL1) {
|
|
StringRef Source("\"\0\"", 3);
|
|
std::vector<tok> ExpectedTokens{ tok::string_literal, tok::eof };
|
|
std::vector<Token> Toks = checkLex(Source, ExpectedTokens,
|
|
/*KeepComments=*/true,
|
|
/*KeepEOF=*/true);
|
|
EXPECT_EQ(Toks[0].getLength(), 3U);
|
|
EXPECT_EQ(Toks[1].getLength(), 0U);
|
|
}
|
|
|
|
TEST_F(LexerTest, ContentStartHashbangSkip) {
|
|
const char *Source = "#!/usr/bin/swift\naaa";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 17), Tok.getLoc());
|
|
}
|
|
|
|
TEST_F(LexerTest, ContentStartHashbangSkipUTF8BOM) {
|
|
const char *Source = "\xEF\xBB\xBF" "#!/usr/bin/swift\naaa";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 20), Tok.getLoc());
|
|
}
|
|
|
|
TEST_F(LexerTest, ContentStartOperatorLeftBound) {
|
|
const char *Source = "+a";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::oper_prefix, Tok.getKind());
|
|
ASSERT_EQ("+", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 0), Tok.getLoc());
|
|
}
|
|
|
|
TEST_F(LexerTest, ContentStartOperatorLeftBoundUTF8BOM) {
|
|
const char *Source = "\xEF\xBB\xBF" "+a";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::oper_prefix, Tok.getKind());
|
|
ASSERT_EQ("+", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getLoc());
|
|
}
|
|
|
|
TEST_F(LexerTest, ContentStartConflictMarker) {
|
|
const char *Source =
|
|
"<<<<<<< HEAD\n"
|
|
"xxx\n"
|
|
"=======\n"
|
|
"yyy\n"
|
|
">>>>>>> 12345670\n"
|
|
"aaa";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
}
|
|
|
|
TEST_F(LexerTest, ContentStartConflictMarkerUTF8BOM) {
|
|
const char *Source =
|
|
"\xEF\xBB\xBF"
|
|
"<<<<<<< HEAD\n"
|
|
"xxx\n"
|
|
"=======\n"
|
|
"yyy\n"
|
|
">>>>>>> 12345670\n"
|
|
"aaa";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
}
|
|
|
|
TEST_F(LexerTest, ContentStartTokenIsStartOfLine) {
|
|
const char *Source = "aaa";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 0), Tok.getLoc());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
}
|
|
|
|
TEST_F(LexerTest, ContentStartTokenIsStartOfLineUTF8BOM) {
|
|
const char *Source = "\xEF\xBB\xBF" "aaa";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getLoc());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
}
|
|
|
|
TEST_F(LexerTest, BOMNoCommentNoTrivia) {
|
|
const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift,
|
|
HashbangMode::Disallowed, CommentRetentionMode::None);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
|
|
}
|
|
|
|
TEST_F(LexerTest, BOMTokenCommentNoTrivia) {
|
|
const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift,
|
|
HashbangMode::Disallowed, CommentRetentionMode::ReturnAsTokens);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::comment, Tok.getKind());
|
|
ASSERT_EQ("// comment\n", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::comment, Tok.getKind());
|
|
ASSERT_EQ("//xx \n", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 18), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 18), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::comment, Tok.getKind());
|
|
ASSERT_EQ("/* x */", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 24), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 24), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
|
|
}
|
|
|
|
TEST_F(LexerTest, BOMAttachCommentNoTrivia) {
|
|
const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift,
|
|
HashbangMode::Disallowed, CommentRetentionMode::AttachToNextToken);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(10u, Tok.getCommentRange().getByteLength());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 18), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(13u, Tok.getCommentRange().getByteLength());
|
|
}
|
|
|
|
TEST_F(LexerTest, RestoreBasic) {
|
|
const char *Source = "aaa \t\0 bbb ccc";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source, 14));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("bbb", Tok.getText());
|
|
ASSERT_FALSE(Tok.isAtStartOfLine());
|
|
|
|
LexerState S = L.getStateForBeginningOfToken(Tok);
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("ccc", Tok.getText());
|
|
ASSERT_FALSE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
|
|
L.restoreState(S);
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("bbb", Tok.getText());
|
|
ASSERT_FALSE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("ccc", Tok.getText());
|
|
ASSERT_FALSE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
}
|
|
|
|
TEST_F(LexerTest, RestoreNewlineFlag) {
|
|
const char *Source = "aaa \n \0\tbbb \nccc";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source, 16));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("bbb", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
LexerState S = L.getStateForBeginningOfToken(Tok);
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("ccc", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
|
|
L.restoreState(S);
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("bbb", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("ccc", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
}
|
|
|
|
TEST_F(LexerTest, RestoreStopAtCodeCompletion) {
|
|
const char *Source = "aaa \n \0\tbbb \nccc";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source, 16));
|
|
SourceMgr.setIDEInspectionTarget(BufferID, 6);
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::code_complete, Tok.getKind());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("bbb", Tok.getText());
|
|
ASSERT_FALSE(Tok.isAtStartOfLine());
|
|
|
|
LexerState S = L.getStateForBeginningOfToken(Tok);
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("ccc", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
|
|
L.restoreState(S);
|
|
|
|
// Ensure that we don't get tok::code_complete here. We saved the lexer
|
|
// position after it, so we should not be getting it.
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("bbb", Tok.getText());
|
|
ASSERT_FALSE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("ccc", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
}
|
|
|
|
TEST_F(LexerTest, CharactersContainTheEdgeContinuationByte) {
|
|
// A continuation byte must be in the range greater than or
|
|
// equal to 0x80 and less than or equal to 0xBF
|
|
|
|
// À(0xC3 0x80), 㗀(0xE3 0x97 0x80), 🀀(0xF0 0x9F 0x80 0x80),
|
|
// ÿ(0xC3 0xBF), 俿(0xE4 0xBF 0xBF), 𐐿(0xF0 0x90 0x90 0xBF)
|
|
const char *Source = "À 㗀 🀀 ÿ 俿 𐐿";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(Source);
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("À", Tok.getText());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("㗀", Tok.getText());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("🀀", Tok.getText());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("ÿ", Tok.getText());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("俿", Tok.getText());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("𐐿", Tok.getText());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
}
|
|
|
|
TEST_F(LexerTest, getLocForStartOfToken) {
|
|
const char *Source = "aaa \n \tbbb \"hello\" \"-\\(val)-\"";
|
|
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(Source);
|
|
|
|
// First is character offset, second is its token offset.
|
|
unsigned Offs[][2] =
|
|
{ {1, 0}, {2, 0}, {3, 3}, {4, 4}, {6, 6}, {9, 7}, {14, 11},
|
|
// interpolated string
|
|
{20, 19}, {23, 23}, {24, 23}, {25, 23}, {26, 26}, {27, 19} };
|
|
|
|
for (auto Pair : Offs) {
|
|
ASSERT_EQ(Lexer::getLocForStartOfToken(SourceMgr, BufferID, Pair[0]),
|
|
SourceMgr.getLocForOffset(BufferID, Pair[1]));
|
|
}
|
|
}
|
|
|
|
TEST_F(LexerTest, getLocForStartOfTokenWithCustomSourceLocation) {
|
|
const char *Source =
|
|
"aaa \n"
|
|
// This next line is exactly 50 bytes to make it easy to compare with the
|
|
// previous test.
|
|
"#sourceLocation(file: \"custom-50.swuft\", line: 9)\n"
|
|
" \tbbb \"hello\" \"-\\(val)-\"";
|
|
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(Source);
|
|
|
|
// First is character offset, second is its token offset.
|
|
unsigned Offs[][2] =
|
|
{ {1, 0}, {2, 0}, {3, 3}, {4, 4},
|
|
{56, 56}, {59, 57}, {64, 61},
|
|
// interpolated string
|
|
{70, 69}, {73, 73}, {74, 73}, {75, 73}, {76, 76}, {77, 69} };
|
|
|
|
for (auto Pair : Offs) {
|
|
ASSERT_EQ(Lexer::getLocForStartOfToken(SourceMgr, BufferID, Pair[0]),
|
|
SourceMgr.getLocForOffset(BufferID, Pair[1]));
|
|
}
|
|
}
|
|
|
|
TEST_F(LexerTest, NestedSubLexers) {
|
|
const char *Source = "aaa0 bbb1 ccc2 ddd3 eee4 fff5 ggg6";
|
|
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(Source);
|
|
|
|
Lexer Primary(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr,
|
|
LexerMode::Swift);
|
|
std::vector<Token> TokensPrimary;
|
|
do {
|
|
TokensPrimary.emplace_back();
|
|
Primary.lex(TokensPrimary.back());
|
|
} while (TokensPrimary.back().isNot(tok::eof));
|
|
ASSERT_EQ(8U, TokensPrimary.size());
|
|
ASSERT_EQ(tok::eof, TokensPrimary.back().getKind());
|
|
|
|
Lexer Sub1(Primary, Primary.getStateForBeginningOfToken(TokensPrimary[1]),
|
|
Primary.getStateForBeginningOfToken(*(TokensPrimary.end() - 2)));
|
|
std::vector<Token> TokensSub1;
|
|
do {
|
|
TokensSub1.emplace_back();
|
|
Sub1.lex(TokensSub1.back());
|
|
} while (TokensSub1.back().isNot(tok::eof));
|
|
ASSERT_EQ(6U, TokensSub1.size());
|
|
ASSERT_EQ("bbb1", TokensSub1.front().getText());
|
|
ASSERT_EQ("fff5", (TokensSub1.end() - 2)->getText());
|
|
ASSERT_EQ(tok::eof, TokensSub1.back().getKind());
|
|
ASSERT_EQ("ggg6", TokensSub1.back().getText());
|
|
|
|
Lexer Sub2(Sub1, Sub1.getStateForBeginningOfToken(TokensSub1[1]),
|
|
Sub1.getStateForBeginningOfToken(*(TokensSub1.end() - 2)));
|
|
std::vector<Token> TokensSub2;
|
|
do {
|
|
TokensSub2.emplace_back();
|
|
Sub2.lex(TokensSub2.back());
|
|
} while (TokensSub2.back().isNot(tok::eof));
|
|
ASSERT_EQ(4U, TokensSub2.size());
|
|
ASSERT_EQ("ccc2", TokensSub2.front().getText());
|
|
ASSERT_EQ("eee4", (TokensSub2.end() - 2)->getText());
|
|
ASSERT_EQ(tok::eof, TokensSub2.back().getKind());
|
|
ASSERT_EQ("fff5", TokensSub2.back().getText());
|
|
}
|
|
|
|
TEST_F(LexerTest, TokenizePlaceholder) {
|
|
const char *Source = "aa <#one#> bb <# two #>";
|
|
std::vector<tok> ExpectedTokens{
|
|
tok::identifier, tok::identifier, tok::identifier, tok::identifier
|
|
};
|
|
std::vector<Token> Toks = checkLex(Source, ExpectedTokens);
|
|
EXPECT_EQ("aa", Toks[0].getText());
|
|
EXPECT_EQ("<#one#>", Toks[1].getText());
|
|
EXPECT_EQ("bb", Toks[2].getText());
|
|
EXPECT_EQ("<# two #>", Toks[3].getText());
|
|
}
|
|
|
|
TEST_F(LexerTest, NoPlaceholder) {
|
|
auto checkTok = [&](StringRef Source) {
|
|
unsigned BufID = SourceMgr.addMemBufferCopy(Source);
|
|
std::vector<Token> Toks = tokenize(LangOpts, SourceMgr, BufID, 0, 0, /*Diags=*/nullptr, false);
|
|
ASSERT_FALSE(Toks.empty());
|
|
EXPECT_NE(tok::identifier, Toks[0].getKind());
|
|
};
|
|
checkTok("<#");
|
|
checkTok("<#a#");
|
|
checkTok("<#a\n#>");
|
|
checkTok("< #a#>");
|
|
}
|
|
|
|
TEST_F(LexerTest, NestedPlaceholder) {
|
|
const char *Source = "<#<#aa#>#>";
|
|
std::vector<tok> ExpectedTokens{
|
|
tok::oper_prefix, tok::pound, tok::identifier, tok::pound, tok::oper_postfix
|
|
};
|
|
std::vector<Token> Toks = checkLex(Source, ExpectedTokens);
|
|
EXPECT_EQ("<#aa#>", Toks[2].getText());
|
|
}
|
|
|
|
class StringCaptureDiagnosticConsumer : public DiagnosticConsumer {
|
|
public:
|
|
virtual void handleDiagnostic(SourceManager &SM,
|
|
const swift::DiagnosticInfo &Info) override {
|
|
std::string DiagMsg;
|
|
llvm::raw_string_ostream DiagOS(DiagMsg);
|
|
DiagnosticEngine::formatDiagnosticText(DiagOS, Info.FormatString,
|
|
Info.FormatArgs);
|
|
auto LC = SM.getPresumedLineAndColumnForLoc(Info.Loc);
|
|
std::ostringstream StrOS;
|
|
StrOS << LC.first << ", " << LC.second << ": " << DiagOS.str();
|
|
messages.push_back(StrOS.str());
|
|
}
|
|
|
|
std::vector<std::string> messages;
|
|
};
|
|
|
|
bool containsPrefix(const std::vector<std::string> &strs,
|
|
const std::string &prefix) {
|
|
for (auto &str : strs) {
|
|
if (StringRef(str).starts_with(StringRef(prefix))) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
TEST_F(LexerTest, DiagnoseEmbeddedNul) {
|
|
const char Source[] = " \0 \0 aaa \0 \0 bbb";
|
|
size_t SourceLen = sizeof(Source) - 1;
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source, SourceLen));
|
|
|
|
StringCaptureDiagnosticConsumer DiagConsumer;
|
|
DiagnosticEngine Diags(SourceMgr);
|
|
Diags.addConsumer(DiagConsumer);
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, &Diags,
|
|
LexerMode::Swift, HashbangMode::Disallowed,
|
|
CommentRetentionMode::None);
|
|
|
|
Token Tok;
|
|
L.lex(Tok);
|
|
|
|
ASSERT_TRUE(containsPrefix(DiagConsumer.messages,
|
|
"1, 2: nul character embedded in middle of file"));
|
|
ASSERT_TRUE(containsPrefix(DiagConsumer.messages,
|
|
"1, 4: nul character embedded in middle of file"));
|
|
}
|
|
|
|
TEST_F(LexerTest, DiagnoseEmbeddedNulOffset) {
|
|
const char Source[] = " \0 \0 aaa \0 \0 bbb";
|
|
size_t SourceLen = sizeof(Source) - 1;
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source, SourceLen));
|
|
|
|
StringCaptureDiagnosticConsumer DiagConsumer;
|
|
DiagnosticEngine Diags(SourceMgr);
|
|
Diags.addConsumer(DiagConsumer);
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, &Diags,
|
|
LexerMode::Swift, HashbangMode::Disallowed,
|
|
CommentRetentionMode::None,
|
|
/*Offset=*/5, /*EndOffset=*/SourceLen);
|
|
|
|
ASSERT_FALSE(containsPrefix(
|
|
DiagConsumer.messages, "1, 2: nul character embedded in middle of file"));
|
|
ASSERT_FALSE(containsPrefix(
|
|
DiagConsumer.messages, "1, 4: nul character embedded in middle of file"));
|
|
}
|
|
|
|
TEST_F(LexerTest, InvalidUTF8Bytes) {
|
|
const char *Source = "\x80";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(Source);
|
|
|
|
StringCaptureDiagnosticConsumer DiagConsumer;
|
|
DiagnosticEngine Diags(SourceMgr);
|
|
Diags.addConsumer(DiagConsumer);
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, &Diags, LexerMode::Swift);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
|
|
ASSERT_EQ(DiagConsumer.messages.size(), 1);
|
|
auto message = DiagConsumer.messages.front();
|
|
ASSERT_TRUE(message.find("invalid UTF-8 found in source file") !=
|
|
std::string::npos);
|
|
}
|
|
|
|
#if HAS_MMAP
|
|
|
|
// This test requires mmap because llvm::sys::Memory doesn't support protecting
|
|
// pages to have no permissions.
|
|
TEST_F(LexerTest, EncodedStringSegmentPastTheEnd) {
|
|
Expected<size_t> ExptPageSize = llvm::sys::Process::getPageSize();
|
|
ASSERT_TRUE(bool(ExptPageSize));
|
|
size_t PageSize = *ExptPageSize;
|
|
|
|
void *FirstPage = mmap(/*addr*/nullptr, PageSize * 2, PROT_NONE,
|
|
MAP_PRIVATE | MAP_ANON, /*fd*/-1, /*offset*/0);
|
|
SWIFT_DEFER { (void)munmap(FirstPage, PageSize * 2); };
|
|
ASSERT_NE(FirstPage, MAP_FAILED);
|
|
int ProtectResult = mprotect(FirstPage, PageSize, PROT_READ | PROT_WRITE);
|
|
ASSERT_EQ(ProtectResult, 0);
|
|
|
|
auto check = [FirstPage, PageSize](StringRef Input, StringRef Expected) {
|
|
char *StartPtr = static_cast<char *>(FirstPage) + PageSize - Input.size();
|
|
memcpy(StartPtr, Input.data(), Input.size());
|
|
|
|
SmallString<64> Buffer;
|
|
StringRef Escaped = Lexer::getEncodedStringSegment({StartPtr, Input.size()},
|
|
Buffer);
|
|
EXPECT_EQ(Escaped, Expected);
|
|
};
|
|
|
|
check("needs escaping\\r",
|
|
"needs escaping\r");
|
|
check("does not need escaping",
|
|
"does not need escaping");
|
|
check("invalid escape at the end \\",
|
|
"invalid escape at the end ");
|
|
}
|
|
|
|
#endif // HAS_MMAP
|