mirror of
https://github.com/apple/swift.git
synced 2025-12-21 12:14:44 +01:00
Lexer::getEncodedStringSegment (now getEncodedStringSegmentImpl) assumes that it can read one byte past the end of a string segment in order to avoid bounds-checks on things like "is this a \r\n sequence?". However, the function was being used for strings that did not come from source where this assumption was not always valid. Change the reusable form of the function to always copy into a temporary buffer, allowing the fast path to continue to be used for normal parsing. Caught by ASan! rdar://problem/44306756
852 lines
28 KiB
C++
852 lines
28 KiB
C++
#include "swift/AST/DiagnosticConsumer.h"
|
|
#include "swift/AST/DiagnosticEngine.h"
|
|
#include "swift/Basic/Defer.h"
|
|
#include "swift/Basic/LangOptions.h"
|
|
#include "swift/Basic/SourceManager.h"
|
|
#include "swift/Parse/Lexer.h"
|
|
#include "swift/Subsystems.h"
|
|
#include "llvm/Support/MemoryBuffer.h"
|
|
#include "llvm/Support/Process.h"
|
|
#include "gtest/gtest.h"
|
|
|
|
#if __has_include(<sys/mman.h>)
|
|
# include <sys/mman.h>
|
|
# define HAS_MMAP 1
|
|
#else
|
|
# define HAS_MMAP 0
|
|
#endif
|
|
|
|
using namespace swift;
|
|
using namespace llvm;
|
|
|
|
// The test fixture.
|
|
class LexerTest : public ::testing::Test {
|
|
public:
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
|
|
std::vector<Token> tokenizeAndKeepEOF(unsigned BufferID) {
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr,
|
|
/*InSILMode=*/false);
|
|
std::vector<Token> Tokens;
|
|
do {
|
|
Tokens.emplace_back();
|
|
L.lex(Tokens.back());
|
|
} while (Tokens.back().isNot(tok::eof));
|
|
return Tokens;
|
|
}
|
|
|
|
std::vector<Token> checkLex(StringRef Source,
|
|
ArrayRef<tok> ExpectedTokens,
|
|
bool KeepComments = false,
|
|
bool KeepEOF = false) {
|
|
unsigned BufID = SourceMgr.addMemBufferCopy(Source);
|
|
|
|
std::vector<Token> Toks;
|
|
if (KeepEOF)
|
|
Toks = tokenizeAndKeepEOF(BufID);
|
|
else
|
|
Toks = tokenize(LangOpts, SourceMgr, BufID, 0, 0, /*Diags=*/nullptr, KeepComments);
|
|
EXPECT_EQ(ExpectedTokens.size(), Toks.size());
|
|
for (unsigned i = 0, e = ExpectedTokens.size(); i != e; ++i) {
|
|
EXPECT_EQ(ExpectedTokens[i], Toks[i].getKind()) << "i = " << i;
|
|
}
|
|
|
|
return Toks;
|
|
}
|
|
|
|
SourceLoc getLocForEndOfToken(SourceLoc Loc) {
|
|
return Lexer::getLocForEndOfToken(SourceMgr, Loc);
|
|
}
|
|
};
|
|
|
|
TEST_F(LexerTest, TokenizeSkipComments) {
|
|
const char *Source =
|
|
"// Blah\n"
|
|
"(/*yo*/)";
|
|
std::vector<tok> ExpectedTokens{ tok::l_paren, tok::r_paren };
|
|
checkLex(Source, ExpectedTokens, /*KeepComments=*/false);
|
|
}
|
|
|
|
TEST_F(LexerTest, TokenizeWithComments) {
|
|
const char *Source =
|
|
"// Blah\n"
|
|
"(/*yo*/)";
|
|
std::vector<tok> ExpectedTokens{
|
|
tok::comment, tok::l_paren, tok::comment, tok::r_paren
|
|
};
|
|
std::vector<Token> Toks = checkLex(Source, ExpectedTokens,
|
|
/*KeepComments=*/true);
|
|
EXPECT_EQ(Toks[0].getLength(), 8U);
|
|
EXPECT_EQ(Toks[2].getLength(), 6U);
|
|
EXPECT_EQ(getLocForEndOfToken(Toks[0].getLoc()),
|
|
Toks[0].getLoc().getAdvancedLoc(8));
|
|
}
|
|
|
|
TEST_F(LexerTest, EOFTokenLengthIsZero) {
|
|
const char *Source = "meow";
|
|
std::vector<tok> ExpectedTokens{ tok::identifier, tok::eof };
|
|
std::vector<Token> Toks = checkLex(Source, ExpectedTokens,
|
|
/*KeepComments=*/true,
|
|
/*KeepEOF=*/true);
|
|
EXPECT_EQ(Toks[1].getLength(), 0U);
|
|
}
|
|
|
|
TEST_F(LexerTest, BrokenStringLiteral1) {
|
|
StringRef Source("\"meow\0", 6);
|
|
std::vector<tok> ExpectedTokens{ tok::unknown, tok::eof };
|
|
std::vector<Token> Toks = checkLex(Source, ExpectedTokens,
|
|
/*KeepComments=*/true,
|
|
/*KeepEOF=*/true);
|
|
EXPECT_EQ(Toks[0].getLength(), 6U);
|
|
EXPECT_EQ(Toks[1].getLength(), 0U);
|
|
}
|
|
|
|
TEST_F(LexerTest, BrokenStringLiteral2) {
|
|
StringRef Source("\"\\(meow\0", 8);
|
|
std::vector<tok> ExpectedTokens{ tok::unknown, tok::eof };
|
|
std::vector<Token> Toks = checkLex(Source, ExpectedTokens,
|
|
/*KeepComments=*/true,
|
|
/*KeepEOF=*/true);
|
|
EXPECT_EQ(Toks[0].getLength(), 8U);
|
|
EXPECT_EQ(Toks[1].getLength(), 0U);
|
|
}
|
|
|
|
TEST_F(LexerTest, StringLiteralWithNUL1) {
|
|
StringRef Source("\"\0\"", 3);
|
|
std::vector<tok> ExpectedTokens{ tok::string_literal, tok::eof };
|
|
std::vector<Token> Toks = checkLex(Source, ExpectedTokens,
|
|
/*KeepComments=*/true,
|
|
/*KeepEOF=*/true);
|
|
EXPECT_EQ(Toks[0].getLength(), 3U);
|
|
EXPECT_EQ(Toks[1].getLength(), 0U);
|
|
}
|
|
|
|
TEST_F(LexerTest, ContentStartHashbangSkip) {
|
|
const char *Source = "#!/usr/bin/swift\naaa";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 17), Tok.getLoc());
|
|
}
|
|
|
|
TEST_F(LexerTest, ContentStartHashbangSkipUTF8BOM) {
|
|
const char *Source = "\xEF\xBB\xBF" "#!/usr/bin/swift\naaa";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 20), Tok.getLoc());
|
|
}
|
|
|
|
TEST_F(LexerTest, ContentStartOperatorLeftBound) {
|
|
const char *Source = "+a";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::oper_prefix, Tok.getKind());
|
|
ASSERT_EQ("+", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 0), Tok.getLoc());
|
|
}
|
|
|
|
TEST_F(LexerTest, ContentStartOperatorLeftBoundUTF8BOM) {
|
|
const char *Source = "\xEF\xBB\xBF" "+a";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::oper_prefix, Tok.getKind());
|
|
ASSERT_EQ("+", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getLoc());
|
|
}
|
|
|
|
TEST_F(LexerTest, ContentStartConflictMarker) {
|
|
const char *Source =
|
|
"<<<<<<< HEAD\n"
|
|
"xxx\n"
|
|
"=======\n"
|
|
"yyy\n"
|
|
">>>>>>> 12345670\n"
|
|
"aaa";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
}
|
|
|
|
TEST_F(LexerTest, ContentStartConflictMarkerUTF8BOM) {
|
|
const char *Source =
|
|
"\xEF\xBB\xBF"
|
|
"<<<<<<< HEAD\n"
|
|
"xxx\n"
|
|
"=======\n"
|
|
"yyy\n"
|
|
">>>>>>> 12345670\n"
|
|
"aaa";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
}
|
|
|
|
TEST_F(LexerTest, ContentStartTokenIsStartOfLine) {
|
|
const char *Source = "aaa";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 0), Tok.getLoc());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
}
|
|
|
|
TEST_F(LexerTest, ContentStartTokenIsStartOfLineUTF8BOM) {
|
|
const char *Source = "\xEF\xBB\xBF" "aaa";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getLoc());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
}
|
|
|
|
TEST_F(LexerTest, BOMNoCommentNoTrivia) {
|
|
const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false,
|
|
HashbangMode::Disallowed, CommentRetentionMode::None,
|
|
TriviaRetentionMode::WithoutTrivia);
|
|
|
|
Token Tok;
|
|
syntax::Trivia LeadingTrivia, TrailingTrivia;
|
|
|
|
L.lex(Tok, LeadingTrivia, TrailingTrivia);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
|
|
ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
|
|
ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
|
|
|
|
L.lex(Tok, LeadingTrivia, TrailingTrivia);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
|
|
ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
|
|
ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
|
|
}
|
|
|
|
TEST_F(LexerTest, BOMTokenCommentNoTrivia) {
|
|
const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false,
|
|
HashbangMode::Disallowed, CommentRetentionMode::ReturnAsTokens,
|
|
TriviaRetentionMode::WithoutTrivia);
|
|
|
|
Token Tok;
|
|
syntax::Trivia LeadingTrivia, TrailingTrivia;
|
|
|
|
L.lex(Tok, LeadingTrivia, TrailingTrivia);
|
|
ASSERT_EQ(tok::comment, Tok.getKind());
|
|
ASSERT_EQ("// comment\n", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
|
|
ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
|
|
ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
|
|
|
|
L.lex(Tok, LeadingTrivia, TrailingTrivia);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
|
|
ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
|
|
ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
|
|
|
|
L.lex(Tok, LeadingTrivia, TrailingTrivia);
|
|
ASSERT_EQ(tok::comment, Tok.getKind());
|
|
ASSERT_EQ("//xx \n", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 18), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 18), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
|
|
ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
|
|
ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
|
|
|
|
L.lex(Tok, LeadingTrivia, TrailingTrivia);
|
|
ASSERT_EQ(tok::comment, Tok.getKind());
|
|
ASSERT_EQ("/* x */", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 24), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 24), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
|
|
ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
|
|
ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
|
|
|
|
L.lex(Tok, LeadingTrivia, TrailingTrivia);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
|
|
ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
|
|
ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
|
|
}
|
|
|
|
TEST_F(LexerTest, BOMAttachCommentNoTrivia) {
|
|
const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false,
|
|
HashbangMode::Disallowed, CommentRetentionMode::AttachToNextToken,
|
|
TriviaRetentionMode::WithoutTrivia);
|
|
|
|
Token Tok;
|
|
syntax::Trivia LeadingTrivia, TrailingTrivia;
|
|
|
|
L.lex(Tok, LeadingTrivia, TrailingTrivia);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(10u, Tok.getCommentRange().getByteLength());
|
|
ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
|
|
ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
|
|
|
|
L.lex(Tok, LeadingTrivia, TrailingTrivia);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 18), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(13u, Tok.getCommentRange().getByteLength());
|
|
ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
|
|
ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
|
|
}
|
|
|
|
TEST_F(LexerTest, BOMNoCommentTrivia) {
|
|
const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false,
|
|
HashbangMode::Disallowed, CommentRetentionMode::None,
|
|
TriviaRetentionMode::WithTrivia);
|
|
|
|
Token Tok;
|
|
syntax::Trivia LeadingTrivia, TrailingTrivia;
|
|
|
|
L.lex(Tok, LeadingTrivia, TrailingTrivia);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
|
|
ASSERT_EQ((syntax::Trivia{{
|
|
syntax::TriviaPiece::garbageText("\xEF\xBB\xBF"),
|
|
syntax::TriviaPiece::lineComment("// comment"),
|
|
syntax::TriviaPiece::newlines(1)
|
|
}}), LeadingTrivia);
|
|
ASSERT_EQ((syntax::Trivia{{
|
|
syntax::TriviaPiece::spaces(1)
|
|
}}), TrailingTrivia);
|
|
|
|
L.lex(Tok, LeadingTrivia, TrailingTrivia);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
|
|
ASSERT_EQ((syntax::Trivia{{
|
|
syntax::TriviaPiece::lineComment("//xx "),
|
|
syntax::TriviaPiece::newlines(1),
|
|
syntax::TriviaPiece::blockComment("/* x */")
|
|
}}), LeadingTrivia);
|
|
ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
|
|
}
|
|
|
|
TEST_F(LexerTest, BOMAttachCommentTrivia) {
|
|
const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false,
|
|
HashbangMode::Disallowed, CommentRetentionMode::AttachToNextToken,
|
|
TriviaRetentionMode::WithTrivia);
|
|
|
|
Token Tok;
|
|
syntax::Trivia LeadingTrivia, TrailingTrivia;
|
|
|
|
L.lex(Tok, LeadingTrivia, TrailingTrivia);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(10u, Tok.getCommentRange().getByteLength());
|
|
ASSERT_EQ((syntax::Trivia{{
|
|
syntax::TriviaPiece::garbageText("\xEF\xBB\xBF"),
|
|
syntax::TriviaPiece::lineComment("// comment"),
|
|
syntax::TriviaPiece::newlines(1)
|
|
}}), LeadingTrivia);
|
|
ASSERT_EQ((syntax::Trivia{{
|
|
syntax::TriviaPiece::spaces(1)
|
|
}}), TrailingTrivia);
|
|
|
|
L.lex(Tok, LeadingTrivia, TrailingTrivia);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
|
|
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 18), Tok.getCommentRange().getStart());
|
|
ASSERT_EQ(13u, Tok.getCommentRange().getByteLength());
|
|
ASSERT_EQ((syntax::Trivia{{
|
|
syntax::TriviaPiece::lineComment("//xx "),
|
|
syntax::TriviaPiece::newlines(1),
|
|
syntax::TriviaPiece::blockComment("/* x */")
|
|
}}), LeadingTrivia);
|
|
ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
|
|
}
|
|
|
|
TEST_F(LexerTest, RestoreBasic) {
|
|
const char *Source = "aaa \t\0 bbb ccc";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source, 14));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("bbb", Tok.getText());
|
|
ASSERT_FALSE(Tok.isAtStartOfLine());
|
|
|
|
LexerState S = L.getStateForBeginningOfToken(Tok);
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("ccc", Tok.getText());
|
|
ASSERT_FALSE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
|
|
L.restoreState(S);
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("bbb", Tok.getText());
|
|
ASSERT_FALSE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("ccc", Tok.getText());
|
|
ASSERT_FALSE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
}
|
|
|
|
TEST_F(LexerTest, RestoreNewlineFlag) {
|
|
const char *Source = "aaa \n \0\tbbb \nccc";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source, 16));
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("bbb", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
LexerState S = L.getStateForBeginningOfToken(Tok);
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("ccc", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
|
|
L.restoreState(S);
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("bbb", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("ccc", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
}
|
|
|
|
TEST_F(LexerTest, RestoreStopAtCodeCompletion) {
|
|
const char *Source = "aaa \n \0\tbbb \nccc";
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source, 16));
|
|
SourceMgr.setCodeCompletionPoint(BufferID, 6);
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false);
|
|
|
|
Token Tok;
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("aaa", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::code_complete, Tok.getKind());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("bbb", Tok.getText());
|
|
ASSERT_FALSE(Tok.isAtStartOfLine());
|
|
|
|
LexerState S = L.getStateForBeginningOfToken(Tok);
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("ccc", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
|
|
L.restoreState(S);
|
|
|
|
// Ensure that we don't get tok::code_complete here. We saved the lexer
|
|
// position after it, so we should not be getting it.
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("bbb", Tok.getText());
|
|
ASSERT_FALSE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::identifier, Tok.getKind());
|
|
ASSERT_EQ("ccc", Tok.getText());
|
|
ASSERT_TRUE(Tok.isAtStartOfLine());
|
|
|
|
L.lex(Tok);
|
|
ASSERT_EQ(tok::eof, Tok.getKind());
|
|
}
|
|
|
|
TEST_F(LexerTest, getLocForStartOfToken) {
|
|
const char *Source = "aaa \n \tbbb \"hello\" \"-\\(val)-\"";
|
|
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(Source);
|
|
|
|
// First is character offset, second is its token offset.
|
|
unsigned Offs[][2] =
|
|
{ {1, 0}, {2, 0}, {3, 3}, {4, 4}, {6, 6}, {9, 7}, {14, 11},
|
|
// interpolated string
|
|
{20, 19}, {23, 23}, {24, 23}, {25, 23}, {26, 26}, {27, 19} };
|
|
|
|
for (auto Pair : Offs) {
|
|
ASSERT_EQ(Lexer::getLocForStartOfToken(SourceMgr, BufferID, Pair[0]),
|
|
SourceMgr.getLocForOffset(BufferID, Pair[1]));
|
|
}
|
|
}
|
|
|
|
TEST_F(LexerTest, getLocForStartOfTokenWithCustomSourceLocation) {
|
|
const char *Source =
|
|
"aaa \n"
|
|
// This next line is exactly 50 bytes to make it easy to compare with the
|
|
// previous test.
|
|
"#sourceLocation(file: \"custom-50.swuft\", line: 9)\n"
|
|
" \tbbb \"hello\" \"-\\(val)-\"";
|
|
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(Source);
|
|
|
|
// First is character offset, second is its token offset.
|
|
unsigned Offs[][2] =
|
|
{ {1, 0}, {2, 0}, {3, 3}, {4, 4},
|
|
{56, 56}, {59, 57}, {64, 61},
|
|
// interpolated string
|
|
{70, 69}, {73, 73}, {74, 73}, {75, 73}, {76, 76}, {77, 69} };
|
|
|
|
for (auto Pair : Offs) {
|
|
ASSERT_EQ(Lexer::getLocForStartOfToken(SourceMgr, BufferID, Pair[0]),
|
|
SourceMgr.getLocForOffset(BufferID, Pair[1]));
|
|
}
|
|
}
|
|
|
|
TEST_F(LexerTest, NestedSubLexers) {
|
|
const char *Source = "aaa0 bbb1 ccc2 ddd3 eee4 fff5 ggg6";
|
|
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(Source);
|
|
|
|
Lexer Primary(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr,
|
|
/*InSILMode=*/false);
|
|
std::vector<Token> TokensPrimary;
|
|
do {
|
|
TokensPrimary.emplace_back();
|
|
Primary.lex(TokensPrimary.back());
|
|
} while (TokensPrimary.back().isNot(tok::eof));
|
|
ASSERT_EQ(8U, TokensPrimary.size());
|
|
ASSERT_EQ(tok::eof, TokensPrimary.back().getKind());
|
|
|
|
Lexer Sub1(Primary, Primary.getStateForBeginningOfToken(TokensPrimary[1]),
|
|
Primary.getStateForBeginningOfToken(*(TokensPrimary.end() - 2)));
|
|
std::vector<Token> TokensSub1;
|
|
do {
|
|
TokensSub1.emplace_back();
|
|
Sub1.lex(TokensSub1.back());
|
|
} while (TokensSub1.back().isNot(tok::eof));
|
|
ASSERT_EQ(6U, TokensSub1.size());
|
|
ASSERT_EQ("bbb1", TokensSub1.front().getText());
|
|
ASSERT_EQ("fff5", (TokensSub1.end() - 2)->getText());
|
|
ASSERT_EQ(tok::eof, TokensSub1.back().getKind());
|
|
ASSERT_EQ("ggg6", TokensSub1.back().getText());
|
|
|
|
Lexer Sub2(Sub1, Sub1.getStateForBeginningOfToken(TokensSub1[1]),
|
|
Sub1.getStateForBeginningOfToken(*(TokensSub1.end() - 2)));
|
|
std::vector<Token> TokensSub2;
|
|
do {
|
|
TokensSub2.emplace_back();
|
|
Sub2.lex(TokensSub2.back());
|
|
} while (TokensSub2.back().isNot(tok::eof));
|
|
ASSERT_EQ(4U, TokensSub2.size());
|
|
ASSERT_EQ("ccc2", TokensSub2.front().getText());
|
|
ASSERT_EQ("eee4", (TokensSub2.end() - 2)->getText());
|
|
ASSERT_EQ(tok::eof, TokensSub2.back().getKind());
|
|
ASSERT_EQ("fff5", TokensSub2.back().getText());
|
|
}
|
|
|
|
TEST_F(LexerTest, TokenizePlaceholder) {
|
|
const char *Source = "aa <#one#> bb <# two #>";
|
|
std::vector<tok> ExpectedTokens{
|
|
tok::identifier, tok::identifier, tok::identifier, tok::identifier
|
|
};
|
|
std::vector<Token> Toks = checkLex(Source, ExpectedTokens);
|
|
EXPECT_EQ("aa", Toks[0].getText());
|
|
EXPECT_EQ("<#one#>", Toks[1].getText());
|
|
EXPECT_EQ("bb", Toks[2].getText());
|
|
EXPECT_EQ("<# two #>", Toks[3].getText());
|
|
}
|
|
|
|
TEST_F(LexerTest, NoPlaceholder) {
|
|
auto checkTok = [&](StringRef Source) {
|
|
unsigned BufID = SourceMgr.addMemBufferCopy(Source);
|
|
std::vector<Token> Toks = tokenize(LangOpts, SourceMgr, BufID, 0, 0, /*Diags=*/nullptr, false);
|
|
ASSERT_FALSE(Toks.empty());
|
|
EXPECT_NE(tok::identifier, Toks[0].getKind());
|
|
};
|
|
checkTok("<#");
|
|
checkTok("<#a#");
|
|
checkTok("<#a\n#>");
|
|
checkTok("< #a#>");
|
|
}
|
|
|
|
TEST_F(LexerTest, NestedPlaceholder) {
|
|
const char *Source = "<#<#aa#>#>";
|
|
std::vector<tok> ExpectedTokens{
|
|
tok::oper_prefix, tok::pound, tok::identifier, tok::pound, tok::oper_postfix
|
|
};
|
|
std::vector<Token> Toks = checkLex(Source, ExpectedTokens);
|
|
EXPECT_EQ("<#aa#>", Toks[2].getText());
|
|
}
|
|
|
|
class StringCaptureDiagnosticConsumer : public DiagnosticConsumer {
|
|
public:
|
|
virtual void handleDiagnostic(SourceManager &SM, SourceLoc Loc,
|
|
DiagnosticKind Kind, StringRef FormatString,
|
|
ArrayRef<DiagnosticArgument> FormatArgs,
|
|
const swift::DiagnosticInfo &Info) override {
|
|
std::string DiagMsg;
|
|
llvm::raw_string_ostream DiagOS(DiagMsg);
|
|
DiagnosticEngine::formatDiagnosticText(DiagOS, FormatString, FormatArgs);
|
|
auto LC = SM.getLineAndColumn(Loc);
|
|
std::ostringstream StrOS;
|
|
StrOS << LC.first << ", " << LC.second << ": " << DiagOS.str();
|
|
messages.push_back(StrOS.str());
|
|
}
|
|
|
|
std::vector<std::string> messages;
|
|
};
|
|
|
|
bool containsPrefix(const std::vector<std::string> &strs,
|
|
const std::string &prefix) {
|
|
for (auto &str : strs) {
|
|
if (StringRef(str).startswith(StringRef(prefix))) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
TEST_F(LexerTest, DiagnoseEmbeddedNul) {
|
|
const char Source[] = " \0 \0 aaa \0 \0 bbb";
|
|
size_t SourceLen = sizeof(Source) - 1;
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source, SourceLen));
|
|
|
|
StringCaptureDiagnosticConsumer DiagConsumer;
|
|
DiagnosticEngine Diags(SourceMgr);
|
|
Diags.addConsumer(DiagConsumer);
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, &Diags,
|
|
/*InSILMode=*/false, HashbangMode::Disallowed,
|
|
CommentRetentionMode::None, TriviaRetentionMode::WithTrivia);
|
|
|
|
ASSERT_TRUE(containsPrefix(DiagConsumer.messages,
|
|
"1, 2: nul character embedded in middle of file"));
|
|
ASSERT_TRUE(containsPrefix(DiagConsumer.messages,
|
|
"1, 4: nul character embedded in middle of file"));
|
|
}
|
|
|
|
TEST_F(LexerTest, DiagnoseEmbeddedNulOffset) {
|
|
const char Source[] = " \0 \0 aaa \0 \0 bbb";
|
|
size_t SourceLen = sizeof(Source) - 1;
|
|
|
|
LangOptions LangOpts;
|
|
SourceManager SourceMgr;
|
|
unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source, SourceLen));
|
|
|
|
StringCaptureDiagnosticConsumer DiagConsumer;
|
|
DiagnosticEngine Diags(SourceMgr);
|
|
Diags.addConsumer(DiagConsumer);
|
|
|
|
Lexer L(LangOpts, SourceMgr, BufferID, &Diags,
|
|
/*InSILMode=*/false, HashbangMode::Disallowed,
|
|
CommentRetentionMode::None, TriviaRetentionMode::WithTrivia,
|
|
/*Offset=*/5, /*EndOffset=*/SourceLen);
|
|
|
|
ASSERT_FALSE(containsPrefix(
|
|
DiagConsumer.messages, "1, 2: nul character embedded in middle of file"));
|
|
ASSERT_FALSE(containsPrefix(
|
|
DiagConsumer.messages, "1, 4: nul character embedded in middle of file"));
|
|
}
|
|
|
|
#if HAS_MMAP
|
|
|
|
// This test requires mmap because llvm::sys::Memory doesn't support protecting
|
|
// pages to have no permissions.
|
|
TEST_F(LexerTest, EncodedStringSegmentPastTheEnd) {
|
|
size_t PageSize = llvm::sys::Process::getPageSize();
|
|
|
|
void *FirstPage = mmap(/*addr*/nullptr, PageSize * 2, PROT_NONE,
|
|
MAP_PRIVATE | MAP_ANON, /*fd*/-1, /*offset*/0);
|
|
SWIFT_DEFER { (void)munmap(FirstPage, PageSize * 2); };
|
|
ASSERT_NE(FirstPage, MAP_FAILED);
|
|
int ProtectResult = mprotect(FirstPage, PageSize, PROT_READ | PROT_WRITE);
|
|
ASSERT_EQ(ProtectResult, 0);
|
|
|
|
auto check = [FirstPage, PageSize](StringRef Input, StringRef Expected) {
|
|
char *StartPtr = static_cast<char *>(FirstPage) + PageSize - Input.size();
|
|
memcpy(StartPtr, Input.data(), Input.size());
|
|
|
|
SmallString<64> Buffer;
|
|
StringRef Escaped = Lexer::getEncodedStringSegment({StartPtr, Input.size()},
|
|
Buffer);
|
|
EXPECT_EQ(Escaped, Expected);
|
|
};
|
|
|
|
check("needs escaping\\r",
|
|
"needs escaping\r");
|
|
check("does not need escaping",
|
|
"does not need escaping");
|
|
check("invalid escape at the end \\",
|
|
"invalid escape at the end ");
|
|
}
|
|
|
|
#endif // HAS_MMAP
|