[Lexer] Introduce Lexer::getLocForStartOfToken() that returns the location at the start of the token that a given offset points to.

Swift SVN r8281
This commit is contained in:
Argyrios Kyrtzidis
2013-09-16 18:41:16 +00:00
parent e7af4d6c72
commit 5db368ce7b
4 changed files with 116 additions and 2 deletions

View File

@@ -110,6 +110,11 @@ public:
/// \brief Returns the distance in bytes between the given source locations. /// \brief Returns the distance in bytes between the given source locations.
unsigned getByteDistance(SourceLoc Start, SourceLoc End) const; unsigned getByteDistance(SourceLoc Start, SourceLoc End) const;
/// Returns the SourceLoc for the byte offset in the specified buffer.
SourceLoc getLocForOffset(unsigned BufferID, unsigned Offset) const {
return getLocForBufferStart(BufferID).getAdvancedLoc(Offset);
}
std::pair<unsigned, unsigned> getLineAndColumn(SourceLoc Loc, std::pair<unsigned, unsigned> getLineAndColumn(SourceLoc Loc,
int BufferID = -1) const { int BufferID = -1) const {
assert(Loc.isValid()); assert(Loc.isValid());

View File

@@ -192,6 +192,14 @@ public:
/// \param Loc The source location of the beginning of a token. /// \param Loc The source location of the beginning of a token.
static SourceLoc getLocForEndOfToken(SourceManager &SM, SourceLoc Loc); static SourceLoc getLocForEndOfToken(SourceManager &SM, SourceLoc Loc);
/// Return the start location of the token that the offset in the given buffer
/// points to.
///
/// If the offset points to whitespace the source location will point to the
/// exact offset.
static SourceLoc getLocForStartOfToken(SourceManager &SM, unsigned BufferID,
unsigned Offset);
/// \brief Determines if the given string is a valid non-operator /// \brief Determines if the given string is a valid non-operator
/// identifier. /// identifier.
static bool isIdentifier(StringRef identifier); static bool isIdentifier(StringRef identifier);

View File

@@ -1486,3 +1486,88 @@ SourceLoc Lexer::getLocForEndOfToken(SourceManager &SM, SourceLoc Loc) {
return Loc.getAdvancedLoc(Length); return Loc.getAdvancedLoc(Length);
} }
static SourceLoc getLocForStartOfTokenInBuf(SourceManager &SM,
unsigned BufferID,
unsigned Offset,
unsigned BufferStart,
unsigned BufferEnd,
bool InInterpolatedString) {
Lexer L(SM, BufferID, nullptr, /*InSILMode=*/false, /*KeepComments=*/false,
BufferStart, BufferEnd);
// Lex tokens until we find the token that contains the source location.
Token Tok;
do {
L.lex(Tok);
unsigned TokOffs = SM.getLocOffsetInBuffer(Tok.getLoc(), BufferID);
if (TokOffs > Offset) {
// We ended up skipping over the source location entirely, which means
// that it points into whitespace. We are done here.
break;
}
if (Offset < TokOffs+Tok.getLength()) {
// Current token encompasses our source location.
if (Tok.is(tok::string_literal)) {
assert(!InInterpolatedString);
SmallVector<Lexer::StringSegment, 4> Segments;
Lexer::getStringLiteralSegments(Tok, Segments, /*Diags=*/0);
for (auto &Seg : Segments) {
unsigned SegOffs = SM.getLocOffsetInBuffer(Seg.Loc, BufferID);
unsigned SegEnd = SegOffs+Seg.Length;
if (SegOffs > Offset)
break;
// If the offset is inside an interpolated expr segment, re-lex.
if (Seg.Kind == Lexer::StringSegment::Expr && Offset < SegEnd)
return getLocForStartOfTokenInBuf(SM, BufferID, Offset,
/*BufferStart=*/SegOffs,
/*BufferEnd=*/SegEnd,
/*InInterpolatedString=*/true);
}
}
return Tok.getLoc();
}
} while (Tok.isNot(tok::eof));
// We've passed our source location; just return the original source location.
return SM.getLocForOffset(BufferID, Offset);
}
SourceLoc Lexer::getLocForStartOfToken(SourceManager &SM, unsigned BufferID,
unsigned Offset) {
const llvm::MemoryBuffer *MemBuffer = SM->getMemoryBuffer(BufferID);
if (!MemBuffer)
return SourceLoc();
StringRef Buffer = MemBuffer->getBuffer();
const char *BufStart = Buffer.data();
if (Offset > Buffer.size())
return SourceLoc();
const char *StrData = BufStart+Offset;
// If it points to whitespace return the SourceLoc for it.
if (StrData[0] == '\n' || StrData[0] == '\r' ||
StrData[0] == ' ' || StrData[0] == '\t')
return SM.getLocForOffset(BufferID, Offset);
// Back up from the current location until we hit the beginning of a line
// (or the buffer). We'll relex from that point.
const char *LexStart = StrData;
while (LexStart != BufStart) {
if (LexStart[0] == '\n' || LexStart[0] == '\r') {
++LexStart;
break;
}
--LexStart;
}
return getLocForStartOfTokenInBuf(SM, BufferID, Offset,
/*BufferStart=*/LexStart-BufStart,
/*BufferEnd=*/Buffer.size(),
/*InInterpolatedString=*/false);
}

View File

@@ -9,9 +9,8 @@ using namespace llvm;
// The test fixture. // The test fixture.
class LexerTest : public ::testing::Test { class LexerTest : public ::testing::Test {
SourceManager SourceMgr;
public: public:
SourceManager SourceMgr;
std::vector<Token> tokenizeAndKeepEOF(unsigned BufferID) { std::vector<Token> tokenizeAndKeepEOF(unsigned BufferID) {
Lexer L(SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false); Lexer L(SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false);
@@ -257,3 +256,20 @@ TEST_F(LexerTest, RestoreStopAtCodeCompletion) {
ASSERT_EQ(tok::eof, Tok.getKind()); ASSERT_EQ(tok::eof, Tok.getKind());
} }
TEST_F(LexerTest, getLocForStartOfToken) {
const char *Source = "aaa \n \tbbb \"hello\" \"-\\(val)-\"";
MemoryBuffer *Buf = MemoryBuffer::getMemBuffer(Source);
unsigned BufferID = SourceMgr->AddNewSourceBuffer(Buf, llvm::SMLoc());
// First is character offset, second is its token offset.
unsigned Offs[][2] =
{ {1, 0}, {2, 0}, {3, 3}, {4, 4}, {6, 6}, {9, 7}, {14, 11},
// interpolated string
{20, 19}, {23, 23}, {24, 23}, {25, 23}, {26, 26}, {27, 19} };
for (auto Pair : Offs) {
ASSERT_EQ(Lexer::getLocForStartOfToken(SourceMgr, BufferID, Pair[0]),
SourceMgr.getLocForOffset(BufferID, Pair[1]));
}
}