[Lexer] Introduce Lexer::getLocForStartOfToken() that returns the location at the start of the token that a given offset points to.

Swift SVN r8281
2025-12-14 20:36:38 +01:00 · 2013-09-16 18:41:16 +00:00
parent e7af4d6c72
commit 5db368ce7b
4 changed files with 116 additions and 2 deletions
--- a/include/swift/Basic/SourceManager.h
+++ b/include/swift/Basic/SourceManager.h
@@ -110,6 +110,11 @@ public:
  /// \brief Returns the distance in bytes between the given source locations.
  unsigned getByteDistance(SourceLoc Start, SourceLoc End) const;

+  /// Returns the SourceLoc for the byte offset in the specified buffer.
+  SourceLoc getLocForOffset(unsigned BufferID, unsigned Offset) const {
+    return getLocForBufferStart(BufferID).getAdvancedLoc(Offset);
+  }
+
  std::pair<unsigned, unsigned> getLineAndColumn(SourceLoc Loc,
                                                 int BufferID = -1) const {
    assert(Loc.isValid());
--- a/include/swift/Parse/Lexer.h
+++ b/include/swift/Parse/Lexer.h
@@ -192,6 +192,14 @@ public:
  /// \param Loc The source location of the beginning of a token.
  static SourceLoc getLocForEndOfToken(SourceManager &SM, SourceLoc Loc);

+  /// Return the start location of the token that the offset in the given buffer
+  /// points to.
+  ///
+  /// If the offset points to whitespace the source location will point to the
+  /// exact offset.
+  static SourceLoc getLocForStartOfToken(SourceManager &SM, unsigned BufferID,
+                                         unsigned Offset);
+
  /// \brief Determines if the given string is a valid non-operator
  /// identifier.
  static bool isIdentifier(StringRef identifier);
--- a/lib/Parse/Lexer.cpp
+++ b/lib/Parse/Lexer.cpp
@@ -1486,3 +1486,88 @@ SourceLoc Lexer::getLocForEndOfToken(SourceManager &SM, SourceLoc Loc) {
  return Loc.getAdvancedLoc(Length);
 }

+static SourceLoc getLocForStartOfTokenInBuf(SourceManager &SM,
+                                            unsigned BufferID,
+                                            unsigned Offset,
+                                            unsigned BufferStart,
+                                            unsigned BufferEnd,
+                                            bool InInterpolatedString) {
+  Lexer L(SM, BufferID, nullptr, /*InSILMode=*/false, /*KeepComments=*/false,
+          BufferStart, BufferEnd);
+
+  // Lex tokens until we find the token that contains the source location.
+  Token Tok;
+  do {
+    L.lex(Tok);
+
+    unsigned TokOffs = SM.getLocOffsetInBuffer(Tok.getLoc(), BufferID);
+    if (TokOffs > Offset) {
+      // We ended up skipping over the source location entirely, which means
+      // that it points into whitespace. We are done here.
+      break;
+    }
+
+    if (Offset < TokOffs+Tok.getLength()) {
+      // Current token encompasses our source location.
+
+      if (Tok.is(tok::string_literal)) {
+        assert(!InInterpolatedString);
+        SmallVector<Lexer::StringSegment, 4> Segments;
+        Lexer::getStringLiteralSegments(Tok, Segments, /*Diags=*/0);
+        for (auto &Seg : Segments) {
+          unsigned SegOffs = SM.getLocOffsetInBuffer(Seg.Loc, BufferID);
+          unsigned SegEnd = SegOffs+Seg.Length;
+          if (SegOffs > Offset)
+            break;
+
+          // If the offset is inside an interpolated expr segment, re-lex.
+          if (Seg.Kind == Lexer::StringSegment::Expr && Offset < SegEnd)
+            return getLocForStartOfTokenInBuf(SM, BufferID, Offset,
+                                              /*BufferStart=*/SegOffs,
+                                              /*BufferEnd=*/SegEnd,
+                                              /*InInterpolatedString=*/true);
+        }
+      }
+
+      return Tok.getLoc();
+    }
+  } while (Tok.isNot(tok::eof));
+
+  // We've passed our source location; just return the original source location.
+  return SM.getLocForOffset(BufferID, Offset);
+}
+
+SourceLoc Lexer::getLocForStartOfToken(SourceManager &SM, unsigned BufferID,
+                                       unsigned Offset) {
+  const llvm::MemoryBuffer *MemBuffer = SM->getMemoryBuffer(BufferID);
+  if (!MemBuffer)
+    return SourceLoc();
+  StringRef Buffer = MemBuffer->getBuffer();
+
+  const char *BufStart = Buffer.data();
+  if (Offset > Buffer.size())
+    return SourceLoc();
+
+  const char *StrData = BufStart+Offset;
+  // If it points to whitespace return the SourceLoc for it.
+  if (StrData[0] == '\n' || StrData[0] == '\r' ||
+      StrData[0] == ' ' || StrData[0] == '\t')
+    return SM.getLocForOffset(BufferID, Offset);
+
+  // Back up from the current location until we hit the beginning of a line
+  // (or the buffer). We'll relex from that point.
+  const char *LexStart = StrData;
+  while (LexStart != BufStart) {
+    if (LexStart[0] == '\n' || LexStart[0] == '\r') {
+      ++LexStart;
+      break;
+    }
+
+    --LexStart;
+  }
+
+  return getLocForStartOfTokenInBuf(SM, BufferID, Offset,
+                                    /*BufferStart=*/LexStart-BufStart,
+                                    /*BufferEnd=*/Buffer.size(),
+                                    /*InInterpolatedString=*/false);
+}
--- a/unittests/Parse/LexerTests.cpp
+++ b/unittests/Parse/LexerTests.cpp
@@ -9,9 +9,8 @@ using namespace llvm;

 // The test fixture.
 class LexerTest : public ::testing::Test {
-  SourceManager SourceMgr;
-
 public:
+  SourceManager SourceMgr;

  std::vector<Token> tokenizeAndKeepEOF(unsigned BufferID) {
    Lexer L(SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false);
@@ -257,3 +256,20 @@ TEST_F(LexerTest, RestoreStopAtCodeCompletion) {
  ASSERT_EQ(tok::eof, Tok.getKind());
 }

+TEST_F(LexerTest, getLocForStartOfToken) {
+  const char *Source = "aaa \n \tbbb \"hello\" \"-\\(val)-\"";
+
+  MemoryBuffer *Buf = MemoryBuffer::getMemBuffer(Source);
+  unsigned BufferID = SourceMgr->AddNewSourceBuffer(Buf, llvm::SMLoc());
+
+  // First is character offset, second is its token offset.
+  unsigned Offs[][2] =
+    { {1, 0}, {2, 0}, {3, 3}, {4, 4}, {6, 6}, {9, 7}, {14, 11},
+      // interpolated string
+      {20, 19}, {23, 23}, {24, 23}, {25, 23}, {26, 26}, {27, 19} };
+
+  for (auto Pair : Offs) {
+    ASSERT_EQ(Lexer::getLocForStartOfToken(SourceMgr, BufferID, Pair[0]),
+              SourceMgr.getLocForOffset(BufferID, Pair[1]));
+  }
+}