[sourcekit] Merge SourceKit into the Swift repo.

The code goes into its own sub-tree under 'tools' but tests go under 'test', so that running 'check-swift' will also run all the SourceKit tests. SourceKit is disabled on non-darwin platforms.
2025-12-21 12:14:44 +01:00 · 2015-11-05 01:06:19 -08:00
parent 598a129b8e
commit 8ff6a98a99
379 changed files with 51992 additions and 1 deletions
--- a/tools/SourceKit/lib/Support/FuzzyStringMatcher.cpp
+++ b/tools/SourceKit/lib/Support/FuzzyStringMatcher.cpp
@@ -0,0 +1,505 @@
+#include "SourceKit/Support/FuzzyStringMatcher.h"
+#include "clang/Basic/CharInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallString.h"
+
+using namespace SourceKit;
+using clang::toUppercase;
+using clang::toLowercase;
+using clang::isUppercase;
+using clang::isLowercase;
+
+FuzzyStringMatcher::FuzzyStringMatcher(StringRef pattern_)
+    : pattern(pattern_), charactersInPattern(1 << (sizeof(char) * 8)) {
+  lowercasePattern.reserve(pattern.size());
+  unsigned upperCharCount = 0;
+  for (char c : pattern) {
+    char lower = toLowercase(c);
+    upperCharCount += (c == lower) ? 0 : 1;
+    lowercasePattern.push_back(lower);
+    charactersInPattern.set(static_cast<unsigned char>(lower));
+    charactersInPattern.set(static_cast<unsigned char>(toUppercase(c)));
+  }
+  assert(pattern.size() == lowercasePattern.size());
+
+  // FIXME: pull out the magic constants.
+  // This depends on the inner details of the matching algorithm and  will need
+  // to be updated if we substantially alter it.
+  if (pattern.size() == 1) {
+    maxScore = 3.0 +  // uppercase match
+               0.001; // size bonus
+  } else {
+    maxScore = 0.25 +                           // percent match bonus
+               2.5 +                            // match at start bonus
+               pattern.size() * pattern.size(); // max run length score
+    if (upperCharCount)                         // max uppercase match score
+      maxScore += (upperCharCount + 1) * (upperCharCount + 1);
+    maxScore *= 1.1; // exact match bonus
+  }
+}
+
+bool FuzzyStringMatcher::matchesCandidate(StringRef candidate) const {
+  unsigned patternLength = pattern.size();
+  unsigned candidateLength = candidate.size();
+  if (patternLength > candidateLength)
+    return false;
+
+  // Do all of the pattern characters match the candidate in order?
+  unsigned pidx = 0, cidx = 0;
+  while (pidx < patternLength && cidx < candidateLength) {
+    char c = candidate[cidx];
+    char p = lowercasePattern[pidx];
+    if (p == c || p == toLowercase(c))
+      ++pidx;
+    ++cidx;
+  }
+
+  return pidx == patternLength;
+}
+
+static bool isTokenizingChar(char c) {
+  switch (c) {
+  case '/':
+  case '.':
+  case '_':
+  case '+':
+  case '-':
+    return true;
+  default:
+    return false;
+  }
+}
+
+namespace {
+/// A simple index range.
+struct Range {
+  unsigned location;
+  unsigned length;
+};
+} // end anonymous namespace
+
+static void
+populateTokenTable(SmallVectorImpl<Range> &tokens,
+                   llvm::MutableArrayRef<unsigned> characterToTokenIndex,
+                   StringRef candidate) {
+  unsigned start = 0;
+  characterToTokenIndex[0] = 0;
+
+  for (unsigned cidx = 1; cidx < candidate.size(); ++cidx) {
+    char current = candidate[cidx];
+    char prev = candidate[cidx - 1];
+
+    // Is this a special tokenizing character like '_', or the start of a camel
+    // case word?  The uppercase character should start a new token.
+    if (isTokenizingChar(prev) ||
+        (isUppercase(current) && !isUppercase(prev)) ||
+        (clang::isDigit(current) && !clang::isDigit(prev))) {
+      tokens.push_back({start, cidx - start});
+      start = cidx;
+
+
+    } else if (isLowercase(current) && isUppercase(prev) && start != cidx - 1) {
+      // Or is this the end of a run of uppercase characters?
+      // E.g. in NSWindow, the 'W' should start a new token.
+      tokens.push_back({start, cidx - start - 1});
+      characterToTokenIndex[cidx - 1] = tokens.size();
+      start = cidx - 1;
+    }
+    characterToTokenIndex[cidx] = tokens.size();
+  }
+
+  tokens.push_back({start, static_cast<unsigned>(candidate.size() - start)});
+}
+
+static constexpr unsigned notFound = ~0U;
+
+namespace {
+/// The candidate-specific matching data and algorithms.
+struct CandidateSpecificMatcher {
+  // The following StringRefs are owned by FuzzyStringMatcher and must outlive
+  // this object.
+  StringRef pattern;
+  StringRef lowercasePattern;
+  StringRef candidate;
+  SmallVector<char, 128> lowercaseCandidate;
+  SmallVector<unsigned, 128> jumpTable; ///< The next matching character index.
+  SmallVector<Range, 128> tokens; ///< Tokenized ranges from the candidate.
+  SmallVector<unsigned, 128> characterToTokenIndex;
+  SmallVector<Range, 128> runs;
+
+  CandidateSpecificMatcher(StringRef pattern, StringRef lowercasePattern,
+                           StringRef candidate,
+                           const llvm::BitVector &charactersInPattern,
+                           unsigned &firstPatternPos);
+
+  /// Calculates the candidate's score, matching the candidate from
+  /// \p firstPatternPos or later.
+  ///
+  /// This drives drives scoreCandidateTrial by trying the possible matches.
+  double scoreCandidate(unsigned firstPatternPos);
+
+  /// Calculates the candidate's score, matching the candidate from
+  /// exactly \p firstPatternPos.
+  double scoreCandidateTrial(unsigned firstPatternPos);
+};
+} // end anonymous namespace
+
+double FuzzyStringMatcher::scoreCandidate(StringRef candidate) const {
+  double finalScore = 0.0;
+  if (candidate.empty() || pattern.empty() || candidate.size() < pattern.size())
+    return finalScore;
+
+  // Single character pattern matching should be simple and fast.  Just look at
+  // the first character.
+  if (pattern.size() == 1) {
+    char c = candidate[0];
+    if (c == pattern[0] && isUppercase(c)) {
+      finalScore = 3.0;
+    } else if (toLowercase(c) == lowercasePattern[0]) {
+      finalScore = 2.0;
+    }
+
+    // Make sure shorter results come first;
+    if (finalScore)
+      finalScore += (1 / static_cast<double>(candidate.size())) * (1 / 1000.0);
+
+    if (normalize)
+      finalScore /= maxScore;
+
+    return finalScore;
+  }
+
+  // FIXME: path separators would be handled here, jumping straight to the last
+  // component if the pattern doesn't contain a separator.
+
+  unsigned firstPatternPos = 0;
+  CandidateSpecificMatcher CSM(pattern, lowercasePattern, candidate,
+                               charactersInPattern, firstPatternPos);
+  finalScore = CSM.scoreCandidate(firstPatternPos);
+
+  if (normalize)
+    finalScore /= maxScore;
+
+  return finalScore;
+}
+
+CandidateSpecificMatcher::CandidateSpecificMatcher(
+    StringRef pattern, StringRef lowercasePattern, StringRef candidate,
+    const llvm::BitVector &charactersInPattern, unsigned &firstPatternPos)
+    : pattern(pattern), lowercasePattern(lowercasePattern),
+      candidate(candidate) {
+
+  assert(!pattern.empty() && pattern.size() <= candidate.size());
+  assert(pattern.size() == lowercasePattern.size());
+
+  // Build a table that points at the next pattern character so we skip
+  // through candidate faster.
+  unsigned candidateLength = candidate.size();
+  jumpTable.resize(candidateLength);
+  lowercaseCandidate.resize(candidateLength);
+  unsigned lastPatternPos = notFound;
+  for (unsigned cidx = candidateLength - 1;; --cidx) {
+    char c = candidate[cidx];
+    lowercaseCandidate[cidx] = toLowercase(c);
+    jumpTable[cidx] = lastPatternPos;
+    if (charactersInPattern[static_cast<unsigned char>(c)])
+      lastPatternPos = cidx;
+
+    if (!cidx)
+      break;
+  }
+  firstPatternPos = lastPatternPos;
+
+  // Build the token table.
+  characterToTokenIndex.resize(candidate.size());
+  populateTokenTable(tokens, characterToTokenIndex, candidate);
+}
+
+double CandidateSpecificMatcher::scoreCandidate(unsigned firstPatternPos) {
+  double finalScore = 0.0;
+
+  // The outer matching loop. We run multiple trials so that "a_b_c_abc"
+  // matching "abc" is matched on the "abc" part instead of the "a_b_c" part.
+  while (firstPatternPos != notFound) {
+    // Quickly skip to the first character that matches. We need
+    // the loop in case the first pattern-character in the
+    // candidate is not the first character in the pattern.
+    while (firstPatternPos != notFound) {
+      if (lowercasePattern[0] == lowercaseCandidate[firstPatternPos])
+        break;
+      firstPatternPos = jumpTable[firstPatternPos];
+    }
+    if (firstPatternPos == notFound)
+      break;
+
+    double trialScore = scoreCandidateTrial(firstPatternPos);
+
+    if (trialScore > finalScore) {
+      finalScore = trialScore;
+      // FIXME: update output ranges, if necessary
+    }
+
+    firstPatternPos = jumpTable[firstPatternPos];
+  }
+
+  return finalScore;
+}
+
+static double scoreRun(unsigned runStart, unsigned runLength,
+                       unsigned prevTokenStart, unsigned tokenIndex,
+                       unsigned uppercaseMatches, bool isTokenizingChar) {
+  if (runLength == 0)
+    return 0.0;
+
+  // We really don't like not matching at token starts, but if it's a long match
+  // give some credit.
+  if (runStart != prevTokenStart && !isTokenizingChar) {
+    if (runLength < 5) {
+      return (runLength < 3) ? 0.0 : runLength;
+    }
+
+    // For really long matches, we'll give a high score.  Pretend it's a bit
+    // shorter.
+    runLength -= 2;
+  }
+
+  // Bonus if the match is the first or second token.
+  double prefixBonus = (runStart == 0) ? 2.5 : ((tokenIndex < 2) ? 1.0 : 0.0);
+  double uppercaseBonus =
+      uppercaseMatches ? (uppercaseMatches + 1) * (uppercaseMatches + 1) : 0.0;
+
+  return (runLength * runLength) + uppercaseBonus + prefixBonus;
+}
+
+double
+CandidateSpecificMatcher::scoreCandidateTrial(unsigned firstPatternPos) {
+  double trialScore = 0.0; /// We run multiple trials so that "a_b_c_abc"
+                           /// matching "abc" is matched on the "abc" part
+                           /// instead of the "a_b_c" part.
+  unsigned uppercaseMatches = 0;
+  unsigned cidx = firstPatternPos;
+  unsigned pidx = 0;
+  unsigned runLength = 0;
+  unsigned runStart = cidx;
+  unsigned nonTokenRuns = 0;
+  unsigned camelCaseLen = 0;
+  unsigned camelCaseLastToken = 0;
+  double camelCaseStartBonus = 0.0;
+  unsigned camelCaseSkips = 0;
+
+  unsigned patternLength = pattern.size();
+  unsigned candidateLength = candidate.size();
+
+  while (pidx < patternLength && cidx < candidateLength) {
+    char lowerPatternChar = lowercasePattern[pidx];
+    char lowerCandidateChar = lowercaseCandidate[cidx];
+    unsigned nextCidx = jumpTable[cidx];
+    bool matched = lowerPatternChar == lowerCandidateChar;
+    if (matched) {
+      if (isUppercase(pattern[pidx]) && isUppercase(candidate[cidx])) {
+        ++uppercaseMatches;
+      }
+
+      ++runLength;
+      ++pidx;
+      if (pidx < patternLength)
+        lowerPatternChar = lowercasePattern[pidx];
+    }
+
+    // If we're skipping forward and were running, the run ended.
+    if (((cidx + 1) != nextCidx) || !matched) {
+      if (runLength) {
+        double runValue =
+            scoreRun(runStart, runLength,
+                     tokens[characterToTokenIndex[runStart]].location,
+                     characterToTokenIndex[runStart], uppercaseMatches,
+                     isTokenizingChar(candidate[runStart]));
+
+        // If it's a poor match in the middle of a token, see if the next char
+        // starts a token and also matches. If so, use it.
+        if (runLength == 1 && pidx > 1 && runValue == 0.0 &&
+            nextCidx != notFound &&
+            characterToTokenIndex[runStart] < tokens.size() - 1) {
+          bool foundIt = false;
+          unsigned charToCheck = matched ? nextCidx : cidx;
+          while (charToCheck != notFound) {
+            if (tokens[characterToTokenIndex[charToCheck]].location ==
+                    charToCheck &&
+                lowercasePattern[pidx - 1] == lowercaseCandidate[charToCheck]) {
+              foundIt = true;
+              break;
+            }
+            charToCheck = jumpTable[charToCheck];
+          }
+
+          if (foundIt) {
+            --pidx;
+            lowerPatternChar = lowercasePattern[pidx];
+            runStart = cidx = charToCheck;
+            runLength = 0;
+            continue;
+          }
+        }
+
+        // We really don't like matches that don't start at a token.
+        if (runValue == 0.0) {
+          ++nonTokenRuns;
+
+        } else {
+          unsigned tokenIndex = characterToTokenIndex[runStart];
+          if (runStart == tokens[tokenIndex].location ||
+              isTokenizingChar(lowerCandidateChar)) {
+            camelCaseLen += runLength;
+
+            // Bonus for matching the beginning of the candidate.
+            if (tokenIndex <= 1) {
+              camelCaseStartBonus = 2.0;
+
+              // Penalty for skipping a token.
+            } else if (tokenIndex != camelCaseLastToken + 1) {
+              camelCaseSkips += tokenIndex - camelCaseLastToken - 1;
+            }
+
+            camelCaseLastToken = tokenIndex;
+
+            if (isTokenizingChar(lowerCandidateChar) && runLength == 1) {
+              --camelCaseLastToken;
+            }
+          }
+        }
+
+        // Accumulate run and reset for next run.
+        trialScore += runValue;
+        runs.push_back({runStart, runLength});
+        uppercaseMatches = 0;
+        runLength = 0;
+      }
+
+      runStart = nextCidx;
+    }
+
+    cidx = nextCidx;
+  }
+
+  // The trial is done, did we find a match?
+  // FIXME: this can happen spuriously in foo => ufDownOb.
+  if (pidx != patternLength)
+    return 0.0;
+
+  // Okay, we found a match.
+
+  // FIXME: this code is largely duplicated with the previous block. There are
+  // some subtle differences that can be seen if you try to remove this one and
+  // check for pidx == patternLength for the other block.
+  if (runLength) {
+    double runValue = scoreRun(
+        runStart, runLength, tokens[characterToTokenIndex[runStart]].location,
+        characterToTokenIndex[runStart], uppercaseMatches,
+        isTokenizingChar(candidate[runStart]));
+
+    // If it's a poor match in the middle of a token, see if the next char
+    // starts a token and also matches. If so, use it.
+    if (runLength == 1 && runValue == 0.0) {
+      unsigned nextCidx = jumpTable[runStart];
+      if (nextCidx != notFound &&
+          characterToTokenIndex[runStart] < tokens.size() - 1) {
+        bool foundIt = false;
+        while (nextCidx != notFound) {
+          if (tokens[characterToTokenIndex[nextCidx]].location == nextCidx &&
+              lowercasePattern[pidx - 1] == lowercaseCandidate[nextCidx]) {
+            foundIt = true;
+            break;
+          }
+          nextCidx = jumpTable[nextCidx];
+        }
+
+        if (foundIt) {
+          runStart = nextCidx;
+          uppercaseMatches +=
+              (isUppercase(pattern[pidx - 1]) &&
+               isUppercase(candidate[runStart])) ? 1 : 0;
+          runValue = scoreRun(runStart, runLength,
+                              tokens[characterToTokenIndex[runStart]].location,
+                              characterToTokenIndex[runStart], uppercaseMatches,
+                              isTokenizingChar(candidate[runStart]));
+        }
+      }
+    }
+
+    // We really don't like matches that don't start at a token.
+    if (runValue == 0.0) {
+      ++nonTokenRuns;
+
+    } else {
+      unsigned tokenIndex = characterToTokenIndex[runStart];
+      if (runStart == tokens[tokenIndex].location ||
+          isTokenizingChar(lowercaseCandidate[runStart])) {
+        camelCaseLen += runLength;
+
+        if (tokenIndex <= 1) {
+          // Bonus for matching the beginning of the candidate.
+          camelCaseStartBonus = 2.0;
+        } else if (tokenIndex != camelCaseLastToken + 1) {
+          // Penalty for skipping a token.
+          camelCaseSkips += tokenIndex - camelCaseLastToken - 1;
+        }
+      }
+    }
+
+    // Accumulate run.
+    trialScore += runValue;
+    runs.push_back({runStart, runLength});
+  }
+
+  // Unless there were bad matches, prefer camel case matches.
+  if (nonTokenRuns == 0 && camelCaseSkips < 3) {
+    double camelCaseScore = (camelCaseLen * camelCaseLen) + camelCaseStartBonus;
+    if (camelCaseSkips == 1) {
+      camelCaseScore *= 0.9;
+    } else if (camelCaseSkips == 2) {
+      camelCaseScore *= 0.8;
+    }
+
+    if (trialScore < camelCaseScore) {
+      // Camel case matched better.
+      trialScore = camelCaseScore;
+    }
+  }
+
+  // FIXME: using the range up to a dot is silly when candidate isn't a file.
+  auto dotLoc = candidate.find_last_of('.');
+  unsigned baseNameLength =
+      dotLoc != StringRef::npos && dotLoc > 1 ? dotLoc : candidateLength;
+
+  // FIXME: file type bonus if we're checking a file path.
+
+  // Add a bit for the percentage of the candidate matched.
+  trialScore += patternLength / static_cast<double>(baseNameLength) * 0.25;
+
+  // Exact matches are even better.
+  if (patternLength >= baseNameLength && runs.size() > 0 &&
+      runs[0].location == 0) {
+    trialScore *= 1.1;
+  }
+
+  // FIXME: popular/unpopular API.
+
+  // We really don't like matches that don't start at a token.
+  switch (nonTokenRuns) {
+  case 0:
+    break;
+  case 1:
+    trialScore *= 0.8;
+    break;
+  case 2:
+    trialScore *= 0.5;
+    break;
+  default:
+    trialScore *= 0.33;
+    break;
+  }
+
+  // FIXME: matched ranges output.
+
+  return trialScore;
+}