mirror of
https://github.com/apple/swift.git
synced 2025-12-21 12:14:44 +01:00
[sourcekit] Merge SourceKit into the Swift repo.
The code goes into its own sub-tree under 'tools' but tests go under 'test', so that running 'check-swift' will also run all the SourceKit tests. SourceKit is disabled on non-darwin platforms.
This commit is contained in:
505
tools/SourceKit/lib/Support/FuzzyStringMatcher.cpp
Normal file
505
tools/SourceKit/lib/Support/FuzzyStringMatcher.cpp
Normal file
@@ -0,0 +1,505 @@
|
||||
#include "SourceKit/Support/FuzzyStringMatcher.h"
|
||||
#include "clang/Basic/CharInfo.h"
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/ADT/SmallString.h"
|
||||
|
||||
using namespace SourceKit;
|
||||
using clang::toUppercase;
|
||||
using clang::toLowercase;
|
||||
using clang::isUppercase;
|
||||
using clang::isLowercase;
|
||||
|
||||
FuzzyStringMatcher::FuzzyStringMatcher(StringRef pattern_)
|
||||
: pattern(pattern_), charactersInPattern(1 << (sizeof(char) * 8)) {
|
||||
lowercasePattern.reserve(pattern.size());
|
||||
unsigned upperCharCount = 0;
|
||||
for (char c : pattern) {
|
||||
char lower = toLowercase(c);
|
||||
upperCharCount += (c == lower) ? 0 : 1;
|
||||
lowercasePattern.push_back(lower);
|
||||
charactersInPattern.set(static_cast<unsigned char>(lower));
|
||||
charactersInPattern.set(static_cast<unsigned char>(toUppercase(c)));
|
||||
}
|
||||
assert(pattern.size() == lowercasePattern.size());
|
||||
|
||||
// FIXME: pull out the magic constants.
|
||||
// This depends on the inner details of the matching algorithm and will need
|
||||
// to be updated if we substantially alter it.
|
||||
if (pattern.size() == 1) {
|
||||
maxScore = 3.0 + // uppercase match
|
||||
0.001; // size bonus
|
||||
} else {
|
||||
maxScore = 0.25 + // percent match bonus
|
||||
2.5 + // match at start bonus
|
||||
pattern.size() * pattern.size(); // max run length score
|
||||
if (upperCharCount) // max uppercase match score
|
||||
maxScore += (upperCharCount + 1) * (upperCharCount + 1);
|
||||
maxScore *= 1.1; // exact match bonus
|
||||
}
|
||||
}
|
||||
|
||||
bool FuzzyStringMatcher::matchesCandidate(StringRef candidate) const {
|
||||
unsigned patternLength = pattern.size();
|
||||
unsigned candidateLength = candidate.size();
|
||||
if (patternLength > candidateLength)
|
||||
return false;
|
||||
|
||||
// Do all of the pattern characters match the candidate in order?
|
||||
unsigned pidx = 0, cidx = 0;
|
||||
while (pidx < patternLength && cidx < candidateLength) {
|
||||
char c = candidate[cidx];
|
||||
char p = lowercasePattern[pidx];
|
||||
if (p == c || p == toLowercase(c))
|
||||
++pidx;
|
||||
++cidx;
|
||||
}
|
||||
|
||||
return pidx == patternLength;
|
||||
}
|
||||
|
||||
static bool isTokenizingChar(char c) {
|
||||
switch (c) {
|
||||
case '/':
|
||||
case '.':
|
||||
case '_':
|
||||
case '+':
|
||||
case '-':
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
/// A simple index range.
|
||||
struct Range {
|
||||
unsigned location;
|
||||
unsigned length;
|
||||
};
|
||||
} // end anonymous namespace
|
||||
|
||||
static void
|
||||
populateTokenTable(SmallVectorImpl<Range> &tokens,
|
||||
llvm::MutableArrayRef<unsigned> characterToTokenIndex,
|
||||
StringRef candidate) {
|
||||
unsigned start = 0;
|
||||
characterToTokenIndex[0] = 0;
|
||||
|
||||
for (unsigned cidx = 1; cidx < candidate.size(); ++cidx) {
|
||||
char current = candidate[cidx];
|
||||
char prev = candidate[cidx - 1];
|
||||
|
||||
// Is this a special tokenizing character like '_', or the start of a camel
|
||||
// case word? The uppercase character should start a new token.
|
||||
if (isTokenizingChar(prev) ||
|
||||
(isUppercase(current) && !isUppercase(prev)) ||
|
||||
(clang::isDigit(current) && !clang::isDigit(prev))) {
|
||||
tokens.push_back({start, cidx - start});
|
||||
start = cidx;
|
||||
|
||||
|
||||
} else if (isLowercase(current) && isUppercase(prev) && start != cidx - 1) {
|
||||
// Or is this the end of a run of uppercase characters?
|
||||
// E.g. in NSWindow, the 'W' should start a new token.
|
||||
tokens.push_back({start, cidx - start - 1});
|
||||
characterToTokenIndex[cidx - 1] = tokens.size();
|
||||
start = cidx - 1;
|
||||
}
|
||||
characterToTokenIndex[cidx] = tokens.size();
|
||||
}
|
||||
|
||||
tokens.push_back({start, static_cast<unsigned>(candidate.size() - start)});
|
||||
}
|
||||
|
||||
static constexpr unsigned notFound = ~0U;
|
||||
|
||||
namespace {
|
||||
/// The candidate-specific matching data and algorithms.
|
||||
struct CandidateSpecificMatcher {
|
||||
// The following StringRefs are owned by FuzzyStringMatcher and must outlive
|
||||
// this object.
|
||||
StringRef pattern;
|
||||
StringRef lowercasePattern;
|
||||
StringRef candidate;
|
||||
SmallVector<char, 128> lowercaseCandidate;
|
||||
SmallVector<unsigned, 128> jumpTable; ///< The next matching character index.
|
||||
SmallVector<Range, 128> tokens; ///< Tokenized ranges from the candidate.
|
||||
SmallVector<unsigned, 128> characterToTokenIndex;
|
||||
SmallVector<Range, 128> runs;
|
||||
|
||||
CandidateSpecificMatcher(StringRef pattern, StringRef lowercasePattern,
|
||||
StringRef candidate,
|
||||
const llvm::BitVector &charactersInPattern,
|
||||
unsigned &firstPatternPos);
|
||||
|
||||
/// Calculates the candidate's score, matching the candidate from
|
||||
/// \p firstPatternPos or later.
|
||||
///
|
||||
/// This drives drives scoreCandidateTrial by trying the possible matches.
|
||||
double scoreCandidate(unsigned firstPatternPos);
|
||||
|
||||
/// Calculates the candidate's score, matching the candidate from
|
||||
/// exactly \p firstPatternPos.
|
||||
double scoreCandidateTrial(unsigned firstPatternPos);
|
||||
};
|
||||
} // end anonymous namespace
|
||||
|
||||
double FuzzyStringMatcher::scoreCandidate(StringRef candidate) const {
|
||||
double finalScore = 0.0;
|
||||
if (candidate.empty() || pattern.empty() || candidate.size() < pattern.size())
|
||||
return finalScore;
|
||||
|
||||
// Single character pattern matching should be simple and fast. Just look at
|
||||
// the first character.
|
||||
if (pattern.size() == 1) {
|
||||
char c = candidate[0];
|
||||
if (c == pattern[0] && isUppercase(c)) {
|
||||
finalScore = 3.0;
|
||||
} else if (toLowercase(c) == lowercasePattern[0]) {
|
||||
finalScore = 2.0;
|
||||
}
|
||||
|
||||
// Make sure shorter results come first;
|
||||
if (finalScore)
|
||||
finalScore += (1 / static_cast<double>(candidate.size())) * (1 / 1000.0);
|
||||
|
||||
if (normalize)
|
||||
finalScore /= maxScore;
|
||||
|
||||
return finalScore;
|
||||
}
|
||||
|
||||
// FIXME: path separators would be handled here, jumping straight to the last
|
||||
// component if the pattern doesn't contain a separator.
|
||||
|
||||
unsigned firstPatternPos = 0;
|
||||
CandidateSpecificMatcher CSM(pattern, lowercasePattern, candidate,
|
||||
charactersInPattern, firstPatternPos);
|
||||
finalScore = CSM.scoreCandidate(firstPatternPos);
|
||||
|
||||
if (normalize)
|
||||
finalScore /= maxScore;
|
||||
|
||||
return finalScore;
|
||||
}
|
||||
|
||||
CandidateSpecificMatcher::CandidateSpecificMatcher(
|
||||
StringRef pattern, StringRef lowercasePattern, StringRef candidate,
|
||||
const llvm::BitVector &charactersInPattern, unsigned &firstPatternPos)
|
||||
: pattern(pattern), lowercasePattern(lowercasePattern),
|
||||
candidate(candidate) {
|
||||
|
||||
assert(!pattern.empty() && pattern.size() <= candidate.size());
|
||||
assert(pattern.size() == lowercasePattern.size());
|
||||
|
||||
// Build a table that points at the next pattern character so we skip
|
||||
// through candidate faster.
|
||||
unsigned candidateLength = candidate.size();
|
||||
jumpTable.resize(candidateLength);
|
||||
lowercaseCandidate.resize(candidateLength);
|
||||
unsigned lastPatternPos = notFound;
|
||||
for (unsigned cidx = candidateLength - 1;; --cidx) {
|
||||
char c = candidate[cidx];
|
||||
lowercaseCandidate[cidx] = toLowercase(c);
|
||||
jumpTable[cidx] = lastPatternPos;
|
||||
if (charactersInPattern[static_cast<unsigned char>(c)])
|
||||
lastPatternPos = cidx;
|
||||
|
||||
if (!cidx)
|
||||
break;
|
||||
}
|
||||
firstPatternPos = lastPatternPos;
|
||||
|
||||
// Build the token table.
|
||||
characterToTokenIndex.resize(candidate.size());
|
||||
populateTokenTable(tokens, characterToTokenIndex, candidate);
|
||||
}
|
||||
|
||||
double CandidateSpecificMatcher::scoreCandidate(unsigned firstPatternPos) {
|
||||
double finalScore = 0.0;
|
||||
|
||||
// The outer matching loop. We run multiple trials so that "a_b_c_abc"
|
||||
// matching "abc" is matched on the "abc" part instead of the "a_b_c" part.
|
||||
while (firstPatternPos != notFound) {
|
||||
// Quickly skip to the first character that matches. We need
|
||||
// the loop in case the first pattern-character in the
|
||||
// candidate is not the first character in the pattern.
|
||||
while (firstPatternPos != notFound) {
|
||||
if (lowercasePattern[0] == lowercaseCandidate[firstPatternPos])
|
||||
break;
|
||||
firstPatternPos = jumpTable[firstPatternPos];
|
||||
}
|
||||
if (firstPatternPos == notFound)
|
||||
break;
|
||||
|
||||
double trialScore = scoreCandidateTrial(firstPatternPos);
|
||||
|
||||
if (trialScore > finalScore) {
|
||||
finalScore = trialScore;
|
||||
// FIXME: update output ranges, if necessary
|
||||
}
|
||||
|
||||
firstPatternPos = jumpTable[firstPatternPos];
|
||||
}
|
||||
|
||||
return finalScore;
|
||||
}
|
||||
|
||||
static double scoreRun(unsigned runStart, unsigned runLength,
|
||||
unsigned prevTokenStart, unsigned tokenIndex,
|
||||
unsigned uppercaseMatches, bool isTokenizingChar) {
|
||||
if (runLength == 0)
|
||||
return 0.0;
|
||||
|
||||
// We really don't like not matching at token starts, but if it's a long match
|
||||
// give some credit.
|
||||
if (runStart != prevTokenStart && !isTokenizingChar) {
|
||||
if (runLength < 5) {
|
||||
return (runLength < 3) ? 0.0 : runLength;
|
||||
}
|
||||
|
||||
// For really long matches, we'll give a high score. Pretend it's a bit
|
||||
// shorter.
|
||||
runLength -= 2;
|
||||
}
|
||||
|
||||
// Bonus if the match is the first or second token.
|
||||
double prefixBonus = (runStart == 0) ? 2.5 : ((tokenIndex < 2) ? 1.0 : 0.0);
|
||||
double uppercaseBonus =
|
||||
uppercaseMatches ? (uppercaseMatches + 1) * (uppercaseMatches + 1) : 0.0;
|
||||
|
||||
return (runLength * runLength) + uppercaseBonus + prefixBonus;
|
||||
}
|
||||
|
||||
double
|
||||
CandidateSpecificMatcher::scoreCandidateTrial(unsigned firstPatternPos) {
|
||||
double trialScore = 0.0; /// We run multiple trials so that "a_b_c_abc"
|
||||
/// matching "abc" is matched on the "abc" part
|
||||
/// instead of the "a_b_c" part.
|
||||
unsigned uppercaseMatches = 0;
|
||||
unsigned cidx = firstPatternPos;
|
||||
unsigned pidx = 0;
|
||||
unsigned runLength = 0;
|
||||
unsigned runStart = cidx;
|
||||
unsigned nonTokenRuns = 0;
|
||||
unsigned camelCaseLen = 0;
|
||||
unsigned camelCaseLastToken = 0;
|
||||
double camelCaseStartBonus = 0.0;
|
||||
unsigned camelCaseSkips = 0;
|
||||
|
||||
unsigned patternLength = pattern.size();
|
||||
unsigned candidateLength = candidate.size();
|
||||
|
||||
while (pidx < patternLength && cidx < candidateLength) {
|
||||
char lowerPatternChar = lowercasePattern[pidx];
|
||||
char lowerCandidateChar = lowercaseCandidate[cidx];
|
||||
unsigned nextCidx = jumpTable[cidx];
|
||||
bool matched = lowerPatternChar == lowerCandidateChar;
|
||||
if (matched) {
|
||||
if (isUppercase(pattern[pidx]) && isUppercase(candidate[cidx])) {
|
||||
++uppercaseMatches;
|
||||
}
|
||||
|
||||
++runLength;
|
||||
++pidx;
|
||||
if (pidx < patternLength)
|
||||
lowerPatternChar = lowercasePattern[pidx];
|
||||
}
|
||||
|
||||
// If we're skipping forward and were running, the run ended.
|
||||
if (((cidx + 1) != nextCidx) || !matched) {
|
||||
if (runLength) {
|
||||
double runValue =
|
||||
scoreRun(runStart, runLength,
|
||||
tokens[characterToTokenIndex[runStart]].location,
|
||||
characterToTokenIndex[runStart], uppercaseMatches,
|
||||
isTokenizingChar(candidate[runStart]));
|
||||
|
||||
// If it's a poor match in the middle of a token, see if the next char
|
||||
// starts a token and also matches. If so, use it.
|
||||
if (runLength == 1 && pidx > 1 && runValue == 0.0 &&
|
||||
nextCidx != notFound &&
|
||||
characterToTokenIndex[runStart] < tokens.size() - 1) {
|
||||
bool foundIt = false;
|
||||
unsigned charToCheck = matched ? nextCidx : cidx;
|
||||
while (charToCheck != notFound) {
|
||||
if (tokens[characterToTokenIndex[charToCheck]].location ==
|
||||
charToCheck &&
|
||||
lowercasePattern[pidx - 1] == lowercaseCandidate[charToCheck]) {
|
||||
foundIt = true;
|
||||
break;
|
||||
}
|
||||
charToCheck = jumpTable[charToCheck];
|
||||
}
|
||||
|
||||
if (foundIt) {
|
||||
--pidx;
|
||||
lowerPatternChar = lowercasePattern[pidx];
|
||||
runStart = cidx = charToCheck;
|
||||
runLength = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// We really don't like matches that don't start at a token.
|
||||
if (runValue == 0.0) {
|
||||
++nonTokenRuns;
|
||||
|
||||
} else {
|
||||
unsigned tokenIndex = characterToTokenIndex[runStart];
|
||||
if (runStart == tokens[tokenIndex].location ||
|
||||
isTokenizingChar(lowerCandidateChar)) {
|
||||
camelCaseLen += runLength;
|
||||
|
||||
// Bonus for matching the beginning of the candidate.
|
||||
if (tokenIndex <= 1) {
|
||||
camelCaseStartBonus = 2.0;
|
||||
|
||||
// Penalty for skipping a token.
|
||||
} else if (tokenIndex != camelCaseLastToken + 1) {
|
||||
camelCaseSkips += tokenIndex - camelCaseLastToken - 1;
|
||||
}
|
||||
|
||||
camelCaseLastToken = tokenIndex;
|
||||
|
||||
if (isTokenizingChar(lowerCandidateChar) && runLength == 1) {
|
||||
--camelCaseLastToken;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Accumulate run and reset for next run.
|
||||
trialScore += runValue;
|
||||
runs.push_back({runStart, runLength});
|
||||
uppercaseMatches = 0;
|
||||
runLength = 0;
|
||||
}
|
||||
|
||||
runStart = nextCidx;
|
||||
}
|
||||
|
||||
cidx = nextCidx;
|
||||
}
|
||||
|
||||
// The trial is done, did we find a match?
|
||||
// FIXME: this can happen spuriously in foo => ufDownOb.
|
||||
if (pidx != patternLength)
|
||||
return 0.0;
|
||||
|
||||
// Okay, we found a match.
|
||||
|
||||
// FIXME: this code is largely duplicated with the previous block. There are
|
||||
// some subtle differences that can be seen if you try to remove this one and
|
||||
// check for pidx == patternLength for the other block.
|
||||
if (runLength) {
|
||||
double runValue = scoreRun(
|
||||
runStart, runLength, tokens[characterToTokenIndex[runStart]].location,
|
||||
characterToTokenIndex[runStart], uppercaseMatches,
|
||||
isTokenizingChar(candidate[runStart]));
|
||||
|
||||
// If it's a poor match in the middle of a token, see if the next char
|
||||
// starts a token and also matches. If so, use it.
|
||||
if (runLength == 1 && runValue == 0.0) {
|
||||
unsigned nextCidx = jumpTable[runStart];
|
||||
if (nextCidx != notFound &&
|
||||
characterToTokenIndex[runStart] < tokens.size() - 1) {
|
||||
bool foundIt = false;
|
||||
while (nextCidx != notFound) {
|
||||
if (tokens[characterToTokenIndex[nextCidx]].location == nextCidx &&
|
||||
lowercasePattern[pidx - 1] == lowercaseCandidate[nextCidx]) {
|
||||
foundIt = true;
|
||||
break;
|
||||
}
|
||||
nextCidx = jumpTable[nextCidx];
|
||||
}
|
||||
|
||||
if (foundIt) {
|
||||
runStart = nextCidx;
|
||||
uppercaseMatches +=
|
||||
(isUppercase(pattern[pidx - 1]) &&
|
||||
isUppercase(candidate[runStart])) ? 1 : 0;
|
||||
runValue = scoreRun(runStart, runLength,
|
||||
tokens[characterToTokenIndex[runStart]].location,
|
||||
characterToTokenIndex[runStart], uppercaseMatches,
|
||||
isTokenizingChar(candidate[runStart]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We really don't like matches that don't start at a token.
|
||||
if (runValue == 0.0) {
|
||||
++nonTokenRuns;
|
||||
|
||||
} else {
|
||||
unsigned tokenIndex = characterToTokenIndex[runStart];
|
||||
if (runStart == tokens[tokenIndex].location ||
|
||||
isTokenizingChar(lowercaseCandidate[runStart])) {
|
||||
camelCaseLen += runLength;
|
||||
|
||||
if (tokenIndex <= 1) {
|
||||
// Bonus for matching the beginning of the candidate.
|
||||
camelCaseStartBonus = 2.0;
|
||||
} else if (tokenIndex != camelCaseLastToken + 1) {
|
||||
// Penalty for skipping a token.
|
||||
camelCaseSkips += tokenIndex - camelCaseLastToken - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Accumulate run.
|
||||
trialScore += runValue;
|
||||
runs.push_back({runStart, runLength});
|
||||
}
|
||||
|
||||
// Unless there were bad matches, prefer camel case matches.
|
||||
if (nonTokenRuns == 0 && camelCaseSkips < 3) {
|
||||
double camelCaseScore = (camelCaseLen * camelCaseLen) + camelCaseStartBonus;
|
||||
if (camelCaseSkips == 1) {
|
||||
camelCaseScore *= 0.9;
|
||||
} else if (camelCaseSkips == 2) {
|
||||
camelCaseScore *= 0.8;
|
||||
}
|
||||
|
||||
if (trialScore < camelCaseScore) {
|
||||
// Camel case matched better.
|
||||
trialScore = camelCaseScore;
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: using the range up to a dot is silly when candidate isn't a file.
|
||||
auto dotLoc = candidate.find_last_of('.');
|
||||
unsigned baseNameLength =
|
||||
dotLoc != StringRef::npos && dotLoc > 1 ? dotLoc : candidateLength;
|
||||
|
||||
// FIXME: file type bonus if we're checking a file path.
|
||||
|
||||
// Add a bit for the percentage of the candidate matched.
|
||||
trialScore += patternLength / static_cast<double>(baseNameLength) * 0.25;
|
||||
|
||||
// Exact matches are even better.
|
||||
if (patternLength >= baseNameLength && runs.size() > 0 &&
|
||||
runs[0].location == 0) {
|
||||
trialScore *= 1.1;
|
||||
}
|
||||
|
||||
// FIXME: popular/unpopular API.
|
||||
|
||||
// We really don't like matches that don't start at a token.
|
||||
switch (nonTokenRuns) {
|
||||
case 0:
|
||||
break;
|
||||
case 1:
|
||||
trialScore *= 0.8;
|
||||
break;
|
||||
case 2:
|
||||
trialScore *= 0.5;
|
||||
break;
|
||||
default:
|
||||
trialScore *= 0.33;
|
||||
break;
|
||||
}
|
||||
|
||||
// FIXME: matched ranges output.
|
||||
|
||||
return trialScore;
|
||||
}
|
||||
Reference in New Issue
Block a user