Files
sourcekit-lsp/Sources/CompletionScoring/Text/MatchCollator.swift
Alex Hoppen 6074529dac Manually fix swift-format lint rules
- I think there are valid calls with mixing inline and trailing closures (notably `Debouncer`), so I’m considering whether we should disable that rule.
- The `forEach` rule is a little annoying because we have `forEach` on `SKDResponseArray`. But it caught two cases of using `forEach` on arrays, so I think it’s worth keeping.
2025-08-11 09:31:09 +02:00

548 lines
21 KiB
Swift

//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2024 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
import Foundation
/// Aggregates functionality to support the `MatchCollator.selectBestMatches(for:from:in:)` function, which sorts and
/// selects the best matches from a list, applying the `.thorough` scoring function while being conscience of it's expense.
package struct MatchCollator {
private var originalMatches: UnsafeBufferPointer<Match>
private var rescoredMatches: UnsafeMutableBufferPointer<RescoredMatch>
private let batches: UnsafeBufferPointer<CandidateBatch.UnsafeStorage>
private let groupScores: UnsafeMutableBufferPointer<Double>
private let influencers: InfluencingIdentifiers
private let patternUTF8Length: Int
private let tieBreaker: UnsafePointer<TieBreaker>
private let maximumNumberOfItemsForExpensiveSelection: Int
package static let defaultMaximumNumberOfItemsForExpensiveSelection = 100
private init(
originalMatches: UnsafeBufferPointer<Match>,
rescoredMatches: UnsafeMutableBufferPointer<MatchCollator.RescoredMatch>,
batches: UnsafeBufferPointer<CandidateBatch.UnsafeStorage>,
groupScores: UnsafeMutableBufferPointer<Double>,
influencers: InfluencingIdentifiers,
patternUTF8Length: Int,
orderingTiesBy tieBreaker: UnsafePointer<TieBreaker>,
maximumNumberOfItemsForExpensiveSelection: Int
) {
for match in originalMatches {
precondition(batches.indices.contains(match.batchIndex))
precondition(batches[match.batchIndex].indices.contains(match.candidateIndex))
}
self.originalMatches = originalMatches
self.rescoredMatches = rescoredMatches
self.batches = batches
self.groupScores = groupScores
self.influencers = influencers
self.patternUTF8Length = patternUTF8Length
self.tieBreaker = tieBreaker
self.maximumNumberOfItemsForExpensiveSelection = maximumNumberOfItemsForExpensiveSelection
}
private static func withUnsafeMatchCollator<R>(
matches originalMatches: [Match],
batches: [CandidateBatch],
influencingTokenizedIdentifiers: [[String]],
patternUTF8Length: Int,
orderingTiesBy tieBreakerBody: (_ lhs: Match, _ rhs: Match) -> Bool,
maximumNumberOfItemsForExpensiveSelection: Int,
body: (inout MatchCollator) -> R
) -> R {
let rescoredMatches = UnsafeMutableBufferPointer<RescoredMatch>.allocate(capacity: originalMatches.count)
defer { rescoredMatches.deinitializeAllAndDeallocate() }
let groupScores = UnsafeMutableBufferPointer<Double>.allocate(capacity: originalMatches.count)
defer { groupScores.deinitializeAllAndDeallocate() }
for (matchIndex, originalMatch) in originalMatches.enumerated() {
rescoredMatches.initialize(
index: matchIndex,
to: RescoredMatch(
originalMatchIndex: matchIndex,
textIndex: TextIndex(batch: originalMatch.batchIndex, candidate: originalMatch.candidateIndex),
denseGroupID: nil,
individualScore: originalMatch.score,
groupScore: -Double.infinity,
falseStarts: 0
)
)
}
assignDenseGroupId(to: rescoredMatches, from: originalMatches, batchCount: batches.count)
return withoutActuallyEscaping(tieBreakerBody) { tieBreakerBody in
var tieBreaker = TieBreaker(tieBreakerBody)
return withExtendedLifetime(tieBreaker) {
InfluencingIdentifiers.withUnsafeInfluencingTokenizedIdentifiers(influencingTokenizedIdentifiers) {
influencers in
originalMatches.withUnsafeBufferPointer { originalMatches in
CandidateBatch.withUnsafeStorages(batches) { batchStorages in
var collator = Self(
originalMatches: originalMatches,
rescoredMatches: rescoredMatches,
batches: batchStorages,
groupScores: groupScores,
influencers: influencers,
patternUTF8Length: patternUTF8Length,
orderingTiesBy: &tieBreaker,
maximumNumberOfItemsForExpensiveSelection: maximumNumberOfItemsForExpensiveSelection
)
return body(&collator)
}
}
}
}
}
}
/// This allows us to only take the dictionary hit one time, so that we don't have to do repeated dictionary lookups
/// as we lookup groupIDs and map them to group scores.
private static func assignDenseGroupId(
to rescoredMatches: UnsafeMutableBufferPointer<RescoredMatch>,
from originalMatches: [Match],
batchCount: Int
) {
typealias SparseGroupID = Int
typealias DenseGroupID = Int
let initialDictionaryCapacity = (batchCount > 0) ? originalMatches.count / batchCount : 0
var batchAssignments: [[SparseGroupID: DenseGroupID]] = Array(
repeating: Dictionary(capacity: initialDictionaryCapacity),
count: batchCount
)
var nextDenseID = 0
for (matchIndex, match) in originalMatches.enumerated() {
if let sparseID = match.groupID {
rescoredMatches[matchIndex].denseGroupID = batchAssignments[match.batchIndex][sparseID].lazyInitialize {
let denseID = nextDenseID
nextDenseID += 1
return denseID
}
}
}
}
private mutating func selectBestFastScoredMatchesForThoroughScoring() {
if rescoredMatches.count > maximumNumberOfItemsForExpensiveSelection {
rescoredMatches.selectTopKAndTruncate(maximumNumberOfItemsForExpensiveSelection) { lhs, rhs in
(lhs.groupScore >? rhs.groupScore) ?? (lhs.individualScore > rhs.individualScore)
}
}
}
private func refreshGroupScores() {
// We call this the first time without initializing the Double values.
groupScores.setAll(to: -.infinity)
for match in rescoredMatches {
if let denseGroupID = match.denseGroupID {
groupScores[denseGroupID] = max(groupScores[denseGroupID], match.individualScore.value)
}
}
for (index, match) in rescoredMatches.enumerated() {
if let denseGroupID = match.denseGroupID {
rescoredMatches[index].groupScore = groupScores[denseGroupID]
} else {
rescoredMatches[index].groupScore = rescoredMatches[index].individualScore.value
}
}
}
private func unsafeBytes(at textIndex: TextIndex) -> CandidateBatch.UTF8Bytes {
return batches[textIndex.batch].bytes(at: textIndex.candidate)
}
mutating func thoroughlyRescore(pattern: Pattern) {
// `nonisolated(unsafe)` is fine because every iteration accesses a different index of `batches`.
nonisolated(unsafe) let batches = batches
// `nonisolated(unsafe)` is fine because every iteration accesses a disjunct set of indices of `rescoredMatches`.
nonisolated(unsafe) let rescoredMatches = rescoredMatches
let pattern = pattern
rescoredMatches.slicedConcurrentForEachSliceRange { sliceRange in
UnsafeStackAllocator.withUnsafeStackAllocator { allocator in
for matchIndex in sliceRange {
let textIndex = rescoredMatches[matchIndex].textIndex
let (candidateBytes, candidateContentType) = batches[textIndex.batch]
.candidateContent(at: textIndex.candidate)
let textScore = pattern.score(
candidate: candidateBytes,
contentType: candidateContentType,
precision: .thorough,
allocator: &allocator
)
rescoredMatches[matchIndex].individualScore.textComponent = textScore.value
rescoredMatches[matchIndex].falseStarts = textScore.falseStarts
}
}
}
}
/// Generated and validated by `MatchCollatorTests.testMinimumTextCutoff()`
package static let bestRejectedTextScoreByPatternLength: [Double] = [
0.0,
0.0,
2.900400881379344,
2.900400881379344,
2.900400881379344,
2.900400881379344,
2.900400881379344,
2.900400881379344,
2.900400881379344,
2.900400881379344,
2.900400881379344,
]
private var cutoffRatio: Double {
// |
// 0.67 | ____________________
// | /
// |/
// +------------------------
// 4
let fullCutoffRatio = (2.0 / 3.0)
let weight = min(max(Double(patternUTF8Length), 1.0) / 4.0, 1.0)
return fullCutoffRatio * weight
}
private static let maxInfluenceBonus = 0.10
private static let maxFalseStarts = 2
private var bestRejectedTextScore: Double {
let bestRejectedTextScoreByPatternLength = Self.bestRejectedTextScoreByPatternLength
let inBounds = bestRejectedTextScoreByPatternLength.indices.contains(patternUTF8Length)
return
(inBounds
? bestRejectedTextScoreByPatternLength[patternUTF8Length] : bestRejectedTextScoreByPatternLength.last)
?? 0
}
private mutating func selectBestThoroughlyScoredMatches() {
if let bestThoroughlyScoredMatch = rescoredMatches.max(by: \.individualScore) {
let topMatchFalseStarts = bestThoroughlyScoredMatch.falseStarts
let compositeCutoff = self.cutoffRatio * bestThoroughlyScoredMatch.individualScore.value
let semanticCutoffForTokenFalseStartsExemption =
bestThoroughlyScoredMatch.individualScore.semanticComponent / 3.0
let bestRejectedTextScore = bestRejectedTextScore
let maxAllowedFalseStarts = Self.maxFalseStarts
rescoredMatches.removeAndTruncateWhere { candidate in
let overcomesTextCutoff = candidate.individualScore.textComponent > bestRejectedTextScore
let overcomesFalseStartCutoff = candidate.falseStarts <= maxAllowedFalseStarts
let acceptedByCompositeScore = candidate.individualScore.value >= compositeCutoff
let acceptedByTokenFalseStarts =
(candidate.falseStarts <= topMatchFalseStarts)
&& (candidate.individualScore.semanticComponent >= semanticCutoffForTokenFalseStartsExemption)
let keep =
overcomesTextCutoff && overcomesFalseStartCutoff
&& (acceptedByCompositeScore || acceptedByTokenFalseStarts)
return !keep
}
}
}
private mutating func selectBestFastScoredMatches() {
if let bestSemanticScore = rescoredMatches.max(of: { candidate in candidate.individualScore.semanticComponent }) {
let minimumSemanticScore = bestSemanticScore * cutoffRatio
rescoredMatches.removeAndTruncateWhere { candidate in
candidate.individualScore.semanticComponent < minimumSemanticScore
}
}
}
private mutating func applyInfluence() {
let influencers = self.influencers
let maxInfluenceBonus = Self.maxInfluenceBonus
if influencers.hasContent && (maxInfluenceBonus != 0.0) {
// `nonisolated(unsafe)` is fine because every iteration accesses a disjoint set of indices in `rescoredMatches`.
nonisolated(unsafe) let rescoredMatches = rescoredMatches
// `nonisolated(unsafe)` is fine because `batches` is not modified
nonisolated(unsafe) let batches = batches
rescoredMatches.slicedConcurrentForEachSliceRange { sliceRange in
UnsafeStackAllocator.withUnsafeStackAllocator { allocator in
for matchIndex in sliceRange {
let textIndex = rescoredMatches[matchIndex].textIndex
let candidate = batches[textIndex.batch].candidate(at: textIndex.candidate)
let percentOfInfluenceBonus = influencers.score(candidate: candidate, allocator: &allocator)
let textCoefficient = (percentOfInfluenceBonus * maxInfluenceBonus) + 1.0
rescoredMatches[matchIndex].individualScore.textComponent *= textCoefficient
}
}
}
refreshGroupScores()
}
}
private func lessThan(_ lhs: RescoredMatch, _ rhs: RescoredMatch) -> Bool {
if let definitiveGroupScoreComparison = lhs.groupScore >? rhs.groupScore {
return definitiveGroupScoreComparison
// Only compare `individualScore` within the same group, or among items that have no group.
// Otherwise when the group score ties, we would interleave the members of the tying groups.
} else if lhs.denseGroupID == rhs.denseGroupID,
let definitiveIndividualScoreComparison = lhs.individualScore.value >? rhs.individualScore.value
{
return definitiveIndividualScoreComparison
} else {
let lhsBytes = unsafeBytes(at: lhs.textIndex)
let rhsBytes = unsafeBytes(at: rhs.textIndex)
switch compareBytes(lhsBytes, rhsBytes) {
case .ascending:
return true
case .descending:
return false
case .same:
if rescoredMatches.count <= maximumNumberOfItemsForExpensiveSelection {
let lhsOriginal = originalMatches[lhs.originalMatchIndex]
let rhsOriginal = originalMatches[rhs.originalMatchIndex]
if tieBreaker.pointee.lessThan(lhsOriginal, rhsOriginal) {
return true
} else if tieBreaker.pointee.lessThan(rhsOriginal, lhsOriginal) {
return false
}
}
return (lhs.originalMatchIndex < rhs.originalMatchIndex)
}
}
}
private mutating func sort() {
rescoredMatches.sort(by: lessThan)
}
private mutating func selectBestMatches(pattern: Pattern) -> Selection {
refreshGroupScores()
let precision: Pattern.Precision
if pattern.typedEnoughForThoroughScoring
|| (rescoredMatches.count <= maximumNumberOfItemsForExpensiveSelection)
{
selectBestFastScoredMatchesForThoroughScoring()
thoroughlyRescore(pattern: pattern)
refreshGroupScores()
selectBestThoroughlyScoredMatches()
precision = .thorough
} else {
selectBestFastScoredMatches()
precision = .fast
}
applyInfluence()
sort()
return Selection(
precision: precision,
matches: rescoredMatches.map { match in
originalMatches[match.originalMatchIndex]
}
)
}
/// Uses heuristics to cull matches, and then apply the expensive `.thorough` scoring function.
///
/// Returns the results stably ordered by score, then text.
package static func selectBestMatches(
_ matches: [Match],
from batches: [CandidateBatch],
for pattern: Pattern,
influencingTokenizedIdentifiers: [[String]],
orderingTiesBy tieBreaker: (_ lhs: Match, _ rhs: Match) -> Bool,
maximumNumberOfItemsForExpensiveSelection: Int
) -> Selection {
withUnsafeMatchCollator(
matches: matches,
batches: batches,
influencingTokenizedIdentifiers: influencingTokenizedIdentifiers,
patternUTF8Length: pattern.patternUTF8Length,
orderingTiesBy: tieBreaker,
maximumNumberOfItemsForExpensiveSelection: maximumNumberOfItemsForExpensiveSelection
) { collator in
collator.selectBestMatches(pattern: pattern)
}
}
/// Short for `selectBestMatches(_:from:for:influencingTokenizedIdentifiers:orderingTiesBy:).matches`
package static func selectBestMatches(
for pattern: Pattern,
from matches: [Match],
in batches: [CandidateBatch],
influencingTokenizedIdentifiers: [[String]],
orderingTiesBy tieBreaker: (_ lhs: Match, _ rhs: Match) -> Bool,
maximumNumberOfItemsForExpensiveSelection: Int = Self.defaultMaximumNumberOfItemsForExpensiveSelection
) -> [Match] {
return selectBestMatches(
matches,
from: batches,
for: pattern,
influencingTokenizedIdentifiers: influencingTokenizedIdentifiers,
orderingTiesBy: tieBreaker,
maximumNumberOfItemsForExpensiveSelection: maximumNumberOfItemsForExpensiveSelection
).matches
}
/// Split identifiers into constituent subwords. For example "documentDownload" becomes ["document", "Download"]
/// - Parameters:
/// - identifiers: Strings from the program source, like "documentDownload"
/// - filterLowSignalTokens: When true, removes common tokens that would falsely signal influence, like "from".
/// - Returns: A value suitable for use with the `influencingTokenizedIdentifiers:` parameter of `selectBestMatches()`.
package static func tokenize(
influencingTokenizedIdentifiers identifiers: [String],
filterLowSignalTokens: Bool
) -> [[String]] {
identifiers.map { identifier in
tokenize(influencingTokenizedIdentifier: identifier, filterLowSignalTokens: filterLowSignalTokens)
}
}
/// Only package so that we can performance test this
package static func performanceTest_influenceScores(
for batches: [CandidateBatch],
influencingTokenizedIdentifiers: [[String]],
iterations: Int
) -> Double {
let matches = batches.enumerated().flatMap { batchIndex, batch in
(0..<batch.count).map { candidateIndex in
Match(
batchIndex: batchIndex,
candidateIndex: candidateIndex,
groupID: nil,
score: CompletionScore(textComponent: 1, semanticComponent: 1)
)
}
}
return MatchCollator.withUnsafeMatchCollator(
matches: matches,
batches: batches,
influencingTokenizedIdentifiers: influencingTokenizedIdentifiers,
patternUTF8Length: 0,
orderingTiesBy: { _, _ in false },
maximumNumberOfItemsForExpensiveSelection: Self.defaultMaximumNumberOfItemsForExpensiveSelection,
body: { collator in
return (0..<iterations).reduce(0) { accumulation, _ in
collator.applyInfluence()
return collator.rescoredMatches.reduce(accumulation) { accumulation, match in
accumulation + match.individualScore.value
}
}
}
)
}
package static func tokenize(
influencingTokenizedIdentifier identifier: String,
filterLowSignalTokens: Bool
)
-> [String]
{
var tokens: [String] = []
identifier.withUncachedUTF8Bytes { identifierBytes in
UnsafeStackAllocator.withUnsafeStackAllocator { allocator in
var tokenization = Pattern.Tokenization.allocate(
mixedcaseBytes: identifierBytes,
contentType: .codeCompletionSymbol,
allocator: &allocator
); defer { tokenization.deallocate(allocator: &allocator) }
tokenization.enumerate { tokenRange in
if let token = String(bytes: identifierBytes[tokenRange], encoding: .utf8) {
tokens.append(token)
}
}
}
}
if filterLowSignalTokens {
let minimumLength = 4 // Shorter tokens appear too much to be useful: (in, on, a, the)
let ignoredTokens: Set = ["from", "with"]
tokens.removeAll { token in
return (token.count < minimumLength) || ignoredTokens.contains(token.lowercased())
}
}
return tokens
}
}
extension MatchCollator {
fileprivate struct TextIndex {
var batch: Int
var candidate: Int
}
}
extension MatchCollator {
fileprivate struct RescoredMatch {
var originalMatchIndex: Int
var textIndex: TextIndex
var denseGroupID: Int?
var individualScore: CompletionScore
var groupScore: Double
var falseStarts: Int
}
}
extension MatchCollator {
/// A wrapper to allow taking an unsafe pointer to a closure.
fileprivate final class TieBreaker {
var lessThan: (_ lhs: Match, _ rhs: Match) -> Bool
init(_ lessThan: @escaping (_ lhs: Match, _ rhs: Match) -> Bool) {
self.lessThan = lessThan
}
}
}
extension MatchCollator.RescoredMatch: CustomStringConvertible {
var description: String {
func format(_ value: Double) -> String {
String(format: "%0.3f", value)
}
return
"""
RescoredMatch(\
idx: \(originalMatchIndex), \
gid: \(denseGroupID?.description ?? "_"), \
score.t: \(format(individualScore.textComponent)), \
score.s: \(format(individualScore.semanticComponent)), \
groupScore: \(format(groupScore)), \
falseStarts: \(falseStarts)\
)
"""
}
}
fileprivate extension Pattern {
var typedEnoughForThoroughScoring: Bool {
patternUTF8Length >= MatchCollator.minimumPatternLengthToAlwaysRescoreWithThoroughPrecision
}
}
// Deprecated Entry Points
extension MatchCollator {
@available(*, deprecated, renamed: "selectBestMatches(for:from:in:influencingTokenizedIdentifiers:orderingTiesBy:)")
package static func selectBestMatches(
for pattern: Pattern,
from matches: [Match],
in batches: [CandidateBatch],
influencingTokenizedIdentifiers: [[String]]
) -> [Match] {
selectBestMatches(
for: pattern,
from: matches,
in: batches,
influencingTokenizedIdentifiers: influencingTokenizedIdentifiers,
orderingTiesBy: { _, _ in false }
)
}
@available(
*,
deprecated,
message:
"Use the MatchCollator.Selection.precision value returned from selectBestMatches(...) to choose between fast and thorough matched text ranges."
)
package static var bestMatchesThoroughScanningMinimumPatternLength: Int {
minimumPatternLengthToAlwaysRescoreWithThoroughPrecision
}
}
extension MatchCollator {
package static let minimumPatternLengthToAlwaysRescoreWithThoroughPrecision = 2
}