//===----------------------------------------------------------------------===// // // This source file is part of the Swift.org open source project // // Copyright (c) 2024 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// import Foundation /// The pattern that the user has typed and which should be used to cull and score results. package final class Pattern: Sendable { package typealias ContentType = Candidate.ContentType package typealias UTF8Bytes = UnsafeBufferPointer package typealias UTF8ByteRange = Range package enum Precision { case fast case thorough } package let text: String /// These all have pattern in the name to avoid confusion with their candidate counterparts in functions operating on both. // `nonisolated(unsafe)` is fine because the underlying buffer is never modified. private nonisolated(unsafe) let patternMixedcaseBytes: UTF8Bytes private nonisolated(unsafe) let patternLowercaseBytes: UTF8Bytes private let patternRejectionFilter: RejectionFilter /// For each byte in `text`, a rejection filter that contains all the characters occurring after or at that offset. /// /// This way when we have already matched the first 4 bytes, we can check which characters occur from byte 5 onwards /// and check that they appear in the candidate's remaining text. private let patternSuccessiveRejectionFilters: [RejectionFilter] private let patternHasMixedcase: Bool internal var patternUTF8Length: Int { patternMixedcaseBytes.count } package init(text: String) { self.text = text let mixedcaseBytes = Array(text.utf8) let lowercaseBytes = mixedcaseBytes.map(\.lowercasedUTF8Byte) self.patternMixedcaseBytes = UnsafeBufferPointer.allocate(copyOf: mixedcaseBytes) self.patternLowercaseBytes = UnsafeBufferPointer.allocate(copyOf: lowercaseBytes) self.patternRejectionFilter = .init(lowercaseBytes: lowercaseBytes) self.patternHasMixedcase = lowercaseBytes != mixedcaseBytes self.patternSuccessiveRejectionFilters = UnsafeStackAllocator.withUnsafeStackAllocator { allocator in return lowercaseBytes.withUnsafeBufferPointer { lowercaseBytes in var rejectionFilters = Self.allocateSuccessiveRejectionFilters( lowercaseBytes: lowercaseBytes, allocator: &allocator ) defer { allocator.deallocate(&rejectionFilters) } return Array(rejectionFilters) } } } deinit { patternMixedcaseBytes.deallocate() patternLowercaseBytes.deallocate() } /// Perform an insensitive greedy match and return the location where the greedy match stared if it succeeded. /// If the match was not successful, return `nil`. /// /// Future searches used during scoring can use this location to jump past all of the initial bytes that don't match /// anything. An empty pattern matches with location 0. private func matchLocation(candidate: Candidate) -> Int? { if RejectionFilter.match(pattern: patternRejectionFilter, candidate: candidate.rejectionFilter) == .maybe { let cBytes = candidate.bytes let pCount = patternLowercaseBytes.count let cCount = cBytes.count var pRemaining = pCount var cRemaining = cCount if (pRemaining > 0) && (cRemaining > 0) && (pRemaining <= cRemaining) { let pFirst = patternLowercaseBytes[0] let cStart = cBytes.firstIndex { cByte in cByte.lowercasedUTF8Byte == pFirst } if let cStart = cStart { cRemaining -= cStart cRemaining -= 1 pRemaining -= 1 // While we're not at the end and the remaining pattern is as short or shorter than the remaining candidate. while (pRemaining > 0) && (cRemaining > 0) && (pRemaining <= cRemaining) { let cIdx = cCount - cRemaining let pIdx = pCount - pRemaining if cBytes[cIdx].lowercasedUTF8Byte == patternLowercaseBytes[pIdx] { pRemaining -= 1 } cRemaining -= 1 } return (pRemaining == 0) ? cStart : nil } } return pCount == 0 ? 0 : nil } return nil } package func matches(candidate: Candidate) -> Bool { matchLocation(candidate: candidate) != nil } package func score(candidate: Candidate, precision: Precision) -> Double { score(candidate: candidate.bytes, contentType: candidate.contentType, precision: precision) } package func score( candidate candidateMixedcaseBytes: UTF8Bytes, contentType: ContentType, precision: Precision ) -> Double { UnsafeStackAllocator.withUnsafeStackAllocator { allocator in score( candidate: candidateMixedcaseBytes, contentType: contentType, precision: precision, allocator: &allocator ) .value } } package struct TextScore: Comparable { package var value: Double package var falseStarts: Int static let worstPossibleScore = TextScore(value: -Double.infinity, falseStarts: Int.max) package init(value: Double, falseStarts: Int) { self.value = value self.falseStarts = falseStarts } package static func < (_ lhs: Self, _ rhs: Self) -> Bool { return (lhs.value rhs.falseStarts) } fileprivate static let emptyMatchScore = TextScore(value: 1.0, falseStarts: 0) fileprivate static let noMatchScore = TextScore(value: 0.0, falseStarts: 0) } internal func matchAndScore( candidate: Candidate, precision: Precision, allocator: inout UnsafeStackAllocator ) -> TextScore? { return matchLocation(candidate: candidate).map { firstMatchingLowercaseByteIndex in if patternLowercaseBytes.hasContent { var indexedCandidate = IndexedCandidate.allocate( referencing: candidate.bytes, patternByteCount: patternLowercaseBytes.count, firstMatchingLowercaseByteIndex: firstMatchingLowercaseByteIndex, contentType: candidate.contentType, allocator: &allocator ) defer { indexedCandidate.deallocate(allocator: &allocator) } return score( candidate: &indexedCandidate, precision: precision, captureMatchingRanges: false, allocator: &allocator ) } else { return .emptyMatchScore } } } package func score( candidate candidateMixedcaseBytes: UTF8Bytes, contentType: ContentType, precision: Precision, allocator: inout UnsafeStackAllocator ) -> TextScore { if patternLowercaseBytes.hasContent { var candidate = IndexedCandidate.allocate( referencing: candidateMixedcaseBytes, patternByteCount: patternLowercaseBytes.count, firstMatchingLowercaseByteIndex: nil, contentType: contentType, allocator: &allocator ) defer { candidate.deallocate(allocator: &allocator) } return score( candidate: &candidate, precision: precision, captureMatchingRanges: false, allocator: &allocator ) } else { return .emptyMatchScore } } package func score( candidate candidateMixedcaseBytes: UTF8Bytes, contentType: ContentType, precision: Precision, captureMatchingRanges: Bool, ranges: inout [UTF8ByteRange] ) -> Double { UnsafeStackAllocator.withUnsafeStackAllocator { allocator in if patternLowercaseBytes.hasContent { var candidate = IndexedCandidate.allocate( referencing: candidateMixedcaseBytes, patternByteCount: patternLowercaseBytes.count, firstMatchingLowercaseByteIndex: nil, contentType: contentType, allocator: &allocator ) defer { candidate.deallocate(allocator: &allocator) } let score = score( candidate: &candidate, precision: precision, captureMatchingRanges: captureMatchingRanges, allocator: &allocator ) if captureMatchingRanges { ranges = Array(candidate.matchedRanges) } return score.value } else { return TextScore.emptyMatchScore.value } } } private func score( candidate: inout IndexedCandidate, precision: Precision, captureMatchingRanges: Bool, allocator: inout UnsafeStackAllocator ) -> TextScore { switch precision { case .fast: let matchStyle = populateMatchingRangesForFastScoring(&candidate) return singleScore(candidate: candidate, precision: precision, matchStyle: matchStyle) case .thorough: let budget = 5000 return bestScore( candidate: &candidate, budget: budget, captureMatchingRanges: captureMatchingRanges, allocator: &allocator ) } } private func eligibleForAcronymMatch(candidate: IndexedCandidate) -> Bool { return (patternLowercaseBytes.count >= 3) && candidate.contentType.isEligibleForAcronymMatch && candidate.tokenization.hasNonUppercaseNonDelimiterBytes } private enum MatchStyle: CaseIterable { case lowercaseContinuous case mixedcaseContinuous case mixedcaseGreedy case lowercaseGreedy case acronym } /// Looks for matches like `tamic` against `[t]ranslates[A]utoresizing[M]ask[I]nto[C]onstraints.` /// Or `MOC` against `NS[M]anaged[O]bject[C]ontext`. /// Accomplishes this by doing a greedy match over the acronym characters, which are the first characters of each token. private func populateMatchingRangesForAcronymMatch(_ candidate: inout IndexedCandidate) -> Bool { let tokenization = candidate.tokenization if eligibleForAcronymMatch(candidate: candidate) && tokenization.tokens.hasContent { let candidateLowercaseBytes = candidate.lowercaseBytes let allowFullMatchesBeginningWithTokenIndex = candidate.contentType.acronymMatchAllowsMultiCharacterMatchesAfterBaseName ? candidate.tokenization.firstNonBaseNameTokenIndex : candidate.tokenization.tokenCount var tidx = 0 let tcnt = tokenization.tokens.count var cidx = 0 let ccnt = candidateLowercaseBytes.count var pidx = 0 let pcnt = patternLowercaseBytes.count var matches = true while (cidx < ccnt) && (pidx < pcnt) && (tidx < tcnt) && matches { let token = tokenization.tokens[tidx] var tcidx = 0 let tccnt = (token.allUppercase || (tidx >= allowFullMatchesBeginningWithTokenIndex)) ? token.length : 1 let initialCidx = cidx while (tcidx < tccnt) && (cidx < ccnt) && (pidx < pcnt) { if candidateLowercaseBytes[cidx] == patternLowercaseBytes[pidx] { tcidx += 1 cidx += 1 pidx += 1 } else { break } } matches = (tcidx > 0) // Matched any characters in this token // Allow skipping the first token if it's all uppercase || ((tidx == 0) && token.allUppercase) // Allow skipping single character delemiters || ((token.length == 1) && candidateLowercaseBytes[cidx].isDelimiter) if (tcidx > 0) { candidate.matchedRanges.append(initialCidx.., startOffset: Int, candidateBytes: UTF8Bytes, patternBytes: UTF8Bytes ) -> Bool { if let contiguousMatch = candidateBytes.rangeOf(bytes: patternBytes, startOffset: startOffset) { ranges.append(contiguousMatch) return true } return false } /// Returns `true` if the pattern matches the candidate according to the rules of the `matchStyle`. /// When successful, the matched ranges will be added to `ranges`. Otherwise `ranges` will be cleared. private func populateMatchingRanges(_ candidate: inout IndexedCandidate, matchStyle: MatchStyle) -> Bool { let startOffset = candidate.firstMatchingLowercaseByteIndex ?? 0 switch matchStyle { case .lowercaseContinuous: return Self.populateMatchingContinuousRanges( &candidate.matchedRanges, startOffset: startOffset, candidateBytes: candidate.lowercaseBytes, patternBytes: patternLowercaseBytes ) case .mixedcaseContinuous: return Self.populateMatchingContinuousRanges( &candidate.matchedRanges, startOffset: startOffset, candidateBytes: candidate.mixedcaseBytes, patternBytes: patternMixedcaseBytes ) case .acronym: return populateMatchingRangesForAcronymMatch(&candidate) case .mixedcaseGreedy: return Self.populateGreedyMatchingRanges( &candidate.matchedRanges, startOffset: startOffset, candidateBytes: candidate.mixedcaseBytes, patternBytes: patternMixedcaseBytes ) case .lowercaseGreedy: return Self.populateGreedyMatchingRanges( &candidate.matchedRanges, startOffset: startOffset, candidateBytes: candidate.lowercaseBytes, patternBytes: patternLowercaseBytes ) } } /// Tries a fixed set of strategies in most to least desirable order for matching the candidate to the pattern. /// /// Returns the first strategy that matches. /// Generally this match will produce the highest score of the considered strategies, but this isn't known, and a /// higher scoring match could be found by the `.thorough` search. /// For example, the highest priority strategy is a case matched contiguous sequence. If you search for "name" in /// "filenames(name:)", this code will select the first one, but the second occurrence will score higher since it's a /// complete token. /// The fast scoring just needs to get the result into the second round though where `.thorough` will find the better /// match. private func populateMatchingRangesForFastScoring(_ candidate: inout IndexedCandidate) -> MatchStyle? { let startOffset = candidate.firstMatchingLowercaseByteIndex ?? 0 if Self.populateMatchingContinuousRanges( &candidate.matchedRanges, startOffset: startOffset, candidateBytes: candidate.lowercaseBytes, patternBytes: patternLowercaseBytes ) { return .lowercaseContinuous } else if populateMatchingRangesForAcronymMatch(&candidate) { return .acronym } else if Self.populateGreedyMatchingRanges( &candidate.matchedRanges, startOffset: startOffset, candidateBytes: candidate.mixedcaseBytes, patternBytes: patternMixedcaseBytes ) { return .mixedcaseGreedy } else if Self.populateGreedyMatchingRanges( &candidate.matchedRanges, startOffset: startOffset, candidateBytes: candidate.lowercaseBytes, patternBytes: patternLowercaseBytes ) { return .lowercaseGreedy } return nil } /// Tries to match `patternBytes` against `candidateBytes` by greedily matching the first occurrence of a character /// it finds, ignoring tokenization. For example, "filename" matches both "file(named:)" and "decoyfileadecoynamedecoy". /// /// If a successful match was found, returns `true` and populates `ranges` with the matched ranges. /// Otherwise `ranges` is cleared since it's used as scratch space. private static func populateGreedyMatchingRanges( _ ranges: inout UnsafeStackArray, startOffset: Int, candidateBytes: UTF8Bytes, patternBytes: UTF8Bytes ) -> Bool { var cidx = startOffset, pidx = 0 var currentlyMatching = false while (cidx < candidateBytes.count) && (pidx < patternBytes.count) { if candidateBytes[cidx] == patternBytes[pidx] { pidx += 1 if !currentlyMatching { currentlyMatching = true ranges.append(cidx ..+ 0) } ranges[ranges.count - 1].extend(upperBoundBy: 1) } else { currentlyMatching = false } cidx += 1 } let matched = pidx == patternBytes.count if !matched { ranges.removeAll() } return matched } private func singleScore(candidate: IndexedCandidate, precision: Precision, matchStyle: MatchStyle?) -> TextScore { func ratio(_ lhs: Int, _ rhs: Int) -> Double { return Double(lhs) / Double(rhs) } var patternCharactersRemaining = patternMixedcaseBytes.count let matchedRanges = candidate.matchedRanges if let firstMatchedRange = matchedRanges.first { let candidateMixedcaseBytes = candidate.mixedcaseBytes let tokenization = candidate.tokenization let prefixMatchBonusValue = candidate.contentType.prefixMatchBonus let contentAfterBasenameIsTrivial = candidate.contentType.contentAfterBasenameIsTrivial let leadingCaseMatchableCount = contentAfterBasenameIsTrivial ? tokenization.baseNameLength : candidateMixedcaseBytes.count var score = 0.0 var falseStarts = 0 var uppercaseMatches = 0 var uppercaseMismatches = 0 var anyCaseMatches = 0 var isPrefixUppercaseMatch = false do { var pidx = 0 for matchedRange in matchedRanges { for cidx in matchedRange { let candidateCharacter = candidateMixedcaseBytes[cidx] if cidx < leadingCaseMatchableCount { if candidateCharacter == patternMixedcaseBytes[pidx] { /// Check for case match. uppercaseMatches += candidateCharacter.isUppercase ? 1 : 0 isPrefixUppercaseMatch = isPrefixUppercaseMatch || (candidateCharacter.isUppercase && (cidx == 0)) anyCaseMatches += 1 } else { uppercaseMismatches += 1 } } pidx += 1 } } } var badShortMatches = 0 var incompletelyMatchedTokens = 0 var allRunsStartOnWordStartOrUppercaseLetter = true for range in matchedRanges { var position = range.lowerBound var remainingCharacters = range.length var matchedTokenPrefix = false repeat { let tokenIndex = tokenization.byteTokenAddresses[position].tokenIndex let tokenLength = tokenization.tokens[tokenIndex].length let positionInToken = tokenization.byteTokenAddresses[position].indexInToken let tokenCharactersRemaining = (tokenLength - positionInToken) let coveredCharacters = (remainingCharacters > tokenCharactersRemaining) ? tokenCharactersRemaining : remainingCharacters let coveredWholeToken = (coveredCharacters == tokenLength) incompletelyMatchedTokens += coveredWholeToken ? 0 : 1 let laterMatchesExist = (coveredCharacters < patternCharactersRemaining) let incompleteMatch = (!coveredWholeToken && laterMatchesExist) if incompleteMatch || (positionInToken != 0) { falseStarts += 1 } if incompleteMatch && (coveredCharacters <= 2) { badShortMatches += 1 } if positionInToken == 0 { matchedTokenPrefix = true } else if !candidateMixedcaseBytes[position].isUppercase { allRunsStartOnWordStartOrUppercaseLetter = false } patternCharactersRemaining -= coveredCharacters remainingCharacters -= coveredCharacters position += coveredCharacters } while (remainingCharacters > 0) if (range.length > 1) || matchedTokenPrefix { score += pow(Double(range.length), 1.5) } } // This is for cases like an autogenerated member-wise initializer of a huge struct matching everything. // If they only matched within the arguments, and it's a huge symbol, it's a false start. if (firstMatchedRange.lowerBound > tokenization.baseNameLength) && (candidateMixedcaseBytes.count > 256) { falseStarts += 1 score *= 0.75 } if (matchStyle == .acronym) { badShortMatches = 0 falseStarts = 0 } if matchedRanges.only?.length == candidateMixedcaseBytes.count { score *= candidate.contentType.fullMatchBonus } else if matchedRanges.only == 0.. 1) && (matchStyle != .acronym) { score /= 2 } } return TextScore(value: score, falseStarts: falseStarts) } else { return .noMatchScore } } private static func allocateSuccessiveRejectionFilters( lowercaseBytes: UTF8Bytes, allocator: inout UnsafeStackAllocator ) -> UnsafeStackArray { var filters = allocator.allocateUnsafeArray(of: RejectionFilter.self, maximumCapacity: lowercaseBytes.count) filters.initializeWithContainedGarbage() var idx = lowercaseBytes.count - 1 var accumulated = RejectionFilter.empty while idx >= 0 { accumulated.formUnion(lowercaseByte: lowercaseBytes[idx]) filters[idx] = accumulated idx -= 1 } return filters } } extension Pattern { package struct ByteTokenAddress { package var tokenIndex = 0 package var indexInToken = 0 } /// A token is a single conceptual name piece of an identifier, usually separated by camel case or underscores. /// /// For example `doFancyStuff` is divided into 3 tokens: `do`, `Fancy`, and `Stuff`. package struct Token { var storage: UInt init(length: Int, allUppercase: Bool) { let bit: UInt = (allUppercase ? 1 : 0) << (Int.bitWidth - 1) storage = UInt(bitPattern: length) | bit } package var length: Int { Int(bitPattern: UInt(storage & ~(1 << (Int.bitWidth - 1)))) } package var allUppercase: Bool { (storage & (1 << (Int.bitWidth - 1))) != 0 } } package struct Tokenization { package private(set) var baseNameLength: Int package private(set) var hasNonUppercaseNonDelimiterBytes: Bool package private(set) var tokens: UnsafeStackArray package private(set) var byteTokenAddresses: UnsafeStackArray var tokenCount: Int { tokens.count } var byteCount: Int { byteTokenAddresses.count } private enum CharacterClass: Equatable { case uppercase case delimiter case other } // `nonisolated(unsafe)` is fine because the underlying buffer is never mutated or deallocated. private nonisolated(unsafe) static let characterClasses: UnsafePointer = { let array: [CharacterClass] = (0...255).map { (character: UInt8) in if character.isDelimiter { return .delimiter } else if character.isUppercase { return .uppercase } else { return .other } } return UnsafeBufferPointer.allocate(copyOf: array).baseAddress! }() // name -> [name] // myName -> [my][Name] // File.h -> [File][.][h] // NSOpenGLView -> [NS][Open][GL][View] // NSURL -> [NSURL] private init(mixedcaseBytes: UTF8Bytes, contentType: ContentType, allocator: inout UnsafeStackAllocator) { let byteCount = mixedcaseBytes.count let maxTokenCount = byteCount let baseNameMatchesLast = (contentType.baseNameAffinity == .last) let baseNameSeparator = contentType.baseNameSeparator byteTokenAddresses = allocator.allocateUnsafeArray(of: ByteTokenAddress.self, maximumCapacity: byteCount) tokens = allocator.allocateUnsafeArray(of: Token.self, maximumCapacity: maxTokenCount) baseNameLength = -1 var baseNameLength: Int? = nil if byteCount > 1 { let mixedcaseBytes = mixedcaseBytes.baseAddress! let characterClasses = Self.characterClasses let endMixedCaseBytes = mixedcaseBytes + byteCount func characterClass(at pointer: UnsafePointer) -> CharacterClass { if pointer != endMixedCaseBytes { return characterClasses[Int(pointer.pointee)] } else { return .delimiter } } var previous = characterClass(at: mixedcaseBytes) var current = characterClass(at: mixedcaseBytes + 1) var token = (index: 0, length: 1, isAllUppercase: (previous == .uppercase)) hasNonUppercaseNonDelimiterBytes = (previous == .other) tokens.initializeWithContainedGarbage() byteTokenAddresses.initializeWithContainedGarbage() var nextByteTokenAddress = byteTokenAddresses.base nextByteTokenAddress.pointee = .init(tokenIndex: 0, indexInToken: 0) var nextBytePointer = mixedcaseBytes + 1 while nextBytePointer != endMixedCaseBytes { nextBytePointer += 1 let next = characterClass(at: nextBytePointer) let currentIsUppercase = (current == .uppercase) let tokenizeBeforeCurrentCharacter = (currentIsUppercase && ((previous == .other) || (next == .other))) || (current == .delimiter) || (previous == .delimiter) if tokenizeBeforeCurrentCharacter { let anyOtherCase = !(token.isAllUppercase || (previous == .delimiter)) hasNonUppercaseNonDelimiterBytes = hasNonUppercaseNonDelimiterBytes || anyOtherCase tokens[token.index] = .init(length: token.length, allUppercase: token.isAllUppercase) token.isAllUppercase = true token.length = 0 token.index += 1 let lookBack = nextBytePointer - 2 if lookBack.pointee == baseNameSeparator { if baseNameLength == nil || baseNameMatchesLast { baseNameLength = mixedcaseBytes.distance(to: lookBack) } } } token.isAllUppercase = token.isAllUppercase && currentIsUppercase nextByteTokenAddress += 1 nextByteTokenAddress.pointee.tokenIndex = token.index nextByteTokenAddress.pointee.indexInToken = token.length token.length += 1 previous = current current = next } let anyOtherCase = !(token.isAllUppercase || (previous == .delimiter)) hasNonUppercaseNonDelimiterBytes = hasNonUppercaseNonDelimiterBytes || anyOtherCase tokens[token.index] = .init(length: token.length, allUppercase: token.isAllUppercase) tokens.truncateLeavingGarbage(to: token.index + 1) } else if byteCount == 1 { let characterClass = Self.characterClasses[Int(mixedcaseBytes[0])] tokens.append(.init(length: 1, allUppercase: characterClass == .uppercase)) byteTokenAddresses.append(.init(tokenIndex: 0, indexInToken: 0)) hasNonUppercaseNonDelimiterBytes = (characterClass == .other) } else { hasNonUppercaseNonDelimiterBytes = false } self.baseNameLength = baseNameLength ?? byteCount } func enumerate(body: (Range) -> ()) { var position = 0 for token in tokens { body(position ..+ token.length) position += token.length } } func anySatisfy(predicate: (Range) -> Bool) -> Bool { var position = 0 for token in tokens { if predicate(position ..+ token.length) { return true } position += token.length } return false } package mutating func deallocate(allocator: inout UnsafeStackAllocator) { allocator.deallocate(&tokens) allocator.deallocate(&byteTokenAddresses) } package static func allocate( mixedcaseBytes: UTF8Bytes, contentType: ContentType, allocator: inout UnsafeStackAllocator ) -> Tokenization { Tokenization(mixedcaseBytes: mixedcaseBytes, contentType: contentType, allocator: &allocator) } var firstNonBaseNameTokenIndex: Int { (byteCount == baseNameLength) ? tokenCount : byteTokenAddresses[baseNameLength].tokenIndex } } } extension Pattern.UTF8Bytes { fileprivate func allocateLowercaseBytes(allocator: inout UnsafeStackAllocator) -> Self { let lowercaseBytes = allocator.allocateBuffer(of: UTF8Byte.self, count: count) for index in indices { lowercaseBytes[index] = self[index].lowercasedUTF8Byte } return UnsafeBufferPointer(lowercaseBytes) } } extension Pattern { fileprivate struct IndexedCandidate { var mixedcaseBytes: UTF8Bytes var lowercaseBytes: UTF8Bytes var contentType: ContentType var tokenization: Tokenization var matchedRanges: UnsafeStackArray var firstMatchingLowercaseByteIndex: Int? static func allocate( referencing mixedcaseBytes: UTF8Bytes, patternByteCount: Int, firstMatchingLowercaseByteIndex: Int?, contentType: ContentType, allocator: inout UnsafeStackAllocator ) -> Self { let lowercaseBytes = mixedcaseBytes.allocateLowercaseBytes(allocator: &allocator) let tokenization = Tokenization.allocate( mixedcaseBytes: mixedcaseBytes, contentType: contentType, allocator: &allocator ) let matchedRanges = allocator.allocateUnsafeArray(of: UTF8ByteRange.self, maximumCapacity: patternByteCount) return Self( mixedcaseBytes: mixedcaseBytes, lowercaseBytes: lowercaseBytes, contentType: contentType, tokenization: tokenization, matchedRanges: matchedRanges, firstMatchingLowercaseByteIndex: firstMatchingLowercaseByteIndex ) } mutating func deallocate(allocator: inout UnsafeStackAllocator) { allocator.deallocate(&matchedRanges) tokenization.deallocate(allocator: &allocator) allocator.deallocate(&lowercaseBytes) } /// Create a new stack array such that for every offset `i` in this pattern `result[i]` points to the start offset /// of the next token. If there are no more valid start locations `result[i]` points to one character after the end /// of the candidate. /// Valid start locations are the start of the next token that begins with a byte that's in the pattern. /// /// Examples: /// ------------------------------------------------------------------------------------------------------- /// Candidate | Pattern | Result /// ------------------------------------------------------------------------------------------------------- /// "doSomeWork" | "SWork" | `[2, 2, 6, 6, 6, 6, 10, 10, 10, 10]` /// "fn(one:two:three:four:)" | "tf" | `[7,7,7,7,7,7,7,11,11,11,11,17,17,17,17,17,17,23,23,23,23,23,23]` /// fileprivate func allocateNextSearchStarts( allocator: inout UnsafeStackAllocator, patternRejectionFilter: RejectionFilter ) -> UnsafeStackArray { var nextSearchStarts = allocator.allocateUnsafeArray(of: Int.self, maximumCapacity: mixedcaseBytes.count) var nextStart = mixedcaseBytes.count let byteTokenAddresses = tokenization.byteTokenAddresses nextSearchStarts.initializeWithContainedGarbage() for cidx in mixedcaseBytes.indices.reversed() { nextSearchStarts[cidx] = nextStart let isTokenStart = byteTokenAddresses[cidx].indexInToken == 0 if isTokenStart && patternRejectionFilter.contains(candidateByte: mixedcaseBytes[cidx]) == .maybe { nextStart = cidx } } return nextSearchStarts } var looksLikeAType: Bool { (tokenization.hasNonUppercaseNonDelimiterBytes && tokenization.baseNameLength == mixedcaseBytes.count) } } } // MARK: - Best Score Search - extension Pattern { private struct Location { var pattern: Int var candidate: Int func nextByMatching() -> Self { Location(pattern: pattern + 1, candidate: candidate + 1) } func nextBySkipping(validStartingLocations: UnsafeStackArray) -> Self { Location(pattern: pattern, candidate: validStartingLocations[candidate]) } static let start = Self(pattern: 0, candidate: 0) } private enum RestoredRange { case restore(UTF8ByteRange) case unwind case none func restore(ranges: inout UnsafeStackArray) { switch self { case .restore(let lastRange): ranges[ranges.count - 1] = lastRange case .unwind: ranges.removeLast() case .none: break } } } private struct Step { var location: Location var restoredRange: RestoredRange } private struct Context { var pattern: UTF8Bytes var candidate: UTF8Bytes var location = Location.start var patternBytesRemaining: Int { pattern.count - location.pattern } var candidateBytesRemaining: Int { candidate.count - location.candidate } var enoughCandidateBytesRemain: Bool { patternBytesRemaining <= candidateBytesRemaining } var isCompleteMatch: Bool { return patternBytesRemaining == 0 } var isCharacterMatch: Bool { pattern[location.pattern] == candidate[location.candidate] } } private struct Buffers { var steps: UnsafeStackArray var bestRangeSnapshot: UnsafeStackArray var validMatchStartingLocations: UnsafeStackArray /// For each byte the candidate's text, a rejection filter that contains all the characters occurring after or /// at that offset. /// /// This way when we have already matched the first 4 bytes, we can check which characters occur from byte 5 /// onwards and check that they appear in the candidate's remaining text. var candidateSuccessiveRejectionFilters: UnsafeStackArray static func allocate( patternLowercaseBytes: UTF8Bytes, candidate: IndexedCandidate, patternRejectionFilter: RejectionFilter, allocator: inout UnsafeStackAllocator ) -> Self { let steps = allocator.allocateUnsafeArray(of: Step.self, maximumCapacity: patternLowercaseBytes.count + 1) let bestRangeSnapshot = allocator.allocateUnsafeArray( of: UTF8ByteRange.self, maximumCapacity: patternLowercaseBytes.count ) let validMatchStartingLocations = candidate.allocateNextSearchStarts( allocator: &allocator, patternRejectionFilter: patternRejectionFilter ) let candidateSuccessiveRejectionFilters = Pattern.allocateSuccessiveRejectionFilters( lowercaseBytes: candidate.lowercaseBytes, allocator: &allocator ) return Buffers( steps: steps, bestRangeSnapshot: bestRangeSnapshot, validMatchStartingLocations: validMatchStartingLocations, candidateSuccessiveRejectionFilters: candidateSuccessiveRejectionFilters ) } mutating func deallocate(allocator: inout UnsafeStackAllocator) { allocator.deallocate(&candidateSuccessiveRejectionFilters) allocator.deallocate(&validMatchStartingLocations) allocator.deallocate(&bestRangeSnapshot) allocator.deallocate(&steps) } } private struct ExecutionLimit { private(set) var remainingCycles: Int mutating func permitCycle() -> Bool { if remainingCycles == 0 { return false } else { remainingCycles -= 1 return true } } } /// Exhaustively searches for ways to match the pattern to the candidate, scoring each one, and then returning the best score. /// If `captureMatchingRanges` is `true`, and a match is found, writes the matching ranges out to `matchedRangesStorage`. fileprivate func bestScore( candidate: inout IndexedCandidate, budget maxSteps: Int, captureMatchingRanges: Bool, allocator: inout UnsafeStackAllocator ) -> TextScore { var context = Context(pattern: patternLowercaseBytes, candidate: candidate.lowercaseBytes) var bestScore: TextScore? = nil // Find the best score by visiting all possible scorings. // So given the pattern "down" and candidate "documentDocument", enumerate these matches: // - [do]cumentDo[wn]load // - [d]ocumentD[own]load // - document[Down]load // Score each one, and choose the best. // // While recursion would be easier, use a manual stack because in practice we run out of stack space. // // Shortcuts: // * Stop if there are more character remaining to be matched than candidates to match. // * Keep a list of successive reject filters for the candidate and pattern, so that if the remaining pattern // characters are known not to exist in the rest of the candidate, we can stop early. // * Each time we step forward after a failed match in a candidate, move up to the next token, not next character. // * Impose a budget of several thousand iterations so that we back off if things are going to hit the exponential worst case if patternMixedcaseBytes.hasContent && context.enoughCandidateBytesRemain { var buffers = Buffers.allocate( patternLowercaseBytes: patternLowercaseBytes, candidate: candidate, patternRejectionFilter: patternRejectionFilter, allocator: &allocator ) defer { buffers.deallocate(allocator: &allocator) } var executionLimit = ExecutionLimit(remainingCycles: maxSteps) buffers.steps.push(Step(location: .start, restoredRange: .none)) while executionLimit.permitCycle(), let step = buffers.steps.popLast() { context.location = step.location step.restoredRange.restore(ranges: &candidate.matchedRanges) if context.isCompleteMatch { let newScore = singleScore(candidate: candidate, precision: .thorough, matchStyle: nil) accumulate( score: newScore, into: &bestScore, matchedRanges: candidate.matchedRanges, captureMatchingRanges: captureMatchingRanges, matchedRangesStorage: &buffers.bestRangeSnapshot ) } else if context.enoughCandidateBytesRemain { if RejectionFilter.match( pattern: patternSuccessiveRejectionFilters[context.location.pattern], candidate: buffers.candidateSuccessiveRejectionFilters[context.location.candidate] ) == .maybe { if context.isCharacterMatch { let extending = candidate.matchedRanges.last?.upperBound == context.location.candidate let restoredRange: RestoredRange if extending { let lastIndex = candidate.matchedRanges.count - 1 restoredRange = .restore(candidate.matchedRanges[lastIndex]) candidate.matchedRanges[lastIndex].extend(upperBoundBy: 1) } else { restoredRange = .unwind candidate.matchedRanges.append(context.location.candidate ..+ 1) } buffers.steps.push( Step( location: context.location.nextBySkipping( validStartingLocations: buffers.validMatchStartingLocations ), restoredRange: restoredRange ) ) buffers.steps.push(Step(location: context.location.nextByMatching(), restoredRange: .none)) } else { buffers.steps.push( Step( location: context.location.nextBySkipping( validStartingLocations: buffers.validMatchStartingLocations ), restoredRange: .none ) ) } } } } // We need to consider the special cases the `.fast` search scans for so that a `.fast` score can't be better than // a .`thorough` score. // For example, they could match 30 character that weren't on a token boundary, which we would have skipped above. // Or we could have ran out of time searching and missed a contiguous match. for matchStyle in MatchStyle.allCases { candidate.matchedRanges.removeAll() if populateMatchingRanges(&candidate, matchStyle: matchStyle) { let newScore = singleScore(candidate: candidate, precision: .thorough, matchStyle: matchStyle) accumulate( score: newScore, into: &bestScore, matchedRanges: candidate.matchedRanges, captureMatchingRanges: captureMatchingRanges, matchedRangesStorage: &buffers.bestRangeSnapshot ) } } candidate.matchedRanges.removeAll() candidate.matchedRanges.append(contentsOf: buffers.bestRangeSnapshot) } return bestScore ?? .noMatchScore } private func accumulate( score newScore: TextScore, into bestScore: inout TextScore?, matchedRanges: UnsafeStackArray, captureMatchingRanges: Bool, matchedRangesStorage bestMatchedRangesStorage: inout UnsafeStackArray ) { if newScore > (bestScore ?? .worstPossibleScore) { bestScore = newScore if captureMatchingRanges { bestMatchedRangesStorage.removeAll() bestMatchedRangesStorage.append(contentsOf: matchedRanges) } } } } extension Pattern { package func test_searchStart(candidate: Candidate, contentType: ContentType) -> [Int] { UnsafeStackAllocator.withUnsafeStackAllocator { allocator in var indexedCandidate = IndexedCandidate.allocate( referencing: candidate.bytes, patternByteCount: patternLowercaseBytes.count, firstMatchingLowercaseByteIndex: nil, contentType: contentType, allocator: &allocator ) defer { indexedCandidate.deallocate(allocator: &allocator) } var nextSearchStarts = indexedCandidate.allocateNextSearchStarts( allocator: &allocator, patternRejectionFilter: patternRejectionFilter ) defer { allocator.deallocate(&nextSearchStarts) } return Array(nextSearchStarts) } } package func testPerformance_tokenizing(batch: CandidateBatch, contentType: ContentType) -> Int { UnsafeStackAllocator.withUnsafeStackAllocator { allocator in var all = 0 batch.enumerate { candidate in var indexedCandidate = IndexedCandidate.allocate( referencing: candidate.bytes, patternByteCount: patternLowercaseBytes.count, firstMatchingLowercaseByteIndex: nil, contentType: contentType, allocator: &allocator ) defer { indexedCandidate.deallocate(allocator: &allocator) } all &= indexedCandidate.tokenization.tokenCount } return all } } } extension Candidate.ContentType { fileprivate var prefixMatchBonus: Double { switch self { case .codeCompletionSymbol: return 2.00 case .fileName, .projectSymbol: return 1.05 case .unknown: return 2.00 } } fileprivate var fullMatchBonus: Double { switch self { case .codeCompletionSymbol: return 1.00 case .fileName, .projectSymbol: return 1.50 case .unknown: return 1.00 } } fileprivate var fullBaseNameMatchBonus: Double { switch self { case .codeCompletionSymbol: return 1.00 case .fileName, .projectSymbol: return 1.50 case .unknown: return 1.00 } } fileprivate var baseNameAffinity: BaseNameAffinity { switch self { case .codeCompletionSymbol, .projectSymbol: return .first case .fileName: return .last case .unknown: return .last } } fileprivate var baseNameSeparator: UTF8Byte { switch self { case .codeCompletionSymbol, .projectSymbol: return .cLeftParentheses case .fileName: return .cPeriod case .unknown: return 0 } } fileprivate var isEligibleForAcronymMatch: Bool { switch self { case .codeCompletionSymbol, .fileName, .projectSymbol: return true case .unknown: return false } } fileprivate var acronymMatchAllowsMultiCharacterMatchesAfterBaseName: Bool { switch self { // You can't do a acronym match into function arguments case .codeCompletionSymbol, .projectSymbol, .unknown: return false case .fileName: return true } } fileprivate var acronymMatchMustBeInBaseName: Bool { switch self { // You can't do a acronym match into function arguments case .codeCompletionSymbol, .projectSymbol: return true case .fileName, .unknown: return false } } fileprivate var contentAfterBasenameIsTrivial: Bool { switch self { case .codeCompletionSymbol, .projectSymbol: return false case .fileName: return true case .unknown: return false } } fileprivate var isEligibleForTypeNameOverLocalVariableModifier: Bool { switch self { case .codeCompletionSymbol: return true case .fileName, .projectSymbol, .unknown: return false } } fileprivate enum BaseNameAffinity { case first case last } } extension Pattern { @available(*, deprecated, message: "Pass a contentType") package func score( candidate: UTF8Bytes, precision: Precision, captureMatchingRanges: Bool, ranges: inout [UTF8ByteRange] ) -> Double { score( candidate: candidate, contentType: .codeCompletionSymbol, precision: precision, captureMatchingRanges: captureMatchingRanges, ranges: &ranges ) } @available(*, deprecated, message: "Pass a contentType") package func score(candidate: UTF8Bytes, precision: Precision) -> Double { score(candidate: candidate, contentType: .codeCompletionSymbol, precision: precision) } }