mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
Assign some previously reserved bits in String.Index and _StringObject to keep track of their associated storage encoding (either UTF-8 or UTF-16). None of these bits will be reliably set in processes that load binaries compiled with older stdlib releases, but when they do end up getting set, we can use them opportunistically to more reliably detect cases where an index is applied on a string with a mismatching encoding. As more and more code gets recompiled with 5.7+, the stdlib will gradually become able to detect such issues with complete accuracy. Code that misuses indices this way was always considered broken; however, String wasn’t able to reliably detect these runtime errors before. Therefore, I expect there is a large amount of broken code out there that keeps using bridged Cocoa String indices (UTF-16) after a mutation turns them into native UTF-8 strings. Therefore, instead of trapping, this commit silently corrects the issue, transcoding the offsets into the correct encoding. It would probably be a good idea to also emit a runtime warning in addition to recovering from the error. This would generate some noise that would gently nudge folks to fix their code. rdar://89369680
392 lines
14 KiB
Swift
392 lines
14 KiB
Swift
//===--- StringCharacterView.swift - String's Collection of Characters ----===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See https://swift.org/LICENSE.txt for license information
|
|
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// String is-not-a Sequence or Collection, but it exposes a
|
|
// collection of characters.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// FIXME(ABI)#70 : The character string view should have a custom iterator type
|
|
// to allow performance optimizations of linear traversals.
|
|
|
|
import SwiftShims
|
|
|
|
extension String: BidirectionalCollection {
|
|
public typealias SubSequence = Substring
|
|
public typealias Element = Character
|
|
|
|
/// The position of the first character in a nonempty string.
|
|
///
|
|
/// In an empty string, `startIndex` is equal to `endIndex`.
|
|
@inlinable @inline(__always)
|
|
public var startIndex: Index { return _guts.startIndex }
|
|
|
|
/// A string's "past the end" position---that is, the position one greater
|
|
/// than the last valid subscript argument.
|
|
///
|
|
/// In an empty string, `endIndex` is equal to `startIndex`.
|
|
@inlinable @inline(__always)
|
|
public var endIndex: Index { return _guts.endIndex }
|
|
|
|
/// The number of characters in a string.
|
|
@inline(__always)
|
|
public var count: Int {
|
|
return distance(from: startIndex, to: endIndex)
|
|
}
|
|
|
|
/// Returns the position immediately after the given index.
|
|
///
|
|
/// - Parameter i: A valid index of the collection. `i` must be less than
|
|
/// `endIndex`.
|
|
/// - Returns: The index value immediately after `i`.
|
|
public func index(after i: Index) -> Index {
|
|
let i = _guts.ensureMatchingEncoding(i)
|
|
_precondition(i < endIndex, "String index is out of bounds")
|
|
let r = _uncheckedIndex(after: _guts.scalarAlign(i))
|
|
return _guts.markEncoding(r)
|
|
}
|
|
|
|
/// A version of `index(after:)` that assumes that the given index:
|
|
///
|
|
/// - has the right encoding,
|
|
/// - is within bounds, and
|
|
/// - is scalar aligned.
|
|
///
|
|
/// It does not mark the encoding of the returned index.
|
|
internal func _uncheckedIndex(after i: Index) -> Index {
|
|
// FIXME: Unlike `index(before:)`, this function may return incorrect
|
|
// results if `i` isn't on a grapheme cluster boundary. (The grapheme
|
|
// breaking algorithm assumes we start on a break when we go forward.)
|
|
_internalInvariant(_guts.hasMatchingEncoding(i))
|
|
_internalInvariant(i < endIndex)
|
|
_internalInvariant(i._isScalarAligned)
|
|
|
|
// TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc.
|
|
let stride = _characterStride(startingAt: i)
|
|
let nextOffset = i._encodedOffset &+ stride
|
|
let nextIndex = Index(_encodedOffset: nextOffset)._scalarAligned
|
|
let nextStride = _characterStride(startingAt: nextIndex)
|
|
let r = Index(encodedOffset: nextOffset, characterStride: nextStride)
|
|
return r._scalarAligned
|
|
}
|
|
|
|
/// Returns the position immediately before the given index.
|
|
///
|
|
/// - Parameter i: A valid index of the collection. `i` must be greater than
|
|
/// `startIndex`.
|
|
/// - Returns: The index value immediately before `i`.
|
|
public func index(before i: Index) -> Index {
|
|
let i = _guts.ensureMatchingEncoding(i)
|
|
|
|
// Note: bounds checking in `index(before:)` is tricky as scalar aligning an
|
|
// index may need to access storage, but it may also move it closer towards
|
|
// the `startIndex`. Therefore, we must check against the `endIndex` before
|
|
// aligning, but we need to delay the `i > startIndex` check until after.
|
|
_precondition(i <= endIndex, "String index is out of bounds")
|
|
let i = _guts.scalarAlign(i)
|
|
_precondition(i > startIndex, "String index is out of bounds")
|
|
|
|
let r = _uncheckedIndex(before: _guts.scalarAlign(i))
|
|
return _guts.markEncoding(r)
|
|
}
|
|
|
|
/// A version of `index(before:)` that assumes that the given index:
|
|
///
|
|
/// - has the right encoding,
|
|
/// - is within bounds, and
|
|
/// - is scalar aligned.
|
|
///
|
|
/// It does not mark the encoding of the returned index.
|
|
internal func _uncheckedIndex(before i: Index) -> Index {
|
|
_internalInvariant(_guts.hasMatchingEncoding(i))
|
|
_internalInvariant(i > startIndex && i <= endIndex)
|
|
_internalInvariant(i._isScalarAligned)
|
|
|
|
// TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc.
|
|
let stride = _characterStride(endingAt: i)
|
|
let priorOffset = i._encodedOffset &- stride
|
|
|
|
let r = Index(encodedOffset: priorOffset, characterStride: stride)
|
|
return r._scalarAligned
|
|
}
|
|
|
|
/// Returns an index that is the specified distance from the given index.
|
|
///
|
|
/// The following example obtains an index advanced four positions from a
|
|
/// string's starting index and then prints the character at that position.
|
|
///
|
|
/// let s = "Swift"
|
|
/// let i = s.index(s.startIndex, offsetBy: 4)
|
|
/// print(s[i])
|
|
/// // Prints "t"
|
|
///
|
|
/// The value passed as `distance` must not offset `i` beyond the bounds of
|
|
/// the collection.
|
|
///
|
|
/// - Parameters:
|
|
/// - i: A valid index of the collection.
|
|
/// - distance: The distance to offset `i`.
|
|
/// - Returns: An index offset by `distance` from the index `i`. If
|
|
/// `distance` is positive, this is the same value as the result of
|
|
/// `distance` calls to `index(after:)`. If `distance` is negative, this
|
|
/// is the same value as the result of `abs(distance)` calls to
|
|
/// `index(before:)`.
|
|
/// - Complexity: O(*n*), where *n* is the absolute value of `distance`.
|
|
public func index(_ i: Index, offsetBy distance: Int) -> Index {
|
|
// Note: in Swift 5.6 and below, this method used to be inlinable,
|
|
// forwarding to `_index(_:offsetBy:)`.
|
|
|
|
// TODO: known-ASCII and single-scalar-grapheme fast path, etc.
|
|
|
|
var i = _guts.ensureMatchingEncoding(i)
|
|
_precondition(i >= startIndex && i <= endIndex,
|
|
"String index is out of bounds")
|
|
i = _guts.scalarAlign(i)
|
|
|
|
if distance >= 0 {
|
|
for _ in stride(from: 0, to: distance, by: 1) {
|
|
_precondition(i < endIndex, "String index is out of bounds")
|
|
i = _uncheckedIndex(after: i)
|
|
}
|
|
} else {
|
|
for _ in stride(from: 0, to: distance, by: -1) {
|
|
_precondition(i > startIndex, "String index is out of bounds")
|
|
i = _uncheckedIndex(before: i)
|
|
}
|
|
}
|
|
return _guts.markEncoding(i)
|
|
}
|
|
|
|
/// Returns an index that is the specified distance from the given index,
|
|
/// unless that distance is beyond a given limiting index.
|
|
///
|
|
/// The following example obtains an index advanced four positions from a
|
|
/// string's starting index and then prints the character at that position.
|
|
/// The operation doesn't require going beyond the limiting `s.endIndex`
|
|
/// value, so it succeeds.
|
|
///
|
|
/// let s = "Swift"
|
|
/// if let i = s.index(s.startIndex, offsetBy: 4, limitedBy: s.endIndex) {
|
|
/// print(s[i])
|
|
/// }
|
|
/// // Prints "t"
|
|
///
|
|
/// The next example attempts to retrieve an index six positions from
|
|
/// `s.startIndex` but fails, because that distance is beyond the index
|
|
/// passed as `limit`.
|
|
///
|
|
/// let j = s.index(s.startIndex, offsetBy: 6, limitedBy: s.endIndex)
|
|
/// print(j)
|
|
/// // Prints "nil"
|
|
///
|
|
/// The value passed as `distance` must not offset `i` beyond the bounds of
|
|
/// the collection, unless the index passed as `limit` prevents offsetting
|
|
/// beyond those bounds.
|
|
///
|
|
/// - Parameters:
|
|
/// - i: A valid index of the collection.
|
|
/// - distance: The distance to offset `i`.
|
|
/// - limit: A valid index of the collection to use as a limit. If
|
|
/// `distance > 0`, a limit that is less than `i` has no effect.
|
|
/// Likewise, if `distance < 0`, a limit that is greater than `i` has no
|
|
/// effect.
|
|
/// - Returns: An index offset by `distance` from the index `i`, unless that
|
|
/// index would be beyond `limit` in the direction of movement. In that
|
|
/// case, the method returns `nil`.
|
|
///
|
|
/// - Complexity: O(*n*), where *n* is the absolute value of `distance`.
|
|
public func index(
|
|
_ i: Index, offsetBy distance: Int, limitedBy limit: Index
|
|
) -> Index? {
|
|
// Note: In Swift 5.6 and below, this function used to be inlinable,
|
|
// forwarding to `BidirectionalCollection._index(_:offsetBy:limitedBy:)`.
|
|
// Unfortunately, that approach isn't compatible with SE-0180, as it doesn't
|
|
// support cases where `i` or `limit` aren't character aligned.
|
|
|
|
// TODO: known-ASCII and single-scalar-grapheme fast path, etc.
|
|
|
|
// Per SE-0180, `i` and `limit` are allowed to fall in between grapheme
|
|
// breaks, in which case this function must still terminate without trapping
|
|
// and return a result that makes sense.
|
|
|
|
// Note: `limit` is intentionally not scalar aligned to ensure our behavior
|
|
// exactly matches the documentation above.
|
|
let limit = _guts.ensureMatchingEncoding(limit)
|
|
|
|
var i = _guts.ensureMatchingEncoding(i)
|
|
_precondition(i >= startIndex && i <= endIndex,
|
|
"String index is out of bounds")
|
|
i = _guts.scalarAlign(i)
|
|
|
|
let start = i
|
|
if distance >= 0 {
|
|
for _ in stride(from: 0, to: distance, by: 1) {
|
|
guard limit < start || i < limit else { return nil }
|
|
_precondition(i < endIndex, "String index is out of bounds")
|
|
i = _uncheckedIndex(after: i)
|
|
}
|
|
guard limit < start || i <= limit else { return nil }
|
|
} else {
|
|
for _ in stride(from: 0, to: distance, by: -1) {
|
|
guard limit > start || i > limit else { return nil }
|
|
_precondition(i > startIndex, "String index is out of bounds")
|
|
i = _uncheckedIndex(before: i)
|
|
}
|
|
guard limit > start || i >= limit else { return nil }
|
|
}
|
|
return _guts.markEncoding(i)
|
|
}
|
|
|
|
/// Returns the distance between two indices.
|
|
///
|
|
/// - Parameters:
|
|
/// - start: A valid index of the collection.
|
|
/// - end: Another valid index of the collection. If `end` is equal to
|
|
/// `start`, the result is zero.
|
|
/// - Returns: The distance between `start` and `end`.
|
|
///
|
|
/// - Complexity: O(*n*), where *n* is the resulting distance.
|
|
public func distance(from start: Index, to end: Index) -> Int {
|
|
// Note: In Swift 5.6 and below, this function used to be inlinable,
|
|
// forwarding to `BidirectionalCollection._distance(from:to:)`.
|
|
|
|
// FIXME: Due to the `index(after:)` problem above, this function doesn't
|
|
// always return consistent results when the given indices fall between
|
|
// grapheme breaks -- swapping `start` and `end` may change the magnitude of
|
|
// the result.
|
|
|
|
var start = _guts.ensureMatchingEncoding(start)
|
|
var end = _guts.ensureMatchingEncoding(end)
|
|
|
|
_precondition(
|
|
start >= startIndex && start <= endIndex &&
|
|
end >= startIndex && end <= endIndex,
|
|
"String index is out of bounds")
|
|
|
|
start = _guts.scalarAlign(start)
|
|
end = _guts.scalarAlign(end)
|
|
|
|
// TODO: known-ASCII and single-scalar-grapheme fast path, etc.
|
|
|
|
// Per SE-0180, `start` and `end` are allowed to fall in between grapheme
|
|
// breaks, in which case this function must still terminate without trapping
|
|
// and return a result that makes sense.
|
|
|
|
var i = start
|
|
var count = 0
|
|
if i < end {
|
|
while i < end { // Note `<` instead of `==`
|
|
count += 1
|
|
i = _uncheckedIndex(after: i)
|
|
}
|
|
}
|
|
else if i > end {
|
|
while i > end { // Note `<` instead of `==`
|
|
count -= 1
|
|
i = _uncheckedIndex(before: i)
|
|
}
|
|
}
|
|
return count
|
|
}
|
|
|
|
/// Accesses the character at the given position.
|
|
///
|
|
/// You can use the same indices for subscripting a string and its substring.
|
|
/// For example, this code finds the first letter after the first space:
|
|
///
|
|
/// let str = "Greetings, friend! How are you?"
|
|
/// let firstSpace = str.firstIndex(of: " ") ?? str.endIndex
|
|
/// let substr = str[firstSpace...]
|
|
/// if let nextCapital = substr.firstIndex(where: { $0 >= "A" && $0 <= "Z" }) {
|
|
/// print("Capital after a space: \(str[nextCapital])")
|
|
/// }
|
|
/// // Prints "Capital after a space: H"
|
|
///
|
|
/// - Parameter i: A valid index of the string. `i` must be less than the
|
|
/// string's end index.
|
|
@inlinable @inline(__always) // FIXME(lorentey): Consider removing these. If
|
|
// `index(after:)` isn't inlinable, does it
|
|
// really matter if this one is? (Potential
|
|
// _guts-related optimizations notwithstanding.)
|
|
// `subscript` being inlinable forces a bunch of
|
|
// new additions to be _aEIC, even though they
|
|
// ought to be internal.
|
|
public subscript(i: Index) -> Character {
|
|
var i = _guts.ensureMatchingEncoding(i)
|
|
_boundsCheck(i)
|
|
i = _guts.scalarAlign(i)
|
|
let distance = _characterStride(startingAt: i)
|
|
|
|
return _guts.errorCorrectedCharacter(
|
|
startingAt: i._encodedOffset, endingAt: i._encodedOffset &+ distance)
|
|
}
|
|
|
|
@inlinable @inline(__always)
|
|
internal func _characterStride(startingAt i: Index) -> Int {
|
|
_internalInvariant_5_1(i._isScalarAligned)
|
|
|
|
// Fast check if it's already been measured, otherwise check resiliently
|
|
if let d = i.characterStride { return d }
|
|
|
|
if i == endIndex { return 0 }
|
|
|
|
return _guts._opaqueCharacterStride(startingAt: i._encodedOffset)
|
|
}
|
|
|
|
@inlinable @inline(__always)
|
|
internal func _characterStride(endingAt i: Index) -> Int {
|
|
_internalInvariant_5_1(i._isScalarAligned)
|
|
|
|
if i == startIndex { return 0 }
|
|
|
|
return _guts._opaqueCharacterStride(endingAt: i._encodedOffset)
|
|
}
|
|
}
|
|
|
|
extension String {
|
|
@frozen
|
|
public struct Iterator: IteratorProtocol, Sendable {
|
|
@usableFromInline
|
|
internal var _guts: _StringGuts
|
|
|
|
@usableFromInline
|
|
internal var _position: Int = 0
|
|
|
|
@usableFromInline
|
|
internal var _end: Int
|
|
|
|
@inlinable
|
|
internal init(_ guts: _StringGuts) {
|
|
self._end = guts.count
|
|
self._guts = guts
|
|
}
|
|
|
|
@inlinable
|
|
public mutating func next() -> Character? {
|
|
guard _fastPath(_position < _end) else { return nil }
|
|
|
|
let len = _guts._opaqueCharacterStride(startingAt: _position)
|
|
let nextPosition = _position &+ len
|
|
let result = _guts.errorCorrectedCharacter(
|
|
startingAt: _position, endingAt: nextPosition)
|
|
_position = nextPosition
|
|
return result
|
|
}
|
|
}
|
|
|
|
@inlinable
|
|
public __consuming func makeIterator() -> Iterator {
|
|
return Iterator(_guts)
|
|
}
|
|
}
|
|
|