Files
swift-mirror/stdlib/public/core/StringGutsSlice.swift
Karoy Lorentey 6e18955f90 [stdlib] Add bookkeeping to keep track of the encoding of strings and indices
Assign some previously reserved bits in String.Index and _StringObject to keep track of their associated storage encoding (either UTF-8 or UTF-16).

None of these bits will be reliably set in processes that load binaries compiled with older stdlib releases, but when they do end up getting set, we can use them opportunistically to more reliably detect cases where an index is applied on a string with a mismatching encoding.

As more and more code gets recompiled with 5.7+, the stdlib will gradually become able to detect such issues with complete accuracy.

Code that misuses indices this way was always considered broken; however, String wasn’t able to reliably detect these runtime errors before. Therefore, I expect there is a large amount of broken code out there that keeps using bridged Cocoa String indices (UTF-16) after a mutation turns them into native UTF-8 strings. Therefore, instead of trapping, this commit silently corrects the issue, transcoding the offsets into the correct encoding.

It would probably be a good idea to also emit a runtime warning in addition to recovering from the error. This would generate some noise that would gently nudge folks to fix their code.

rdar://89369680
2022-03-24 20:59:59 -07:00

106 lines
3.0 KiB
Swift

//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2018 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
// TODO(String performance): Unfortunately, this slice struct seems to add
// overhead. We may want to wean ourselves off of this and have all users just
// also store a range.
// A sliced _StringGuts, convenient for unifying String/Substring comparison,
// hashing, and RRC.
internal struct _StringGutsSlice {
internal var _guts: _StringGuts
internal var _offsetRange: Range<Int>
@inline(__always)
internal init(_ guts: _StringGuts) {
self._guts = guts
self._offsetRange = 0..<self._guts.count
}
@inline(__always)
internal init(_ guts: _StringGuts, _ offsetRange: Range<Int>) {
_internalInvariant(
guts.isOnUnicodeScalarBoundary(offsetRange.lowerBound)
&& guts.isOnUnicodeScalarBoundary(offsetRange.upperBound))
self._guts = guts
self._offsetRange = offsetRange
}
@inlinable
internal var start: Int {
@inline(__always) get { return _offsetRange.lowerBound }
}
@inlinable
internal var end: Int {
@inline(__always) get { return _offsetRange.upperBound }
}
@inlinable
internal var count: Int {
@inline(__always) get { return _offsetRange.count }
}
@inlinable
internal var isNFCFastUTF8: Bool {
@inline(__always) get { return _guts.isNFCFastUTF8 }
}
@inlinable
internal var isASCII: Bool {
@inline(__always) get { return _guts.isASCII }
}
@inlinable
internal var isFastUTF8: Bool {
@inline(__always) get { return _guts.isFastUTF8 }
}
internal var utf8Count: Int {
@inline(__always) get {
if _fastPath(self.isFastUTF8) {
return _offsetRange.count
}
return Substring(self).utf8.count
}
}
@inlinable
internal var range: Range<String.Index> {
@inline(__always) get {
let lower = String.Index(_encodedOffset: _offsetRange.lowerBound)
._scalarAligned
let higher = String.Index(_encodedOffset: _offsetRange.upperBound)
._scalarAligned
return Range(_uncheckedBounds: (lower, higher))
}
}
@inline(__always)
internal func withFastUTF8<R>(
_ f: (UnsafeBufferPointer<UInt8>) throws -> R
) rethrows -> R {
return try _guts.withFastUTF8(range: _offsetRange, f)
}
@_effects(releasenone)
internal func foreignErrorCorrectedScalar(
startingAt idx: String.Index
) -> (Unicode.Scalar, scalarLength: Int) {
let (scalar, len) = _guts.foreignErrorCorrectedScalar(startingAt: idx)
if _slowPath(idx.encoded(offsetBy: len) > range.upperBound) {
return (Unicode.Scalar._replacementCharacter, 1)
}
return (scalar, len)
}
}