mirror of
https://github.com/apple/swift.git
synced 2025-12-21 12:14:44 +01:00
Merge pull request #41417 from lorentey/the-horror-of-se-0180
[stdlib] Fix String indexing edge cases, anomalies & validation bugs
This commit is contained in:
@@ -287,11 +287,106 @@ extension _StringGuts {
|
||||
|
||||
@inlinable @inline(__always)
|
||||
internal var startIndex: String.Index {
|
||||
return Index(_encodedOffset: 0)._scalarAligned
|
||||
// The start index is always `Character` aligned.
|
||||
Index(_encodedOffset: 0)._characterAligned._encodingIndependent
|
||||
}
|
||||
|
||||
@inlinable @inline(__always)
|
||||
internal var endIndex: String.Index {
|
||||
return Index(_encodedOffset: self.count)._scalarAligned
|
||||
// The end index is always `Character` aligned.
|
||||
markEncoding(Index(_encodedOffset: self.count)._characterAligned)
|
||||
}
|
||||
}
|
||||
|
||||
// Encoding
|
||||
extension _StringGuts {
|
||||
/// Returns whether this string has a UTF-8 storage representation.
|
||||
///
|
||||
/// This always returns a value corresponding to the string's actual encoding.
|
||||
@_alwaysEmitIntoClient
|
||||
@inline(__always)
|
||||
internal var isUTF8: Bool { _object.isUTF8 }
|
||||
|
||||
/// Returns whether this string has a UTF-16 storage representation.
|
||||
///
|
||||
/// This always returns a value corresponding to the string's actual encoding.
|
||||
@_alwaysEmitIntoClient
|
||||
@inline(__always)
|
||||
internal var isUTF16: Bool { _object.isUTF16 }
|
||||
|
||||
@_alwaysEmitIntoClient // Swift 5.7
|
||||
@inline(__always)
|
||||
internal func markEncoding(_ i: String.Index) -> String.Index {
|
||||
isUTF8 ? i._knownUTF8 : i._knownUTF16
|
||||
}
|
||||
|
||||
/// Returns true if the encoding of the given index isn't known to be in
|
||||
/// conflict with this string's encoding.
|
||||
///
|
||||
/// If the index was created by code that was built on a stdlib below 5.7,
|
||||
/// then this check may incorrectly return true on a mismatching index, but it
|
||||
/// is guaranteed to never incorrectly return false. If all loaded binaries
|
||||
/// were built in 5.7+, then this method is guaranteed to always return the
|
||||
/// correct value.
|
||||
@_alwaysEmitIntoClient @inline(__always)
|
||||
internal func hasMatchingEncoding(_ i: String.Index) -> Bool {
|
||||
i._hasMatchingEncoding(isUTF8: isUTF8)
|
||||
}
|
||||
|
||||
/// Return an index whose encoding can be assumed to match that of `self`.
|
||||
///
|
||||
/// Detecting an encoding mismatch isn't always possible -- older binaries did
|
||||
/// not set the flags that this method relies on. However, false positives
|
||||
/// cannot happen: if this method detects a mismatch, then it is guaranteed to
|
||||
/// be a real one.
|
||||
@_alwaysEmitIntoClient
|
||||
@inline(__always)
|
||||
internal func ensureMatchingEncoding(_ i: String.Index) -> String.Index {
|
||||
if _fastPath(hasMatchingEncoding(i)) { return i }
|
||||
return _slowEnsureMatchingEncoding(i)
|
||||
}
|
||||
|
||||
@_alwaysEmitIntoClient
|
||||
@inline(never)
|
||||
@_effects(releasenone)
|
||||
internal func _slowEnsureMatchingEncoding(_ i: String.Index) -> String.Index {
|
||||
guard isUTF8 else {
|
||||
// Attempt to use an UTF-8 index on a UTF-16 string. Strings don't usually
|
||||
// get converted to UTF-16 storage, so it seems okay to trap in this case
|
||||
// -- the index most likely comes from an unrelated string. (Trapping here
|
||||
// may still turn out to affect binary compatibility with broken code in
|
||||
// existing binaries running with new stdlibs. If so, we can replace this
|
||||
// with the same transcoding hack as in the UTF-16->8 case below.)
|
||||
//
|
||||
// Note that this trap is not guaranteed to trigger when the process
|
||||
// includes client binaries compiled with a previous Swift release.
|
||||
// (`i._canBeUTF16` can sometimes return true in that case even if the
|
||||
// index actually came from an UTF-8 string.) However, the trap will still
|
||||
// often trigger in this case, as long as the index was initialized by
|
||||
// code that was compiled with 5.7+.
|
||||
//
|
||||
// This trap can never trigger on OSes that have stdlibs <= 5.6, because
|
||||
// those versions never set the `isKnownUTF16` flag in `_StringObject`.
|
||||
_preconditionFailure("Invalid string index")
|
||||
}
|
||||
// Attempt to use an UTF-16 index on a UTF-8 string.
|
||||
//
|
||||
// This can happen if `self` was originally verbatim-bridged, and someone
|
||||
// mistakenly attempts to keep using an old index after a mutation. This is
|
||||
// technically an error, but trapping here would trigger a lot of broken
|
||||
// code that previously happened to work "fine" on e.g. ASCII strings.
|
||||
// Instead, attempt to convert the offset to UTF-8 code units by transcoding
|
||||
// the string. This can be slow, but it often results in a usable index,
|
||||
// even if non-ASCII characters are present. (UTF-16 breadcrumbs help reduce
|
||||
// the severity of the slowdown.)
|
||||
|
||||
// FIXME: Consider emitting a runtime warning here.
|
||||
// FIXME: Consider performing a linked-on-or-after check & trapping if the
|
||||
// client executable was built on some particular future Swift release.
|
||||
let utf16 = String(self).utf16
|
||||
let base = utf16.index(utf16.startIndex, offsetBy: i._encodedOffset)
|
||||
if i.transcodedOffset == 0 { return base }
|
||||
return base.encoded(offsetBy: i.transcodedOffset)._knownUTF8
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user