Merge pull request #41417 from lorentey/the-horror-of-se-0180

[stdlib] Fix String indexing edge cases, anomalies & validation bugs
This commit is contained in:
Karoy Lorentey
2022-04-14 14:08:53 -07:00
committed by GitHub
25 changed files with 3400 additions and 761 deletions

View File

@@ -287,11 +287,106 @@ extension _StringGuts {
@inlinable @inline(__always)
internal var startIndex: String.Index {
return Index(_encodedOffset: 0)._scalarAligned
// The start index is always `Character` aligned.
Index(_encodedOffset: 0)._characterAligned._encodingIndependent
}
@inlinable @inline(__always)
internal var endIndex: String.Index {
return Index(_encodedOffset: self.count)._scalarAligned
// The end index is always `Character` aligned.
markEncoding(Index(_encodedOffset: self.count)._characterAligned)
}
}
// Encoding
extension _StringGuts {
/// Returns whether this string has a UTF-8 storage representation.
///
/// This always returns a value corresponding to the string's actual encoding.
@_alwaysEmitIntoClient
@inline(__always)
internal var isUTF8: Bool { _object.isUTF8 }
/// Returns whether this string has a UTF-16 storage representation.
///
/// This always returns a value corresponding to the string's actual encoding.
@_alwaysEmitIntoClient
@inline(__always)
internal var isUTF16: Bool { _object.isUTF16 }
@_alwaysEmitIntoClient // Swift 5.7
@inline(__always)
internal func markEncoding(_ i: String.Index) -> String.Index {
isUTF8 ? i._knownUTF8 : i._knownUTF16
}
/// Returns true if the encoding of the given index isn't known to be in
/// conflict with this string's encoding.
///
/// If the index was created by code that was built on a stdlib below 5.7,
/// then this check may incorrectly return true on a mismatching index, but it
/// is guaranteed to never incorrectly return false. If all loaded binaries
/// were built in 5.7+, then this method is guaranteed to always return the
/// correct value.
@_alwaysEmitIntoClient @inline(__always)
internal func hasMatchingEncoding(_ i: String.Index) -> Bool {
i._hasMatchingEncoding(isUTF8: isUTF8)
}
/// Return an index whose encoding can be assumed to match that of `self`.
///
/// Detecting an encoding mismatch isn't always possible -- older binaries did
/// not set the flags that this method relies on. However, false positives
/// cannot happen: if this method detects a mismatch, then it is guaranteed to
/// be a real one.
@_alwaysEmitIntoClient
@inline(__always)
internal func ensureMatchingEncoding(_ i: String.Index) -> String.Index {
if _fastPath(hasMatchingEncoding(i)) { return i }
return _slowEnsureMatchingEncoding(i)
}
@_alwaysEmitIntoClient
@inline(never)
@_effects(releasenone)
internal func _slowEnsureMatchingEncoding(_ i: String.Index) -> String.Index {
guard isUTF8 else {
// Attempt to use an UTF-8 index on a UTF-16 string. Strings don't usually
// get converted to UTF-16 storage, so it seems okay to trap in this case
// -- the index most likely comes from an unrelated string. (Trapping here
// may still turn out to affect binary compatibility with broken code in
// existing binaries running with new stdlibs. If so, we can replace this
// with the same transcoding hack as in the UTF-16->8 case below.)
//
// Note that this trap is not guaranteed to trigger when the process
// includes client binaries compiled with a previous Swift release.
// (`i._canBeUTF16` can sometimes return true in that case even if the
// index actually came from an UTF-8 string.) However, the trap will still
// often trigger in this case, as long as the index was initialized by
// code that was compiled with 5.7+.
//
// This trap can never trigger on OSes that have stdlibs <= 5.6, because
// those versions never set the `isKnownUTF16` flag in `_StringObject`.
_preconditionFailure("Invalid string index")
}
// Attempt to use an UTF-16 index on a UTF-8 string.
//
// This can happen if `self` was originally verbatim-bridged, and someone
// mistakenly attempts to keep using an old index after a mutation. This is
// technically an error, but trapping here would trigger a lot of broken
// code that previously happened to work "fine" on e.g. ASCII strings.
// Instead, attempt to convert the offset to UTF-8 code units by transcoding
// the string. This can be slow, but it often results in a usable index,
// even if non-ASCII characters are present. (UTF-16 breadcrumbs help reduce
// the severity of the slowdown.)
// FIXME: Consider emitting a runtime warning here.
// FIXME: Consider performing a linked-on-or-after check & trapping if the
// client executable was built on some particular future Swift release.
let utf16 = String(self).utf16
let base = utf16.index(utf16.startIndex, offsetBy: i._encodedOffset)
if i.transcodedOffset == 0 { return base }
return base.encoded(offsetBy: i.transcodedOffset)._knownUTF8
}
}