mirror of
https://github.com/apple/swift.git
synced 2025-12-21 12:14:44 +01:00
[String] Scalar-alignment bug fixes.
Fixes a general category (pun intended) of scalar-alignment bugs surrounding exchanging non-scalar-aligned indices between views and for slicing. SE-0180 unifies the Index type of String and all its views and allows non-scalar-aligned indices to be used across views. In order to guarantee behavior, we often have to check and perform scalar alignment. To speed up these checks, we allocate a bit denoting known-to-be-aligned, so that the alignment check can skip the load. The below shows what views need to check for alignment before they can operate, and whether the indices they produce are aligned. ┌───────────────╥────────────────────┬──────────────────────────┐ │ View ║ Requires Alignment │ Produces Aligned Indices │ ╞═══════════════╬════════════════════╪══════════════════════════╡ │ Native UTF8 ║ no │ no │ ├───────────────╫────────────────────┼──────────────────────────┤ │ Native UTF16 ║ yes │ no │ ╞═══════════════╬════════════════════╪══════════════════════════╡ │ Foreign UTF8 ║ yes │ no │ ├───────────────╫────────────────────┼──────────────────────────┤ │ Foreign UTF16 ║ no │ no │ ╞═══════════════╬════════════════════╪══════════════════════════╡ │ UnicodeScalar ║ yes │ yes │ ├───────────────╫────────────────────┼──────────────────────────┤ │ Character ║ yes │ yes │ └───────────────╨────────────────────┴──────────────────────────┘ The "requires alignment" applies to any operation taking a String.Index that's not defined entirely in terms of other operations taking a String.Index. These include: * index(after:) * index(before:) * subscript * distance(from:to:) (since `to` is compared against directly) * UTF16View._nativeGetOffset(for:)
This commit is contained in:
@@ -139,40 +139,46 @@ extension String.UTF16View: BidirectionalCollection {
|
||||
public var endIndex: Index { return _guts.endIndex }
|
||||
|
||||
@inlinable @inline(__always)
|
||||
public func index(after i: Index) -> Index {
|
||||
if _slowPath(_guts.isForeign) { return _foreignIndex(after: i) }
|
||||
if _guts.isASCII { return i.nextEncoded }
|
||||
public func index(after idx: Index) -> Index {
|
||||
if _slowPath(_guts.isForeign) { return _foreignIndex(after: idx) }
|
||||
if _guts.isASCII { return idx.nextEncoded }
|
||||
|
||||
// For a BMP scalar (1-3 UTF-8 code units), advance past it. For a non-BMP
|
||||
// scalar, use a transcoded offset first.
|
||||
let len = _guts.fastUTF8ScalarLength(startingAt: i._encodedOffset)
|
||||
if len == 4 && i.transcodedOffset == 0 {
|
||||
return i.nextTranscoded
|
||||
|
||||
// TODO: do the ugly if non-transcoded make sure to scalar align thing...
|
||||
// Also, can we just jump ahead 4 is transcoded is 1?
|
||||
|
||||
let idx = _utf16AlignNativeIndex(idx)
|
||||
let len = _guts.fastUTF8ScalarLength(startingAt: idx._encodedOffset)
|
||||
if len == 4 && idx.transcodedOffset == 0 {
|
||||
return idx.nextTranscoded
|
||||
}
|
||||
return i.strippingTranscoding.encoded(offsetBy: len)
|
||||
return idx.strippingTranscoding.encoded(offsetBy: len).aligned
|
||||
}
|
||||
|
||||
@inlinable @inline(__always)
|
||||
public func index(before i: Index) -> Index {
|
||||
precondition(!i.isZeroPosition)
|
||||
if _slowPath(_guts.isForeign) { return _foreignIndex(before: i) }
|
||||
if _guts.isASCII { return i.priorEncoded }
|
||||
public func index(before idx: Index) -> Index {
|
||||
precondition(!idx.isZeroPosition)
|
||||
if _slowPath(_guts.isForeign) { return _foreignIndex(before: idx) }
|
||||
if _guts.isASCII { return idx.priorEncoded }
|
||||
|
||||
if i.transcodedOffset != 0 {
|
||||
_internalInvariant(i.transcodedOffset == 1)
|
||||
return i.strippingTranscoding
|
||||
if idx.transcodedOffset != 0 {
|
||||
_internalInvariant(idx.transcodedOffset == 1)
|
||||
return idx.strippingTranscoding
|
||||
}
|
||||
|
||||
let len = _guts.fastUTF8ScalarLength(endingAt: i._encodedOffset)
|
||||
let idx = _utf16AlignNativeIndex(idx)
|
||||
let len = _guts.fastUTF8ScalarLength(endingAt: idx._encodedOffset)
|
||||
if len == 4 {
|
||||
// 2 UTF-16 code units comprise this scalar; advance to the beginning and
|
||||
// start mid-scalar transcoding
|
||||
return i.encoded(offsetBy: -len).nextTranscoded
|
||||
return idx.encoded(offsetBy: -len).nextTranscoded
|
||||
}
|
||||
|
||||
// Single UTF-16 code unit
|
||||
_internalInvariant((1...3) ~= len)
|
||||
return i.encoded(offsetBy: -len)
|
||||
return idx.encoded(offsetBy: -len).aligned
|
||||
}
|
||||
|
||||
public func index(_ i: Index, offsetBy n: Int) -> Index {
|
||||
@@ -239,19 +245,16 @@ extension String.UTF16View: BidirectionalCollection {
|
||||
/// - Parameter position: A valid index of the view. `position` must be
|
||||
/// less than the view's end index.
|
||||
@inlinable @inline(__always)
|
||||
public subscript(i: Index) -> UTF16.CodeUnit {
|
||||
String(_guts)._boundsCheck(i)
|
||||
public subscript(idx: Index) -> UTF16.CodeUnit {
|
||||
String(_guts)._boundsCheck(idx)
|
||||
|
||||
if _fastPath(_guts.isFastUTF8) {
|
||||
let scalar = _guts.fastUTF8Scalar(
|
||||
startingAt: _guts.scalarAlign(i)._encodedOffset)
|
||||
if scalar.value <= 0xFFFF {
|
||||
return UInt16(truncatingIfNeeded: scalar.value)
|
||||
}
|
||||
return scalar.utf16[i.transcodedOffset]
|
||||
startingAt: _guts.scalarAlign(idx)._encodedOffset)
|
||||
return scalar.utf16[idx.transcodedOffset]
|
||||
}
|
||||
|
||||
return _foreignSubscript(position: i)
|
||||
return _foreignSubscript(position: idx)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -426,27 +429,30 @@ extension String.UTF16View {
|
||||
@_effects(releasenone)
|
||||
internal func _foreignIndex(after i: Index) -> Index {
|
||||
_internalInvariant(_guts.isForeign)
|
||||
return i.nextEncoded
|
||||
return i.strippingTranscoding.nextEncoded
|
||||
}
|
||||
|
||||
@usableFromInline @inline(never)
|
||||
@_effects(releasenone)
|
||||
internal func _foreignIndex(before i: Index) -> Index {
|
||||
_internalInvariant(_guts.isForeign)
|
||||
return i.priorEncoded
|
||||
return i.strippingTranscoding.priorEncoded
|
||||
}
|
||||
|
||||
@usableFromInline @inline(never)
|
||||
@_effects(releasenone)
|
||||
internal func _foreignSubscript(position i: Index) -> UTF16.CodeUnit {
|
||||
_internalInvariant(_guts.isForeign)
|
||||
return _guts.foreignErrorCorrectedUTF16CodeUnit(at: i)
|
||||
return _guts.foreignErrorCorrectedUTF16CodeUnit(at: i.strippingTranscoding)
|
||||
}
|
||||
|
||||
@usableFromInline @inline(never)
|
||||
@_effects(releasenone)
|
||||
internal func _foreignDistance(from start: Index, to end: Index) -> Int {
|
||||
_internalInvariant(_guts.isForeign)
|
||||
|
||||
// Ignore transcoded offsets, i.e. scalar align if-and-only-if from a
|
||||
// transcoded view
|
||||
return end._encodedOffset - start._encodedOffset
|
||||
}
|
||||
|
||||
@@ -460,14 +466,14 @@ extension String.UTF16View {
|
||||
if n > 0 ? l >= 0 && l < n : l <= 0 && n < l {
|
||||
return nil
|
||||
}
|
||||
return i.encoded(offsetBy: n)
|
||||
return i.strippingTranscoding.encoded(offsetBy: n)
|
||||
}
|
||||
|
||||
@usableFromInline @inline(never)
|
||||
@_effects(releasenone)
|
||||
internal func _foreignIndex(_ i: Index, offsetBy n: Int) -> Index {
|
||||
_internalInvariant(_guts.isForeign)
|
||||
return i.encoded(offsetBy: n)
|
||||
return i.strippingTranscoding.encoded(offsetBy: n)
|
||||
}
|
||||
|
||||
@usableFromInline @inline(never)
|
||||
@@ -476,6 +482,19 @@ extension String.UTF16View {
|
||||
_internalInvariant(_guts.isForeign)
|
||||
return endIndex._encodedOffset - startIndex._encodedOffset
|
||||
}
|
||||
|
||||
// Align a native UTF-8 index to a valid UTF-16 position. If there is a
|
||||
// transcoded offset already, this is already a valid UTF-16 position
|
||||
// (referring to the second surrogate) and returns `idx`. Otherwise, this will
|
||||
// scalar-align the index. This is needed because we may be passed a
|
||||
// non-scalar-aligned index from the UTF8View.
|
||||
@_alwaysEmitIntoClient // Swift 5.1
|
||||
@inline(__always)
|
||||
internal func _utf16AlignNativeIndex(_ idx: String.Index) -> String.Index {
|
||||
_internalInvariant(!_guts.isForeign)
|
||||
guard idx.transcodedOffset == 0 else { return idx }
|
||||
return _guts.scalarAlign(idx)
|
||||
}
|
||||
}
|
||||
|
||||
extension String.Index {
|
||||
@@ -506,6 +525,7 @@ extension String.UTF16View {
|
||||
return idx._encodedOffset
|
||||
}
|
||||
|
||||
let idx = _utf16AlignNativeIndex(idx)
|
||||
if idx._encodedOffset < _shortHeuristic || !_guts.hasBreadcrumbs {
|
||||
return _distance(from: startIndex, to: idx)
|
||||
}
|
||||
@@ -567,22 +587,27 @@ extension String.UTF16View {
|
||||
_internalInvariant(utf16Len == 2)
|
||||
return Index(encodedOffset: readIdx, transcodedOffset: 1)
|
||||
}
|
||||
return Index(_encodedOffset: readIdx &+ len)
|
||||
return Index(_encodedOffset: readIdx &+ len).aligned
|
||||
}
|
||||
|
||||
readIdx &+= len
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extension String {
|
||||
@usableFromInline // @testable
|
||||
internal func _nativeCopyUTF16CodeUnits(
|
||||
// Copy (i.e. transcode to UTF-16) our contents into a buffer. `alignedRange`
|
||||
// means that the indices are part of the UTF16View.indices -- they are either
|
||||
// scalar-aligned or transcoded (e.g. derived from the UTF-16 view). They do
|
||||
// not need to go through an alignment check.
|
||||
internal func _nativeCopy(
|
||||
into buffer: UnsafeMutableBufferPointer<UInt16>,
|
||||
range: Range<String.Index>
|
||||
alignedRange range: Range<String.Index>
|
||||
) {
|
||||
_internalInvariant(_guts.isFastUTF8)
|
||||
_internalInvariant(
|
||||
range.lowerBound == _utf16AlignNativeIndex(range.lowerBound))
|
||||
_internalInvariant(
|
||||
range.upperBound == _utf16AlignNativeIndex(range.upperBound))
|
||||
|
||||
if _slowPath(range.isEmpty) { return }
|
||||
|
||||
@@ -637,8 +662,6 @@ extension String {
|
||||
writeIdx &+= 1
|
||||
}
|
||||
_internalInvariant(writeIdx <= writeEnd)
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user