[String] Scalar-alignment bug fixes.

Fixes a general category (pun intended) of scalar-alignment bugs
surrounding exchanging non-scalar-aligned indices between views and
for slicing.

SE-0180 unifies the Index type of String and all its views and allows
non-scalar-aligned indices to be used across views. In order to
guarantee behavior, we often have to check and perform scalar
alignment. To speed up these checks, we allocate a bit denoting
known-to-be-aligned, so that the alignment check can skip the
load. The below shows what views need to check for alignment before
they can operate, and whether the indices they produce are aligned.

┌───────────────╥────────────────────┬──────────────────────────┐
│ View          ║ Requires Alignment │ Produces Aligned Indices │
╞═══════════════╬════════════════════╪══════════════════════════╡
│ Native UTF8   ║ no                 │ no                       │
├───────────────╫────────────────────┼──────────────────────────┤
│ Native UTF16  ║ yes                │ no                       │
╞═══════════════╬════════════════════╪══════════════════════════╡
│ Foreign UTF8  ║ yes                │ no                       │
├───────────────╫────────────────────┼──────────────────────────┤
│ Foreign UTF16 ║ no                 │ no                       │
╞═══════════════╬════════════════════╪══════════════════════════╡
│ UnicodeScalar ║ yes                │ yes                      │
├───────────────╫────────────────────┼──────────────────────────┤
│ Character     ║ yes                │ yes                      │
└───────────────╨────────────────────┴──────────────────────────┘

The "requires alignment" applies to any operation taking a
String.Index that's not defined entirely in terms of other operations
taking a String.Index. These include:

* index(after:)
* index(before:)
* subscript
* distance(from:to:) (since `to` is compared against directly)
* UTF16View._nativeGetOffset(for:)
This commit is contained in:
Michael Ilseman
2019-06-24 16:51:54 -07:00
parent 93d65fc9c3
commit 4cd1e812b7
15 changed files with 545 additions and 164 deletions

View File

@@ -139,40 +139,46 @@ extension String.UTF16View: BidirectionalCollection {
public var endIndex: Index { return _guts.endIndex }
@inlinable @inline(__always)
public func index(after i: Index) -> Index {
if _slowPath(_guts.isForeign) { return _foreignIndex(after: i) }
if _guts.isASCII { return i.nextEncoded }
public func index(after idx: Index) -> Index {
if _slowPath(_guts.isForeign) { return _foreignIndex(after: idx) }
if _guts.isASCII { return idx.nextEncoded }
// For a BMP scalar (1-3 UTF-8 code units), advance past it. For a non-BMP
// scalar, use a transcoded offset first.
let len = _guts.fastUTF8ScalarLength(startingAt: i._encodedOffset)
if len == 4 && i.transcodedOffset == 0 {
return i.nextTranscoded
// TODO: do the ugly if non-transcoded make sure to scalar align thing...
// Also, can we just jump ahead 4 is transcoded is 1?
let idx = _utf16AlignNativeIndex(idx)
let len = _guts.fastUTF8ScalarLength(startingAt: idx._encodedOffset)
if len == 4 && idx.transcodedOffset == 0 {
return idx.nextTranscoded
}
return i.strippingTranscoding.encoded(offsetBy: len)
return idx.strippingTranscoding.encoded(offsetBy: len).aligned
}
@inlinable @inline(__always)
public func index(before i: Index) -> Index {
precondition(!i.isZeroPosition)
if _slowPath(_guts.isForeign) { return _foreignIndex(before: i) }
if _guts.isASCII { return i.priorEncoded }
public func index(before idx: Index) -> Index {
precondition(!idx.isZeroPosition)
if _slowPath(_guts.isForeign) { return _foreignIndex(before: idx) }
if _guts.isASCII { return idx.priorEncoded }
if i.transcodedOffset != 0 {
_internalInvariant(i.transcodedOffset == 1)
return i.strippingTranscoding
if idx.transcodedOffset != 0 {
_internalInvariant(idx.transcodedOffset == 1)
return idx.strippingTranscoding
}
let len = _guts.fastUTF8ScalarLength(endingAt: i._encodedOffset)
let idx = _utf16AlignNativeIndex(idx)
let len = _guts.fastUTF8ScalarLength(endingAt: idx._encodedOffset)
if len == 4 {
// 2 UTF-16 code units comprise this scalar; advance to the beginning and
// start mid-scalar transcoding
return i.encoded(offsetBy: -len).nextTranscoded
return idx.encoded(offsetBy: -len).nextTranscoded
}
// Single UTF-16 code unit
_internalInvariant((1...3) ~= len)
return i.encoded(offsetBy: -len)
return idx.encoded(offsetBy: -len).aligned
}
public func index(_ i: Index, offsetBy n: Int) -> Index {
@@ -239,19 +245,16 @@ extension String.UTF16View: BidirectionalCollection {
/// - Parameter position: A valid index of the view. `position` must be
/// less than the view's end index.
@inlinable @inline(__always)
public subscript(i: Index) -> UTF16.CodeUnit {
String(_guts)._boundsCheck(i)
public subscript(idx: Index) -> UTF16.CodeUnit {
String(_guts)._boundsCheck(idx)
if _fastPath(_guts.isFastUTF8) {
let scalar = _guts.fastUTF8Scalar(
startingAt: _guts.scalarAlign(i)._encodedOffset)
if scalar.value <= 0xFFFF {
return UInt16(truncatingIfNeeded: scalar.value)
}
return scalar.utf16[i.transcodedOffset]
startingAt: _guts.scalarAlign(idx)._encodedOffset)
return scalar.utf16[idx.transcodedOffset]
}
return _foreignSubscript(position: i)
return _foreignSubscript(position: idx)
}
}
@@ -426,27 +429,30 @@ extension String.UTF16View {
@_effects(releasenone)
internal func _foreignIndex(after i: Index) -> Index {
_internalInvariant(_guts.isForeign)
return i.nextEncoded
return i.strippingTranscoding.nextEncoded
}
@usableFromInline @inline(never)
@_effects(releasenone)
internal func _foreignIndex(before i: Index) -> Index {
_internalInvariant(_guts.isForeign)
return i.priorEncoded
return i.strippingTranscoding.priorEncoded
}
@usableFromInline @inline(never)
@_effects(releasenone)
internal func _foreignSubscript(position i: Index) -> UTF16.CodeUnit {
_internalInvariant(_guts.isForeign)
return _guts.foreignErrorCorrectedUTF16CodeUnit(at: i)
return _guts.foreignErrorCorrectedUTF16CodeUnit(at: i.strippingTranscoding)
}
@usableFromInline @inline(never)
@_effects(releasenone)
internal func _foreignDistance(from start: Index, to end: Index) -> Int {
_internalInvariant(_guts.isForeign)
// Ignore transcoded offsets, i.e. scalar align if-and-only-if from a
// transcoded view
return end._encodedOffset - start._encodedOffset
}
@@ -460,14 +466,14 @@ extension String.UTF16View {
if n > 0 ? l >= 0 && l < n : l <= 0 && n < l {
return nil
}
return i.encoded(offsetBy: n)
return i.strippingTranscoding.encoded(offsetBy: n)
}
@usableFromInline @inline(never)
@_effects(releasenone)
internal func _foreignIndex(_ i: Index, offsetBy n: Int) -> Index {
_internalInvariant(_guts.isForeign)
return i.encoded(offsetBy: n)
return i.strippingTranscoding.encoded(offsetBy: n)
}
@usableFromInline @inline(never)
@@ -476,6 +482,19 @@ extension String.UTF16View {
_internalInvariant(_guts.isForeign)
return endIndex._encodedOffset - startIndex._encodedOffset
}
// Align a native UTF-8 index to a valid UTF-16 position. If there is a
// transcoded offset already, this is already a valid UTF-16 position
// (referring to the second surrogate) and returns `idx`. Otherwise, this will
// scalar-align the index. This is needed because we may be passed a
// non-scalar-aligned index from the UTF8View.
@_alwaysEmitIntoClient // Swift 5.1
@inline(__always)
internal func _utf16AlignNativeIndex(_ idx: String.Index) -> String.Index {
_internalInvariant(!_guts.isForeign)
guard idx.transcodedOffset == 0 else { return idx }
return _guts.scalarAlign(idx)
}
}
extension String.Index {
@@ -506,6 +525,7 @@ extension String.UTF16View {
return idx._encodedOffset
}
let idx = _utf16AlignNativeIndex(idx)
if idx._encodedOffset < _shortHeuristic || !_guts.hasBreadcrumbs {
return _distance(from: startIndex, to: idx)
}
@@ -567,22 +587,27 @@ extension String.UTF16View {
_internalInvariant(utf16Len == 2)
return Index(encodedOffset: readIdx, transcodedOffset: 1)
}
return Index(_encodedOffset: readIdx &+ len)
return Index(_encodedOffset: readIdx &+ len).aligned
}
readIdx &+= len
}
}
}
}
extension String {
@usableFromInline // @testable
internal func _nativeCopyUTF16CodeUnits(
// Copy (i.e. transcode to UTF-16) our contents into a buffer. `alignedRange`
// means that the indices are part of the UTF16View.indices -- they are either
// scalar-aligned or transcoded (e.g. derived from the UTF-16 view). They do
// not need to go through an alignment check.
internal func _nativeCopy(
into buffer: UnsafeMutableBufferPointer<UInt16>,
range: Range<String.Index>
alignedRange range: Range<String.Index>
) {
_internalInvariant(_guts.isFastUTF8)
_internalInvariant(
range.lowerBound == _utf16AlignNativeIndex(range.lowerBound))
_internalInvariant(
range.upperBound == _utf16AlignNativeIndex(range.upperBound))
if _slowPath(range.isEmpty) { return }
@@ -637,8 +662,6 @@ extension String {
writeIdx &+= 1
}
_internalInvariant(writeIdx <= writeEnd)
}
}
}