Files
swift-mirror/stdlib/public/core/UnicodeHelpers.swift
Michael Ilseman e01a294da6 [stdlib] Introduce _invariantCheck_5_1 for 5.1 and later assertions.
Inlinable and non-inlinable code can cause 5.1 code to intermix with
5.0 code on older OSes. Some (weak) invariants for 5.1 should only be
checked when the OS's code is 5.1 or later, which is the purpose of
_invariantCheck_5_1.

Applied to String.Index._isScalarAligned, which is a new bit
introduced in 5.1 from one of the reserved bits from 5.0. The bit is
set when the index is proven to be scalar aligned, and we want to
assert on this liberally in contexts where we expect it to be
so. However, older OSes might not set this bit when doing scalar
aligning, depending on exactly what got inlined where/when.
2019-07-12 15:58:27 -07:00

436 lines
14 KiB
Swift

//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
//
// Low-level helper functions and utilities for interpreting Unicode
//
@inlinable
@inline(__always)
internal func _decodeUTF8(_ x: UInt8) -> Unicode.Scalar {
_internalInvariant(UTF8.isASCII(x))
return Unicode.Scalar(_unchecked: UInt32(x))
}
@inlinable
@inline(__always)
internal func _decodeUTF8(_ x: UInt8, _ y: UInt8) -> Unicode.Scalar {
_internalInvariant(_utf8ScalarLength(x) == 2)
_internalInvariant(UTF8.isContinuation(y))
let x = UInt32(x)
let value = ((x & 0b0001_1111) &<< 6) | _continuationPayload(y)
return Unicode.Scalar(_unchecked: value)
}
@inlinable
@inline(__always)
internal func _decodeUTF8(
_ x: UInt8, _ y: UInt8, _ z: UInt8
) -> Unicode.Scalar {
_internalInvariant(_utf8ScalarLength(x) == 3)
_internalInvariant(UTF8.isContinuation(y) && UTF8.isContinuation(z))
let x = UInt32(x)
let value = ((x & 0b0000_1111) &<< 12)
| (_continuationPayload(y) &<< 6)
| _continuationPayload(z)
return Unicode.Scalar(_unchecked: value)
}
@inlinable
@inline(__always)
internal func _decodeUTF8(
_ x: UInt8, _ y: UInt8, _ z: UInt8, _ w: UInt8
) -> Unicode.Scalar {
_internalInvariant(_utf8ScalarLength(x) == 4)
_internalInvariant(
UTF8.isContinuation(y) && UTF8.isContinuation(z)
&& UTF8.isContinuation(w))
let x = UInt32(x)
let value = ((x & 0b0000_1111) &<< 18)
| (_continuationPayload(y) &<< 12)
| (_continuationPayload(z) &<< 6)
| _continuationPayload(w)
return Unicode.Scalar(_unchecked: value)
}
internal func _decodeScalar(
_ utf16: UnsafeBufferPointer<UInt16>, startingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
let high = utf16[i]
if i + 1 >= utf16.count {
_internalInvariant(!UTF16.isLeadSurrogate(high))
_internalInvariant(!UTF16.isTrailSurrogate(high))
return (Unicode.Scalar(_unchecked: UInt32(high)), 1)
}
if !UTF16.isLeadSurrogate(high) {
_internalInvariant(!UTF16.isTrailSurrogate(high))
return (Unicode.Scalar(_unchecked: UInt32(high)), 1)
}
let low = utf16[i+1]
_internalInvariant(UTF16.isLeadSurrogate(high))
_internalInvariant(UTF16.isTrailSurrogate(low))
return (UTF16._decodeSurrogates(high, low), 2)
}
@inlinable
internal func _decodeScalar(
_ utf8: UnsafeBufferPointer<UInt8>, startingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
let cu0 = utf8[_unchecked: i]
let len = _utf8ScalarLength(cu0)
switch len {
case 1: return (_decodeUTF8(cu0), len)
case 2: return (_decodeUTF8(cu0, utf8[_unchecked: i &+ 1]), len)
case 3: return (_decodeUTF8(
cu0, utf8[_unchecked: i &+ 1], utf8[_unchecked: i &+ 2]), len)
case 4:
return (_decodeUTF8(
cu0,
utf8[_unchecked: i &+ 1],
utf8[_unchecked: i &+ 2],
utf8[_unchecked: i &+ 3]),
len)
default: Builtin.unreachable()
}
}
@inlinable
internal func _decodeScalar(
_ utf8: UnsafeBufferPointer<UInt8>, endingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
let len = _utf8ScalarLength(utf8, endingAt: i)
let (scalar, scalarLen) = _decodeScalar(utf8, startingAt: i &- len)
_internalInvariant(len == scalarLen)
return (scalar, len)
}
@inlinable @inline(__always)
internal func _utf8ScalarLength(_ x: UInt8) -> Int {
_internalInvariant(!UTF8.isContinuation(x))
if UTF8.isASCII(x) { return 1 }
// TODO(String micro-performance): check codegen
return (~x).leadingZeroBitCount
}
@inlinable @inline(__always)
internal func _utf8ScalarLength(
_ utf8: UnsafeBufferPointer<UInt8>, endingAt i: Int
) -> Int {
var len = 1
while UTF8.isContinuation(utf8[_unchecked: i &- len]) {
len &+= 1
}
_internalInvariant(len == _utf8ScalarLength(utf8[i &- len]))
return len
}
@inlinable
@inline(__always)
internal func _continuationPayload(_ x: UInt8) -> UInt32 {
return UInt32(x & 0x3F)
}
@inlinable
internal func _scalarAlign(
_ utf8: UnsafeBufferPointer<UInt8>, _ idx: Int
) -> Int {
guard _fastPath(idx != utf8.count) else { return idx }
var i = idx
while _slowPath(UTF8.isContinuation(utf8[_unchecked: i])) {
i &-= 1
_internalInvariant(i >= 0,
"Malformed contents: starts with continuation byte")
}
return i
}
//
// Scalar helpers
//
extension _StringGuts {
@inlinable
@inline(__always) // fast-path: fold common fastUTF8 check
internal func scalarAlign(_ idx: Index) -> Index {
let result: String.Index
if _fastPath(idx._isScalarAligned) {
result = idx
} else {
// TODO(String performance): isASCII check
result = scalarAlignSlow(idx)
}
_internalInvariant(isOnUnicodeScalarBoundary(result),
"Alignment bit is set for non-aligned index")
_internalInvariant_5_1(result._isScalarAligned)
return result
}
@inline(never) // slow-path
@_alwaysEmitIntoClient // Swift 5.1
internal func scalarAlignSlow(_ idx: Index) -> Index {
_internalInvariant_5_1(!idx._isScalarAligned)
if _slowPath(idx.transcodedOffset != 0 || idx._encodedOffset == 0) {
// Transcoded index offsets are already scalar aligned
return String.Index(_encodedOffset: idx._encodedOffset)._scalarAligned
}
if _slowPath(self.isForeign) {
let foreignIdx = foreignScalarAlign(idx)
_internalInvariant_5_1(foreignIdx._isScalarAligned)
return foreignIdx
}
return String.Index(_encodedOffset:
self.withFastUTF8 { _scalarAlign($0, idx._encodedOffset) }
)._scalarAligned
}
@inlinable
internal func fastUTF8ScalarLength(startingAt i: Int) -> Int {
_internalInvariant(isFastUTF8)
let len = _utf8ScalarLength(self.withFastUTF8 { $0[i] })
_internalInvariant((1...4) ~= len)
return len
}
@inlinable
internal func fastUTF8ScalarLength(endingAt i: Int) -> Int {
_internalInvariant(isFastUTF8)
return self.withFastUTF8 { utf8 in
_internalInvariant(i == utf8.count || !UTF8.isContinuation(utf8[i]))
var len = 1
while UTF8.isContinuation(utf8[i &- len]) {
_internalInvariant(i &- len > 0)
len += 1
}
_internalInvariant(len <= 4)
return len
}
}
@inlinable
internal func fastUTF8Scalar(startingAt i: Int) -> Unicode.Scalar {
_internalInvariant(isFastUTF8)
return self.withFastUTF8 { _decodeScalar($0, startingAt: i).0 }
}
@usableFromInline
@_effects(releasenone)
internal func isOnUnicodeScalarBoundary(_ i: String.Index) -> Bool {
// TODO(String micro-performance): check isASCII
// Beginning and end are always scalar aligned; mid-scalar never is
guard i.transcodedOffset == 0 else { return false }
if i == self.startIndex || i == self.endIndex { return true }
if _fastPath(isFastUTF8) {
return self.withFastUTF8 {
return !UTF8.isContinuation($0[i._encodedOffset])
}
}
return i == foreignScalarAlign(i)
}
}
//
// Error-correcting helpers (U+FFFD for unpaired surrogates) for accessing
// contents of foreign strings
//
extension _StringGuts {
@_effects(releasenone)
private func _getForeignCodeUnit(at i: Int) -> UInt16 {
#if _runtime(_ObjC)
// Currently, foreign means NSString
return _cocoaStringSubscript(_object.cocoaObject, i)
#else
fatalError("No foreign strings on Linux in this version of Swift")
#endif
}
@usableFromInline
@_effects(releasenone)
internal func foreignErrorCorrectedScalar(
startingAt idx: String.Index
) -> (Unicode.Scalar, scalarLength: Int) {
_internalInvariant(idx.transcodedOffset == 0)
_internalInvariant(idx._encodedOffset < self.count)
let start = idx._encodedOffset
let leading = _getForeignCodeUnit(at: start)
if _fastPath(!UTF16.isSurrogate(leading)) {
return (Unicode.Scalar(_unchecked: UInt32(leading)), 1)
}
// Validate foreign strings on-read: trailing surrogates are invalid,
// leading need following trailing
//
// TODO(String performance): Consider having a valid performance flag
// available to check, and assert it's not set in the condition here.
let nextOffset = start &+ 1
if _slowPath(UTF16.isTrailSurrogate(leading) || nextOffset == self.count) {
return (Unicode.Scalar._replacementCharacter, 1)
}
let trailing = _getForeignCodeUnit(at: nextOffset)
if _slowPath(!UTF16.isTrailSurrogate(trailing)) {
return (Unicode.Scalar._replacementCharacter, 1)
}
return (UTF16._decodeSurrogates(leading, trailing), 2)
}
@_effects(releasenone)
internal func foreignErrorCorrectedScalar(
endingAt idx: String.Index
) -> (Unicode.Scalar, scalarLength: Int) {
_internalInvariant(idx.transcodedOffset == 0)
_internalInvariant(idx._encodedOffset <= self.count)
_internalInvariant(idx._encodedOffset > 0)
let end = idx._encodedOffset
let trailing = _getForeignCodeUnit(at: end &- 1)
if _fastPath(!UTF16.isSurrogate(trailing)) {
return (Unicode.Scalar(_unchecked: UInt32(trailing)), 1)
}
// Validate foreign strings on-read: trailing surrogates are invalid,
// leading need following trailing
//
// TODO(String performance): Consider having a valid performance flag
// available to check, and assert it's not set in the condition here.
let priorOffset = end &- 2
if _slowPath(UTF16.isLeadSurrogate(trailing) || priorOffset < 0) {
return (Unicode.Scalar._replacementCharacter, 1)
}
let leading = _getForeignCodeUnit(at: priorOffset)
if _slowPath(!UTF16.isLeadSurrogate(leading)) {
return (Unicode.Scalar._replacementCharacter, 1)
}
return (UTF16._decodeSurrogates(leading, trailing), 2)
}
@_effects(releasenone)
internal func foreignErrorCorrectedUTF16CodeUnit(
at idx: String.Index
) -> UInt16 {
_internalInvariant(idx.transcodedOffset == 0)
_internalInvariant(idx._encodedOffset < self.count)
let start = idx._encodedOffset
let cu = _getForeignCodeUnit(at: start)
if _fastPath(!UTF16.isSurrogate(cu)) {
return cu
}
// Validate foreign strings on-read: trailing surrogates are invalid,
// leading need following trailing
//
// TODO(String performance): Consider having a valid performance flag
// available to check, and assert it's not set in the condition here.
if UTF16.isLeadSurrogate(cu) {
let nextOffset = start &+ 1
guard nextOffset < self.count,
UTF16.isTrailSurrogate(_getForeignCodeUnit(at: nextOffset))
else { return UTF16._replacementCodeUnit }
} else {
let priorOffset = start &- 1
guard priorOffset >= 0,
UTF16.isLeadSurrogate(_getForeignCodeUnit(at: priorOffset))
else { return UTF16._replacementCodeUnit }
}
return cu
}
@usableFromInline @inline(never) // slow-path
@_effects(releasenone)
internal func foreignScalarAlign(_ idx: Index) -> Index {
guard idx._encodedOffset != self.count else { return idx._scalarAligned }
_internalInvariant(idx._encodedOffset < self.count)
let ecCU = foreignErrorCorrectedUTF16CodeUnit(at: idx)
if _fastPath(!UTF16.isTrailSurrogate(ecCU)) {
return idx._scalarAligned
}
_internalInvariant(idx._encodedOffset > 0,
"Error-correction shouldn't give trailing surrogate at position zero")
return String.Index(_encodedOffset: idx._encodedOffset &- 1)._scalarAligned
}
@usableFromInline @inline(never)
@_effects(releasenone)
internal func foreignErrorCorrectedGrapheme(
startingAt start: Int, endingAt end: Int
) -> Character {
#if _runtime(_ObjC)
_internalInvariant(self.isForeign)
// Both a fast-path for single-code-unit graphemes and validation:
// ICU treats isolated surrogates as isolated graphemes
let count = end &- start
if start &- end == 1 {
return Character(String(self.foreignErrorCorrectedScalar(
startingAt: String.Index(_encodedOffset: start)
).0))
}
// TODO(String performance): Stack buffer if small enough
var cus = Array<UInt16>(repeating: 0, count: count)
cus.withUnsafeMutableBufferPointer {
_cocoaStringCopyCharacters(
from: self._object.cocoaObject,
range: start..<end,
into: $0.baseAddress._unsafelyUnwrappedUnchecked)
}
return cus.withUnsafeBufferPointer {
return Character(String._uncheckedFromUTF16($0))
}
#else
fatalError("No foreign strings on Linux in this version of Swift")
#endif
}
}
// Higher level aggregate operations. These should only be called when the
// result is the sole operation done by a caller, otherwise it's always more
// efficient to use `withFastUTF8` in the caller.
extension _StringGuts {
@inlinable @inline(__always)
internal func errorCorrectedScalar(
startingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
if _fastPath(isFastUTF8) {
return withFastUTF8 { _decodeScalar($0, startingAt: i) }
}
return foreignErrorCorrectedScalar(
startingAt: String.Index(_encodedOffset: i))
}
@inlinable @inline(__always)
internal func errorCorrectedCharacter(
startingAt start: Int, endingAt end: Int
) -> Character {
if _fastPath(isFastUTF8) {
return withFastUTF8(range: start..<end) { utf8 in
return Character(unchecked: String._uncheckedFromUTF8(utf8))
}
}
return foreignErrorCorrectedGrapheme(startingAt: start, endingAt: end)
}
}