//===----------------------------------------------------------------------===// // // This source file is part of the Swift.org open source project // // Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// // // Low-level helper functions and utilities for interpreting Unicode // @inlinable @inline(__always) internal func _decodeUTF8(_ x: UInt8) -> Unicode.Scalar { _internalInvariant(UTF8.isASCII(x)) return Unicode.Scalar(_unchecked: UInt32(x)) } @inlinable @inline(__always) internal func _decodeUTF8(_ x: UInt8, _ y: UInt8) -> Unicode.Scalar { _internalInvariant(_utf8ScalarLength(x) == 2) _internalInvariant(UTF8.isContinuation(y)) let x = UInt32(x) let value = ((x & 0b0001_1111) &<< 6) | _continuationPayload(y) return Unicode.Scalar(_unchecked: value) } @inlinable @inline(__always) internal func _decodeUTF8( _ x: UInt8, _ y: UInt8, _ z: UInt8 ) -> Unicode.Scalar { _internalInvariant(_utf8ScalarLength(x) == 3) _internalInvariant(UTF8.isContinuation(y) && UTF8.isContinuation(z)) let x = UInt32(x) let value = ((x & 0b0000_1111) &<< 12) | (_continuationPayload(y) &<< 6) | _continuationPayload(z) return Unicode.Scalar(_unchecked: value) } @inlinable @inline(__always) internal func _decodeUTF8( _ x: UInt8, _ y: UInt8, _ z: UInt8, _ w: UInt8 ) -> Unicode.Scalar { _internalInvariant(_utf8ScalarLength(x) == 4) _internalInvariant( UTF8.isContinuation(y) && UTF8.isContinuation(z) && UTF8.isContinuation(w)) let x = UInt32(x) let value = ((x & 0b0000_1111) &<< 18) | (_continuationPayload(y) &<< 12) | (_continuationPayload(z) &<< 6) | _continuationPayload(w) return Unicode.Scalar(_unchecked: value) } @inlinable @inline(__always) internal func _decodeScalar( _ utf8: UnsafeBufferPointer, startingAt i: Int ) -> (Unicode.Scalar, scalarLength: Int) { let cu0 = unsafe utf8[_unchecked: i] let len = _utf8ScalarLength(cu0) switch len { case 1: return (_decodeUTF8(cu0), len) case 2: return unsafe (_decodeUTF8(cu0, utf8[_unchecked: i &+ 1]), len) case 3: return unsafe (_decodeUTF8( cu0, utf8[_unchecked: i &+ 1], utf8[_unchecked: i &+ 2]), len) case 4: return unsafe (_decodeUTF8( cu0, utf8[_unchecked: i &+ 1], utf8[_unchecked: i &+ 2], utf8[_unchecked: i &+ 3]), len) default: Builtin.unreachable() } } @inlinable internal func _decodeScalar( _ utf8: UnsafeBufferPointer, endingAt i: Int ) -> (Unicode.Scalar, scalarLength: Int) { let len = unsafe _utf8ScalarLength(utf8, endingAt: i) let (scalar, scalarLen) = unsafe _decodeScalar(utf8, startingAt: i &- len) _internalInvariant(len == scalarLen) return (scalar, len) } @inlinable @inline(__always) internal func _utf8ScalarLength(_ x: UInt8) -> Int { _internalInvariant(!UTF8.isContinuation(x)) if UTF8.isASCII(x) { return 1 } // TODO(String micro-performance): check codegen return (~x).leadingZeroBitCount } @inlinable @inline(__always) internal func _utf8ScalarLength( _ utf8: UnsafeBufferPointer, endingAt i: Int ) -> Int { var len = 1 while unsafe UTF8.isContinuation(utf8[_unchecked: i &- len]) { len &+= 1 } unsafe _internalInvariant(len == _utf8ScalarLength(utf8[i &- len])) return len } @inlinable @inline(__always) internal func _continuationPayload(_ x: UInt8) -> UInt32 { return UInt32(x & 0x3F) } @inlinable internal func _scalarAlign( _ utf8: UnsafeBufferPointer, _ idx: Int ) -> Int { guard _fastPath(idx != utf8.count) else { return idx } var i = idx while unsafe _slowPath(UTF8.isContinuation(utf8[_unchecked: i])) { i &-= 1 _internalInvariant(i >= 0, "Malformed contents: starts with continuation byte") } return i } // // Scalar helpers // extension _StringGuts { @inlinable @inline(__always) // fast-path: fold common fastUTF8 check internal func scalarAlign(_ idx: Index) -> Index { let result: String.Index if _fastPath(idx._isScalarAligned) { result = idx } else { // TODO(String performance): isASCII check result = scalarAlignSlow(idx)._scalarAligned._copyingEncoding(from: idx) } _internalInvariant(isOnUnicodeScalarBoundary(result), "Alignment bit is set for non-aligned index") _internalInvariant_5_1(result._isScalarAligned) return result } @inline(never) // slow-path @_alwaysEmitIntoClient // Swift 5.1 @_effects(releasenone) internal func scalarAlignSlow(_ idx: Index) -> Index { _internalInvariant_5_1(!idx._isScalarAligned) if _slowPath(idx.transcodedOffset != 0 || idx._encodedOffset == 0) { // Transcoded index offsets are already scalar aligned return String.Index(_encodedOffset: idx._encodedOffset) } if _slowPath(self.isForeign) { // In 5.1 this check was added to foreignScalarAlign, but when this is // emitted into a client that then runs against a 5.0 stdlib, it calls // a version of foreignScalarAlign that doesn't check for this, which // ends up asking CFString for its endIndex'th character, which throws // an exception. So we duplicate the check here for back deployment. guard idx._encodedOffset != self.count else { return idx } let foreignIdx = foreignScalarAlign(idx) _internalInvariant_5_1(foreignIdx._isScalarAligned) return foreignIdx } return unsafe String.Index(_encodedOffset: self.withFastUTF8 { unsafe _scalarAlign($0, idx._encodedOffset) } ) } @inlinable internal func fastUTF8ScalarLength(startingAt i: Int) -> Int { _internalInvariant(isFastUTF8) let len = unsafe _utf8ScalarLength(self.withFastUTF8 { unsafe $0[_unchecked: i] }) _internalInvariant((1...4) ~= len) return len } @inlinable internal func fastUTF8ScalarLength(endingAt i: Int) -> Int { _internalInvariant(isFastUTF8) return unsafe self.withFastUTF8 { utf8 in unsafe _internalInvariant(i == utf8.count || !UTF8.isContinuation(utf8[i])) var len = 1 while unsafe UTF8.isContinuation(utf8[i &- len]) { _internalInvariant(i &- len > 0) len += 1 } _internalInvariant(len <= 4) return len } } @inlinable @inline(__always) internal func fastUTF8Scalar(startingAt i: Int) -> Unicode.Scalar { _internalInvariant(isFastUTF8) return unsafe self.withFastUTF8 { unsafe _decodeScalar($0, startingAt: i).0 } } @_alwaysEmitIntoClient @inline(__always) internal func isOnUnicodeScalarBoundary(_ offset: Int) -> Bool { isOnUnicodeScalarBoundary(String.Index(_encodedOffset: offset)) } @usableFromInline @_effects(releasenone) internal func isOnUnicodeScalarBoundary(_ i: String.Index) -> Bool { _internalInvariant(i._encodedOffset <= count) // TODO(String micro-performance): check isASCII // Beginning and end are always scalar aligned; mid-scalar never is guard i.transcodedOffset == 0 else { return false } if i == self.startIndex || i == self.endIndex { return true } if _fastPath(isFastUTF8) { return unsafe self.withFastUTF8 { return unsafe !UTF8.isContinuation($0[_unchecked: i._encodedOffset]) } } return i == foreignScalarAlign(i) } } // // Error-correcting helpers (U+FFFD for unpaired surrogates) for accessing // contents of foreign strings // extension _StringGuts { @_effects(releasenone) private func _getForeignCodeUnit(at i: Int) -> UInt16 { #if _runtime(_ObjC) // Currently, foreign means NSString return _object.withCocoaObject { _cocoaStringSubscript($0, i) } #else fatalError("No foreign strings on Linux in this version of Swift") #endif } @usableFromInline @_effects(releasenone) internal func foreignErrorCorrectedScalar( startingAt idx: String.Index ) -> (Unicode.Scalar, scalarLength: Int) { _internalInvariant(idx.transcodedOffset == 0) _internalInvariant(idx._encodedOffset < self.count) let start = idx._encodedOffset let leading = _getForeignCodeUnit(at: start) if _fastPath(!UTF16.isSurrogate(leading)) { return (Unicode.Scalar(_unchecked: UInt32(leading)), 1) } // Validate foreign strings on-read: trailing surrogates are invalid, // leading need following trailing // // TODO(String performance): Consider having a valid performance flag // available to check, and assert it's not set in the condition here. let nextOffset = start &+ 1 if _slowPath(UTF16.isTrailSurrogate(leading) || nextOffset == self.count) { return (Unicode.Scalar._replacementCharacter, 1) } let trailing = _getForeignCodeUnit(at: nextOffset) if _slowPath(!UTF16.isTrailSurrogate(trailing)) { return (Unicode.Scalar._replacementCharacter, 1) } return (UTF16._decodeSurrogates(leading, trailing), 2) } @_effects(releasenone) internal func foreignErrorCorrectedScalar( endingAt idx: String.Index ) -> (Unicode.Scalar, scalarLength: Int) { _internalInvariant(idx.transcodedOffset == 0) _internalInvariant(idx._encodedOffset <= self.count) _internalInvariant(idx._encodedOffset > 0) let end = idx._encodedOffset let trailing = _getForeignCodeUnit(at: end &- 1) if _fastPath(!UTF16.isSurrogate(trailing)) { return (Unicode.Scalar(_unchecked: UInt32(trailing)), 1) } // Validate foreign strings on-read: trailing surrogates are invalid, // leading need following trailing // // TODO(String performance): Consider having a valid performance flag // available to check, and assert it's not set in the condition here. let priorOffset = end &- 2 if _slowPath(UTF16.isLeadSurrogate(trailing) || priorOffset < 0) { return (Unicode.Scalar._replacementCharacter, 1) } let leading = _getForeignCodeUnit(at: priorOffset) if _slowPath(!UTF16.isLeadSurrogate(leading)) { return (Unicode.Scalar._replacementCharacter, 1) } return (UTF16._decodeSurrogates(leading, trailing), 2) } @_effects(releasenone) internal func foreignErrorCorrectedUTF16CodeUnit( at idx: String.Index ) -> UInt16 { _internalInvariant(idx.transcodedOffset == 0) _internalInvariant(idx._encodedOffset < self.count) let start = idx._encodedOffset let cu = _getForeignCodeUnit(at: start) if _fastPath(!UTF16.isSurrogate(cu)) { return cu } // Validate foreign strings on-read: trailing surrogates are invalid, // leading need following trailing // // TODO(String performance): Consider having a valid performance flag // available to check, and assert it's not set in the condition here. if UTF16.isLeadSurrogate(cu) { let nextOffset = start &+ 1 guard nextOffset < self.count, UTF16.isTrailSurrogate(_getForeignCodeUnit(at: nextOffset)) else { return UTF16._replacementCodeUnit } } else { let priorOffset = start &- 1 guard priorOffset >= 0, UTF16.isLeadSurrogate(_getForeignCodeUnit(at: priorOffset)) else { return UTF16._replacementCodeUnit } } return cu } @usableFromInline @inline(never) // slow-path @_effects(releasenone) internal func foreignScalarAlign(_ idx: Index) -> Index { guard idx._encodedOffset != self.count else { return idx._scalarAligned } _internalInvariant(idx._encodedOffset < self.count) let ecCU = foreignErrorCorrectedUTF16CodeUnit(at: idx) if _fastPath(!UTF16.isTrailSurrogate(ecCU)) { return idx._scalarAligned } _internalInvariant(idx._encodedOffset > 0, "Error-correction shouldn't give trailing surrogate at position zero") return String.Index(_encodedOffset: idx._encodedOffset &- 1)._scalarAligned } @usableFromInline @inline(never) @_effects(releasenone) internal func foreignErrorCorrectedGrapheme( startingAt start: Int, endingAt end: Int ) -> Character { #if _runtime(_ObjC) _internalInvariant(self.isForeign) // Both a fast-path for single-code-unit graphemes and validation: // treat isolated surrogates as isolated graphemes let count = end &- start if start &- end == 1 { return Character(String(self.foreignErrorCorrectedScalar( startingAt: String.Index(_encodedOffset: start) ).0)) } return unsafe withUnsafeTemporaryAllocation( of: UInt16.self, capacity: count ) { buffer in self._object.withCocoaObject { unsafe _cocoaStringCopyCharacters( from: $0, range: start.. (Unicode.Scalar, scalarLength: Int) { if _fastPath(isFastUTF8) { return unsafe withFastUTF8 { unsafe _decodeScalar($0, startingAt: i) } } return foreignErrorCorrectedScalar( startingAt: String.Index(_encodedOffset: i)) } @inlinable @inline(__always) internal func errorCorrectedCharacter( startingAt start: Int, endingAt end: Int ) -> Character { if _fastPath(isFastUTF8) { return unsafe withFastUTF8(range: start..