diff --git a/stdlib/public/SDK/Foundation/ExtraStringAPIs.swift b/stdlib/public/SDK/Foundation/ExtraStringAPIs.swift index 0244b27aa71..6f143324299 100644 --- a/stdlib/public/SDK/Foundation/ExtraStringAPIs.swift +++ b/stdlib/public/SDK/Foundation/ExtraStringAPIs.swift @@ -16,7 +16,7 @@ extension String.UTF16View.Index { @available(swift, obsoleted: 4.0) public init(_ offset: Int) { _precondition(offset >= 0, "Negative UTF16 index offset not allowed") - self.init(_offset: offset) + self.init(encodedOffset: offset) } @available(swift, deprecated: 3.2) diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift index db48e4aff86..22e62073b82 100644 --- a/stdlib/public/core/StringCharacterView.swift +++ b/stdlib/public/core/StringCharacterView.swift @@ -221,18 +221,18 @@ extension String._CharacterView : BidirectionalCollection { /// this view's `_baseOffset`. @inlinable // FIXME(sil-serialize-all) internal func _toBaseIndex(_ index: Index) -> Index { - return Index( - encodedOffset: index.encodedOffset - _baseOffset, - index._cache) + var ret = index + ret._codeUnitOffset -= _baseOffset + return ret } /// Translates an index in the underlying base string into a view index using /// this view's `_baseOffset`. @inlinable // FIXME(sil-serialize-all) internal func _toViewIndex(_ index: Index) -> Index { - return Index( - encodedOffset: index.encodedOffset + _baseOffset, - index._cache) + var ret = index + ret._codeUnitOffset += _baseOffset + return ret } /// The position of the first character in a nonempty character view. diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift index 50a3567f24a..9a021ad5873 100644 --- a/stdlib/public/core/StringGraphemeBreaking.swift +++ b/stdlib/public/core/StringGraphemeBreaking.swift @@ -18,29 +18,10 @@ internal var _CR: UInt8 { return 0x0d } @inlinable // FIXME(sil-serialize-all) internal var _LF: UInt8 { return 0x0a } -extension String.Index { - @inlinable // FIXME(sil-serialize-all) - internal init(encodedOffset: Int, characterStride stride: Int) { - if _slowPath(stride == 0 || stride > UInt16.max) { - // Don't store a 0 stride for the endIndex - // or a truncated stride for an overlong grapheme cluster. - self.init(encodedOffset: encodedOffset) - return - } - self.init( - encodedOffset: encodedOffset, - .character(stride: UInt16(truncatingIfNeeded: stride))) - } -} - extension _StringVariant { @inlinable internal func _stride(at i: String.Index) -> Int { - if case .character(let stride) = i._cache { - // TODO: should _fastPath the case somehow - _sanityCheck(stride > 0) - return Int(stride) - } + if let stride = i.characterStride { return stride } return characterStride(atOffset: i.encodedOffset) } diff --git a/stdlib/public/core/StringIndex.swift b/stdlib/public/core/StringIndex.swift index f4a920d534d..2d8f768a6f8 100644 --- a/stdlib/public/core/StringIndex.swift +++ b/stdlib/public/core/StringIndex.swift @@ -13,45 +13,66 @@ extension String { /// A position of a character or code unit in a string. @_fixed_layout // FIXME(sil-serialize-all) public struct Index { - @usableFromInline // FIXME(sil-serialize-all) - internal var _compoundOffset : UInt64 - @usableFromInline - internal var _cache: _Cache - internal typealias _UTF8Buffer = UTF8.EncodedScalar - @_frozen // FIXME(sil-serialize-all) + + @usableFromInline // FIXME(sil-serialize-all) + internal var _codeUnitOffset: Int + @usableFromInline - internal enum _Cache { - case none - case utf8(buffer: _UTF8Buffer) - case character(stride: UInt16) - } + internal var _utf8Buffer = _UTF8Buffer() + + @usableFromInline + internal var _graphemeStrideCache: UInt16 = 0 + + @usableFromInline + internal var _transcodedOffset: Int8 = 0 } } /// Convenience accessors -extension String.Index._Cache { +extension String.Index { @inlinable // FIXME(sil-serialize-all) - internal var utf8: String.Index._UTF8Buffer? { - if case .utf8(let r) = self { return r } else { return nil } + internal var utf8Buffer: String.Index._UTF8Buffer? { + guard !_utf8Buffer.isEmpty else { return nil } + return _utf8Buffer } + @inlinable // FIXME(sil-serialize-all) - internal var character: UInt16? { - if case .character(let r) = self { return r } else { return nil } + internal var characterStride: Int? { + guard _graphemeStrideCache > 0 else { return nil } + return Int(truncatingIfNeeded: _graphemeStrideCache) + } + + // TODO: Probably worth carving a bit for, or maybe a isSubScalar bit... + @inlinable // FIXME(sil-serialize-all) + internal var isUTF8: Bool { + return self.utf8Buffer != nil || self.transcodedOffset > 0 } } extension String.Index : Equatable { + // A combined code unit and transcoded offset, for comparison purposes + @inlinable // FIXME(sil-serialize-all) + internal var _orderingValue: UInt64 { + let cuOffset = UInt64(truncatingIfNeeded: _codeUnitOffset) + _sanityCheck( + cuOffset & 0xFFFF_0000_0000_0000 == 0, "String length capped at 48bits") + let transOffset = UInt64(truncatingIfNeeded: _transcodedOffset) + _sanityCheck(transOffset <= 4, "UTF-8 max transcoding is 4 code units") + + return cuOffset &<< 2 | transOffset + } + @inlinable // FIXME(sil-serialize-all) public static func == (lhs: String.Index, rhs: String.Index) -> Bool { - return lhs._compoundOffset == rhs._compoundOffset + return lhs._orderingValue == rhs._orderingValue } } extension String.Index : Comparable { @inlinable // FIXME(sil-serialize-all) public static func < (lhs: String.Index, rhs: String.Index) -> Bool { - return lhs._compoundOffset < rhs._compoundOffset + return lhs._orderingValue < rhs._orderingValue } } @@ -63,56 +84,47 @@ extension String.Index : Hashable { /// of this instance. @inlinable // FIXME(sil-serialize-all) public func hash(into hasher: inout Hasher) { - hasher.combine(_compoundOffset) + hasher.combine(_orderingValue) } } extension String.Index { - internal typealias _Self = String.Index - /// Creates a new index at the specified UTF-16 offset. /// /// - Parameter offset: An offset in UTF-16 code units. @inlinable // FIXME(sil-serialize-all) public init(encodedOffset offset: Int) { - _compoundOffset = UInt64(offset << _Self._strideBits) - _cache = .none + self._codeUnitOffset = offset } @inlinable // FIXME(sil-serialize-all) - internal init(encodedOffset o: Int, transcodedOffset: Int = 0, _ c: _Cache) { - _compoundOffset = UInt64(o << _Self._strideBits | transcodedOffset) - _cache = c + internal init( + encodedOffset offset: Int, transcodedOffset: Int, buffer: _UTF8Buffer + ) { + _sanityCheck(transcodedOffset < Int8.max && transcodedOffset > Int8.min) + self._codeUnitOffset = offset + self._transcodedOffset = Int8(truncatingIfNeeded: transcodedOffset) + self._utf8Buffer = buffer } - - @inlinable // FIXME(sil-serialize-all) - internal static var _strideBits : Int { return 2 } - @inlinable // FIXME(sil-serialize-all) - internal static var _mask : UInt64 { return (1 &<< _Self._strideBits) &- 1 } - - @inlinable // FIXME(sil-serialize-all) - internal mutating func _setEncodedOffset(_ x: Int) { - _compoundOffset = UInt64(x << _Self._strideBits) + + @inlinable + internal init(encodedOffset: Int, characterStride: Int) { + self._codeUnitOffset = encodedOffset + if characterStride < UInt16.max { + self._graphemeStrideCache = UInt16(truncatingIfNeeded: characterStride) + } } /// The offset into a string's UTF-16 encoding for this index. @inlinable // FIXME(sil-serialize-all) public var encodedOffset : Int { - return Int(_compoundOffset >> _Self._strideBits) + return _codeUnitOffset } /// The offset of this index within whatever encoding this is being viewed as @inlinable // FIXME(sil-serialize-all) - internal var _transcodedOffset : Int { - get { - return Int(_compoundOffset & _Self._mask) - } - set { - let extended = UInt64(newValue) - _sanityCheck(extended <= _Self._mask) - _compoundOffset &= ~_Self._mask - _compoundOffset |= extended - } + internal var transcodedOffset: Int { + return Int(truncatingIfNeeded: _transcodedOffset) } } @@ -130,8 +142,8 @@ extension String.Index { @available(swift, deprecated: 3.2) @available(swift, obsoleted: 4.0) public // SPI(Foundation) - init(_offset: Int) { - self.init(encodedOffset: _offset) + init(_codeUnitOffset: Int) { + self.init(encodedOffset: _codeUnitOffset) } @inlinable // FIXME(sil-serialize-all) diff --git a/stdlib/public/core/StringRangeReplaceableCollection.swift b/stdlib/public/core/StringRangeReplaceableCollection.swift index fe404b2b493..000ac083d62 100644 --- a/stdlib/public/core/StringRangeReplaceableCollection.swift +++ b/stdlib/public/core/StringRangeReplaceableCollection.swift @@ -455,11 +455,8 @@ extension String { @inlinable // FIXME(sil-serialize-all) internal func _stride(of i: Index) -> Int { - if case .character(let stride) = i._cache { - // TODO: should _fastPath the case somehow - _sanityCheck(stride > 0) - return Int(stride) - } + if let stride = i.characterStride { return stride } + let offset = i.encodedOffset return _visitGuts(_guts, args: offset, ascii: { ascii, offset in diff --git a/stdlib/public/core/StringUTF16.swift b/stdlib/public/core/StringUTF16.swift index 5a97108a529..d5a1b52d3f2 100644 --- a/stdlib/public/core/StringUTF16.swift +++ b/stdlib/public/core/StringUTF16.swift @@ -322,9 +322,9 @@ extension String { // there is no owner and elements are dropped from the end. let wholeString = String(utf16._guts) guard - let start = UTF16Index(_offset: utf16._offset) + let start = UTF16Index(encodedOffset: utf16._offset) .samePosition(in: wholeString), - let end = UTF16Index(_offset: utf16._offset + utf16._length) + let end = UTF16Index(encodedOffset: utf16._offset + utf16._length) .samePosition(in: wholeString) else { diff --git a/stdlib/public/core/StringUTF8.swift b/stdlib/public/core/StringUTF8.swift index bb80b26e547..ecd789cb27a 100644 --- a/stdlib/public/core/StringUTF8.swift +++ b/stdlib/public/core/StringUTF8.swift @@ -187,7 +187,7 @@ extension String { return UTF8View._fillBuffer(from: &i)} ) - return Index(encodedOffset: n, .utf8(buffer: buffer)) + return Index(encodedOffset: n, transcodedOffset: 0, buffer: buffer) } @inline(__always) @@ -241,12 +241,12 @@ extension String { var j = i // Ensure j's cache is utf8 - if _slowPath(j._cache.utf8 == nil) { + if _slowPath(j.utf8Buffer == nil) { j = _nonASCIIIndex(atEncodedOffset: j.encodedOffset) precondition(j != endIndex, "Index out of bounds") } - let buffer = j._cache.utf8._unsafelyUnwrappedUnchecked + let buffer = j.utf8Buffer._unsafelyUnwrappedUnchecked var scalarLength16 = 1 let b0 = buffer.first._unsafelyUnwrappedUnchecked @@ -258,13 +258,13 @@ extension String { } else { // Number of bytes consumed in this scalar - let n8 = j._transcodedOffset + 1 + let n8 = j.transcodedOffset + 1 // If we haven't reached a scalar boundary... if _fastPath(n8 < leading1s) { // Advance to the next position in this scalar return Index( encodedOffset: j.encodedOffset, - transcodedOffset: n8, .utf8(buffer: buffer)) + transcodedOffset: n8, buffer: buffer) } // We reached a scalar boundary; compute the underlying utf16's width // based on the number of utf8 code units @@ -275,7 +275,8 @@ extension String { if _fastPath(!nextBuffer.isEmpty) { return Index( encodedOffset: j.encodedOffset + scalarLength16, - .utf8(buffer: nextBuffer)) + transcodedOffset: 0, + buffer: nextBuffer) } // If nothing left in the buffer, refill it. return _nonASCIIIndex(atEncodedOffset: j.encodedOffset + scalarLength16) @@ -296,11 +297,12 @@ extension String { @usableFromInline internal func _nonASCIIIndex(before i: Index) -> Index { _sanityCheck(!_guts._isASCIIOrSmallASCII) - if i._transcodedOffset != 0 { - _sanityCheck(i._cache.utf8 != nil) - var r = i - r._compoundOffset = r._compoundOffset &- 1 - return r + if i.transcodedOffset != 0 { + _sanityCheck(i.utf8Buffer != nil) + return Index( + encodedOffset: i.encodedOffset, + transcodedOffset: i.transcodedOffset &- 1, + buffer: i.utf8Buffer._unsafelyUnwrappedUnchecked) } // Handle the scalar boundary the same way as the not-a-utf8-index case. @@ -312,8 +314,7 @@ extension String { return Index( encodedOffset: i.encodedOffset &- (u8.count < 4 ? 1 : 2), transcodedOffset: u8.count &- 1, - .utf8(buffer: String.Index._UTF8Buffer(u8)) - ) + buffer: String.Index._UTF8Buffer(u8)) } @inlinable // FIXME(sil-serialize-all) @@ -338,7 +339,7 @@ extension String { start = j end = i } - let countAbs = end._transcodedOffset - start._transcodedOffset + let countAbs = end.transcodedOffset - start.transcodedOffset + _gutsNonASCIIUTF8Count(start.encodedOffset.. Int { } extension String.UTF8View { - @usableFromInline - internal static func _count(fromUTF16 source: Source) -> Int - where Source.Element == Unicode.UTF16.CodeUnit - { + internal static func _count( + fromUTF16 source: Source + ) -> Int where Source.Element == Unicode.UTF16.CodeUnit { var result = 0 var prev: Unicode.UTF16.CodeUnit = 0 for u in source { @@ -702,17 +702,14 @@ extension String.UTF8View.Index { /// - sourcePosition: A position in a `String` or one of its views. /// - target: The `UTF8View` in which to find the new position. @inlinable // FIXME(sil-serialize-all) - public init?(_ sourcePosition: String.Index, within target: String.UTF8View) { - switch sourcePosition._cache { - case .utf8: - self.init(encodedOffset: sourcePosition.encodedOffset, - transcodedOffset:sourcePosition._transcodedOffset, sourcePosition._cache) - - default: - guard String.UnicodeScalarView(target._guts)._isOnUnicodeScalarBoundary( - sourcePosition) else { return nil } - self.init(encodedOffset: sourcePosition.encodedOffset) + public init?(_ idx: String.Index, within target: String.UTF8View) { + guard idx.isUTF8 || + String.UnicodeScalarView(target._guts)._isOnUnicodeScalarBoundary(idx) + else { + return nil } + + self = idx } } @@ -794,23 +791,23 @@ extension String.UTF8View { r.upperBound.encodedOffset == _guts.count) || r.upperBound.samePosition(in: wholeString) == nil) - if r.upperBound._transcodedOffset == 0 { + if r.upperBound.transcodedOffset == 0 { return String.UTF8View( _guts._extractSlice( r.lowerBound.encodedOffset..( - _ view: View, for string: String, - stackTrace: SourceLocStack = SourceLocStack(), - showFrame: Bool = true, - file: String = #file, line: UInt = #line) { - - var stackTrace = stackTrace.pushIf(showFrame, file: file, line: line) - - let count = view.count - func expect(_ i: Int, - file: String = #file, line: UInt = #line - ) { - expectEqual(count, i, "for String: \(string)", - stackTrace: stackTrace.pushIf(showFrame, file: file, line: line), - showFrame: false) - } - - - let reversedView = view.reversed() - - expect(Array(view).count) - expect(view.indices.count) - expect(view.indices.reversed().count) - expect(reversedView.indices.count) - expect(view.distance(from: view.startIndex, to: view.endIndex)) - expect(reversedView.distance( - from: reversedView.startIndex, to: reversedView.endIndex)) +enum SimpleString: String { + case smallASCII = "abcdefg" + case smallUnicode = "abéÏ𓀀" + case largeASCII = "012345678901234567890" + case largeUnicode = "abéÏ012345678901234567890𓀀" + case emoji = "😀😃🤢🤮👩🏿‍🎤🧛🏻‍♂️🧛🏻‍♂️👩‍👩‍👦‍👦" } -StringIndexTests.test("views") { +let simpleStrings: [String] = [ + SimpleString.smallASCII.rawValue, + SimpleString.smallUnicode.rawValue, + SimpleString.largeASCII.rawValue, + SimpleString.largeUnicode.rawValue, + SimpleString.emoji.rawValue, +] + +StringIndexTests.test("basic sanity checks") { + for s in simpleStrings { + let utf8 = Array(s.utf8) + let subUTF8 = Array(s[...].utf8) + let utf16 = Array(s.utf16) + let subUTF16 = Array(s[...].utf16) + let utf32 = Array(s.unicodeScalars.map { $0.value }) + let subUTF32 = Array(s[...].unicodeScalars.map { $0.value }) + + expectEqual(s, String(decoding: utf8, as: UTF8.self)) + expectEqual(s, String(decoding: subUTF8, as: UTF8.self)) + expectEqual(s, String(decoding: utf16, as: UTF16.self)) + expectEqual(s, String(decoding: subUTF16, as: UTF16.self)) + expectEqual(s, String(decoding: utf32, as: UTF32.self)) + expectEqual(s, String(decoding: subUTF32, as: UTF32.self)) + } +} + +StringIndexTests.test("view counts") { + func validateViewCount( + _ view: View, for string: String, + stackTrace: SourceLocStack = SourceLocStack(), + showFrame: Bool = true, + file: String = #file, line: UInt = #line + ) where View.Element: Equatable, View.Index == String.Index { + + var stackTrace = stackTrace.pushIf(showFrame, file: file, line: line) + + let count = view.count + func expect(_ i: Int, + file: String = #file, line: UInt = #line + ) { + expectEqual(count, i, "for String: \(string)", + stackTrace: stackTrace.pushIf(showFrame, file: file, line: line), + showFrame: false) + } + + let reversedView = view.reversed() + + expect(Array(view).count) + expect(view.indices.count) + expect(view.indices.reversed().count) + expect(reversedView.indices.count) + expect(view.distance(from: view.startIndex, to: view.endIndex)) + expect(reversedView.distance( + from: reversedView.startIndex, to: reversedView.endIndex)) + + // Access the elements from the indices + expectEqual(Array(view), view.indices.map { view[$0] }) + expectEqual( + Array(reversedView), reversedView.indices.map { reversedView[$0] }) + + let indicesArray = Array(view.indices) + for i in 0..= 0b1100_0000 { + expectEqual(idx, idx.samePosition(in: s.unicodeScalars)) + expectEqual(idx, idx.samePosition(in: s.utf16)) + + // ASCII + if char <= 0x7F { + expectEqual(UInt16(char), s.utf16[idx]) + expectEqual(UInt32(char), s.unicodeScalars[idx].value) + } + } else { + // Continuation code unit + assert(char & 0b1100_0000 == 0b1000_0000) + expectNil(idx.samePosition(in: s)) + expectNil(idx.samePosition(in: s.utf16)) + expectNil(idx.samePosition(in: s.unicodeScalars)) + } + } + } + + for s in simpleStrings { + validateIndices(s) + } +} + runAllTests() \ No newline at end of file