mirror of
https://github.com/apple/swift.git
synced 2025-12-21 12:14:44 +01:00
[string] Clean up String.UTF8View
Extract slow paths into non-inlinable functions so that fast-paths can be faster and we don't pay the large code bloat for the Unicode parsers. Some tests proactively extended to highlight UTF8View of multiple kinds of Strings.
This commit is contained in:
committed by
Michael Ilseman
parent
60b6789206
commit
00e214ec50
@@ -123,8 +123,15 @@ extension _StringGuts {
|
||||
@inlinable
|
||||
public // @testable
|
||||
var isASCII: Bool {
|
||||
// FIXME: Currently used to sometimes mean contiguous ASCII
|
||||
return _object.isContiguousASCII
|
||||
@inline(__always) get { return _object.isContiguousASCII }
|
||||
}
|
||||
|
||||
@inlinable
|
||||
internal
|
||||
var _isASCIIOrSmallASCII: Bool {
|
||||
@inline(__always) get {
|
||||
return isASCII || _isSmall && _smallUTF8String.isASCII
|
||||
}
|
||||
}
|
||||
|
||||
@inlinable
|
||||
|
||||
@@ -134,8 +134,15 @@ extension String {
|
||||
/// If the UTF-8 view is empty, `startIndex` is equal to `endIndex`.
|
||||
@inlinable // FIXME(sil-serialize-all)
|
||||
public var startIndex: Index {
|
||||
let r = _index(atEncodedOffset: _guts.startIndex)
|
||||
if _legacyOffsets.start == 0 { return r }
|
||||
let r: Index
|
||||
if _fastPath(_guts._isASCIIOrSmallASCII) {
|
||||
r = Index(encodedOffset: 0)
|
||||
} else {
|
||||
r = _nonASCIIIndex(atEncodedOffset: 0)
|
||||
}
|
||||
_sanityCheck(r.encodedOffset == 0)
|
||||
if _fastPath(_legacyOffsets.start == 0) { return r }
|
||||
|
||||
return index(r, offsetBy: numericCast(_legacyOffsets.start))
|
||||
}
|
||||
|
||||
@@ -160,32 +167,55 @@ extension String {
|
||||
}
|
||||
}
|
||||
|
||||
@inlinable // FIXME(sil-serialize-all)
|
||||
internal func _index(atEncodedOffset n: Int) -> Index {
|
||||
if _fastPath(_guts.isASCII) { return Index(encodedOffset: n) }
|
||||
@inline(never)
|
||||
@effects(releasenone)
|
||||
@usableFromInline
|
||||
internal func _nonASCIIIndex(atEncodedOffset n: Int) -> Index {
|
||||
_sanityCheck(!_guts._isASCIIOrSmallASCII)
|
||||
let count = _guts.count
|
||||
if n == count { return endIndex }
|
||||
let buffer: Index._UTF8Buffer = _visitGuts(
|
||||
_guts, range: (n..<count, performBoundsCheck: true),
|
||||
ascii: { _ in
|
||||
Builtin.unreachable()
|
||||
return Index._UTF8Buffer() },
|
||||
utf16: { utf16 in
|
||||
var i = utf16.makeIterator()
|
||||
return UTF8View._fillBuffer(from: &i) },
|
||||
opaque: { opaque in
|
||||
var i = opaque.makeIterator()
|
||||
return UTF8View._fillBuffer(from: &i)}
|
||||
)
|
||||
|
||||
return Index(encodedOffset: n, .utf8(buffer: buffer))
|
||||
}
|
||||
|
||||
@inline(__always)
|
||||
internal
|
||||
static func _fillBuffer<Iter: IteratorProtocol>(
|
||||
from i: inout Iter
|
||||
) -> Index._UTF8Buffer where Iter.Element == UInt16 {
|
||||
var p = UTF16.ForwardParser()
|
||||
var i = _guts.makeIterator(in: n..<count)
|
||||
var buffer = Index._UTF8Buffer()
|
||||
Loop:
|
||||
while true {
|
||||
switch p.parseScalar(from: &i) {
|
||||
case .valid(let u16):
|
||||
let u8 = Unicode.UTF8.transcode(u16, from: Unicode.UTF16.self)
|
||||
._unsafelyUnwrappedUnchecked
|
||||
if buffer.count + u8.count > buffer.capacity { break Loop }
|
||||
if buffer.count + u8.count > buffer.capacity {
|
||||
return buffer
|
||||
}
|
||||
buffer.append(contentsOf: u8)
|
||||
case .error:
|
||||
let u8 = Unicode.UTF8.encodedReplacementCharacter
|
||||
if buffer.count + u8.count > buffer.capacity { break Loop }
|
||||
if buffer.count + u8.count > buffer.capacity {
|
||||
return buffer
|
||||
}
|
||||
buffer.append(contentsOf: u8)
|
||||
case .emptyInput:
|
||||
break Loop
|
||||
return buffer
|
||||
}
|
||||
}
|
||||
return Index(encodedOffset: n, .utf8(buffer: buffer))
|
||||
}
|
||||
|
||||
/// Returns the next consecutive position after `i`.
|
||||
@@ -194,16 +224,25 @@ extension String {
|
||||
@inlinable // FIXME(sil-serialize-all)
|
||||
@inline(__always)
|
||||
public func index(after i: Index) -> Index {
|
||||
if _fastPath(_guts.isASCII) {
|
||||
if _fastPath(_guts._isASCIIOrSmallASCII) {
|
||||
precondition(i.encodedOffset < _guts.count)
|
||||
return Index(encodedOffset: i.encodedOffset + 1)
|
||||
}
|
||||
|
||||
return _nonASCIIIndex(after: i)
|
||||
}
|
||||
|
||||
@inline(never)
|
||||
@effects(releasenone)
|
||||
@usableFromInline
|
||||
internal func _nonASCIIIndex(after i: Index) -> Index {
|
||||
_sanityCheck(!_guts._isASCIIOrSmallASCII)
|
||||
|
||||
var j = i
|
||||
|
||||
// Ensure j's cache is utf8
|
||||
if _slowPath(j._cache.utf8 == nil) {
|
||||
j = _index(atEncodedOffset: j.encodedOffset)
|
||||
j = _nonASCIIIndex(atEncodedOffset: j.encodedOffset)
|
||||
precondition(j != endIndex, "Index out of bounds")
|
||||
}
|
||||
|
||||
@@ -239,16 +278,24 @@ extension String {
|
||||
.utf8(buffer: nextBuffer))
|
||||
}
|
||||
// If nothing left in the buffer, refill it.
|
||||
return _index(atEncodedOffset: j.encodedOffset + scalarLength16)
|
||||
return _nonASCIIIndex(atEncodedOffset: j.encodedOffset + scalarLength16)
|
||||
}
|
||||
|
||||
@inlinable // FIXME(sil-serialize-all)
|
||||
public func index(before i: Index) -> Index {
|
||||
if _fastPath(_guts.isASCII) {
|
||||
if _fastPath(_guts._isASCIIOrSmallASCII) {
|
||||
precondition(i.encodedOffset > 0)
|
||||
return Index(encodedOffset: i.encodedOffset - 1)
|
||||
}
|
||||
|
||||
return _nonASCIIIndex(before: i)
|
||||
}
|
||||
|
||||
@inline(never)
|
||||
@effects(releasenone)
|
||||
@usableFromInline
|
||||
internal func _nonASCIIIndex(before i: Index) -> Index {
|
||||
_sanityCheck(!_guts._isASCIIOrSmallASCII)
|
||||
if i._transcodedOffset != 0 {
|
||||
_sanityCheck(i._cache.utf8 != nil)
|
||||
var r = i
|
||||
@@ -271,19 +318,29 @@ extension String {
|
||||
|
||||
@inlinable // FIXME(sil-serialize-all)
|
||||
public func distance(from i: Index, to j: Index) -> Int {
|
||||
if _fastPath(_guts.isASCII) {
|
||||
if _fastPath(_guts._isASCIIOrSmallASCII) {
|
||||
return j.encodedOffset - i.encodedOffset
|
||||
}
|
||||
return j >= i
|
||||
? _forwardDistance(from: i, to: j) : -_forwardDistance(from: j, to: i)
|
||||
return _nonASCIIDistance(from: i, to: j)
|
||||
}
|
||||
|
||||
@inlinable // FIXME(sil-serialize-all)
|
||||
@inline(__always)
|
||||
internal func _forwardDistance(from i: Index, to j: Index) -> Int {
|
||||
return j._transcodedOffset - i._transcodedOffset +
|
||||
String.UTF8View._count(fromUTF16: IteratorSequence(_guts.makeIterator(
|
||||
in: i.encodedOffset..<j.encodedOffset)))
|
||||
@inline(never)
|
||||
@effects(releasenone)
|
||||
@usableFromInline
|
||||
internal func _nonASCIIDistance(from i: Index, to j: Index) -> Int {
|
||||
let forwards = j >= i
|
||||
|
||||
let start, end: Index
|
||||
if forwards {
|
||||
start = i
|
||||
end = j
|
||||
} else {
|
||||
start = j
|
||||
end = i
|
||||
}
|
||||
let countAbs = end._transcodedOffset - start._transcodedOffset
|
||||
+ _gutsNonASCIIUTF8Count(start.encodedOffset..<end.encodedOffset)
|
||||
return forwards ? countAbs : -countAbs
|
||||
}
|
||||
|
||||
/// Accesses the code unit at the given position.
|
||||
@@ -302,12 +359,25 @@ extension String {
|
||||
public subscript(position: Index) -> UTF8.CodeUnit {
|
||||
@inline(__always)
|
||||
get {
|
||||
if _fastPath(_guts.isASCII) {
|
||||
let ascii = _guts._unmanagedASCIIView
|
||||
if _fastPath(_guts._isASCIIOrSmallASCII) {
|
||||
let offset = position.encodedOffset
|
||||
_precondition(offset < ascii.count, "Index out of bounds")
|
||||
return ascii.buffer[position.encodedOffset]
|
||||
_precondition(offset < _guts.count, "Index out of bounds")
|
||||
|
||||
if _guts._isSmall {
|
||||
return _guts._smallUTF8String[offset]
|
||||
}
|
||||
return _guts._unmanagedASCIIView.buffer[offset]
|
||||
}
|
||||
|
||||
return _nonASCIISubscript(position: position)
|
||||
}
|
||||
}
|
||||
|
||||
@inline(never)
|
||||
@effects(releasenone)
|
||||
@usableFromInline
|
||||
internal func _nonASCIISubscript(position: Index) -> UTF8.CodeUnit {
|
||||
_sanityCheck(!_guts._isASCIIOrSmallASCII)
|
||||
var j = position
|
||||
while true {
|
||||
if case .utf8(let buffer) = j._cache {
|
||||
@@ -315,11 +385,10 @@ extension String {
|
||||
return buffer[
|
||||
buffer.index(buffer.startIndex, offsetBy: j._transcodedOffset)]
|
||||
}
|
||||
j = _index(atEncodedOffset: j.encodedOffset)
|
||||
j = _nonASCIIIndex(atEncodedOffset: j.encodedOffset)
|
||||
precondition(j < endIndex, "Index out of bounds")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@inlinable // FIXME(sil-serialize-all)
|
||||
public var description: String {
|
||||
@@ -545,21 +614,27 @@ extension String.UTF8View.Iterator : IteratorProtocol {
|
||||
}
|
||||
}
|
||||
|
||||
// Used to calculate a running count. For non-BMP scalars, it's important if the
|
||||
// prior code unit was a leading surrogate (validity).
|
||||
internal func _utf8Count(_ utf16CU: UInt16, prev: UInt16) -> Int {
|
||||
switch utf16CU {
|
||||
case 0..<0x80: return 1
|
||||
case 0x80..<0x800: return 2
|
||||
case 0x800..<0xDC00: return 3
|
||||
case 0xDC00..<0xE000: return UTF16.isLeadSurrogate(prev) ? 1 : 3
|
||||
default: return 3
|
||||
}
|
||||
}
|
||||
|
||||
extension String.UTF8View {
|
||||
@inlinable // FIXME(sil-serialize-all)
|
||||
@usableFromInline
|
||||
internal static func _count<Source: Sequence>(fromUTF16 source: Source) -> Int
|
||||
where Source.Element == Unicode.UTF16.CodeUnit
|
||||
{
|
||||
var result = 0
|
||||
var prev: Unicode.UTF16.CodeUnit = 0
|
||||
for u in source {
|
||||
switch u {
|
||||
case 0..<0x80: result += 1
|
||||
case 0x80..<0x800: result += 2
|
||||
case 0x800..<0xDC00: result += 3
|
||||
case 0xDC00..<0xE000: result += UTF16.isLeadSurrogate(prev) ? 1 : 3
|
||||
default: result += 3
|
||||
}
|
||||
result += _utf8Count(u, prev: prev)
|
||||
prev = u
|
||||
}
|
||||
return result
|
||||
@@ -567,11 +642,23 @@ extension String.UTF8View {
|
||||
|
||||
@inlinable // FIXME(sil-serialize-all)
|
||||
public var count: Int {
|
||||
if _fastPath(_guts.isASCII) { return _guts.count }
|
||||
return _visitGuts(_guts,
|
||||
let gutsCount = _guts.count
|
||||
if _fastPath(_guts._isASCIIOrSmallASCII) { return gutsCount }
|
||||
return _gutsNonASCIIUTF8Count(0..<gutsCount)
|
||||
}
|
||||
|
||||
@inline(never)
|
||||
@effects(releasenone)
|
||||
@usableFromInline
|
||||
internal func _gutsNonASCIIUTF8Count(
|
||||
_ range: Range<Int>
|
||||
) -> Int {
|
||||
_sanityCheck(!_guts._isASCIIOrSmallASCII)
|
||||
return _visitGuts(_guts, range: (range, performBoundsCheck: true),
|
||||
ascii: { ascii in return ascii.count },
|
||||
utf16: { utf16 in return String.UTF8View._count(fromUTF16: utf16) },
|
||||
opaque: { opaque in return String.UTF8View._count(fromUTF16: opaque) })
|
||||
opaque: { opaque in return String.UTF8View._count(fromUTF16: opaque) }
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -195,7 +195,15 @@ SubstringTests.test("Mutate Substring through unicodeScalars view") {
|
||||
}
|
||||
|
||||
SubstringTests.test("UTF8View") {
|
||||
let s = "abcdefg"
|
||||
let strs = [
|
||||
"abcdefg", // Small ASCII
|
||||
"abéÏ", // Small Unicode
|
||||
"012345678901234567890", // Large ASCII
|
||||
"abéÏ012345678901234567890", // Large Unicode
|
||||
]
|
||||
|
||||
for s in strs {
|
||||
let count = s.count
|
||||
let t = s.utf8.dropFirst(2)
|
||||
let u = t.dropFirst(2)
|
||||
|
||||
@@ -206,10 +214,11 @@ SubstringTests.test("UTF8View") {
|
||||
checkMatch(t, u, u.startIndex)
|
||||
checkMatch(t, u, u.index(after: u.startIndex))
|
||||
|
||||
expectEqual("", String(t.dropFirst(10))!)
|
||||
expectEqual("", String(t.dropLast(10))!)
|
||||
expectEqual("", String(u.dropFirst(10))!)
|
||||
expectEqual("", String(u.dropLast(10))!)
|
||||
expectEqual("", String(t.dropFirst(100))!)
|
||||
expectEqual("", String(t.dropLast(100))!)
|
||||
expectEqual("", String(u.dropFirst(100))!)
|
||||
expectEqual("", String(u.dropLast(100))!)
|
||||
}
|
||||
}
|
||||
|
||||
SubstringTests.test("Persistent Content") {
|
||||
|
||||
Reference in New Issue
Block a user