[string] Clean up String.UTF8View

Extract slow paths into non-inlinable functions so that fast-paths can
be faster and we don't pay the large code bloat for the Unicode
parsers.

Some tests proactively extended to highlight UTF8View of multiple
kinds of Strings.
This commit is contained in:
Michael Ilseman
2018-05-10 11:09:31 -07:00
committed by Michael Ilseman
parent 60b6789206
commit 00e214ec50
3 changed files with 167 additions and 64 deletions

View File

@@ -123,8 +123,15 @@ extension _StringGuts {
@inlinable
public // @testable
var isASCII: Bool {
// FIXME: Currently used to sometimes mean contiguous ASCII
return _object.isContiguousASCII
@inline(__always) get { return _object.isContiguousASCII }
}
@inlinable
internal
var _isASCIIOrSmallASCII: Bool {
@inline(__always) get {
return isASCII || _isSmall && _smallUTF8String.isASCII
}
}
@inlinable

View File

@@ -134,8 +134,15 @@ extension String {
/// If the UTF-8 view is empty, `startIndex` is equal to `endIndex`.
@inlinable // FIXME(sil-serialize-all)
public var startIndex: Index {
let r = _index(atEncodedOffset: _guts.startIndex)
if _legacyOffsets.start == 0 { return r }
let r: Index
if _fastPath(_guts._isASCIIOrSmallASCII) {
r = Index(encodedOffset: 0)
} else {
r = _nonASCIIIndex(atEncodedOffset: 0)
}
_sanityCheck(r.encodedOffset == 0)
if _fastPath(_legacyOffsets.start == 0) { return r }
return index(r, offsetBy: numericCast(_legacyOffsets.start))
}
@@ -160,32 +167,55 @@ extension String {
}
}
@inlinable // FIXME(sil-serialize-all)
internal func _index(atEncodedOffset n: Int) -> Index {
if _fastPath(_guts.isASCII) { return Index(encodedOffset: n) }
@inline(never)
@effects(releasenone)
@usableFromInline
internal func _nonASCIIIndex(atEncodedOffset n: Int) -> Index {
_sanityCheck(!_guts._isASCIIOrSmallASCII)
let count = _guts.count
if n == count { return endIndex }
let buffer: Index._UTF8Buffer = _visitGuts(
_guts, range: (n..<count, performBoundsCheck: true),
ascii: { _ in
Builtin.unreachable()
return Index._UTF8Buffer() },
utf16: { utf16 in
var i = utf16.makeIterator()
return UTF8View._fillBuffer(from: &i) },
opaque: { opaque in
var i = opaque.makeIterator()
return UTF8View._fillBuffer(from: &i)}
)
return Index(encodedOffset: n, .utf8(buffer: buffer))
}
@inline(__always)
internal
static func _fillBuffer<Iter: IteratorProtocol>(
from i: inout Iter
) -> Index._UTF8Buffer where Iter.Element == UInt16 {
var p = UTF16.ForwardParser()
var i = _guts.makeIterator(in: n..<count)
var buffer = Index._UTF8Buffer()
Loop:
while true {
switch p.parseScalar(from: &i) {
case .valid(let u16):
let u8 = Unicode.UTF8.transcode(u16, from: Unicode.UTF16.self)
._unsafelyUnwrappedUnchecked
if buffer.count + u8.count > buffer.capacity { break Loop }
if buffer.count + u8.count > buffer.capacity {
return buffer
}
buffer.append(contentsOf: u8)
case .error:
let u8 = Unicode.UTF8.encodedReplacementCharacter
if buffer.count + u8.count > buffer.capacity { break Loop }
if buffer.count + u8.count > buffer.capacity {
return buffer
}
buffer.append(contentsOf: u8)
case .emptyInput:
break Loop
return buffer
}
}
return Index(encodedOffset: n, .utf8(buffer: buffer))
}
/// Returns the next consecutive position after `i`.
@@ -194,16 +224,25 @@ extension String {
@inlinable // FIXME(sil-serialize-all)
@inline(__always)
public func index(after i: Index) -> Index {
if _fastPath(_guts.isASCII) {
if _fastPath(_guts._isASCIIOrSmallASCII) {
precondition(i.encodedOffset < _guts.count)
return Index(encodedOffset: i.encodedOffset + 1)
}
return _nonASCIIIndex(after: i)
}
@inline(never)
@effects(releasenone)
@usableFromInline
internal func _nonASCIIIndex(after i: Index) -> Index {
_sanityCheck(!_guts._isASCIIOrSmallASCII)
var j = i
// Ensure j's cache is utf8
if _slowPath(j._cache.utf8 == nil) {
j = _index(atEncodedOffset: j.encodedOffset)
j = _nonASCIIIndex(atEncodedOffset: j.encodedOffset)
precondition(j != endIndex, "Index out of bounds")
}
@@ -239,16 +278,24 @@ extension String {
.utf8(buffer: nextBuffer))
}
// If nothing left in the buffer, refill it.
return _index(atEncodedOffset: j.encodedOffset + scalarLength16)
return _nonASCIIIndex(atEncodedOffset: j.encodedOffset + scalarLength16)
}
@inlinable // FIXME(sil-serialize-all)
public func index(before i: Index) -> Index {
if _fastPath(_guts.isASCII) {
if _fastPath(_guts._isASCIIOrSmallASCII) {
precondition(i.encodedOffset > 0)
return Index(encodedOffset: i.encodedOffset - 1)
}
return _nonASCIIIndex(before: i)
}
@inline(never)
@effects(releasenone)
@usableFromInline
internal func _nonASCIIIndex(before i: Index) -> Index {
_sanityCheck(!_guts._isASCIIOrSmallASCII)
if i._transcodedOffset != 0 {
_sanityCheck(i._cache.utf8 != nil)
var r = i
@@ -271,19 +318,29 @@ extension String {
@inlinable // FIXME(sil-serialize-all)
public func distance(from i: Index, to j: Index) -> Int {
if _fastPath(_guts.isASCII) {
if _fastPath(_guts._isASCIIOrSmallASCII) {
return j.encodedOffset - i.encodedOffset
}
return j >= i
? _forwardDistance(from: i, to: j) : -_forwardDistance(from: j, to: i)
return _nonASCIIDistance(from: i, to: j)
}
@inlinable // FIXME(sil-serialize-all)
@inline(__always)
internal func _forwardDistance(from i: Index, to j: Index) -> Int {
return j._transcodedOffset - i._transcodedOffset +
String.UTF8View._count(fromUTF16: IteratorSequence(_guts.makeIterator(
in: i.encodedOffset..<j.encodedOffset)))
@inline(never)
@effects(releasenone)
@usableFromInline
internal func _nonASCIIDistance(from i: Index, to j: Index) -> Int {
let forwards = j >= i
let start, end: Index
if forwards {
start = i
end = j
} else {
start = j
end = i
}
let countAbs = end._transcodedOffset - start._transcodedOffset
+ _gutsNonASCIIUTF8Count(start.encodedOffset..<end.encodedOffset)
return forwards ? countAbs : -countAbs
}
/// Accesses the code unit at the given position.
@@ -302,12 +359,25 @@ extension String {
public subscript(position: Index) -> UTF8.CodeUnit {
@inline(__always)
get {
if _fastPath(_guts.isASCII) {
let ascii = _guts._unmanagedASCIIView
if _fastPath(_guts._isASCIIOrSmallASCII) {
let offset = position.encodedOffset
_precondition(offset < ascii.count, "Index out of bounds")
return ascii.buffer[position.encodedOffset]
_precondition(offset < _guts.count, "Index out of bounds")
if _guts._isSmall {
return _guts._smallUTF8String[offset]
}
return _guts._unmanagedASCIIView.buffer[offset]
}
return _nonASCIISubscript(position: position)
}
}
@inline(never)
@effects(releasenone)
@usableFromInline
internal func _nonASCIISubscript(position: Index) -> UTF8.CodeUnit {
_sanityCheck(!_guts._isASCIIOrSmallASCII)
var j = position
while true {
if case .utf8(let buffer) = j._cache {
@@ -315,11 +385,10 @@ extension String {
return buffer[
buffer.index(buffer.startIndex, offsetBy: j._transcodedOffset)]
}
j = _index(atEncodedOffset: j.encodedOffset)
j = _nonASCIIIndex(atEncodedOffset: j.encodedOffset)
precondition(j < endIndex, "Index out of bounds")
}
}
}
@inlinable // FIXME(sil-serialize-all)
public var description: String {
@@ -545,21 +614,27 @@ extension String.UTF8View.Iterator : IteratorProtocol {
}
}
// Used to calculate a running count. For non-BMP scalars, it's important if the
// prior code unit was a leading surrogate (validity).
internal func _utf8Count(_ utf16CU: UInt16, prev: UInt16) -> Int {
switch utf16CU {
case 0..<0x80: return 1
case 0x80..<0x800: return 2
case 0x800..<0xDC00: return 3
case 0xDC00..<0xE000: return UTF16.isLeadSurrogate(prev) ? 1 : 3
default: return 3
}
}
extension String.UTF8View {
@inlinable // FIXME(sil-serialize-all)
@usableFromInline
internal static func _count<Source: Sequence>(fromUTF16 source: Source) -> Int
where Source.Element == Unicode.UTF16.CodeUnit
{
var result = 0
var prev: Unicode.UTF16.CodeUnit = 0
for u in source {
switch u {
case 0..<0x80: result += 1
case 0x80..<0x800: result += 2
case 0x800..<0xDC00: result += 3
case 0xDC00..<0xE000: result += UTF16.isLeadSurrogate(prev) ? 1 : 3
default: result += 3
}
result += _utf8Count(u, prev: prev)
prev = u
}
return result
@@ -567,11 +642,23 @@ extension String.UTF8View {
@inlinable // FIXME(sil-serialize-all)
public var count: Int {
if _fastPath(_guts.isASCII) { return _guts.count }
return _visitGuts(_guts,
let gutsCount = _guts.count
if _fastPath(_guts._isASCIIOrSmallASCII) { return gutsCount }
return _gutsNonASCIIUTF8Count(0..<gutsCount)
}
@inline(never)
@effects(releasenone)
@usableFromInline
internal func _gutsNonASCIIUTF8Count(
_ range: Range<Int>
) -> Int {
_sanityCheck(!_guts._isASCIIOrSmallASCII)
return _visitGuts(_guts, range: (range, performBoundsCheck: true),
ascii: { ascii in return ascii.count },
utf16: { utf16 in return String.UTF8View._count(fromUTF16: utf16) },
opaque: { opaque in return String.UTF8View._count(fromUTF16: opaque) })
opaque: { opaque in return String.UTF8View._count(fromUTF16: opaque) }
)
}
}

View File

@@ -195,7 +195,15 @@ SubstringTests.test("Mutate Substring through unicodeScalars view") {
}
SubstringTests.test("UTF8View") {
let s = "abcdefg"
let strs = [
"abcdefg", // Small ASCII
"abéÏ", // Small Unicode
"012345678901234567890", // Large ASCII
"abéÏ012345678901234567890", // Large Unicode
]
for s in strs {
let count = s.count
let t = s.utf8.dropFirst(2)
let u = t.dropFirst(2)
@@ -206,10 +214,11 @@ SubstringTests.test("UTF8View") {
checkMatch(t, u, u.startIndex)
checkMatch(t, u, u.index(after: u.startIndex))
expectEqual("", String(t.dropFirst(10))!)
expectEqual("", String(t.dropLast(10))!)
expectEqual("", String(u.dropFirst(10))!)
expectEqual("", String(u.dropLast(10))!)
expectEqual("", String(t.dropFirst(100))!)
expectEqual("", String(t.dropLast(100))!)
expectEqual("", String(u.dropFirst(100))!)
expectEqual("", String(u.dropLast(100))!)
}
}
SubstringTests.test("Persistent Content") {