[stdlib] Rebuild String.Index for UTF8View

This commit is contained in:
Dave Abrahams
2017-06-05 17:12:10 -07:00
parent 576b8de64a
commit 283775ed1f
7 changed files with 151 additions and 218 deletions

View File

@@ -585,99 +585,6 @@ public func transcode<
}
}
/// Transcode UTF-16 to UTF-8, replacing ill-formed sequences with U+FFFD.
///
/// Returns the index of the first unhandled code unit and the UTF-8 data
/// that was encoded.
internal func _transcodeSomeUTF16AsUTF8<Input : Collection>(
_ input: Input, _ startIndex: Input.Index
) -> (Input.Index, _StringCore._UTF8Chunk)
where Input.Element == UInt16 {
typealias _UTF8Chunk = _StringCore._UTF8Chunk
let endIndex = input.endIndex
let utf8Max = MemoryLayout<_UTF8Chunk>.size
var result: _UTF8Chunk = 0
var utf8Count = 0
var nextIndex = startIndex
while nextIndex != input.endIndex && utf8Count != utf8Max {
let u = UInt(input[nextIndex])
let shift = _UTF8Chunk(utf8Count * 8)
var utf16Length: Input.IndexDistance = 1
if _fastPath(u <= 0x7f) {
result |= _UTF8Chunk(u) &<< shift
utf8Count += 1
} else {
var scalarUtf8Length: Int
var r: UInt
if _fastPath((u &>> 11) != 0b1101_1) {
// Neither high-surrogate, nor low-surrogate -- well-formed sequence
// of 1 code unit, decoding is trivial.
if u < 0x800 {
r = 0b10__00_0000__110__0_0000
r |= u &>> 6
r |= (u & 0b11_1111) &<< 8
scalarUtf8Length = 2
}
else {
r = 0b10__00_0000__10__00_0000__1110__0000
r |= u &>> 12
r |= ((u &>> 6) & 0b11_1111) &<< 8
r |= (u & 0b11_1111) &<< 16
scalarUtf8Length = 3
}
} else {
let unit0 = u
if _slowPath((unit0 &>> 10) == 0b1101_11) {
// `unit0` is a low-surrogate. We have an ill-formed sequence.
// Replace it with U+FFFD.
r = 0xbdbfef
scalarUtf8Length = 3
} else if _slowPath(input.index(nextIndex, offsetBy: 1) == endIndex) {
// We have seen a high-surrogate and EOF, so we have an ill-formed
// sequence. Replace it with U+FFFD.
r = 0xbdbfef
scalarUtf8Length = 3
} else {
let unit1 = UInt(input[input.index(nextIndex, offsetBy: 1)])
if _fastPath((unit1 &>> 10) == 0b1101_11) {
// `unit1` is a low-surrogate. We have a well-formed surrogate
// pair.
let v = 0x10000 + (((unit0 & 0x03ff) &<< 10) | (unit1 & 0x03ff))
r = 0b10__00_0000__10__00_0000__10__00_0000__1111_0__000
r |= v &>> 18
r |= ((v &>> 12) & 0b11_1111) &<< 8
r |= ((v &>> 6) & 0b11_1111) &<< 16
r |= (v & 0b11_1111) &<< 24
scalarUtf8Length = 4
utf16Length = 2
} else {
// Otherwise, we have an ill-formed sequence. Replace it with
// U+FFFD.
r = 0xbdbfef
scalarUtf8Length = 3
}
}
}
// Don't overrun the buffer
if utf8Count + scalarUtf8Length > utf8Max {
break
}
result |= numericCast(r) &<< shift
utf8Count += scalarUtf8Length
}
nextIndex = input.index(nextIndex, offsetBy: utf16Length)
}
// FIXME: Annoying check, courtesy of <rdar://problem/16740169>
if utf8Count < MemoryLayout.size(ofValue: result) {
result |= ~0 &<< numericCast(utf8Count * 8)
}
return (nextIndex, result)
}
/// Instances of conforming types are used in internal `String`
/// representation.
public // @testable