mirror of
https://github.com/apple/swift.git
synced 2025-12-21 12:14:44 +01:00
stdlib/String: if we can not get a contiguous data buffer out of NSString,
don't call into CoreFoundation to perform UTF-8 transcoding. CoreFoundation can replace ill-formed sequences with a single byte, which is not good enough to implement U+FFFD insertion. Instead, use the same transcoding routine as for contiguous buffer. Pulled out the transcoding routine into a generic function that should be specialized and simplified for the case when input is UnsafeArray; we should not be losing efficiency here. Fixes <rdar://problem/17297055> [unicode] println crashes when given string with unpaired surrogate Swift SVN r19157
This commit is contained in:
@@ -43,104 +43,35 @@ extension _StringCore {
|
||||
size: numericCast(utf16Count))
|
||||
|
||||
return (i + utf16Count, result)
|
||||
}
|
||||
else {
|
||||
return _encodeSomeUTF16AsUTF8(i)
|
||||
} else if _fastPath(!_baseAddress._isNull) {
|
||||
return _encodeSomeContiguousUTF16AsUTF8(i)
|
||||
} else {
|
||||
return _encodeSomeNonContiguousUTF16AsUTF8(i)
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper for _encodeSomeUTF8, above. Handles the case where we
|
||||
/// don't have contiguous ASCII storage.
|
||||
func _encodeSomeUTF16AsUTF8(i: Int) -> (Int, UTF8Chunk) {
|
||||
/// Helper for `_encodeSomeUTF8`, above. Handles the case where the
|
||||
/// storage is contiguous UTF-16.
|
||||
func _encodeSomeContiguousUTF16AsUTF8(i: Int) -> (Int, UTF8Chunk) {
|
||||
_sanityCheck(elementWidth == 2)
|
||||
_sanityCheck(!_baseAddress._isNull)
|
||||
|
||||
if _fastPath(!_baseAddress._isNull) {
|
||||
let utf16Count = self.count
|
||||
let utf8Max = sizeof(UTF8Chunk.self)
|
||||
var result: UTF8Chunk = 0
|
||||
var utf8Count = 0
|
||||
var nextIndex = i
|
||||
while (nextIndex < utf16Count && utf8Count != utf8Max) {
|
||||
let u = UInt(startUTF16[nextIndex])
|
||||
let shift = UTF8Chunk(utf8Count * 8)
|
||||
var utf16Length = 1
|
||||
let storage = UnsafeArray(start: startUTF16, length: self.count)
|
||||
return _transcodeSomeUTF16AsUTF8(storage, i)
|
||||
}
|
||||
|
||||
if _fastPath(u <= 0x7f) {
|
||||
result |= UTF8Chunk(u) << shift
|
||||
++utf8Count
|
||||
}
|
||||
else {
|
||||
var scalarUtf8Length: Int
|
||||
var r: UInt
|
||||
if _fastPath((u >> 11) != 0b1101_1) {
|
||||
// Neither high-surrogate, nor low-surrogate -- sequence of 1 code
|
||||
// unit, decoding is trivial.
|
||||
if u < 0x800 {
|
||||
r = 0b10__00_0000__110__0_0000
|
||||
r |= u >> 6
|
||||
r |= (u & 0b11_1111) << 8
|
||||
scalarUtf8Length = 2
|
||||
}
|
||||
else {
|
||||
r = 0b10__00_0000__10__00_0000__1110__0000
|
||||
r |= u >> 12
|
||||
r |= ((u >> 6) & 0b11_1111) << 8
|
||||
r |= (u & 0b11_1111) << 16
|
||||
scalarUtf8Length = 3
|
||||
}
|
||||
}
|
||||
else {
|
||||
var unit0 = u
|
||||
if _slowPath((unit0 >> 10) == 0b1101_11) {
|
||||
// `unit0` is a low-surrogate. We have an ill-formed sequence.
|
||||
// Replace it with U+FFFD.
|
||||
r = 0xbdbfef
|
||||
scalarUtf8Length = 3
|
||||
} else if _slowPath(nextIndex + 1 == utf16Count) {
|
||||
// We have seen a high-surrogate and EOF, so we have an
|
||||
// ill-formed sequence. Replace it with U+FFFD.
|
||||
r = 0xbdbfef
|
||||
scalarUtf8Length = 3
|
||||
} else {
|
||||
let unit1 = UInt(startUTF16[nextIndex + 1])
|
||||
if _fastPath((unit1 >> 10) == 0b1101_11) {
|
||||
// `unit1` is a low-surrogate. We have a well-formed surrogate
|
||||
// pair.
|
||||
let v = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff))
|
||||
/// Helper for `_encodeSomeUTF8`, above. Handles the case where the
|
||||
/// storage is non-contiguous UTF-16.
|
||||
func _encodeSomeNonContiguousUTF16AsUTF8(i: Int) -> (Int, UTF8Chunk) {
|
||||
_sanityCheck(elementWidth == 2)
|
||||
_sanityCheck(_baseAddress._isNull)
|
||||
|
||||
r = 0b10__00_0000__10__00_0000__10__00_0000__1111_0__000
|
||||
r |= v >> 18
|
||||
r |= ((v >> 12) & 0b11_1111) << 8
|
||||
r |= ((v >> 6) & 0b11_1111) << 16
|
||||
r |= (v & 0b11_1111) << 24
|
||||
scalarUtf8Length = 4
|
||||
utf16Length = 2
|
||||
} else {
|
||||
// Otherwise, we have an ill-formed sequence. Replace it with
|
||||
// U+FFFD.
|
||||
r = 0xbdbfef
|
||||
scalarUtf8Length = 3
|
||||
}
|
||||
}
|
||||
}
|
||||
// Don't overrun the buffer
|
||||
if utf8Count + scalarUtf8Length > utf8Max {
|
||||
break
|
||||
}
|
||||
result |= numericCast(r) << shift
|
||||
utf8Count += scalarUtf8Length
|
||||
}
|
||||
nextIndex += utf16Length
|
||||
}
|
||||
// FIXME: Annoying check, courtesy of <rdar://problem/16740169>
|
||||
if utf8Count < sizeofValue(result) {
|
||||
result |= ~0 << numericCast(utf8Count * 8)
|
||||
}
|
||||
return (nextIndex, result)
|
||||
}
|
||||
else {
|
||||
return _cocoaStringEncodeSomeUTF8(target: self, position: i)
|
||||
let storage = _CollectionOf<Int, UInt16>(
|
||||
startIndex: 0, endIndex: self.count) {
|
||||
(i: Int) -> UInt16 in
|
||||
return _cocoaStringSubscript(target: self, position: i)
|
||||
}
|
||||
return _transcodeSomeUTF16AsUTF8(storage, i)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user