stdlib: fix a bunch of various Unicode issues, primarily in UTF-8 decoding

In UTF-8 decoder:
- implement U+FFFD insertion according to the recommendation given in the
  Unicode spec.  This required changing the decoder to become stateful, which
  significantly increased complexity due to the need to maintain an internal
  buffer.
- reject invalid code unit sequences properly instead of crashing rdar://16767868
- reject overlong sequences rdar://16767911

In stdlib:
- change APIs that assume that UTF decoding can never fail to account for
  possibility of errors
- fix a bug in UnicodeScalarView that could cause a crash during backward
  iteration if U+8000 is present in the string
- allow noncharacters in UnicodeScalar.  They are explicitly allowed in the
  definition of "Unicode scalar" in the specification.  Disallowing noncharacters
  in UnicodeScalar prevents actually using these scalar values as internal
  special values during string processing, which is exactly the reason why they
  are reserved in the first place.
- fix a crash in String.fromCString() that could happen if it was passed a null
  pointer

In Lexer:
- allow noncharacters in string literals.  These Unicode scalar values are not
  allowed to be exchanged externally, but it is totally reasonable to have them
  in literals as long as they don't escape the program.  For example, using
  U+FFFF as a delimiter and then calling str.split("\uffff") is completely
  reasonable.

This is a lot of changes in a single commit; the primary reason why they are
lumped together is the need to change stdlib APIs to account for the
possibility of UTF decoding failure, and this has long-reaching effects
throughout stdlib where these APIs are used.


Swift SVN r19045
This commit is contained in:
Dmitri Hrybenko
2014-06-20 13:07:40 +00:00
parent 48d6a833a5
commit f370ca0746
19 changed files with 2283 additions and 232 deletions

View File

@@ -1023,7 +1023,7 @@ NSStringAPIs.run()
var CStringTests = TestCase("CStringTests")
func getNullCString() -> CString {
return CString(UnsafePointer.null())
return CString(UnsafePointer<CChar>.null())
}
func getASCIICString() -> (CString, dealloc: ()->()) {
@@ -1044,6 +1044,28 @@ func getNonASCIICString() -> (CString, dealloc: ()->()) {
return (CString(up), { up.dealloc(100) })
}
func getIllFormedUTF8String1() -> (CString, dealloc: ()->()) {
var up = UnsafePointer<UInt8>.alloc(100)
up[0] = 0x41
up[1] = 0xed
up[2] = 0xa0
up[3] = 0x80
up[4] = 0x41
up[5] = 0
return (CString(up), { up.dealloc(100) })
}
func getIllFormedUTF8String2() -> (CString, dealloc: ()->()) {
var up = UnsafePointer<UInt8>.alloc(100)
up[0] = 0x41
up[1] = 0xed
up[2] = 0xa0
up[3] = 0x81
up[4] = 0x41
up[5] = 0
return (CString(up), { up.dealloc(100) })
}
func asCCharArray(a: UInt8[]) -> CChar[] {
return a.map { $0.asSigned() }
}
@@ -1060,6 +1082,15 @@ CStringTests.test("init(_:)") {
var (s, dealloc) = getNonASCIICString()
dealloc()
}
if true {
var (s, dealloc) = getIllFormedUTF8String1()
dealloc()
}
}
CStringTests.test("initFromSignedUnsigned") {
CString(UnsafePointer<UInt8>())
CString(UnsafePointer<Int8>())
}
CStringTests.test("convertFromLiterals") {
@@ -1085,6 +1116,11 @@ CStringTests.test("getLogicValue()") {
expectTrue(s.getLogicValue())
dealloc()
}
if true {
var (s, dealloc) = getIllFormedUTF8String1()
expectTrue(s.getLogicValue())
dealloc()
}
}
CStringTests.test("persist()") {
@@ -1102,32 +1138,163 @@ CStringTests.test("persist()") {
expectEqual(asCCharArray([ 0xd0, 0xb0, 0xd0, 0xb1, 0 ]), s.persist()!)
dealloc()
}
if true {
var (s, dealloc) = getIllFormedUTF8String1()
expectEqual(asCCharArray([ 0x41, 0xed, 0xa0, 0x80, 0x41, 0 ]), s.persist()!)
dealloc()
}
}
CStringTests.test("debugDescription") {
if true {
var s = getNullCString()
let s = getNullCString()
expectEqual("<null C string>", s.debugDescription)
}
if true {
var (s, dealloc) = getASCIICString()
let (s, dealloc) = getASCIICString()
expectEqual("\"ab\"", s.debugDescription)
dealloc()
}
if true {
var (s, dealloc) = getNonASCIICString()
let (s, dealloc) = getNonASCIICString()
expectEqual("\"аб\"", s.debugDescription)
dealloc()
}
if true {
let (s, dealloc) = getIllFormedUTF8String1()
expectEqual("<ill-formed UTF-8>\"\u0041\ufffd\ufffd\ufffd\u0041\"",
s.debugDescription)
dealloc()
}
}
CStringTests.test("hashValue") {
if true {
let s = getNullCString()
expectEqual(0, s.hashValue)
}
if true {
let (s, dealloc) = getASCIICString()
expectEqual("ab".hashValue, s.hashValue)
dealloc()
}
if true {
let (s, dealloc) = getNonASCIICString()
expectEqual("аб".hashValue, s.hashValue)
dealloc()
}
if true {
let (s, dealloc) = getIllFormedUTF8String1()
expectEqual("\u0041\ufffd\ufffd\ufffd\u0041".hashValue,
s.hashValue)
dealloc()
}
}
CStringTests.test("OperatorEquals") {
var (s1, dealloc1) = getASCIICString()
var (s2, dealloc2) = getNonASCIICString()
expectTrue(s1 == s1)
expectFalse(s1 == s2)
dealloc1()
dealloc2()
if true {
let (s1, dealloc1) = getASCIICString()
let (s2, dealloc2) = getNonASCIICString()
expectTrue(s1 == s1)
expectFalse(s1 == s2)
dealloc1()
dealloc2()
}
if true {
let (s1, dealloc1) = getIllFormedUTF8String1()
let (s2, dealloc2) = getIllFormedUTF8String1()
expectTrue(s1 == s2)
dealloc1()
dealloc2()
}
if true {
let (s1, dealloc1) = getIllFormedUTF8String1()
let (s2, dealloc2) = getIllFormedUTF8String2()
// This would return true if were decoding UTF-8 and replacing ill-formed
// sequences with U+FFFD.
expectFalse(s1 == s2)
dealloc1()
dealloc2()
}
}
CStringTests.test("OperatorLess") {
if true {
let (s1, dealloc1) = getASCIICString()
let (s2, dealloc2) = getNonASCIICString()
expectFalse(s1 < s1)
expectTrue(s1 < s2)
dealloc1()
dealloc2()
}
if true {
let (s1, dealloc1) = getIllFormedUTF8String1()
let (s2, dealloc2) = getIllFormedUTF8String1()
expectFalse(s1 < s2)
dealloc1()
dealloc2()
}
if true {
let (s1, dealloc1) = getIllFormedUTF8String1()
let (s2, dealloc2) = getIllFormedUTF8String2()
// This would return false if were decoding UTF-8 and replacing ill-formed
// sequences with U+FFFD.
expectTrue(s1 < s2)
dealloc1()
dealloc2()
}
}
CStringTests.test("String.fromCString") {
if true {
let s = getNullCString()
expectEmpty(String.fromCString(s))
}
if true {
let (s, dealloc) = getASCIICString()
expectOptionalEqual("ab", String.fromCString(s))
dealloc()
}
if true {
let (s, dealloc) = getNonASCIICString()
expectOptionalEqual("аб", String.fromCString(s))
dealloc()
}
if true {
let (s, dealloc) = getIllFormedUTF8String1()
expectEmpty(String.fromCString(s))
dealloc()
}
}
CStringTests.test("String.fromCStringRepairingIllFormedUTF8") {
if true {
let s = getNullCString()
let (result, hadError) = String.fromCStringRepairingIllFormedUTF8(s)
expectEmpty(result)
expectFalse(hadError)
}
if true {
let (s, dealloc) = getASCIICString()
let (result, hadError) = String.fromCStringRepairingIllFormedUTF8(s)
expectOptionalEqual("ab", result)
expectFalse(hadError)
dealloc()
}
if true {
let (s, dealloc) = getNonASCIICString()
let (result, hadError) = String.fromCStringRepairingIllFormedUTF8(s)
expectOptionalEqual("аб", result)
expectFalse(hadError)
dealloc()
}
if true {
let (s, dealloc) = getIllFormedUTF8String1()
let (result, hadError) = String.fromCStringRepairingIllFormedUTF8(s)
expectOptionalEqual("\u0041\ufffd\ufffd\ufffd\u0041", result)
expectTrue(hadError)
dealloc()
}
}
CStringTests.run()