mirror of
https://github.com/apple/swift.git
synced 2025-12-21 12:14:44 +01:00
In UTF-8 decoder:
- implement U+FFFD insertion according to the recommendation given in the
Unicode spec. This required changing the decoder to become stateful, which
significantly increased complexity due to the need to maintain an internal
buffer.
- reject invalid code unit sequences properly instead of crashing rdar://16767868
- reject overlong sequences rdar://16767911
In stdlib:
- change APIs that assume that UTF decoding can never fail to account for
possibility of errors
- fix a bug in UnicodeScalarView that could cause a crash during backward
iteration if U+8000 is present in the string
- allow noncharacters in UnicodeScalar. They are explicitly allowed in the
definition of "Unicode scalar" in the specification. Disallowing noncharacters
in UnicodeScalar prevents actually using these scalar values as internal
special values during string processing, which is exactly the reason why they
are reserved in the first place.
- fix a crash in String.fromCString() that could happen if it was passed a null
pointer
In Lexer:
- allow noncharacters in string literals. These Unicode scalar values are not
allowed to be exchanged externally, but it is totally reasonable to have them
in literals as long as they don't escape the program. For example, using
U+FFFF as a delimiter and then calling str.split("\uffff") is completely
reasonable.
This is a lot of changes in a single commit; the primary reason why they are
lumped together is the need to change stdlib APIs to account for the
possibility of UTF decoding failure, and this has long-reaching effects
throughout stdlib where these APIs are used.
Swift SVN r19045
251 lines
6.8 KiB
Swift
251 lines
6.8 KiB
Swift
//===----------------------------------------------------------------------===//
|
||
//
|
||
// This source file is part of the Swift.org open source project
|
||
//
|
||
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
|
||
// Licensed under Apache License v2.0 with Runtime Library Exception
|
||
//
|
||
// See http://swift.org/LICENSE.txt for license information
|
||
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
||
//
|
||
//===----------------------------------------------------------------------===//
|
||
// UnicodeScalar Type
|
||
//===----------------------------------------------------------------------===//
|
||
|
||
struct UnicodeScalar : ExtendedGraphemeClusterLiteralConvertible {
|
||
|
||
var _value: Builtin.Int32
|
||
|
||
var value: UInt32 {
|
||
get {
|
||
return UInt32(_value)
|
||
}
|
||
}
|
||
|
||
static func convertFromExtendedGraphemeClusterLiteral(
|
||
value: String) -> UnicodeScalar {
|
||
let unicodeScalars = value.unicodeScalars
|
||
return unicodeScalars[unicodeScalars.startIndex]
|
||
}
|
||
|
||
init() {
|
||
self._value = Int32(0).value
|
||
}
|
||
|
||
init(_ value : Builtin.Int32) {
|
||
self._value = value
|
||
}
|
||
|
||
init(_ v : UInt32) {
|
||
// Unicode 6.3.0:
|
||
//
|
||
// D9. Unicode codespace: A range of integers from 0 to 10FFFF.
|
||
//
|
||
// D76. Unicode scalar value: Any Unicode code point except
|
||
// high-surrogate and low-surrogate code points.
|
||
//
|
||
// * As a result of this definition, the set of Unicode scalar values
|
||
// consists of the ranges 0 to D7FF and E000 to 10FFFF, inclusive.
|
||
|
||
_precondition(v < 0xD800 || v > 0xDFFF,
|
||
"high- and low-surrogate code points are not valid Unicode scalar values")
|
||
_precondition(v <= 0x10FFFF, "value is outside of Unicode codespace")
|
||
|
||
self._value = v.value
|
||
}
|
||
|
||
init(_ v: UnicodeScalar) {
|
||
// This constructor allows one to provide necessary type context to
|
||
// disambiguate between function overloads on 'String' and 'UnicodeScalar'.
|
||
self = v
|
||
}
|
||
|
||
func escape(#asASCII: Bool) -> String {
|
||
func lowNibbleAsHex(v: UInt32) -> String {
|
||
var nibble = v & 15
|
||
if nibble < 10 {
|
||
return String(UnicodeScalar(nibble+48)) // 48 = '0'
|
||
} else {
|
||
return String(UnicodeScalar(nibble-10+65)) // 65 = 'A'
|
||
}
|
||
}
|
||
|
||
if self == "\\" {
|
||
return "\\\\"
|
||
} else if self == "\'" {
|
||
return "\\\'"
|
||
} else if self == "\"" {
|
||
return "\\\""
|
||
} else if isPrint() {
|
||
return String(self)
|
||
} else if self == "\0" {
|
||
return "\\0"
|
||
} else if self == "\n" {
|
||
return "\\n"
|
||
} else if self == "\r" {
|
||
return "\\r"
|
||
} else if self == "\t" {
|
||
return "\\t"
|
||
} else if UInt32(self) < 128 {
|
||
return "\\x"
|
||
+ lowNibbleAsHex(UInt32(self) >> 4)
|
||
+ lowNibbleAsHex(UInt32(self))
|
||
} else if !asASCII {
|
||
return String(self)
|
||
} else if UInt32(self) <= 0xFFFF {
|
||
return "\\u"
|
||
+ lowNibbleAsHex(UInt32(self) >> 12)
|
||
+ lowNibbleAsHex(UInt32(self) >> 8)
|
||
+ lowNibbleAsHex(UInt32(self) >> 4)
|
||
+ lowNibbleAsHex(UInt32(self))
|
||
} else {
|
||
// FIXME: Type checker performance prohibits this from being a
|
||
// single chained "+".
|
||
var result = "\\U"
|
||
result += lowNibbleAsHex(UInt32(self) >> 28)
|
||
result += lowNibbleAsHex(UInt32(self) >> 24)
|
||
result += lowNibbleAsHex(UInt32(self) >> 20)
|
||
result += lowNibbleAsHex(UInt32(self) >> 16)
|
||
result += lowNibbleAsHex(UInt32(self) >> 12)
|
||
result += lowNibbleAsHex(UInt32(self) >> 8)
|
||
result += lowNibbleAsHex(UInt32(self) >> 4)
|
||
result += lowNibbleAsHex(UInt32(self))
|
||
return result
|
||
}
|
||
}
|
||
|
||
/// \returns true if this is an ASCII character (code point 0 to 127
|
||
/// inclusive).
|
||
func isASCII() -> Bool {
|
||
return value <= 127
|
||
}
|
||
|
||
// FIXME: Locales make this interesting
|
||
func isAlpha() -> Bool {
|
||
return (self >= "A" && self <= "Z") || (self >= "a" && self <= "z")
|
||
}
|
||
|
||
// FIXME: Locales make this interesting
|
||
func isDigit() -> Bool {
|
||
return self >= "0" && self <= "9"
|
||
}
|
||
|
||
// FIXME: Locales make this interesting
|
||
var uppercase : UnicodeScalar {
|
||
if self >= "a" && self <= "z" {
|
||
return UnicodeScalar(UInt32(self) - 32)
|
||
} else if self >= "à" && self <= "þ" && self != "÷" {
|
||
return UnicodeScalar(UInt32(self) - 32)
|
||
}
|
||
return self
|
||
}
|
||
|
||
// FIXME: Locales make this interesting
|
||
var lowercase : UnicodeScalar {
|
||
if self >= "A" && self <= "Z" {
|
||
return UnicodeScalar(UInt32(self) + 32)
|
||
} else if self >= "À" && self <= "Þ" && self != "×" {
|
||
return UnicodeScalar(UInt32(self) + 32)
|
||
}
|
||
return self
|
||
}
|
||
|
||
// FIXME: Locales make this interesting.
|
||
func isSpace() -> Bool {
|
||
// FIXME: The constraint-based type checker goes painfully exponential
|
||
// when we turn this into one large expression. Break it up for now,
|
||
// until we can optimize the constraint solver better.
|
||
if self == " " || self == "\t" { return true }
|
||
if self == "\n" || self == "\r" { return true }
|
||
return self == "\x0B" || self == "\x0C"
|
||
}
|
||
}
|
||
|
||
extension UnicodeScalar : Printable, DebugPrintable {
|
||
var description: String {
|
||
return "\"\(escape(asASCII: false))\""
|
||
}
|
||
var debugDescription: String {
|
||
return "\"\(escape(asASCII: true))\""
|
||
}
|
||
}
|
||
|
||
extension UnicodeScalar : Hashable {
|
||
var hashValue: Int {
|
||
return Int(self.value)
|
||
}
|
||
}
|
||
|
||
extension UnicodeScalar {
|
||
init(_ v : Int) {
|
||
self = UnicodeScalar(UInt32(v))
|
||
}
|
||
}
|
||
|
||
extension UInt8 {
|
||
init(_ v : UnicodeScalar) {
|
||
_precondition(v.value <= UInt32(UInt8.max),
|
||
"Code point value does not fit into UInt8")
|
||
self = UInt8(v.value)
|
||
}
|
||
}
|
||
extension UInt32 {
|
||
init(_ v : UnicodeScalar) {
|
||
self = v.value
|
||
}
|
||
}
|
||
extension UInt64 {
|
||
init(_ v : UnicodeScalar) {
|
||
self = UInt64(v.value)
|
||
}
|
||
}
|
||
|
||
func - (lhs: UnicodeScalar, rhs: UnicodeScalar) -> Int {
|
||
return Int(lhs.value) - Int(rhs.value)
|
||
}
|
||
|
||
func - (lhs: UnicodeScalar, rhs: Int) -> UnicodeScalar {
|
||
return UnicodeScalar(Int(lhs.value) - rhs)
|
||
}
|
||
|
||
func + (lhs: UnicodeScalar, rhs: Int) -> UnicodeScalar {
|
||
return UnicodeScalar(Int(lhs.value) + rhs)
|
||
}
|
||
|
||
func + (lhs: Int, rhs: UnicodeScalar) -> UnicodeScalar {
|
||
return rhs + lhs
|
||
}
|
||
|
||
func ==(lhs: UnicodeScalar, rhs: UnicodeScalar) -> Bool {
|
||
return lhs.value == rhs.value
|
||
}
|
||
|
||
extension UnicodeScalar : Comparable {
|
||
}
|
||
|
||
func <(lhs: UnicodeScalar, rhs: UnicodeScalar) -> Bool {
|
||
return lhs.value < rhs.value
|
||
}
|
||
|
||
extension UnicodeScalar {
|
||
func isPrint() -> Bool {
|
||
return (self >= UnicodeScalar(0o040) && self <= UnicodeScalar(0o176))
|
||
}
|
||
}
|
||
|
||
/// Helpers to provide type context to guide type inference in code like::
|
||
///
|
||
/// var zero = _asUnicodeCodePoint("0")
|
||
func _asUnicodeCodePoint(us: UnicodeScalar) -> Builtin.Int32 {
|
||
return us._value
|
||
}
|
||
func _asUnicodeCodePoint(us: UnicodeScalar) -> UInt32 {
|
||
return us.value
|
||
}
|
||
func _asUTF16CodeUnit(us: UnicodeScalar) -> UTF16.CodeUnit {
|
||
var codePoint = us.value
|
||
_precondition(codePoint <= UInt32(UInt16.max))
|
||
return UTF16.CodeUnit(codePoint)
|
||
}
|
||
|