Files
swift-mirror/stdlib/core/UnicodeScalar.swift
Dmitri Hrybenko f370ca0746 stdlib: fix a bunch of various Unicode issues, primarily in UTF-8 decoding
In UTF-8 decoder:
- implement U+FFFD insertion according to the recommendation given in the
  Unicode spec.  This required changing the decoder to become stateful, which
  significantly increased complexity due to the need to maintain an internal
  buffer.
- reject invalid code unit sequences properly instead of crashing rdar://16767868
- reject overlong sequences rdar://16767911

In stdlib:
- change APIs that assume that UTF decoding can never fail to account for
  possibility of errors
- fix a bug in UnicodeScalarView that could cause a crash during backward
  iteration if U+8000 is present in the string
- allow noncharacters in UnicodeScalar.  They are explicitly allowed in the
  definition of "Unicode scalar" in the specification.  Disallowing noncharacters
  in UnicodeScalar prevents actually using these scalar values as internal
  special values during string processing, which is exactly the reason why they
  are reserved in the first place.
- fix a crash in String.fromCString() that could happen if it was passed a null
  pointer

In Lexer:
- allow noncharacters in string literals.  These Unicode scalar values are not
  allowed to be exchanged externally, but it is totally reasonable to have them
  in literals as long as they don't escape the program.  For example, using
  U+FFFF as a delimiter and then calling str.split("\uffff") is completely
  reasonable.

This is a lot of changes in a single commit; the primary reason why they are
lumped together is the need to change stdlib APIs to account for the
possibility of UTF decoding failure, and this has long-reaching effects
throughout stdlib where these APIs are used.


Swift SVN r19045
2014-06-20 13:07:40 +00:00

251 lines
6.8 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
// UnicodeScalar Type
//===----------------------------------------------------------------------===//
struct UnicodeScalar : ExtendedGraphemeClusterLiteralConvertible {
var _value: Builtin.Int32
var value: UInt32 {
get {
return UInt32(_value)
}
}
static func convertFromExtendedGraphemeClusterLiteral(
value: String) -> UnicodeScalar {
let unicodeScalars = value.unicodeScalars
return unicodeScalars[unicodeScalars.startIndex]
}
init() {
self._value = Int32(0).value
}
init(_ value : Builtin.Int32) {
self._value = value
}
init(_ v : UInt32) {
// Unicode 6.3.0:
//
// D9. Unicode codespace: A range of integers from 0 to 10FFFF.
//
// D76. Unicode scalar value: Any Unicode code point except
// high-surrogate and low-surrogate code points.
//
// * As a result of this definition, the set of Unicode scalar values
// consists of the ranges 0 to D7FF and E000 to 10FFFF, inclusive.
_precondition(v < 0xD800 || v > 0xDFFF,
"high- and low-surrogate code points are not valid Unicode scalar values")
_precondition(v <= 0x10FFFF, "value is outside of Unicode codespace")
self._value = v.value
}
init(_ v: UnicodeScalar) {
// This constructor allows one to provide necessary type context to
// disambiguate between function overloads on 'String' and 'UnicodeScalar'.
self = v
}
func escape(#asASCII: Bool) -> String {
func lowNibbleAsHex(v: UInt32) -> String {
var nibble = v & 15
if nibble < 10 {
return String(UnicodeScalar(nibble+48)) // 48 = '0'
} else {
return String(UnicodeScalar(nibble-10+65)) // 65 = 'A'
}
}
if self == "\\" {
return "\\\\"
} else if self == "\'" {
return "\\\'"
} else if self == "\"" {
return "\\\""
} else if isPrint() {
return String(self)
} else if self == "\0" {
return "\\0"
} else if self == "\n" {
return "\\n"
} else if self == "\r" {
return "\\r"
} else if self == "\t" {
return "\\t"
} else if UInt32(self) < 128 {
return "\\x"
+ lowNibbleAsHex(UInt32(self) >> 4)
+ lowNibbleAsHex(UInt32(self))
} else if !asASCII {
return String(self)
} else if UInt32(self) <= 0xFFFF {
return "\\u"
+ lowNibbleAsHex(UInt32(self) >> 12)
+ lowNibbleAsHex(UInt32(self) >> 8)
+ lowNibbleAsHex(UInt32(self) >> 4)
+ lowNibbleAsHex(UInt32(self))
} else {
// FIXME: Type checker performance prohibits this from being a
// single chained "+".
var result = "\\U"
result += lowNibbleAsHex(UInt32(self) >> 28)
result += lowNibbleAsHex(UInt32(self) >> 24)
result += lowNibbleAsHex(UInt32(self) >> 20)
result += lowNibbleAsHex(UInt32(self) >> 16)
result += lowNibbleAsHex(UInt32(self) >> 12)
result += lowNibbleAsHex(UInt32(self) >> 8)
result += lowNibbleAsHex(UInt32(self) >> 4)
result += lowNibbleAsHex(UInt32(self))
return result
}
}
/// \returns true if this is an ASCII character (code point 0 to 127
/// inclusive).
func isASCII() -> Bool {
return value <= 127
}
// FIXME: Locales make this interesting
func isAlpha() -> Bool {
return (self >= "A" && self <= "Z") || (self >= "a" && self <= "z")
}
// FIXME: Locales make this interesting
func isDigit() -> Bool {
return self >= "0" && self <= "9"
}
// FIXME: Locales make this interesting
var uppercase : UnicodeScalar {
if self >= "a" && self <= "z" {
return UnicodeScalar(UInt32(self) - 32)
} else if self >= "à" && self <= "þ" && self != "÷" {
return UnicodeScalar(UInt32(self) - 32)
}
return self
}
// FIXME: Locales make this interesting
var lowercase : UnicodeScalar {
if self >= "A" && self <= "Z" {
return UnicodeScalar(UInt32(self) + 32)
} else if self >= "À" && self <= "Þ" && self != "×" {
return UnicodeScalar(UInt32(self) + 32)
}
return self
}
// FIXME: Locales make this interesting.
func isSpace() -> Bool {
// FIXME: The constraint-based type checker goes painfully exponential
// when we turn this into one large expression. Break it up for now,
// until we can optimize the constraint solver better.
if self == " " || self == "\t" { return true }
if self == "\n" || self == "\r" { return true }
return self == "\x0B" || self == "\x0C"
}
}
extension UnicodeScalar : Printable, DebugPrintable {
var description: String {
return "\"\(escape(asASCII: false))\""
}
var debugDescription: String {
return "\"\(escape(asASCII: true))\""
}
}
extension UnicodeScalar : Hashable {
var hashValue: Int {
return Int(self.value)
}
}
extension UnicodeScalar {
init(_ v : Int) {
self = UnicodeScalar(UInt32(v))
}
}
extension UInt8 {
init(_ v : UnicodeScalar) {
_precondition(v.value <= UInt32(UInt8.max),
"Code point value does not fit into UInt8")
self = UInt8(v.value)
}
}
extension UInt32 {
init(_ v : UnicodeScalar) {
self = v.value
}
}
extension UInt64 {
init(_ v : UnicodeScalar) {
self = UInt64(v.value)
}
}
func - (lhs: UnicodeScalar, rhs: UnicodeScalar) -> Int {
return Int(lhs.value) - Int(rhs.value)
}
func - (lhs: UnicodeScalar, rhs: Int) -> UnicodeScalar {
return UnicodeScalar(Int(lhs.value) - rhs)
}
func + (lhs: UnicodeScalar, rhs: Int) -> UnicodeScalar {
return UnicodeScalar(Int(lhs.value) + rhs)
}
func + (lhs: Int, rhs: UnicodeScalar) -> UnicodeScalar {
return rhs + lhs
}
func ==(lhs: UnicodeScalar, rhs: UnicodeScalar) -> Bool {
return lhs.value == rhs.value
}
extension UnicodeScalar : Comparable {
}
func <(lhs: UnicodeScalar, rhs: UnicodeScalar) -> Bool {
return lhs.value < rhs.value
}
extension UnicodeScalar {
func isPrint() -> Bool {
return (self >= UnicodeScalar(0o040) && self <= UnicodeScalar(0o176))
}
}
/// Helpers to provide type context to guide type inference in code like::
///
/// var zero = _asUnicodeCodePoint("0")
func _asUnicodeCodePoint(us: UnicodeScalar) -> Builtin.Int32 {
return us._value
}
func _asUnicodeCodePoint(us: UnicodeScalar) -> UInt32 {
return us.value
}
func _asUTF16CodeUnit(us: UnicodeScalar) -> UTF16.CodeUnit {
var codePoint = us.value
_precondition(codePoint <= UInt32(UInt16.max))
return UTF16.CodeUnit(codePoint)
}