Files
swift-mirror/stdlib/core/Character.swift
Maxwell Swadling 3fd1eb23c4 [stdlib] Fixed typo in Character comment
Swift SVN r23847
2014-12-10 23:24:43 +00:00

334 lines
11 KiB
Swift

//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
internal struct _SmallUTF8Sink : SinkType {
var asInt: UInt64 = 0
var shift: UInt64 = 0
mutating func put(x: UTF8.CodeUnit) {
asInt |= UInt64(x) << shift
shift += 8
}
}
/// `Character` represents some Unicode grapheme cluster as
/// defined by a canonical, localized, or otherwise tailored
/// segmentation algorithm.
public struct Character :
_BuiltinExtendedGraphemeClusterLiteralConvertible,
ExtendedGraphemeClusterLiteralConvertible, Equatable, Hashable, Comparable {
// Fundamentally, it is just a String, but it is optimized for the
// common case where the UTF-8 representation fits in 63 bits. The
// remaining bit is used to discriminate between small and large
// representations. In the small representation, the unused bytes
// are filled with 0xFF.
//
// If the grapheme cluster can be represented as Small, it
// should be represented as such.
internal enum Representation {
case Large(_StringBuffer._Storage)
case Small(Builtin.Int63)
}
/// Construct a `Character` containing just the given `scalar`.
public init(_ scalar: UnicodeScalar) {
var output = _SmallUTF8Sink()
UTF8.encode(scalar, output: &output)
output.asInt |= (~0) << output.shift
_representation = .Small(Builtin.trunc_Int64_Int63(output.asInt.value))
}
@effects(readonly)
public init(_builtinUnicodeScalarLiteral value: Builtin.Int32) {
self = Character(
String._fromWellFormedCodeUnitSequence(
UTF32.self, input: CollectionOfOne(UInt32(value))))
}
/// Create an instance initialized to `value`.
public init(unicodeScalarLiteral value: Character) {
self = value
}
@effects(readonly)
public init(
_builtinExtendedGraphemeClusterLiteral start: Builtin.RawPointer,
byteSize: Builtin.Word,
isASCII: Builtin.Int1) {
self = Character(
String(
_builtinExtendedGraphemeClusterLiteral: start,
byteSize: byteSize,
isASCII: isASCII))
}
/// Create an instance initialized to `value`.
public init(extendedGraphemeClusterLiteral value: Character) {
self = value
}
/// Create an instance from a single-character `String`.
///
/// Requires: `s` contains exactly one extended grapheme cluster.
public init(_ s: String) {
// The small representation can accept up to 8 code units as long
// as the last one is a continuation. Since the high bit of the
// last byte is used for the enum's discriminator, we have to
// reconstruct it. As a result, we can't store 0x7f in the final
// byte, because we wouldn't be able to distinguish it from an
// unused 0xFF byte. Rather than trying to squeeze in other
// one-byte code points there, we simplify decoding by banning
// starting a code point in the last byte, and assuming that its
// high bit is 1.
_precondition(
s._core.count != 0, "Can't form a Character from an empty String")
_precondition(
s.startIndex.successor() == s.endIndex,
"Can't form a Character from a String containing more than one extended grapheme cluster")
var (count, initialUTF8) = s._core._encodeSomeUTF8(0)
// Notice that the result of sizeof() is a small non-zero number and can't
// overflow when multiplied by 8.
let bits = sizeofValue(initialUTF8) &* 8 &- 1
if _fastPath(
count == s._core.count && (initialUTF8 & (1 << numericCast(bits))) != 0) {
_representation = .Small(Builtin.trunc_Int64_Int63(initialUTF8.value))
}
else {
if let native = s._core.nativeBuffer {
if native.start == UnsafeMutablePointer(s._core._baseAddress) {
_representation = .Large(native._storage)
return
}
}
var nativeString = ""
nativeString.extend(s)
_representation = .Large(nativeString._core.nativeBuffer!._storage)
}
}
/// Return the index of the lowest byte that is 0xFF, or 8 if
/// there is none
static func _smallSize(value: UInt64) -> Int {
var mask: UInt64 = 0xFF
for var i = 0; i < 8; ++i {
if (value & mask) == mask {
return i
}
mask <<= 8
}
return 8
}
static func _smallValue(value: Builtin.Int63) -> UInt64 {
return UInt64(Builtin.zext_Int63_Int64(value)) | (1<<63)
}
internal static func _makeSmallUTF8Generator(var u8: UInt64)
-> GeneratorOf<UTF8.CodeUnit> {
return GeneratorOf<UTF8.CodeUnit> {
let result = UInt8(truncatingBitPattern: u8)
if result == 0xFF {
return nil
}
u8 = u8 >> 8
return result
}
}
internal struct _SmallUTF8 : CollectionType {
init(var _ u8: UInt64) {
let count = Character._smallSize(u8)
_sanityCheck(count <= 8, "Character with more than 8 UTF-8 code units")
self.count = UInt16(count)
self.data = u8
}
/// The position of the first element in a non-empty collection.
///
/// Identical to `endIndex` in an empty collection.
var startIndex: Int {
return 0
}
/// The collection's "past the end" position.
///
/// `endIndex` is not a valid argument to `subscript`, and is always
/// reachable from `startIndex` by zero or more applications of
/// `successor()`.
var endIndex: Int {
return Int(count)
}
/// Access the code unit at `position`.
///
/// Requires: `position` is a valid position in `self` and
/// `position != endIndex`.
subscript(position: Int) -> UTF8.CodeUnit {
_sanityCheck(position >= 0)
_sanityCheck(position < Int(count))
// Note: using unchecked arthmetic because overflow can not happen if the
// above sanity checks hold.
return UTF8.CodeUnit(
truncatingBitPattern: data >> (UInt64(position) &* 8))
}
/// Return a *generator* over the elements of this *sequence*.
///
/// Complexity: O(1)
func generate() -> IndexingGenerator<_SmallUTF8> {
return IndexingGenerator(self)
}
var count: UInt16
var data: UInt64
}
internal struct _SmallUTF16Sink : SinkType {
mutating func put(x: UTF16.CodeUnit) {
u16 = u16 << 16
u16 = u16 | UInt64(x)
}
var u16: UInt64 = 0
}
struct _SmallUTF16 : CollectionType {
init(var _ u8: UInt64) {
let count = UTF16.measure(
UTF8.self, input: Character._makeSmallUTF8Generator(u8),
repairIllFormedSequences: true)!.0
_sanityCheck(count <= 4, "Character with more than 4 UTF-16 code units")
self.count = UInt16(count)
var output = _SmallUTF16Sink()
transcode(
UTF8.self, UTF16.self, Character._makeSmallUTF8Generator(u8), &output,
stopOnError: false)
self.data = output.u16
}
/// The position of the first element in a non-empty collection.
///
/// Identical to `endIndex` in an empty collection.
var startIndex : Int {
return 0
}
/// The collection's "past the end" position.
///
/// `endIndex` is not a valid argument to `subscript`, and is always
/// reachable from `startIndex` by zero or more applications of
/// `successor()`.
var endIndex : Int {
return Int(count)
}
/// Access the code unit at `position`.
///
/// Requires: `position` is a valid position in `self` and
/// `position != endIndex`.
subscript(position: Int) -> UTF16.CodeUnit {
_sanityCheck(position >= 0)
_sanityCheck(position < Int(count))
// Note: using unchecked arthmetic because overflow can not happen if the
// above sanity checks hold.
return UTF16.CodeUnit(
truncatingBitPattern: data >> (UInt64(position) &* 16))
}
/// Return a *generator* over the elements of this *sequence*.
///
/// Complexity: O(1)
func generate() -> IndexingGenerator<_SmallUTF16> {
return IndexingGenerator(self)
}
var count: UInt16
var data: UInt64
}
/// The hash value.
///
/// **Axiom:** `x == y` implies `x.hashValue == y.hashValue`
///
/// **Note:** the hash value is not guaranteed to be stable across
/// different invocations of the same program. Do not persist the
/// hash value across program runs.
public var hashValue: Int {
// FIXME(performance): constructing a temporary string is extremely
// wasteful and inefficient.
return String(self).hashValue
}
typealias UTF16View = String.UTF16View
var utf16: UTF16View {
return String(self).utf16
}
internal var _representation: Representation;
}
extension String {
/// Construct an instance containing just the given `Character`.
public init(_ c: Character) {
switch c._representation {
case let .Small(_63bits):
let value = Character._smallValue(_63bits)
let smallUTF8 = Character._SmallUTF8(value)
self = String._fromWellFormedCodeUnitSequence(
UTF8.self, input: smallUTF8)
case let .Large(value):
self = String(_StringCore(_StringBuffer(value)))
}
}
}
/// .Small characters are stored in an Int63 with their UTF-8 representation,
/// with any unused bytes set to 0xFF. ASCII characters will have all bytes set
/// to 0xFF except for the lowest byte, which will store the ASCII value. Since
/// 0x7FFFFFFFFFFFFF80 or greater is an invalid UTF-8 sequence, we know if a
/// value is ASCII by checking if it is greater than or equal to
/// 0x7FFFFFFFFFFFFF00.
internal var _minASCIICharReprBuiltin: Builtin.Int63 {
@inline(__always) get {
let x: Int64 = 0x7FFFFFFFFFFFFF00
return Builtin.truncOrBitCast_Int64_Int63(x.value)
}
}
public func ==(lhs: Character, rhs: Character) -> Bool {
switch (lhs._representation, rhs._representation) {
case let (.Small(lbits), .Small(rbits)) where
Bool(Builtin.cmp_uge_Int63(lbits, _minASCIICharReprBuiltin))
&& Bool(Builtin.cmp_uge_Int63(rbits, _minASCIICharReprBuiltin)):
return Bool(Builtin.cmp_eq_Int63(lbits, rbits))
default:
// FIXME(performance): constructing two temporary strings is extremely
// wasteful and inefficient.
return String(lhs) == String(rhs)
}
}
public func <(lhs: Character, rhs: Character) -> Bool {
switch (lhs._representation, rhs._representation) {
case let (.Small(lbits), .Small(rbits)) where
// Note: This is consistent with Foundation but unicode incorrect.
// See String._lessThanASCII.
Bool(Builtin.cmp_uge_Int63(lbits, _minASCIICharReprBuiltin))
&& Bool(Builtin.cmp_uge_Int63(rbits, _minASCIICharReprBuiltin)):
return Bool(Builtin.cmp_ult_Int63(lbits, rbits))
default:
// FIXME(performance): constructing two temporary strings is extremely
// wasteful and inefficient.
return String(lhs) < String(rhs)
}
}