//===----------------------------------------------------------------------===// // // This source file is part of the Swift.org open source project // // Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// /// A single extended grapheme cluster, which approximates a user-perceived /// character. /// /// The `Character` type represents a character made up of one or more Unicode /// scalar values, grouped by a Unicode boundary algorithm. Generally, a /// `Character` instance matches what the reader of a string will perceive as /// a single character. The number of visible characters is generally the most /// natural way to count the length of a string. /// /// let greeting = "Hello! 🐥" /// print("Character count: \(greeting.characters.count)") /// // Prints "Character count: 8" /// /// Because each character in a string can be made up of one or more Unicode /// code points, the number of characters in a string may not match the length /// of the Unicode code point representation or the length of the string in a /// particular binary representation. /// /// print("Unicode code point count: \(greeting.unicodeScalars.count)") /// // Prints "Unicode code point count: 15" /// /// print("UTF-8 representation count: \(greeting.utf8.count)") /// // Prints "UTF-8 representation count: 18" /// /// Every `Character` instance is composed of one or more Unicode code points /// that are grouped together as an *extended grapheme cluster*. The way these /// code points are grouped is defined by a canonical, localized, or otherwise /// tailored Unicode segmentation algorithm. /// /// For example, a country's Unicode flag character is made up of two regional /// indicator code points that correspond to that country's ISO 3166-1 alpha-2 /// code. The alpha-2 code for The United States is "US", so its flag /// character is made up of the Unicode code points `"\u{1F1FA}"` (REGIONAL /// INDICATOR SYMBOL LETTER U) and `"\u{1F1F8}"` (REGIONAL INDICATOR SYMBOL /// LETTER S). When placed next to each other in a Swift string literal, these /// two code points are combined into a single grapheme cluster, represented /// by a `Character` instance in Swift. /// /// let usFlag: Character = "\u{1F1FA}\u{1F1F8}" /// print(usFlag) /// // Prints "🇺🇸" /// /// For more information about the Unicode terms used in this discussion, see /// the [Unicode.org glossary][glossary]. In particular, this discussion /// mentions [extended grapheme clusters][clusters] and [Unicode scalar /// values][scalars]. /// /// [glossary]: http://www.unicode.org/glossary/ /// [clusters]: http://www.unicode.org/glossary/#extended_grapheme_cluster /// [scalars]: http://www.unicode.org/glossary/#unicode_scalar_value public struct Character : _ExpressibleByBuiltinExtendedGraphemeClusterLiteral, ExpressibleByExtendedGraphemeClusterLiteral, Hashable { // Fundamentally, it is just a String, but it is optimized for the // common case where the UTF-8 representation fits in 63 bits. The // remaining bit is used to discriminate between small and large // representations. In the small representation, the unused bytes // are filled with 0xFF. // // If the grapheme cluster can be represented as `.small`, it // should be represented as such. @_versioned internal enum Representation { // A _StringBuffer whose first grapheme cluster is self. // NOTE: may be more than 1 Character long. case large(_StringBuffer._Storage) case small(Builtin.Int63) } /// Creates a character containing the given Unicode scalar value. /// /// - Parameter scalar: The Unicode scalar value to convert into a character. public init(_ scalar: UnicodeScalar) { var asInt: UInt64 = 0 var shift: UInt64 = 0 let output: (UTF8.CodeUnit) -> Void = { asInt |= UInt64($0) << shift shift += 8 } UTF8.encode(scalar, into: output) asInt |= (~0) << shift _representation = .small(Builtin.trunc_Int64_Int63(asInt._value)) } @effects(readonly) public init(_builtinUnicodeScalarLiteral value: Builtin.Int32) { self = Character( String._fromWellFormedCodeUnitSequence( UTF32.self, input: CollectionOfOne(UInt32(value)))) } /// Creates a character with the specified value. /// /// Do not call this initializer directly. It is used by the compiler when you /// use a string literal to initialize a `Character` instance. For example: /// /// let snowflake: Character = "❄︎" /// print(snowflake) /// // Prints "❄︎" /// /// The assignment to the `snowflake` constant calls this initializer behind /// the scenes. public init(unicodeScalarLiteral value: Character) { self = value } @effects(readonly) public init( _builtinExtendedGraphemeClusterLiteral start: Builtin.RawPointer, utf8CodeUnitCount: Builtin.Word, isASCII: Builtin.Int1 ) { self = Character( String( _builtinExtendedGraphemeClusterLiteral: start, utf8CodeUnitCount: utf8CodeUnitCount, isASCII: isASCII)) } /// Creates a character with the specified value. /// /// Do not call this initializer directly. It is used by the compiler when /// you use a string literal to initialize a `Character` instance. For /// example: /// /// let oBreve: Character = "o\u{306}" /// print(oBreve) /// // Prints "ŏ" /// /// The assignment to the `oBreve` constant calls this initializer behind the /// scenes. public init(extendedGraphemeClusterLiteral value: Character) { self = value } /// Creates a character from a single-character string. /// /// The following example creates a new character from the uppercase version /// of a string that only holds one character. /// /// let a = "a" /// let capitalA = Character(a.uppercased()) /// /// - Parameter s: The single-character string to convert to a `Character` /// instance. `s` must contain exactly one extended grapheme cluster. public init(_ s: String) { // The small representation can accept up to 8 code units as long // as the last one is a continuation. Since the high bit of the // last byte is used for the enum's discriminator, we have to // reconstruct it. As a result, we can't store 0x7f in the final // byte, because we wouldn't be able to distinguish it from an // unused 0xFF byte. Rather than trying to squeeze in other // one-byte code points there, we simplify decoding by banning // starting a code point in the last byte, and assuming that its // high bit is 1. _precondition( s._core.count != 0, "Can't form a Character from an empty String") _precondition( s.index(after: s.startIndex) == s.endIndex, "Can't form a Character from a String containing more than one extended grapheme cluster") let (count, initialUTF8) = s._core._encodeSomeUTF8(from: 0) // Notice that the result of sizeof() is a small non-zero number and can't // overflow when multiplied by 8. let bits = MemoryLayout.size(ofValue: initialUTF8) &* 8 &- 1 if _fastPath( count == s._core.count && (initialUTF8 & (1 << numericCast(bits))) != 0) { _representation = .small(Builtin.trunc_Int64_Int63(initialUTF8._value)) } else { if let native = s._core.nativeBuffer, native.start == s._core._baseAddress! { _representation = .large(native._storage) return } var nativeString = "" nativeString.append(s) _representation = .large(nativeString._core.nativeBuffer!._storage) } } /// Returns the index of the lowest byte that is 0xFF, or 8 if /// there is none. static func _smallSize(_ value: UInt64) -> Int { var mask: UInt64 = 0xFF for i in 0..<8 { if (value & mask) == mask { return i } mask <<= 8 } return 8 } static func _smallValue(_ value: Builtin.Int63) -> UInt64 { return UInt64(Builtin.zext_Int63_Int64(value)) | (1<<63) } internal struct _SmallUTF8 : RandomAccessCollection { typealias Indices = CountableRange var indices: CountableRange { return startIndex.. UTF8.CodeUnit { _sanityCheck(position >= 0) _sanityCheck(position < Int(count)) // Note: using unchecked arithmetic because overflow cannot happen if the // above sanity checks hold. return UTF8.CodeUnit( truncatingBitPattern: data >> (UInt64(position) &* 8)) } internal struct Iterator : IteratorProtocol { init(_ data: UInt64) { self._data = data } internal mutating func next() -> UInt8? { let result = UInt8(truncatingBitPattern: _data) if result == 0xFF { return nil } _data = (_data >> 8) | 0xFF00_0000_0000_0000 return result } internal var _data: UInt64 } internal func makeIterator() -> Iterator { return Iterator(data) } var count: UInt16 var data: UInt64 } struct _SmallUTF16 : RandomAccessCollection { typealias Indices = CountableRange init(_ u8: UInt64) { let count = UTF16.transcodedLength( of: _SmallUTF8(u8).makeIterator(), decodedAs: UTF8.self, repairingIllFormedSequences: true)!.0 _sanityCheck(count <= 4, "Character with more than 4 UTF-16 code units") self.count = UInt16(count) var u16: UInt64 = 0 let output: (UTF16.CodeUnit) -> Void = { u16 = u16 << 16 u16 = u16 | UInt64($0) } _ = transcode( _SmallUTF8(u8).makeIterator(), from: UTF8.self, to: UTF16.self, stoppingOnError: false, into: output) self.data = u16 } /// The position of the first element in a non-empty collection. /// /// In an empty collection, `startIndex == endIndex`. var startIndex: Int { return 0 } /// The collection's "past the end" position. /// /// `endIndex` is not a valid argument to `subscript`, and is always /// reachable from `startIndex` by zero or more applications of /// `successor()`. var endIndex: Int { return Int(count) } /// Access the code unit at `position`. /// /// - Precondition: `position` is a valid position in `self` and /// `position != endIndex`. subscript(position: Int) -> UTF16.CodeUnit { _sanityCheck(position >= 0) _sanityCheck(position < Int(count)) // Note: using unchecked arithmetic because overflow cannot happen if the // above sanity checks hold. return UTF16.CodeUnit(truncatingBitPattern: data >> ((UInt64(count) &- UInt64(position) &- 1) &* 16)) } var count: UInt16 var data: UInt64 } /// The character's hash value. /// /// Hash values are not guaranteed to be equal across different executions of /// your program. Do not save hash values to use during a future execution. public var hashValue: Int { // FIXME(performance): constructing a temporary string is extremely // wasteful and inefficient. return String(self).hashValue } typealias UTF16View = String.UTF16View var utf16: UTF16View { return String(self).utf16 } @_versioned internal var _representation: Representation } extension Character : CustomStringConvertible { public var description: String { return String(describing: self) } } extension Character : LosslessStringConvertible {} extension Character : CustomDebugStringConvertible { /// A textual representation of the character, suitable for debugging. public var debugDescription: String { return String(self).debugDescription } } extension String { /// Creates a string containing the given character. /// /// - Parameter c: The character to convert to a string. public init(_ c: Character) { switch c._representation { case let .small(_63bits): let value = Character._smallValue(_63bits) let smallUTF8 = Character._SmallUTF8(value) self = String._fromWellFormedCodeUnitSequence( UTF8.self, input: smallUTF8) case let .large(value): let buf = String(_StringCore(_StringBuffer(value))) self = buf[buf.startIndex.. Bool { switch (lhs._representation, rhs._representation) { case let (.small(lbits), .small(rbits)) where Bool(Builtin.cmp_uge_Int63(lbits, _minASCIICharReprBuiltin)) && Bool(Builtin.cmp_uge_Int63(rbits, _minASCIICharReprBuiltin)): return Bool(Builtin.cmp_eq_Int63(lbits, rbits)) default: // FIXME(performance): constructing two temporary strings is extremely // wasteful and inefficient. return String(lhs) == String(rhs) } } } extension Character : Comparable { public static func < (lhs: Character, rhs: Character) -> Bool { switch (lhs._representation, rhs._representation) { case let (.small(lbits), .small(rbits)) where // Note: This is consistent with Foundation but unicode incorrect. // See String._compareASCII. Bool(Builtin.cmp_uge_Int63(lbits, _minASCIICharReprBuiltin)) && Bool(Builtin.cmp_uge_Int63(rbits, _minASCIICharReprBuiltin)): return Bool(Builtin.cmp_ult_Int63(lbits, rbits)) default: // FIXME(performance): constructing two temporary strings is extremely // wasteful and inefficient. return String(lhs) < String(rhs) } } }