mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
Use UBreakIterators to perform grapheme breaking. This gives Unicode 9 grapheme breaking (e.g. family emoji) and provides a means to upgrade to future versions. It also serves as a model for how to serve up other advanced functionality in ICU to users. This has tricky performance implications. Some things are faster and a number of cases are slower. But, careful use of ICU can help mitigate and amortize these costs. In conjunction with more early detection of fast paths, overall grapheme breaking for the average user should be much faster than in Swift 3. NOTE: This is incomplete. It currently falls back on the legacy tries for some bridged strings. There are many potential directions for a general solution, but for now we'll be interatively adding support for more and more special cases.
704 lines
24 KiB
Swift
704 lines
24 KiB
Swift
//===--- StringCharacterView.swift - String's Collection of Characters ----===//
|
||
//
|
||
// This source file is part of the Swift.org open source project
|
||
//
|
||
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
|
||
// Licensed under Apache License v2.0 with Runtime Library Exception
|
||
//
|
||
// See https://swift.org/LICENSE.txt for license information
|
||
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
||
//
|
||
//===----------------------------------------------------------------------===//
|
||
//
|
||
// String is-not-a Sequence or Collection, but it exposes a
|
||
// collection of characters.
|
||
//
|
||
//===----------------------------------------------------------------------===//
|
||
|
||
// FIXME(ABI)#70 : The character string view should have a custom iterator type to
|
||
// allow performance optimizations of linear traversals.
|
||
|
||
/// CR and LF are common special cases in grapheme breaking logic
|
||
internal let _CR: UInt8 = 0x0d
|
||
internal let _LF: UInt8 = 0x0a
|
||
|
||
import SwiftShims
|
||
|
||
extension String {
|
||
/// A view of a string's contents as a collection of characters.
|
||
///
|
||
/// In Swift, every string provides a view of its contents as characters. In
|
||
/// this view, many individual characters---for example, "é", "김", and
|
||
/// "🇮🇳"---can be made up of multiple Unicode code points. These code points
|
||
/// are combined by Unicode's boundary algorithms into *extended grapheme
|
||
/// clusters*, represented by the `Character` type. Each element of a
|
||
/// `CharacterView` collection is a `Character` instance.
|
||
///
|
||
/// let flowers = "Flowers 💐"
|
||
/// for c in flowers.characters {
|
||
/// print(c)
|
||
/// }
|
||
/// // F
|
||
/// // l
|
||
/// // o
|
||
/// // w
|
||
/// // e
|
||
/// // r
|
||
/// // s
|
||
/// //
|
||
/// // 💐
|
||
///
|
||
/// You can convert a `String.CharacterView` instance back into a string
|
||
/// using the `String` type's `init(_:)` initializer.
|
||
///
|
||
/// let name = "Marie Curie"
|
||
/// if let firstSpace = name.characters.index(of: " ") {
|
||
/// let firstName = String(name.characters[..<firstSpace])
|
||
/// print(firstName)
|
||
/// }
|
||
/// // Prints "Marie"
|
||
public struct CharacterView {
|
||
internal var _core: _StringCore
|
||
|
||
/// The offset of this view's `_core` from an original core. This works
|
||
/// around the fact that `_StringCore` is always zero-indexed.
|
||
/// `_coreOffset` should be subtracted from `UnicodeScalarIndex._position`
|
||
/// before that value is used as a `_core` index.
|
||
internal var _coreOffset: Int
|
||
|
||
/// Creates a view of the given string.
|
||
public init(_ text: String) {
|
||
self._core = text._core
|
||
self._coreOffset = 0
|
||
}
|
||
|
||
public // @testable
|
||
init(_ _core: _StringCore, coreOffset: Int = 0) {
|
||
self._core = _core
|
||
self._coreOffset = coreOffset
|
||
}
|
||
}
|
||
|
||
/// A view of the string's contents as a collection of characters.
|
||
public var characters: CharacterView {
|
||
get {
|
||
return CharacterView(self)
|
||
}
|
||
set {
|
||
self = String(newValue)
|
||
}
|
||
}
|
||
|
||
/// Applies the given closure to a mutable view of the string's characters.
|
||
///
|
||
/// Do not use the string that is the target of this method inside the
|
||
/// closure passed to `body`, as it may not have its correct value. Instead,
|
||
/// use the closure's `CharacterView` argument.
|
||
///
|
||
/// This example below uses the `withMutableCharacters(_:)` method to
|
||
/// truncate the string `str` at the first space and to return the remainder
|
||
/// of the string.
|
||
///
|
||
/// var str = "All this happened, more or less."
|
||
/// let afterSpace = str.withMutableCharacters { chars -> String.CharacterView in
|
||
/// if let i = chars.index(of: " ") {
|
||
/// let result = chars[chars.index(after: i)...]
|
||
/// chars.removeSubrange(i...)
|
||
/// return result
|
||
/// }
|
||
/// return String.CharacterView()
|
||
/// }
|
||
///
|
||
/// print(str)
|
||
/// // Prints "All"
|
||
/// print(String(afterSpace))
|
||
/// // Prints "this happened, more or less."
|
||
///
|
||
/// - Parameter body: A closure that takes a character view as its argument.
|
||
/// The `CharacterView` argument is valid only for the duration of the
|
||
/// closure's execution.
|
||
/// - Returns: The return value of the `body` closure, if any, is the return
|
||
/// value of this method.
|
||
public mutating func withMutableCharacters<R>(
|
||
_ body: (inout CharacterView) -> R
|
||
) -> R {
|
||
// Naively mutating self.characters forces multiple references to
|
||
// exist at the point of mutation. Instead, temporarily move the
|
||
// core of this string into a CharacterView.
|
||
var tmp = CharacterView("")
|
||
(_core, tmp._core) = (tmp._core, _core)
|
||
let r = body(&tmp)
|
||
(_core, tmp._core) = (tmp._core, _core)
|
||
return r
|
||
}
|
||
|
||
/// Creates a string from the given character view.
|
||
///
|
||
/// Use this initializer to recover a string after performing a collection
|
||
/// slicing operation on a string's character view.
|
||
///
|
||
/// let poem = """
|
||
/// 'Twas brillig, and the slithy toves /
|
||
/// Did gyre and gimbal in the wabe: /
|
||
/// All mimsy were the borogoves /
|
||
/// And the mome raths outgrabe.
|
||
/// """
|
||
/// let excerpt = String(poem.characters.prefix(22)) + "..."
|
||
/// print(excerpt)
|
||
/// // Prints "'Twas brillig, and the..."
|
||
///
|
||
/// - Parameter characters: A character view to convert to a string.
|
||
public init(_ characters: CharacterView) {
|
||
self.init(characters._core)
|
||
}
|
||
}
|
||
|
||
/// `String.CharacterView` is a collection of `Character`.
|
||
extension String.CharacterView : BidirectionalCollection {
|
||
internal typealias UnicodeScalarView = String.UnicodeScalarView
|
||
internal var unicodeScalars: UnicodeScalarView {
|
||
return UnicodeScalarView(_core, coreOffset: _coreOffset)
|
||
}
|
||
|
||
/// A position in a string's `CharacterView` instance.
|
||
///
|
||
/// You can convert between indices of the different string views by using
|
||
/// conversion initializers and the `samePosition(in:)` method overloads.
|
||
/// The following example finds the index of the first space in the string's
|
||
/// character view and then converts that to the same position in the UTF-8
|
||
/// view:
|
||
///
|
||
/// let hearts = "Hearts <3 ♥︎ 💘"
|
||
/// if let i = hearts.characters.index(of: " ") {
|
||
/// let j = i.samePosition(in: hearts.utf8)
|
||
/// print(Array(hearts.utf8[..<j]))
|
||
/// }
|
||
/// // Prints "[72, 101, 97, 114, 116, 115]"
|
||
public struct Index : Comparable, CustomPlaygroundQuickLookable {
|
||
public // SPI(Foundation)
|
||
init(_base: String.UnicodeScalarView.Index, in c: String.CharacterView) {
|
||
self._base = _base
|
||
self._countUTF16 = c._measureExtendedGraphemeClusterForward(from: _base)
|
||
}
|
||
|
||
internal init(_base: UnicodeScalarView.Index, _countUTF16: Int) {
|
||
self._base = _base
|
||
self._countUTF16 = _countUTF16
|
||
}
|
||
|
||
internal let _base: UnicodeScalarView.Index
|
||
|
||
/// The count of this extended grapheme cluster in UTF-16 code units.
|
||
internal let _countUTF16: Int
|
||
|
||
/// The integer offset of this index in UTF-16 code units.
|
||
public // SPI(Foundation)
|
||
var _utf16Index: Int {
|
||
return _base._position
|
||
}
|
||
|
||
/// The one past end index for this extended grapheme cluster in Unicode
|
||
/// scalars.
|
||
internal var _endBase: UnicodeScalarView.Index {
|
||
return UnicodeScalarView.Index(_position: _utf16Index + _countUTF16)
|
||
}
|
||
|
||
public var customPlaygroundQuickLook: PlaygroundQuickLook {
|
||
return .int(Int64(_utf16Index))
|
||
}
|
||
}
|
||
|
||
public typealias IndexDistance = Int
|
||
|
||
/// The position of the first character in a nonempty character view.
|
||
///
|
||
/// In an empty character view, `startIndex` is equal to `endIndex`.
|
||
public var startIndex: Index {
|
||
return Index(_base: unicodeScalars.startIndex, in: self)
|
||
}
|
||
|
||
/// A character view's "past the end" position---that is, the position one
|
||
/// greater than the last valid subscript argument.
|
||
///
|
||
/// In an empty character view, `endIndex` is equal to `startIndex`.
|
||
public var endIndex: Index {
|
||
return Index(_base: unicodeScalars.endIndex, in: self)
|
||
}
|
||
|
||
/// Returns the next consecutive position after `i`.
|
||
///
|
||
/// - Precondition: The next position is valid.
|
||
public func index(after i: Index) -> Index {
|
||
_precondition(i._base < unicodeScalars.endIndex,
|
||
"cannot increment beyond endIndex")
|
||
_precondition(i._base >= unicodeScalars.startIndex,
|
||
"cannot increment invalid index")
|
||
return Index(_base: i._endBase, in: self)
|
||
}
|
||
|
||
/// Returns the previous consecutive position before `i`.
|
||
///
|
||
/// - Precondition: The previous position is valid.
|
||
public func index(before i: Index) -> Index {
|
||
_precondition(i._base > unicodeScalars.startIndex,
|
||
"cannot decrement before startIndex")
|
||
_precondition(i._base <= unicodeScalars.endIndex,
|
||
"cannot decrement invalid index")
|
||
let predecessorLengthUTF16 =
|
||
_measureExtendedGraphemeClusterBackward(from: i._base)
|
||
return Index(
|
||
_base: UnicodeScalarView.Index(
|
||
_position: i._utf16Index - predecessorLengthUTF16
|
||
),
|
||
in: self
|
||
)
|
||
}
|
||
|
||
/// Fast check for a (stable) grapheme break between two UInt16 code units
|
||
@inline(__always)
|
||
internal static func _quickCheckGraphemeBreakBetween(
|
||
_ lhs: UInt16, _ rhs: UInt16
|
||
) -> Bool {
|
||
// With the exception of CR-LF, there is always a grapheme break between two
|
||
// sub-0x300 code units
|
||
if lhs < 0x300 && rhs < 0x300 {
|
||
return lhs != UInt16(_CR) && rhs != UInt16(_LF)
|
||
}
|
||
|
||
return _internalExtraCheckGraphemeBreakBetween(lhs, rhs)
|
||
}
|
||
|
||
// A quick check helper to quickly perform extra grapheme-break-between
|
||
// checks that tightly integrate Unicode version-specific assumptions. Should
|
||
// never be inlined into user code, as it is version- specific.
|
||
//
|
||
// TODO: this is actually fine to inline into non-inlinable code
|
||
//
|
||
@inline(never) // @inline(resilient_only)
|
||
internal static func _internalExtraCheckGraphemeBreakBetween(
|
||
_ lhs: UInt16, _ rhs: UInt16
|
||
) -> Bool {
|
||
// Whether the given scalar, when it appears paired with another scalar
|
||
// satisfying this property, has a grapheme break between it and the other
|
||
// scalar.
|
||
func hasBreakWhenPaired(_ x: UInt16) -> Bool {
|
||
// TODO: This doesn't generate optimal code, tune/re-write at a lower level.
|
||
|
||
// Unified CJK Han ideographs, common and some supplemental, amongst
|
||
// others:
|
||
// 0x3400-0xA4CF
|
||
if 0x3400 <= x && x <= 0xa4cf {
|
||
return true
|
||
}
|
||
|
||
//
|
||
// Non-combining kana:
|
||
// 0x3041-0x3096
|
||
// 0x30A1-0x30FA
|
||
//
|
||
// TODO: may be faster to verify whether only 3099 and 309A don't have
|
||
// this property, and compare not-equal rather than using two ranges.
|
||
if 0x3041 <= x && x <= 0x3096 || 0x30a1 <= x && x <= 0x30fa {
|
||
return true
|
||
}
|
||
|
||
// TODO: sub-300 check would also be valuable, e.g. when breaking at the
|
||
// boundary between English embedded in Chinese.
|
||
return false
|
||
}
|
||
return hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs)
|
||
}
|
||
|
||
// NOTE: don't make this function inlineable. Grapheme cluster
|
||
// segmentation uses a completely different algorithm in Unicode 9.0.
|
||
//
|
||
/// Returns the length of the first extended grapheme cluster in UTF-16
|
||
/// code units.
|
||
@inline(never) // Don't remove, see above.
|
||
internal func _measureExtendedGraphemeClusterForward(
|
||
from start: UnicodeScalarView.Index
|
||
) -> Int {
|
||
let end = unicodeScalars.endIndex
|
||
if start == end {
|
||
return 0
|
||
}
|
||
|
||
// Our relative position (offset). If our _core is not a substring, this is
|
||
// the same as start._position.
|
||
let relativeOffset = start._position - _coreOffset
|
||
|
||
// Grapheme breaking is much simpler if known ASCII
|
||
if _core.isASCII {
|
||
_onFastPath() // Please aggressively inline
|
||
let asciiBuffer = _core.asciiBuffer._unsafelyUnwrappedUnchecked
|
||
|
||
// With the exception of CR-LF, ASCII graphemes are single-scalar. Check
|
||
// for that one exception.
|
||
if _slowPath(
|
||
asciiBuffer[relativeOffset] == _CR &&
|
||
relativeOffset+1 < asciiBuffer.endIndex &&
|
||
asciiBuffer[relativeOffset+1] == _LF
|
||
) {
|
||
return 2
|
||
}
|
||
|
||
return 1
|
||
} else {
|
||
// TODO: Check for (potentially non-contiguous) ASCII NSStrings,
|
||
// especially small tagged pointers.
|
||
}
|
||
|
||
let startIndexUTF16 = start._position
|
||
|
||
// Last scalar is its own grapheme
|
||
if (startIndexUTF16+1 == end._position) {
|
||
return 1
|
||
}
|
||
|
||
// Perform a quick single-code-unit grapheme check
|
||
if _core._baseAddress != nil {
|
||
if String.CharacterView._quickCheckGraphemeBreakBetween(
|
||
_core._nthContiguous(relativeOffset),
|
||
_core._nthContiguous(relativeOffset+1)
|
||
) {
|
||
return 1
|
||
}
|
||
} else {
|
||
// TODO: Check for (potentially non-contiguous) UTF16 NSStrings,
|
||
// especially small tagged pointers
|
||
}
|
||
|
||
if _core._baseAddress != nil {
|
||
_onFastPath() // Please aggressively inline
|
||
let breakIterator = _ThreadLocalStorage.getUBreakIterator(for: _core)
|
||
let ubrkFollowing = __swift_stdlib_ubrk_following(
|
||
breakIterator, Int32(relativeOffset)
|
||
)
|
||
// ubrk_following may return UBRK_DONE (-1). Treat that as the rest of the
|
||
// string.
|
||
let nextPosition =
|
||
ubrkFollowing == -1 ? end._position : Int(ubrkFollowing)
|
||
return nextPosition - relativeOffset
|
||
} else {
|
||
// TODO: See if we can get fast character contents.
|
||
}
|
||
|
||
// FIXME: Need to handle the general case correctly with Unicode 9+
|
||
// semantics, as opposed to this legacy Unicode 8 path. This gets hit for
|
||
// e.g. non-contiguous NSStrings. In such cases, there may be an alternative
|
||
// CFString API available, or worst case we can map over it via UTextFuncs.
|
||
|
||
return legacyGraphemeForward(
|
||
start: start, end: end, startIndexUTF16: startIndexUTF16
|
||
)
|
||
}
|
||
|
||
@inline(never)
|
||
func legacyGraphemeForward(
|
||
start: UnicodeScalarView.Index,
|
||
end: UnicodeScalarView.Index,
|
||
startIndexUTF16: Int
|
||
) -> Int {
|
||
var start = start
|
||
let graphemeClusterBreakProperty =
|
||
_UnicodeGraphemeClusterBreakPropertyTrie()
|
||
let segmenter = _UnicodeExtendedGraphemeClusterSegmenter()
|
||
|
||
var gcb0 = graphemeClusterBreakProperty.getPropertyRawValue(
|
||
unicodeScalars[start].value)
|
||
unicodeScalars.formIndex(after: &start)
|
||
|
||
while start != end {
|
||
// FIXME(performance): consider removing this "fast path". A branch
|
||
// that is hard to predict could be worse for performance than a few
|
||
// loads from cache to fetch the property 'gcb1'.
|
||
if segmenter.isBoundaryAfter(gcb0) {
|
||
break
|
||
}
|
||
let gcb1 = graphemeClusterBreakProperty.getPropertyRawValue(
|
||
unicodeScalars[start].value)
|
||
if segmenter.isBoundary(gcb0, gcb1) {
|
||
break
|
||
}
|
||
gcb0 = gcb1
|
||
unicodeScalars.formIndex(after: &start)
|
||
}
|
||
|
||
return start._position - startIndexUTF16
|
||
}
|
||
|
||
// NOTE: don't make this function inlineable. Grapheme cluster
|
||
// segmentation uses a completely different algorithm in Unicode 9.0.
|
||
//
|
||
/// Returns the length of the previous extended grapheme cluster in UTF-16
|
||
/// code units.
|
||
@inline(never) // Don't remove, see above.
|
||
internal func _measureExtendedGraphemeClusterBackward(
|
||
from end: UnicodeScalarView.Index
|
||
) -> Int {
|
||
let start = unicodeScalars.startIndex
|
||
if start == end {
|
||
return 0
|
||
}
|
||
|
||
// The relative position (offset) to the last code unit.
|
||
let lastOffset = end._position - _coreOffset - 1
|
||
// The relative position (offset) that is one-past-the-last
|
||
let endOffset = lastOffset + 1
|
||
|
||
// Grapheme breaking is much simpler if known ASCII
|
||
if _core.isASCII {
|
||
_onFastPath() // Please aggressively inline
|
||
let asciiBuffer = _core.asciiBuffer._unsafelyUnwrappedUnchecked
|
||
_sanityCheck(
|
||
lastOffset >= asciiBuffer.startIndex,
|
||
"should of been caught in earlier start-of-scalars check")
|
||
|
||
// With the exception of CR-LF, ASCII graphemes are single-scalar. Check
|
||
// for that one exception.
|
||
if _slowPath(
|
||
asciiBuffer[lastOffset] == _LF &&
|
||
lastOffset-1 >= asciiBuffer.startIndex &&
|
||
asciiBuffer[lastOffset-1] == _CR
|
||
) {
|
||
return 2
|
||
}
|
||
|
||
return 1
|
||
}
|
||
|
||
let endIndexUTF16 = end._position
|
||
|
||
// First scalar is its own grapheme
|
||
if (endIndexUTF16-1 == start._position) {
|
||
return 1
|
||
}
|
||
|
||
// Perform a quick single-code-unit grapheme check
|
||
if _core._baseAddress != nil {
|
||
if String.CharacterView._quickCheckGraphemeBreakBetween(
|
||
_core._nthContiguous(lastOffset-1),
|
||
_core._nthContiguous(lastOffset)
|
||
) {
|
||
return 1
|
||
}
|
||
}
|
||
|
||
if _core._baseAddress != nil {
|
||
_onFastPath() // Please aggressively inline
|
||
let breakIterator = _ThreadLocalStorage.getUBreakIterator(for: _core)
|
||
let ubrkPreceding = __swift_stdlib_ubrk_preceding(
|
||
breakIterator, Int32(endOffset)
|
||
)
|
||
// ubrk_following may return UBRK_DONE (-1). Treat that as the rest of the
|
||
// string.
|
||
let priorPosition =
|
||
ubrkPreceding == -1 ? start._position : Int(ubrkPreceding)
|
||
return endOffset - priorPosition
|
||
} else {
|
||
// TODO: See if we can get fast character contents.
|
||
}
|
||
|
||
// FIXME: Need to handle the general case correctly with Unicode 9+
|
||
// semantics, as opposed to this legacy Unicode 8 path. This gets hit for
|
||
// e.g. non-contiguous NSStrings. In such cases, there may be an alternative
|
||
// CFString API available, or worst case we can map over it via UTextFuncs.
|
||
|
||
return legacyGraphemeBackward(
|
||
start: start, end: end, endIndexUTF16: endIndexUTF16
|
||
)
|
||
}
|
||
|
||
@inline(never)
|
||
func legacyGraphemeBackward(
|
||
start: UnicodeScalarView.Index,
|
||
end: UnicodeScalarView.Index,
|
||
endIndexUTF16: Int
|
||
) -> Int {
|
||
let graphemeClusterBreakProperty =
|
||
_UnicodeGraphemeClusterBreakPropertyTrie()
|
||
let segmenter = _UnicodeExtendedGraphemeClusterSegmenter()
|
||
|
||
var graphemeClusterStart = end
|
||
|
||
unicodeScalars.formIndex(before: &graphemeClusterStart)
|
||
var gcb0 = graphemeClusterBreakProperty.getPropertyRawValue(
|
||
unicodeScalars[graphemeClusterStart].value)
|
||
|
||
var graphemeClusterStartUTF16 = graphemeClusterStart._position
|
||
|
||
while graphemeClusterStart != start {
|
||
unicodeScalars.formIndex(before: &graphemeClusterStart)
|
||
let gcb1 = graphemeClusterBreakProperty.getPropertyRawValue(
|
||
unicodeScalars[graphemeClusterStart].value)
|
||
if segmenter.isBoundary(gcb1, gcb0) {
|
||
break
|
||
}
|
||
gcb0 = gcb1
|
||
graphemeClusterStartUTF16 = graphemeClusterStart._position
|
||
}
|
||
|
||
return endIndexUTF16 - graphemeClusterStartUTF16
|
||
}
|
||
|
||
/// Accesses the character at the given position.
|
||
///
|
||
/// The following example searches a string's character view for a capital
|
||
/// letter and then prints the character at the found index:
|
||
///
|
||
/// let greeting = "Hello, friend!"
|
||
/// if let i = greeting.characters.index(where: { "A"..."Z" ~= $0 }) {
|
||
/// print("First capital letter: \(greeting.characters[i])")
|
||
/// }
|
||
/// // Prints "First capital letter: H"
|
||
///
|
||
/// - Parameter position: A valid index of the character view. `position`
|
||
/// must be less than the view's end index.
|
||
public subscript(i: Index) -> Character {
|
||
if i._countUTF16 == 1 {
|
||
// For single-code-unit graphemes, we can construct a Character directly
|
||
// from a single unicode scalar (if sub-surrogate).
|
||
let relativeOffset = i._base._position - _coreOffset
|
||
if _core.isASCII {
|
||
let asciiBuffer = _core.asciiBuffer._unsafelyUnwrappedUnchecked
|
||
// Bounds checks in an UnsafeBufferPointer (asciiBuffer) are only
|
||
// performed in Debug mode, so they need to be duplicated here.
|
||
// Falling back to the non-optimal behavior in the case they don't
|
||
// pass.
|
||
if relativeOffset >= asciiBuffer.startIndex &&
|
||
relativeOffset < asciiBuffer.endIndex {
|
||
return Character(Unicode.Scalar(asciiBuffer[relativeOffset]))
|
||
}
|
||
} else if _core._baseAddress != nil {
|
||
let cu = _core._nthContiguous(relativeOffset)
|
||
// Only constructible if sub-surrogate
|
||
if (cu < 0xd800) {
|
||
return Character(Unicode.Scalar(cu)._unsafelyUnwrappedUnchecked)
|
||
}
|
||
}
|
||
}
|
||
|
||
return Character(String(unicodeScalars[i._base..<i._endBase]))
|
||
}
|
||
}
|
||
|
||
extension String.CharacterView : RangeReplaceableCollection {
|
||
/// Creates an empty character view.
|
||
public init() {
|
||
self.init("")
|
||
}
|
||
|
||
/// Replaces the characters within the specified bounds with the given
|
||
/// characters.
|
||
///
|
||
/// Invalidates all indices with respect to the string.
|
||
///
|
||
/// - Parameters:
|
||
/// - bounds: The range of characters to replace. The bounds of the range
|
||
/// must be valid indices of the character view.
|
||
/// - newElements: The new characters to add to the view.
|
||
///
|
||
/// - Complexity: O(*m*), where *m* is the combined length of the character
|
||
/// view and `newElements`. If the call to `replaceSubrange(_:with:)`
|
||
/// simply removes characters at the end of the view, the complexity is
|
||
/// O(*n*), where *n* is equal to `bounds.count`.
|
||
public mutating func replaceSubrange<C>(
|
||
_ bounds: Range<Index>,
|
||
with newElements: C
|
||
) where C : Collection, C.Element == Character {
|
||
let rawSubRange: Range<Int> =
|
||
bounds.lowerBound._base._position - _coreOffset
|
||
..< bounds.upperBound._base._position - _coreOffset
|
||
let lazyUTF16 = newElements.lazy.flatMap { $0.utf16 }
|
||
_core.replaceSubrange(rawSubRange, with: lazyUTF16)
|
||
}
|
||
|
||
/// Reserves enough space in the character view's underlying storage to store
|
||
/// the specified number of ASCII characters.
|
||
///
|
||
/// Because each element of a character view can require more than a single
|
||
/// ASCII character's worth of storage, additional allocation may be
|
||
/// necessary when adding characters to the character view after a call to
|
||
/// `reserveCapacity(_:)`.
|
||
///
|
||
/// - Parameter n: The minimum number of ASCII character's worth of storage
|
||
/// to allocate.
|
||
///
|
||
/// - Complexity: O(*n*), where *n* is the capacity being reserved.
|
||
public mutating func reserveCapacity(_ n: Int) {
|
||
_core.reserveCapacity(n)
|
||
}
|
||
|
||
/// Appends the given character to the character view.
|
||
///
|
||
/// - Parameter c: The character to append to the character view.
|
||
public mutating func append(_ c: Character) {
|
||
switch c._representation {
|
||
case .small(let _63bits):
|
||
let bytes = Character._smallValue(_63bits)
|
||
_core.append(contentsOf: Character._SmallUTF16(bytes))
|
||
case .large(_):
|
||
_core.append(String(c)._core)
|
||
}
|
||
}
|
||
|
||
/// Appends the characters in the given sequence to the character view.
|
||
///
|
||
/// - Parameter newElements: A sequence of characters.
|
||
public mutating func append<S : Sequence>(contentsOf newElements: S)
|
||
where S.Element == Character {
|
||
reserveCapacity(_core.count + newElements.underestimatedCount)
|
||
for c in newElements {
|
||
self.append(c)
|
||
}
|
||
}
|
||
|
||
/// Creates a new character view containing the characters in the given
|
||
/// sequence.
|
||
///
|
||
/// - Parameter characters: A sequence of characters.
|
||
public init<S : Sequence>(_ characters: S)
|
||
where S.Element == Character {
|
||
self = String.CharacterView()
|
||
self.append(contentsOf: characters)
|
||
}
|
||
}
|
||
|
||
// Algorithms
|
||
extension String.CharacterView {
|
||
/// Accesses the characters in the given range.
|
||
///
|
||
/// The example below uses this subscript to access the characters up to, but
|
||
/// not including, the first comma (`","`) in the string.
|
||
///
|
||
/// let str = "All this happened, more or less."
|
||
/// let i = str.characters.index(of: ",")!
|
||
/// let substring = str.characters[str.characters.startIndex ..< i]
|
||
/// print(String(substring))
|
||
/// // Prints "All this happened"
|
||
///
|
||
/// - Complexity: O(*n*) if the underlying string is bridged from
|
||
/// Objective-C, where *n* is the length of the string; otherwise, O(1).
|
||
public subscript(bounds: Range<Index>) -> String.CharacterView {
|
||
let unicodeScalarRange = bounds.lowerBound._base..<bounds.upperBound._base
|
||
return String.CharacterView(unicodeScalars[unicodeScalarRange]._core,
|
||
coreOffset: unicodeScalarRange.lowerBound._position)
|
||
}
|
||
}
|
||
|
||
extension String.CharacterView {
|
||
@available(*, unavailable, renamed: "replaceSubrange")
|
||
public mutating func replaceRange<C>(
|
||
_ subRange: Range<Index>,
|
||
with newElements: C
|
||
) where C : Collection, C.Element == Character {
|
||
Builtin.unreachable()
|
||
}
|
||
|
||
@available(*, unavailable, renamed: "append(contentsOf:)")
|
||
public mutating func appendContentsOf<S : Sequence>(_ newElements: S)
|
||
where S.Element == Character {
|
||
Builtin.unreachable()
|
||
}
|
||
}
|