mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
459 lines
16 KiB
Swift
459 lines
16 KiB
Swift
//===----------------------------------------------------------------------===//
|
||
//
|
||
// This source file is part of the Swift.org open source project
|
||
//
|
||
// Copyright (c) 2025 Apple Inc. and the Swift project authors
|
||
// Licensed under Apache License v2.0 with Runtime Library Exception
|
||
//
|
||
// See https://swift.org/LICENSE.txt for license information
|
||
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
||
//
|
||
//===----------------------------------------------------------------------===//
|
||
|
||
@available(SwiftStdlib 6.2, *)
|
||
extension UTF8Span {
|
||
/// Returns an iterator that will decode the code units into
|
||
/// `Unicode.Scalar`s.
|
||
///
|
||
/// The resulting iterator has the same lifetime constraints as `self`.
|
||
@lifetime(copy self)
|
||
public func makeUnicodeScalarIterator() -> UnicodeScalarIterator {
|
||
.init(self)
|
||
}
|
||
|
||
/// Iterate the `Unicode.Scalar`s contents of a `UTF8Span`.
|
||
///
|
||
/// func printScalarValues(_ string: borrowing String) {
|
||
/// var iterator = string.utf8Span.makeUnicodeScalarIterator()
|
||
/// while let scalar = iterator.next() {
|
||
/// print(scalar.escaped(asASCII: true))
|
||
/// }
|
||
/// }
|
||
///
|
||
/// let string = "A🎉"
|
||
/// printScalarValues(string)
|
||
/// // Prints "A"
|
||
/// // Prints "\u{0001F389}"
|
||
@frozen
|
||
public struct UnicodeScalarIterator: ~Escapable {
|
||
public let codeUnits: UTF8Span
|
||
|
||
/// The byte offset of the start of the next scalar. This is
|
||
/// always scalar-aligned.
|
||
fileprivate(set)
|
||
public var currentCodeUnitOffset: Int
|
||
|
||
@lifetime(copy codeUnits)
|
||
public init(_ codeUnits: UTF8Span) {
|
||
self.codeUnits = codeUnits
|
||
self.currentCodeUnitOffset = 0
|
||
}
|
||
|
||
private var _start: UnsafeRawPointer {
|
||
unsafe codeUnits._start()
|
||
}
|
||
|
||
/// Decode and return the scalar starting at `currentCodeUnitOffset`.
|
||
/// After the function returns, `currentCodeUnitOffset` holds the
|
||
/// position at the end of the returned scalar, which is also the start
|
||
/// of the next scalar.
|
||
///
|
||
/// Returns `nil` if at the end of the `UTF8Span`.
|
||
///
|
||
/// - Complexity: O(1)
|
||
@lifetime(self: copy self)
|
||
public mutating func next() -> Unicode.Scalar? {
|
||
guard currentCodeUnitOffset < codeUnits.count else {
|
||
return nil
|
||
}
|
||
|
||
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
|
||
let (result, newPos) = unsafe _start._decodeScalar(startingAt: currentCodeUnitOffset)
|
||
self.currentCodeUnitOffset = newPos
|
||
return result
|
||
}
|
||
|
||
/// Decode and return the scalar ending at `currentCodeUnitOffset`. After
|
||
/// the function returns, `currentCodeUnitOffset` holds the position at
|
||
/// the start of the returned scalar, which is also the end of the
|
||
/// previous scalar.
|
||
///
|
||
/// Returns `nil` if at the start of the `UTF8Span`.
|
||
///
|
||
/// - Complexity: O(1)
|
||
@lifetime(self: copy self)
|
||
public mutating func previous() -> Unicode.Scalar? {
|
||
guard currentCodeUnitOffset > 0 else {
|
||
return nil
|
||
}
|
||
|
||
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
|
||
let (result, newPos) = unsafe _start._decodeScalar(endingAt: currentCodeUnitOffset)
|
||
self.currentCodeUnitOffset = newPos
|
||
return result
|
||
}
|
||
|
||
|
||
/// Advance `currentCodeUnitOffset` to the end of the current scalar, without
|
||
/// decoding it.
|
||
///
|
||
/// Returns the number of `Unicode.Scalar`s skipped over, which can be 0
|
||
/// if at the end of the UTF8Span.
|
||
///
|
||
/// - Complexity: O(1)
|
||
@lifetime(self: copy self)
|
||
public mutating func skipForward() -> Int {
|
||
guard currentCodeUnitOffset < codeUnits.count else {
|
||
return 0
|
||
}
|
||
|
||
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
|
||
|
||
currentCodeUnitOffset &+= unsafe _start._scalarLength(startingAt: currentCodeUnitOffset)
|
||
return 1
|
||
}
|
||
|
||
/// Advance `currentCodeUnitOffset` to the end of `n` scalars, without decoding
|
||
/// them.
|
||
///
|
||
/// Returns the number of `Unicode.Scalar`s skipped over, which can be
|
||
/// fewer than `n` if at the end of the UTF8Span.
|
||
///
|
||
/// - Complexity: O(n)
|
||
@lifetime(self: copy self)
|
||
public mutating func skipForward(by n: Int) -> Int {
|
||
var numSkipped = 0
|
||
while numSkipped < n && skipForward() != 0 {
|
||
numSkipped += 1
|
||
}
|
||
|
||
return numSkipped
|
||
}
|
||
|
||
/// Move `currentCodeUnitOffset` to the start of the previous scalar, without
|
||
/// decoding it.
|
||
///
|
||
/// Returns the number of `Unicode.Scalar`s skipped over, which can be 0
|
||
/// if at the start of the UTF8Span.
|
||
///
|
||
/// - Complexity: O(1)
|
||
@lifetime(self: copy self)
|
||
public mutating func skipBack() -> Int {
|
||
guard currentCodeUnitOffset > 0 else {
|
||
return 0
|
||
}
|
||
|
||
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
|
||
|
||
currentCodeUnitOffset = unsafe _start._previousScalarStart(currentCodeUnitOffset)
|
||
return 1
|
||
}
|
||
|
||
/// Move `currentCodeUnitOffset` to the start of the previous `n` scalars,
|
||
/// without decoding them.
|
||
///
|
||
/// Returns the number of `Unicode.Scalar`s skipped over, which can be
|
||
/// fewer than `n` if at the start of the UTF8Span.
|
||
///
|
||
/// - Complexity: O(n)
|
||
@lifetime(self: copy self)
|
||
public mutating func skipBack(by n: Int) -> Int {
|
||
var numSkipped = 0
|
||
while numSkipped < n && skipBack() != 0 {
|
||
numSkipped += 1
|
||
}
|
||
|
||
return numSkipped
|
||
}
|
||
|
||
/// Reset to the nearest scalar-aligned code unit offset `<= i`.
|
||
///
|
||
/// func printScalarAfterReset(_ string: borrowing String) {
|
||
/// var iterator = string.utf8Span.makeUnicodeScalarIterator()
|
||
/// iterator.reset(roundingBackwardsFrom: 8) // Position 8 is mid-emoji, rounds back to 6
|
||
/// if let scalar = iterator.next() {
|
||
/// print(scalar) // Prints "🌍" (emoji starts at byte 6)
|
||
/// }
|
||
/// }
|
||
/// let string = "Hello 🌍"
|
||
/// printScalarAfterReset(string)
|
||
///
|
||
/// - Complexity: O(1)
|
||
@lifetime(self: copy self)
|
||
public mutating func reset(roundingBackwardsFrom i: Int) {
|
||
self.currentCodeUnitOffset = codeUnits._scalarAlignBackwards(i)
|
||
}
|
||
|
||
/// Reset to the nearest scalar-aligned code unit offset `>= i`.
|
||
///
|
||
/// - Complexity: O(1)
|
||
@lifetime(self: copy self)
|
||
public mutating func reset(roundingForwardsFrom i: Int) {
|
||
self.currentCodeUnitOffset = codeUnits._scalarAlignForwards(i)
|
||
}
|
||
|
||
// TODO: for below, verify that there is no path to UB, just garbage-data or guaranteed
|
||
// trap!
|
||
|
||
/// Reset this iterator to `codeUnitOffset`, skipping _all_ safety
|
||
/// checks (including bounds checks).
|
||
///
|
||
/// Note: This is only for very specific, low-level use cases. If
|
||
/// `codeUnitOffset` is not properly scalar-aligned, this function can
|
||
/// result in undefined behavior when, e.g., `next()` is called.
|
||
///
|
||
/// For example, this could be used by a regex engine to backtrack to a
|
||
/// known-valid previous position.
|
||
///
|
||
///
|
||
/// - Complexity: O(1)
|
||
@unsafe
|
||
@lifetime(self: copy self)
|
||
public mutating func reset(toUnchecked codeUnitOffset: Int) {
|
||
_internalInvariant(codeUnits._isScalarAligned(unchecked: codeUnitOffset))
|
||
self.currentCodeUnitOffset = codeUnitOffset
|
||
}
|
||
|
||
/// Returns the UTF8Span containing all the content up to the iterator's
|
||
/// current position.
|
||
///
|
||
/// The resultant `UTF8Span` has the same lifetime constraints as `self`.
|
||
///
|
||
/// - Complexity: O(1)
|
||
@lifetime(copy self)
|
||
public func prefix() -> UTF8Span {
|
||
let slice = codeUnits.span.extracting(0..<currentCodeUnitOffset)
|
||
return UTF8Span(
|
||
_uncheckedAssumingValidUTF8: slice,
|
||
isKnownASCII: codeUnits.isKnownASCII,
|
||
isKnownNFC: codeUnits.isKnownNFC)
|
||
}
|
||
|
||
/// Returns the UTF8Span containing all the content after the iterator's
|
||
/// current position.
|
||
///
|
||
/// The resultant `UTF8Span` has the same lifetime constraints as `self`.
|
||
///
|
||
/// - Complexity: O(1)
|
||
@lifetime(copy self)
|
||
public func suffix() -> UTF8Span {
|
||
let slice = codeUnits.span.extracting(currentCodeUnitOffset..<codeUnits.count)
|
||
return UTF8Span(
|
||
_uncheckedAssumingValidUTF8: slice,
|
||
isKnownASCII: codeUnits.isKnownASCII,
|
||
isKnownNFC: codeUnits.isKnownNFC)
|
||
}
|
||
}
|
||
}
|
||
|
||
@available(SwiftStdlib 6.2, *)
|
||
@_unavailableInEmbedded
|
||
extension UTF8Span {
|
||
/// Returns an iterator that will construct `Character`s from the underlying
|
||
/// UTF-8 content.
|
||
///
|
||
/// The resulting iterator has the same lifetime constraints as `self`.
|
||
@lifetime(copy self)
|
||
public func makeCharacterIterator() -> CharacterIterator {
|
||
.init(self)
|
||
}
|
||
|
||
/// Iterate the `Character` contents of a `UTF8Span`.
|
||
///
|
||
/// func countCharacters(_ string: borrowing String) {
|
||
/// var iterator = string.utf8Span.makeCharacterIterator()
|
||
/// var count = 0
|
||
/// while let character = iterator.next() {
|
||
/// count += 1
|
||
/// print("Character \(count): \(character)")
|
||
/// }
|
||
/// print("Total: \(count) characters")
|
||
/// }
|
||
///
|
||
/// let string = "لاهور"
|
||
/// countCharacters(string)
|
||
/// // Prints "Character 1: ل"
|
||
/// // Prints "Character 2: ا"
|
||
/// // Prints "Character 3: ه"
|
||
/// // Prints "Character 4: و"
|
||
/// // Prints "Character 5: ر"
|
||
/// // Prints "Total: 5 characters"
|
||
public struct CharacterIterator: ~Escapable {
|
||
public let codeUnits: UTF8Span
|
||
|
||
/// The byte offset of the start of the next `Character`. This is always
|
||
/// scalar-aligned. It is always `Character`-aligned relative to the last
|
||
/// call to `reset` (or the start of the span if not called).
|
||
fileprivate(set)
|
||
public var currentCodeUnitOffset: Int
|
||
|
||
@lifetime(copy codeUnits)
|
||
public init(_ codeUnits: UTF8Span) {
|
||
self.codeUnits = codeUnits
|
||
self.currentCodeUnitOffset = 0
|
||
}
|
||
|
||
private var _start: UnsafeRawPointer {
|
||
unsafe codeUnits._start()
|
||
}
|
||
|
||
/// Return the `Character` starting at `currentCodeUnitOffset`. After the
|
||
/// function returns, `currentCodeUnitOffset` holds the position at the
|
||
/// end of the `Character`, which is also the start of the next
|
||
/// `Character`.
|
||
///
|
||
/// Returns `nil` if at the end of the `UTF8Span`.
|
||
@lifetime(self: copy self)
|
||
public mutating func next() -> Character? {
|
||
guard currentCodeUnitOffset < codeUnits.count else { return nil }
|
||
|
||
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
|
||
let (result, newPos) = unsafe _start._decodeCharacter(
|
||
startingAt: currentCodeUnitOffset,
|
||
limitedBy: codeUnits.count
|
||
)
|
||
self.currentCodeUnitOffset = newPos
|
||
return result
|
||
}
|
||
|
||
/// Return the `Character` ending at `currentCodeUnitOffset`. After the
|
||
/// function returns, `currentCodeUnitOffset` holds the position at the
|
||
/// start of the returned `Character`, which is also the end of the
|
||
/// previous `Character`.
|
||
///
|
||
/// Returns `nil` if at the start of the `UTF8Span`.
|
||
@lifetime(self: copy self)
|
||
public mutating func previous() -> Character? {
|
||
guard currentCodeUnitOffset > 0 else { return nil }
|
||
|
||
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
|
||
let (result, newPos) = unsafe _start._decodeCharacter(
|
||
endingAt: currentCodeUnitOffset,
|
||
limitedBy: codeUnits.count)
|
||
self.currentCodeUnitOffset = newPos
|
||
return result
|
||
}
|
||
|
||
/// Advance `currentCodeUnitOffset` to the end of the current `Character`,
|
||
/// without constructing it.
|
||
///
|
||
/// Returns the number of `Character`s skipped over, which can be 0
|
||
/// if at the end of the UTF8Span.
|
||
@lifetime(self: copy self)
|
||
public mutating func skipForward() -> Int {
|
||
guard currentCodeUnitOffset < codeUnits.count else {
|
||
return 0
|
||
}
|
||
|
||
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
|
||
|
||
self.currentCodeUnitOffset = unsafe _start._nextCharacterStart(currentCodeUnitOffset, limitedBy: codeUnits.count)
|
||
return 1
|
||
}
|
||
|
||
/// Advance `currentCodeUnitOffset` to the end of `n` `Characters`, without
|
||
/// constructing them.
|
||
///
|
||
/// Returns the number of `Character`s skipped over, which can be
|
||
/// fewer than `n` if at the end of the UTF8Span.
|
||
@lifetime(self: copy self)
|
||
public mutating func skipForward(by n: Int) -> Int {
|
||
var numSkipped = 0
|
||
while numSkipped < n && skipForward() != 0 {
|
||
numSkipped += 1
|
||
}
|
||
|
||
return numSkipped
|
||
}
|
||
|
||
/// Move `currentCodeUnitOffset` to the start of the previous `Character`,
|
||
/// without constructing it.
|
||
///
|
||
/// Returns the number of `Character`s skipped over, which can be 0
|
||
/// if at the start of the UTF8Span.
|
||
@lifetime(self: copy self)
|
||
public mutating func skipBack() -> Int {
|
||
guard currentCodeUnitOffset > 0 else {
|
||
return 0
|
||
}
|
||
|
||
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
|
||
|
||
currentCodeUnitOffset = unsafe _start._previousCharacterStart(currentCodeUnitOffset, limitedBy: codeUnits.count)
|
||
return 1
|
||
|
||
}
|
||
|
||
/// Move `currentCodeUnitOffset` to the start of the previous `n` `Character`s,
|
||
/// without constructing them.
|
||
///
|
||
/// Returns the number of `Character`s skipped over, which can be
|
||
/// fewer than `n` if at the start of the UTF8Span.
|
||
@lifetime(self: copy self)
|
||
public mutating func skipBack(by n: Int) -> Int {
|
||
var numSkipped = 0
|
||
while numSkipped < n && skipBack() != 0 {
|
||
numSkipped += 1
|
||
}
|
||
|
||
return numSkipped
|
||
}
|
||
|
||
/// Reset to the nearest character-aligned position `<= i`.
|
||
@lifetime(self: copy self)
|
||
public mutating func reset(roundingBackwardsFrom i: Int) {
|
||
self.currentCodeUnitOffset = codeUnits._scalarAlignBackwards(i)
|
||
}
|
||
|
||
/// Reset to the nearest character-aligned position `>= i`.
|
||
@lifetime(self: copy self)
|
||
public mutating func reset(roundingForwardsFrom i: Int) {
|
||
self.currentCodeUnitOffset = codeUnits._scalarAlignForwards(i)
|
||
}
|
||
|
||
/// Reset this iterator to `codeUnitOffset`, skipping _all_ safety
|
||
/// checks.
|
||
///
|
||
/// Note: This is only for very specific, low-level use cases. If
|
||
/// `codeUnitOffset` is not properly scalar-aligned, this function can
|
||
/// result in undefined behavior when, e.g., `next()` is called.
|
||
///
|
||
/// If `i` is scalar-aligned, but not `Character`-aligned, you may get
|
||
/// different results from running `Character` iteration.
|
||
///
|
||
/// For example, this could be used by a regex engine to backtrack to a
|
||
/// known-valid previous position.
|
||
///
|
||
@unsafe
|
||
@lifetime(self: copy self)
|
||
public mutating func reset(toUnchecked codeUnitOffset: Int) {
|
||
_internalInvariant(codeUnits._isScalarAligned(unchecked: codeUnitOffset))
|
||
self.currentCodeUnitOffset = codeUnitOffset
|
||
}
|
||
|
||
/// Returns the UTF8Span containing all the content up to the iterator's
|
||
/// current position.
|
||
@lifetime(copy self)
|
||
public func prefix() -> UTF8Span {
|
||
let slice = codeUnits.span.extracting(0..<currentCodeUnitOffset)
|
||
return UTF8Span(
|
||
_uncheckedAssumingValidUTF8: slice,
|
||
isKnownASCII: codeUnits.isKnownASCII,
|
||
isKnownNFC: codeUnits.isKnownNFC)
|
||
}
|
||
|
||
/// Returns the UTF8Span containing all the content after the iterator's
|
||
/// current position.
|
||
@lifetime(copy self)
|
||
public func suffix() -> UTF8Span {
|
||
let slice = codeUnits.span.extracting(currentCodeUnitOffset..<codeUnits.count)
|
||
return UTF8Span(
|
||
_uncheckedAssumingValidUTF8: slice,
|
||
isKnownASCII: codeUnits.isKnownASCII,
|
||
isKnownNFC: codeUnits.isKnownNFC)
|
||
}
|
||
}
|
||
}
|
||
|
||
|