mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
Add support for UTF8Span Also, refactor validation and grapheme breaking
This commit is contained in:
@@ -208,6 +208,13 @@ add_library(swiftCore
|
||||
UnsafeRawPointer.swift
|
||||
UTFEncoding.swift
|
||||
UTF8.swift
|
||||
UTF8EncodingError.swift
|
||||
UTF8Span.swift
|
||||
UTF8SpanBits.swift
|
||||
UTF8SpanComparisons.swift
|
||||
UTF8SpanFundamentals.swift
|
||||
UTF8SpanInternalHelpers.swift
|
||||
UTF8SpanIterators.swift
|
||||
UTF16.swift
|
||||
UTF32.swift
|
||||
Unicode.swift # ORDER DEPENDENCY: must follow new unicode support
|
||||
|
||||
@@ -214,6 +214,13 @@ split_embedded_sources(
|
||||
EMBEDDED UnsafeRawPointer.swift
|
||||
EMBEDDED UTFEncoding.swift
|
||||
EMBEDDED UTF8.swift
|
||||
EMBEDDED UTF8EncodingError.swift
|
||||
EMBEDDED UTF8Span.swift
|
||||
EMBEDDED UTF8SpanBits.swift
|
||||
EMBEDDED UTF8SpanComparisons.swift
|
||||
EMBEDDED UTF8SpanFundamentals.swift
|
||||
EMBEDDED UTF8SpanInternalHelpers.swift
|
||||
EMBEDDED UTF8SpanIterators.swift
|
||||
EMBEDDED UTF16.swift
|
||||
EMBEDDED UTF32.swift
|
||||
EMBEDDED Unicode.swift # ORDER DEPENDENCY: must follow new unicode support
|
||||
|
||||
@@ -205,6 +205,15 @@
|
||||
"RawSpan.swift",
|
||||
"Span.swift"
|
||||
],
|
||||
"UTF8Span": [
|
||||
"UTF8EncodingError.swift",
|
||||
"UTF8Span.swift",
|
||||
"UTF8SpanBits.swift",
|
||||
"UTF8SpanComparisons.swift",
|
||||
"UTF8SpanFundamentals.swift",
|
||||
"UTF8SpanInternalHelpers.swift",
|
||||
"UTF8SpanIterators.swift"
|
||||
],
|
||||
"Protocols": [
|
||||
"CompilerProtocols.swift",
|
||||
"ShadowProtocols.swift"
|
||||
|
||||
@@ -1112,108 +1112,4 @@ extension String {
|
||||
}
|
||||
}
|
||||
|
||||
extension _StringGutsSlice {
|
||||
internal func _isScalarNFCQC(
|
||||
_ scalar: Unicode.Scalar,
|
||||
_ prevCCC: inout UInt8
|
||||
) -> Bool {
|
||||
let normData = Unicode._NormData(scalar, fastUpperbound: 0x300)
|
||||
|
||||
if prevCCC > normData.ccc, normData.ccc != 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
if !normData.isNFCQC {
|
||||
return false
|
||||
}
|
||||
|
||||
prevCCC = normData.ccc
|
||||
return true
|
||||
}
|
||||
|
||||
internal func _withNFCCodeUnits(_ f: (UInt8) throws -> Void) rethrows {
|
||||
let substring = String(_guts)[range]
|
||||
// Fast path: If we're already NFC (or ASCII), then we don't need to do
|
||||
// anything at all.
|
||||
if _fastPath(_guts.isNFC) {
|
||||
try substring.utf8.forEach(f)
|
||||
return
|
||||
}
|
||||
|
||||
var isNFCQC = true
|
||||
var prevCCC: UInt8 = 0
|
||||
|
||||
if _guts.isFastUTF8 {
|
||||
_fastNFCCheck(&isNFCQC, &prevCCC)
|
||||
|
||||
// Because we have access to the fastUTF8, we can go through that instead
|
||||
// of accessing the UTF8 view on String.
|
||||
if isNFCQC {
|
||||
try unsafe withFastUTF8 {
|
||||
for unsafe byte in unsafe $0 {
|
||||
try f(byte)
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
} else {
|
||||
for scalar in substring.unicodeScalars {
|
||||
if !_isScalarNFCQC(scalar, &prevCCC) {
|
||||
isNFCQC = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if isNFCQC {
|
||||
for byte in substring.utf8 {
|
||||
try f(byte)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
for scalar in substring.unicodeScalars._internalNFC {
|
||||
try scalar.withUTF8CodeUnits {
|
||||
for unsafe byte in unsafe $0 {
|
||||
try f(byte)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
internal func _fastNFCCheck(_ isNFCQC: inout Bool, _ prevCCC: inout UInt8) {
|
||||
unsafe withFastUTF8 { utf8 in
|
||||
var position = 0
|
||||
|
||||
while position < utf8.count {
|
||||
// If our first byte is less than 0xCC, then it means we're under the
|
||||
// 0x300 scalar value and everything up to 0x300 is NFC already.
|
||||
if unsafe utf8[position] < 0xCC {
|
||||
// If our first byte is less than 0xC0, then it means it is ASCII
|
||||
// and only takes up a single byte.
|
||||
if unsafe utf8[position] < 0xC0 {
|
||||
position &+= 1
|
||||
} else {
|
||||
// Otherwise, this is a 2 byte < 0x300 sequence.
|
||||
position &+= 2
|
||||
}
|
||||
// ASCII always has ccc of 0.
|
||||
prevCCC = 0
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: position)
|
||||
|
||||
if !_isScalarNFCQC(scalar, &prevCCC) {
|
||||
isNFCQC = false
|
||||
return
|
||||
}
|
||||
|
||||
position &+= len
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,7 +97,7 @@ internal func _stringCompareInternal(
|
||||
}
|
||||
|
||||
@_effects(readonly)
|
||||
private func _stringCompareFastUTF8(
|
||||
internal func _stringCompareFastUTF8(
|
||||
_ utf8Left: UnsafeBufferPointer<UInt8>,
|
||||
_ utf8Right: UnsafeBufferPointer<UInt8>,
|
||||
expecting: _StringComparisonResult,
|
||||
|
||||
@@ -117,7 +117,7 @@ extension String {
|
||||
return unsafe (String._uncheckedFromUTF8(
|
||||
input, asciiPreScanResult: extraInfo.isASCII
|
||||
), false)
|
||||
case .error(let initialRange):
|
||||
case .error(_, let initialRange):
|
||||
return unsafe (repairUTF8(input, firstKnownBrokenRange: initialRange), true)
|
||||
}
|
||||
}
|
||||
@@ -139,7 +139,7 @@ extension String {
|
||||
newIsASCII: info.isASCII
|
||||
)
|
||||
return result.asString
|
||||
case .error(let initialRange):
|
||||
case .error(_, let initialRange):
|
||||
defer { _fixLifetime(result) }
|
||||
//This could be optimized to use excess tail capacity
|
||||
return unsafe repairUTF8(result.codeUnits, firstKnownBrokenRange: initialRange)
|
||||
|
||||
@@ -13,14 +13,18 @@
|
||||
import SwiftShims
|
||||
|
||||
/// CR and LF are common special cases in grapheme breaking logic
|
||||
private var _CR: UInt8 { return 0x0d }
|
||||
private var _LF: UInt8 { return 0x0a }
|
||||
private var _CR: UInt8 { return 0x0D }
|
||||
private var _LF: UInt8 { return 0x0A }
|
||||
|
||||
internal func _hasGraphemeBreakBetween(
|
||||
/// Perform a quick-check to determine if there's a grapheme-break between two
|
||||
/// scalars, without consulting the data tables. Returns true if there
|
||||
/// definitely is a break, false if there definitely is none, and nil if a
|
||||
/// break couldn't be determined
|
||||
internal func _quickHasGraphemeBreakBetween(
|
||||
_ lhs: Unicode.Scalar, _ rhs: Unicode.Scalar
|
||||
) -> Bool {
|
||||
|
||||
// CR-LF is a special case: no break between these
|
||||
) -> Bool? {
|
||||
// GB3:
|
||||
// CR-LF is a special case: no break between these
|
||||
if lhs == Unicode.Scalar(_CR) && rhs == Unicode.Scalar(_LF) {
|
||||
return false
|
||||
}
|
||||
@@ -80,7 +84,10 @@ internal func _hasGraphemeBreakBetween(
|
||||
default: return false
|
||||
}
|
||||
}
|
||||
return hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs)
|
||||
if hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs) {
|
||||
return true
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
extension _StringGuts {
|
||||
@@ -513,6 +520,8 @@ extension Unicode {
|
||||
internal var _previous: Unicode.Scalar
|
||||
internal var _state: _GraphemeBreakingState
|
||||
|
||||
/// Refactoring TODO: should we use a quick check result?
|
||||
///
|
||||
/// Returns a non-nil value if it can be determined whether there is a
|
||||
/// grapheme break between `scalar1` and `scalar2` without knowing anything
|
||||
/// about the scalars that precede `scalar1`. This can optionally be used as
|
||||
@@ -523,13 +532,7 @@ extension Unicode {
|
||||
between scalar1: Unicode.Scalar,
|
||||
and scalar2: Unicode.Scalar
|
||||
) -> Bool? {
|
||||
if scalar1.value == 0xD, scalar2.value == 0xA {
|
||||
return false
|
||||
}
|
||||
if _hasGraphemeBreakBetween(scalar1, scalar2) {
|
||||
return true
|
||||
}
|
||||
return nil
|
||||
_quickHasGraphemeBreakBetween(scalar1, scalar2)
|
||||
}
|
||||
|
||||
/// Initialize a new character recognizer at the _start of text_ (sot)
|
||||
@@ -637,59 +640,76 @@ extension _StringGuts {
|
||||
nextScalar: (Int) -> (scalar: Unicode.Scalar, end: Int)?
|
||||
) -> Int {
|
||||
_internalInvariant(index < endIndex._encodedOffset)
|
||||
return _nextGraphemeClusterBoundary(startingAt: index, nextScalar: nextScalar)
|
||||
}
|
||||
}
|
||||
|
||||
// Note: If `index` in't already on a boundary, then starting with an empty
|
||||
// state here sometimes leads to this method returning results that diverge
|
||||
// from the true breaks in the string.
|
||||
var state = _GraphemeBreakingState()
|
||||
var (scalar, index) = nextScalar(index)!
|
||||
internal func _nextGraphemeClusterBoundary(
|
||||
startingAt index: Int,
|
||||
nextScalar: (Int) -> (scalar: Unicode.Scalar, end: Int)?
|
||||
) -> Int {
|
||||
|
||||
while true {
|
||||
guard let (scalar2, nextIndex) = nextScalar(index) else { break }
|
||||
if state.shouldBreak(between: scalar, and: scalar2) {
|
||||
break
|
||||
}
|
||||
index = nextIndex
|
||||
scalar = scalar2
|
||||
// Note: If `index` isn't already on a boundary, then starting with an empty
|
||||
// state here sometimes leads to this method returning results that diverge
|
||||
// from the true breaks in the string.
|
||||
var state = _GraphemeBreakingState()
|
||||
var (scalar, index) = nextScalar(index)!
|
||||
|
||||
while true {
|
||||
guard let (scalar2, nextIndex) = nextScalar(index) else { break }
|
||||
if state.shouldBreak(between: scalar, and: scalar2) {
|
||||
break
|
||||
}
|
||||
|
||||
return index
|
||||
index = nextIndex
|
||||
scalar = scalar2
|
||||
}
|
||||
|
||||
// Returns the stride of the grapheme cluster ending at offset `index`.
|
||||
//
|
||||
// This method uses `previousScalar` to looks back in the string as far as
|
||||
// necessary to find a correct grapheme cluster boundary, whether or not
|
||||
// `index` happens to be on a boundary itself.
|
||||
internal func previousBoundary(
|
||||
return index
|
||||
}
|
||||
|
||||
extension _StringGuts {
|
||||
fileprivate func previousBoundary(
|
||||
endingAt index: Int,
|
||||
previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
|
||||
) -> Int {
|
||||
// FIXME: This requires potentially arbitrary lookback in each iteration,
|
||||
// leading to quadratic behavior in some edge cases. Ideally lookback should
|
||||
// only be done once per cluster (or in the case of RI sequences, once per
|
||||
// flag sequence). One way to avoid most quadratic behavior is to replace
|
||||
// this implementation with a scheme that first searches backwards for a
|
||||
// safe point then iterates forward using the regular `shouldBreak` until we
|
||||
// reach `index`, as recommended in section 6.4 of TR#29.
|
||||
//
|
||||
// https://www.unicode.org/reports/tr29/#Random_Access
|
||||
|
||||
var (scalar2, index) = previousScalar(index)!
|
||||
|
||||
while true {
|
||||
guard let (scalar1, previousIndex) = previousScalar(index) else { break }
|
||||
if shouldBreakWithLookback(
|
||||
between: scalar1, and: scalar2, at: index, with: previousScalar
|
||||
) {
|
||||
break
|
||||
}
|
||||
index = previousIndex
|
||||
scalar2 = scalar1
|
||||
}
|
||||
|
||||
return index
|
||||
_previousGraphemeClusterBoundary(endingAt: index, previousScalar: previousScalar)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Returns the stride of the grapheme cluster ending at offset `index`.
|
||||
//
|
||||
// This method uses `previousScalar` to looks back in the string as far as
|
||||
// necessary to find a correct grapheme cluster boundary, whether or not
|
||||
// `index` happens to be on a boundary itself.
|
||||
internal func _previousGraphemeClusterBoundary(
|
||||
endingAt index: Int,
|
||||
previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
|
||||
) -> Int {
|
||||
// FIXME: This requires potentially arbitrary lookback in each iteration,
|
||||
// leading to quadratic behavior in some edge cases. Ideally lookback should
|
||||
// only be done once per cluster (or in the case of RI sequences, once per
|
||||
// flag sequence). One way to avoid most quadratic behavior is to replace
|
||||
// this implementation with a scheme that first searches backwards for a
|
||||
// safe point then iterates forward using the regular `shouldBreak` until we
|
||||
// reach `index`, as recommended in section 6.4 of TR#29.
|
||||
//
|
||||
// https://www.unicode.org/reports/tr29/#Random_Access
|
||||
|
||||
var (scalar2, index) = previousScalar(index)!
|
||||
|
||||
while true {
|
||||
guard let (scalar1, previousIndex) = previousScalar(index) else { break }
|
||||
if _shouldBreakWithLookback(
|
||||
between: scalar1, and: scalar2, at: index, with: previousScalar
|
||||
) {
|
||||
break
|
||||
}
|
||||
index = previousIndex
|
||||
scalar2 = scalar1
|
||||
}
|
||||
|
||||
return index
|
||||
}
|
||||
|
||||
extension _GraphemeBreakingState {
|
||||
@@ -708,13 +728,8 @@ extension _GraphemeBreakingState {
|
||||
between scalar1: Unicode.Scalar,
|
||||
and scalar2: Unicode.Scalar
|
||||
) -> Bool {
|
||||
// GB3
|
||||
if scalar1.value == 0xD, scalar2.value == 0xA {
|
||||
return false
|
||||
}
|
||||
|
||||
if _hasGraphemeBreakBetween(scalar1, scalar2) {
|
||||
return true
|
||||
if let result = _quickHasGraphemeBreakBetween(scalar1, scalar2) {
|
||||
return result
|
||||
}
|
||||
|
||||
let x = Unicode._GraphemeBreakProperty(from: scalar1)
|
||||
@@ -868,289 +883,282 @@ extension _GraphemeBreakingState {
|
||||
}
|
||||
}
|
||||
|
||||
extension _StringGuts {
|
||||
// Return true if there is an extended grapheme cluster boundary between two
|
||||
// scalars, with no previous knowledge about preceding scalars.
|
||||
//
|
||||
// This method looks back as far as it needs to determine the correct
|
||||
// placement of boundaries.
|
||||
//
|
||||
// This is based off of the Unicode Annex #29 for [Grapheme Cluster Boundary
|
||||
// Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
|
||||
internal func shouldBreakWithLookback(
|
||||
between scalar1: Unicode.Scalar,
|
||||
and scalar2: Unicode.Scalar,
|
||||
at index: Int,
|
||||
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
|
||||
) -> Bool {
|
||||
// GB3
|
||||
if scalar1.value == 0xD, scalar2.value == 0xA {
|
||||
return false
|
||||
}
|
||||
|
||||
if _hasGraphemeBreakBetween(scalar1, scalar2) {
|
||||
return true
|
||||
}
|
||||
|
||||
let x = Unicode._GraphemeBreakProperty(from: scalar1)
|
||||
let y = Unicode._GraphemeBreakProperty(from: scalar2)
|
||||
|
||||
switch (x, y) {
|
||||
|
||||
// Fast path: If we know our scalars have no properties the decision is
|
||||
// trivial and we don't need to crawl to the default statement.
|
||||
case (.any, .any):
|
||||
return true
|
||||
|
||||
// GB4
|
||||
case (.control, _):
|
||||
return true
|
||||
|
||||
// GB5
|
||||
case (_, .control):
|
||||
return true
|
||||
|
||||
// GB6
|
||||
case (.l, .l),
|
||||
(.l, .v),
|
||||
(.l, .lv),
|
||||
(.l, .lvt):
|
||||
return false
|
||||
|
||||
// GB7
|
||||
case (.lv, .v),
|
||||
(.v, .v),
|
||||
(.lv, .t),
|
||||
(.v, .t):
|
||||
return false
|
||||
|
||||
// GB8
|
||||
case (.lvt, .t),
|
||||
(.t, .t):
|
||||
return false
|
||||
|
||||
// GB9
|
||||
case (_, .extend),
|
||||
(_, .zwj):
|
||||
return false
|
||||
|
||||
// GB9a
|
||||
case (_, .spacingMark):
|
||||
return false
|
||||
|
||||
// GB9b
|
||||
case (.prepend, _):
|
||||
return false
|
||||
|
||||
// GB11
|
||||
case (.zwj, .extendedPictographic):
|
||||
return !checkIfInEmojiSequence(at: index, with: previousScalar)
|
||||
|
||||
// GB12 & GB13
|
||||
case (.regionalIndicator, .regionalIndicator):
|
||||
return countRIs(at: index, with: previousScalar)
|
||||
|
||||
// GB999
|
||||
default:
|
||||
// GB9c
|
||||
//
|
||||
// Check if our rhs is an InCB=Consonant first because we can more easily
|
||||
// exit out of this branch in most cases. Otherwise, this is a consonant.
|
||||
// Check that the lhs is an InCB=Extend or InCB=Linker (we have to check
|
||||
// if it's an .extend or .zwj first because _isInCBExtend assumes that it
|
||||
// is true).
|
||||
if scalar2._isInCBConsonant,
|
||||
(x == .extend || x == .zwj),
|
||||
(scalar1._isInCBExtend || scalar1._isInCBLinker) {
|
||||
return !checkIfInIndicSequence(at: index, with: previousScalar)
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
// Return true if there is an extended grapheme cluster boundary between two
|
||||
// scalars, with no previous knowledge about preceding scalars.
|
||||
//
|
||||
// This method looks back as far as it needs to determine the correct
|
||||
// placement of boundaries.
|
||||
//
|
||||
// This is based off of the Unicode Annex #29 for [Grapheme Cluster Boundary
|
||||
// Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
|
||||
fileprivate func _shouldBreakWithLookback(
|
||||
between scalar1: Unicode.Scalar,
|
||||
and scalar2: Unicode.Scalar,
|
||||
at index: Int,
|
||||
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
|
||||
) -> Bool {
|
||||
if let result = _quickHasGraphemeBreakBetween(scalar1, scalar2) {
|
||||
return result
|
||||
}
|
||||
|
||||
// When walking backwards, it's impossible to know whether we were in an emoji
|
||||
// sequence without walking further backwards. This walks the string backwards
|
||||
// enough until we figure out whether or not to break our
|
||||
// (.zwj, .extendedPictographic) question. For example:
|
||||
//
|
||||
// Scalar view #1:
|
||||
//
|
||||
// [.control, .zwj, .extendedPictographic]
|
||||
// ^
|
||||
// | = To determine whether or not we break here, we need
|
||||
// to see the previous scalar's grapheme property.
|
||||
// ^
|
||||
// | = This is neither .extendedPictographic nor .extend, thus we
|
||||
// were never in an emoji sequence, so break between the .zwj
|
||||
// and .extendedPictographic.
|
||||
//
|
||||
// Scalar view #2:
|
||||
//
|
||||
// [.extendedPictographic, .zwj, .extendedPictographic]
|
||||
// ^
|
||||
// | = Same as above, move backwards one to
|
||||
// view the previous scalar's property.
|
||||
// ^
|
||||
// | = This is an .extendedPictographic, so this indicates that
|
||||
// we are in an emoji sequence, so we should NOT break
|
||||
// between the .zwj and .extendedPictographic.
|
||||
//
|
||||
// Scalar view #3:
|
||||
//
|
||||
// [.extendedPictographic, .extend, .extend, .zwj, .extendedPictographic]
|
||||
// ^
|
||||
// | = Same as above
|
||||
// ^
|
||||
// | = This is an .extend which means
|
||||
// there is a potential emoji
|
||||
// sequence, walk further backwards
|
||||
// to find an .extendedPictographic.
|
||||
//
|
||||
// <-- = Another extend, go backwards more.
|
||||
// ^
|
||||
// | = We found our starting .extendedPictographic letting us
|
||||
// know that we are in an emoji sequence so our initial
|
||||
// break question is answered as NO.
|
||||
internal func checkIfInEmojiSequence(
|
||||
at index: Int,
|
||||
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
|
||||
) -> Bool {
|
||||
guard var i = previousScalar(index)?.start else { return false }
|
||||
while let prev = previousScalar(i) {
|
||||
i = prev.start
|
||||
let gbp = Unicode._GraphemeBreakProperty(from: prev.scalar)
|
||||
let x = Unicode._GraphemeBreakProperty(from: scalar1)
|
||||
let y = Unicode._GraphemeBreakProperty(from: scalar2)
|
||||
|
||||
switch gbp {
|
||||
case .extend:
|
||||
continue
|
||||
case .extendedPictographic:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
switch (x, y) {
|
||||
|
||||
// Fast path: If we know our scalars have no properties the decision is
|
||||
// trivial and we don't need to crawl to the default statement.
|
||||
case (.any, .any):
|
||||
return true
|
||||
|
||||
// GB4
|
||||
case (.control, _):
|
||||
return true
|
||||
|
||||
// GB5
|
||||
case (_, .control):
|
||||
return true
|
||||
|
||||
// GB6
|
||||
case (.l, .l),
|
||||
(.l, .v),
|
||||
(.l, .lv),
|
||||
(.l, .lvt):
|
||||
return false
|
||||
}
|
||||
|
||||
// When walking backwards, it's impossible to know whether we break when we
|
||||
// see our first (InCB=Extend, InCB=Consonant) or (InCB=Linker, InCB=Consonant)
|
||||
// without walking further backwards. This walks the string backwards enough
|
||||
// until we figure out whether or not to break this indic sequence. For example:
|
||||
//
|
||||
// Scalar view #1:
|
||||
//
|
||||
// [InCB=Linker, InCB=Extend, InCB=Consonant]
|
||||
// ^
|
||||
// | = To be able to know whether or not to
|
||||
// break these two, we need to walk
|
||||
// backwards to determine if this is a
|
||||
// legitimate indic sequence.
|
||||
// ^
|
||||
// | = The scalar sequence ends without a starting InCB=Consonant,
|
||||
// so this is in fact not an indic sequence, so we can break the two.
|
||||
//
|
||||
// Scalar view #2:
|
||||
//
|
||||
// [InCB=Consonant, InCB=Linker, InCB=Extend, InCB=Consonant]
|
||||
// ^
|
||||
// | = Same as above
|
||||
// ^
|
||||
// | = This is a Linker, so we at least have seen
|
||||
// 1 to be able to return true if we see a
|
||||
// consonant later.
|
||||
// ^
|
||||
// | = Is a consonant and we've seen a linker, so this is a
|
||||
// legitimate indic sequence, so do NOT break the initial question.
|
||||
internal func checkIfInIndicSequence(
|
||||
at index: Int,
|
||||
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
|
||||
) -> Bool {
|
||||
guard let p = previousScalar(index) else { return false }
|
||||
|
||||
var hasSeenInCBLinker = p.scalar._isInCBLinker
|
||||
var i = p.start
|
||||
|
||||
while let (scalar, prev) = previousScalar(i) {
|
||||
i = prev
|
||||
|
||||
if scalar._isInCBConsonant {
|
||||
return hasSeenInCBLinker
|
||||
}
|
||||
|
||||
let gbp = Unicode._GraphemeBreakProperty(from: scalar)
|
||||
|
||||
guard gbp == .extend || gbp == .zwj else {
|
||||
return false
|
||||
}
|
||||
|
||||
switch (scalar._isInCBExtend, scalar._isInCBLinker) {
|
||||
case (false, false):
|
||||
return false
|
||||
|
||||
case (false, true):
|
||||
hasSeenInCBLinker = true
|
||||
|
||||
case (true, false):
|
||||
continue
|
||||
|
||||
case (true, true):
|
||||
// This case should never happen, but if it does then just be cautious
|
||||
// and say this is invalid.
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// GB7
|
||||
case (.lv, .v),
|
||||
(.v, .v),
|
||||
(.lv, .t),
|
||||
(.v, .t):
|
||||
return false
|
||||
}
|
||||
|
||||
// When walking backwards, it's impossible to know whether we break when we
|
||||
// see our first (.regionalIndicator, .regionalIndicator) without walking
|
||||
// further backwards. This walks the string backwards enough until we figure
|
||||
// out whether or not to break these RIs. For example:
|
||||
//
|
||||
// Scalar view #1:
|
||||
//
|
||||
// [.control, .regionalIndicator, .regionalIndicator]
|
||||
// ^
|
||||
// | = To be able to know whether or not to
|
||||
// break these two, we need to walk
|
||||
// backwards to determine if there were
|
||||
// any previous .regionalIndicators in
|
||||
// a row.
|
||||
// ^
|
||||
// | = Not a .regionalIndicator, so our total riCount is 0 and 0 is
|
||||
// even thus we do not break.
|
||||
//
|
||||
// Scalar view #2:
|
||||
//
|
||||
// [.control, .regionalIndicator, .regionalIndicator, .regionalIndicator]
|
||||
// ^
|
||||
// | = Same as above
|
||||
// ^
|
||||
// | = This is a .regionalIndicator, so continue
|
||||
// walking backwards for more of them. riCount is
|
||||
// now equal to 1.
|
||||
// ^
|
||||
// | = Not a .regionalIndicator. riCount = 1 which is odd, so break
|
||||
// the last two .regionalIndicators.
|
||||
internal func countRIs(
|
||||
at index: Int,
|
||||
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
|
||||
) -> Bool {
|
||||
guard let p = previousScalar(index) else { return false }
|
||||
var i = p.start
|
||||
var riCount = 0
|
||||
while let p = previousScalar(i) {
|
||||
i = p.start
|
||||
// GB8
|
||||
case (.lvt, .t),
|
||||
(.t, .t):
|
||||
return false
|
||||
|
||||
let gbp = Unicode._GraphemeBreakProperty(from: p.scalar)
|
||||
guard gbp == .regionalIndicator else {
|
||||
break
|
||||
}
|
||||
// GB9
|
||||
case (_, .extend),
|
||||
(_, .zwj):
|
||||
return false
|
||||
|
||||
riCount += 1
|
||||
// GB9a
|
||||
case (_, .spacingMark):
|
||||
return false
|
||||
|
||||
// GB9b
|
||||
case (.prepend, _):
|
||||
return false
|
||||
|
||||
// GB11
|
||||
case (.zwj, .extendedPictographic):
|
||||
return !_checkIfInEmojiSequence(at: index, with: previousScalar)
|
||||
|
||||
// GB12 & GB13
|
||||
case (.regionalIndicator, .regionalIndicator):
|
||||
return _countRIs(at: index, with: previousScalar)
|
||||
|
||||
// GB999
|
||||
default:
|
||||
// GB9c
|
||||
//
|
||||
// Check if our rhs is an InCB=Consonant first because we can more easily
|
||||
// exit out of this branch in most cases. Otherwise, this is a consonant.
|
||||
// Check that the lhs is an InCB=Extend or InCB=Linker (we have to check
|
||||
// if it's an .extend or .zwj first because _isInCBExtend assumes that it
|
||||
// is true).
|
||||
if scalar2._isInCBConsonant,
|
||||
(x == .extend || x == .zwj),
|
||||
(scalar1._isInCBExtend || scalar1._isInCBLinker) {
|
||||
return !_checkIfInIndicSequence(at: index, with: previousScalar)
|
||||
}
|
||||
return riCount & 1 != 0
|
||||
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// When walking backwards, it's impossible to know whether we were in an emoji
|
||||
// sequence without walking further backwards. This walks the string backwards
|
||||
// enough until we figure out whether or not to break our
|
||||
// (.zwj, .extendedPictographic) question. For example:
|
||||
//
|
||||
// Scalar view #1:
|
||||
//
|
||||
// [.control, .zwj, .extendedPictographic]
|
||||
// ^
|
||||
// | = To determine whether or not we break here, we need
|
||||
// to see the previous scalar's grapheme property.
|
||||
// ^
|
||||
// | = This is neither .extendedPictographic nor .extend, thus we
|
||||
// were never in an emoji sequence, so break between the .zwj
|
||||
// and .extendedPictographic.
|
||||
//
|
||||
// Scalar view #2:
|
||||
//
|
||||
// [.extendedPictographic, .zwj, .extendedPictographic]
|
||||
// ^
|
||||
// | = Same as above, move backwards one to
|
||||
// view the previous scalar's property.
|
||||
// ^
|
||||
// | = This is an .extendedPictographic, so this indicates that
|
||||
// we are in an emoji sequence, so we should NOT break
|
||||
// between the .zwj and .extendedPictographic.
|
||||
//
|
||||
// Scalar view #3:
|
||||
//
|
||||
// [.extendedPictographic, .extend, .extend, .zwj, .extendedPictographic]
|
||||
// ^
|
||||
// | = Same as above
|
||||
// ^
|
||||
// | = This is an .extend which means
|
||||
// there is a potential emoji
|
||||
// sequence, walk further backwards
|
||||
// to find an .extendedPictographic.
|
||||
//
|
||||
// <-- = Another extend, go backwards more.
|
||||
// ^
|
||||
// | = We found our starting .extendedPictographic letting us
|
||||
// know that we are in an emoji sequence so our initial
|
||||
// break question is answered as NO.
|
||||
fileprivate func _checkIfInEmojiSequence(
|
||||
at index: Int,
|
||||
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
|
||||
) -> Bool {
|
||||
guard var i = previousScalar(index)?.start else { return false }
|
||||
while let prev = previousScalar(i) {
|
||||
i = prev.start
|
||||
let gbp = Unicode._GraphemeBreakProperty(from: prev.scalar)
|
||||
|
||||
switch gbp {
|
||||
case .extend:
|
||||
continue
|
||||
case .extendedPictographic:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// When walking backwards, it's impossible to know whether we break when we
|
||||
// see our first (InCB=Extend, InCB=Consonant) or (InCB=Linker, InCB=Consonant)
|
||||
// without walking further backwards. This walks the string backwards enough
|
||||
// until we figure out whether or not to break this indic sequence. For example:
|
||||
//
|
||||
// Scalar view #1:
|
||||
//
|
||||
// [InCB=Linker, InCB=Extend, InCB=Consonant]
|
||||
// ^
|
||||
// | = To be able to know whether or not to
|
||||
// break these two, we need to walk
|
||||
// backwards to determine if this is a
|
||||
// legitimate indic sequence.
|
||||
// ^
|
||||
// | = The scalar sequence ends without a starting InCB=Consonant,
|
||||
// so this is in fact not an indic sequence, so we can break the two.
|
||||
//
|
||||
// Scalar view #2:
|
||||
//
|
||||
// [InCB=Consonant, InCB=Linker, InCB=Extend, InCB=Consonant]
|
||||
// ^
|
||||
// | = Same as above
|
||||
// ^
|
||||
// | = This is a Linker, so we at least have seen
|
||||
// 1 to be able to return true if we see a
|
||||
// consonant later.
|
||||
// ^
|
||||
// | = Is a consonant and we've seen a linker, so this is a
|
||||
// legitimate indic sequence, so do NOT break the initial question.
|
||||
fileprivate func _checkIfInIndicSequence(
|
||||
at index: Int,
|
||||
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
|
||||
) -> Bool {
|
||||
guard let p = previousScalar(index) else { return false }
|
||||
|
||||
var hasSeenInCBLinker = p.scalar._isInCBLinker
|
||||
var i = p.start
|
||||
|
||||
while let (scalar, prev) = previousScalar(i) {
|
||||
i = prev
|
||||
|
||||
if scalar._isInCBConsonant {
|
||||
return hasSeenInCBLinker
|
||||
}
|
||||
|
||||
let gbp = Unicode._GraphemeBreakProperty(from: scalar)
|
||||
|
||||
guard gbp == .extend || gbp == .zwj else {
|
||||
return false
|
||||
}
|
||||
|
||||
switch (scalar._isInCBExtend, scalar._isInCBLinker) {
|
||||
case (false, false):
|
||||
return false
|
||||
|
||||
case (false, true):
|
||||
hasSeenInCBLinker = true
|
||||
|
||||
case (true, false):
|
||||
continue
|
||||
|
||||
case (true, true):
|
||||
// This case should never happen, but if it does then just be cautious
|
||||
// and say this is invalid.
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// When walking backwards, it's impossible to know whether we break when we
|
||||
// see our first (.regionalIndicator, .regionalIndicator) without walking
|
||||
// further backwards. This walks the string backwards enough until we figure
|
||||
// out whether or not to break these RIs. For example:
|
||||
//
|
||||
// Scalar view #1:
|
||||
//
|
||||
// [.control, .regionalIndicator, .regionalIndicator]
|
||||
// ^
|
||||
// | = To be able to know whether or not to
|
||||
// break these two, we need to walk
|
||||
// backwards to determine if there were
|
||||
// any previous .regionalIndicators in
|
||||
// a row.
|
||||
// ^
|
||||
// | = Not a .regionalIndicator, so our total riCount is 0 and 0 is
|
||||
// even thus we do not break.
|
||||
//
|
||||
// Scalar view #2:
|
||||
//
|
||||
// [.control, .regionalIndicator, .regionalIndicator, .regionalIndicator]
|
||||
// ^
|
||||
// | = Same as above
|
||||
// ^
|
||||
// | = This is a .regionalIndicator, so continue
|
||||
// walking backwards for more of them. riCount is
|
||||
// now equal to 1.
|
||||
// ^
|
||||
// | = Not a .regionalIndicator. riCount = 1 which is odd, so break
|
||||
// the last two .regionalIndicators.
|
||||
fileprivate func _countRIs(
|
||||
at index: Int,
|
||||
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
|
||||
) -> Bool {
|
||||
guard let p = previousScalar(index) else { return false }
|
||||
var i = p.start
|
||||
var riCount = 0
|
||||
while let p = previousScalar(i) {
|
||||
i = p.start
|
||||
|
||||
let gbp = Unicode._GraphemeBreakProperty(from: p.scalar)
|
||||
guard gbp == .regionalIndicator else {
|
||||
break
|
||||
}
|
||||
|
||||
riCount += 1
|
||||
}
|
||||
return riCount & 1 != 0
|
||||
}
|
||||
|
||||
@@ -52,3 +52,119 @@ extension UnsafeBufferPointer where Element == UInt8 {
|
||||
return unsafe !UTF8.isContinuation(self[offset])
|
||||
}
|
||||
}
|
||||
|
||||
internal func _isScalarNFCQC(
|
||||
_ scalar: Unicode.Scalar,
|
||||
_ prevCCC: inout UInt8
|
||||
) -> Bool {
|
||||
let normData = Unicode._NormData(scalar, fastUpperbound: 0x300)
|
||||
|
||||
if prevCCC > normData.ccc, normData.ccc != 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
if !normData.isNFCQC {
|
||||
return false
|
||||
}
|
||||
|
||||
prevCCC = normData.ccc
|
||||
return true
|
||||
}
|
||||
|
||||
extension _StringGutsSlice {
|
||||
internal func _withNFCCodeUnits(_ f: (UInt8) throws -> Void) rethrows {
|
||||
let substring = String(_guts)[range]
|
||||
// Fast path: If we're already NFC (or ASCII), then we don't need to do
|
||||
// anything at all.
|
||||
if _fastPath(_guts.isNFC) {
|
||||
try substring.utf8.forEach(f)
|
||||
return
|
||||
}
|
||||
|
||||
var isNFCQC = true
|
||||
var prevCCC: UInt8 = 0
|
||||
|
||||
if _guts.isFastUTF8 {
|
||||
_fastNFCCheck(&isNFCQC, &prevCCC)
|
||||
|
||||
// Because we have access to the fastUTF8, we can go through that instead
|
||||
// of accessing the UTF8 view on String.
|
||||
if isNFCQC {
|
||||
try unsafe withFastUTF8 {
|
||||
for unsafe byte in unsafe $0 {
|
||||
try f(byte)
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
} else {
|
||||
for scalar in substring.unicodeScalars {
|
||||
if !_isScalarNFCQC(scalar, &prevCCC) {
|
||||
isNFCQC = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if isNFCQC {
|
||||
for byte in substring.utf8 {
|
||||
try f(byte)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
for scalar in substring.unicodeScalars._internalNFC {
|
||||
try scalar.withUTF8CodeUnits {
|
||||
for unsafe byte in unsafe $0 {
|
||||
try f(byte)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
internal func _fastNFCCheck(_ isNFCQC: inout Bool, _ prevCCC: inout UInt8) {
|
||||
unsafe withFastUTF8 { utf8 in
|
||||
isNFCQC = unsafe _nfcQuickCheck(utf8, prevCCC: &prevCCC)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Run the Unicode NFC quick check algorithm, returns
|
||||
internal func _nfcQuickCheck(
|
||||
_ utf8: UnsafeBufferPointer<UInt8>,
|
||||
prevCCC: inout UInt8
|
||||
) -> Bool {
|
||||
var position = 0
|
||||
|
||||
while position < utf8.count {
|
||||
// If our first byte is less than 0xCC, then it means we're under the
|
||||
// 0x300 scalar value and everything up to 0x300 is NFC already.
|
||||
if unsafe utf8[position] < 0xCC {
|
||||
// If our first byte is less than 0xC0, then it means it is ASCII
|
||||
// and only takes up a single byte.
|
||||
if unsafe utf8[position] < 0xC0 {
|
||||
position &+= 1
|
||||
} else {
|
||||
// Otherwise, this is a 2 byte < 0x300 sequence.
|
||||
position &+= 2
|
||||
}
|
||||
// ASCII always has ccc of 0.
|
||||
prevCCC = 0
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: position)
|
||||
|
||||
guard _isScalarNFCQC(scalar, &prevCCC) else {
|
||||
return false
|
||||
}
|
||||
|
||||
position &+= len
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ private func _isNotOverlong_F0(_ x: UInt8) -> Bool {
|
||||
return (0x90...0xBF).contains(x)
|
||||
}
|
||||
|
||||
private func _isNotOverlong_F4(_ x: UInt8) -> Bool {
|
||||
private func _isNotInvalid_F4(_ x: UInt8) -> Bool {
|
||||
return UTF8.isContinuation(x) && x <= 0x8F
|
||||
}
|
||||
|
||||
@@ -26,7 +26,7 @@ private func _isNotOverlong_E0(_ x: UInt8) -> Bool {
|
||||
return (0xA0...0xBF).contains(x)
|
||||
}
|
||||
|
||||
private func _isNotOverlong_ED(_ x: UInt8) -> Bool {
|
||||
private func _isNotInvalid_ED(_ x: UInt8) -> Bool {
|
||||
return UTF8.isContinuation(x) && x <= 0x9F
|
||||
}
|
||||
|
||||
@@ -34,15 +34,82 @@ internal struct UTF8ExtraInfo: Equatable {
|
||||
public var isASCII: Bool
|
||||
}
|
||||
|
||||
@inline(never) // slow-path
|
||||
private func _diagnoseInvalidUTF8MultiByteLeading(
|
||||
_ x: UInt8
|
||||
) -> _UTF8EncodingErrorKind {
|
||||
_internalInvariant(x >= 0x80)
|
||||
_internalInvariant(!_isUTF8MultiByteLeading(x))
|
||||
switch x {
|
||||
case 0x80...0xBF:
|
||||
return .unexpectedContinuationByte
|
||||
case 0xC0..<0xC2:
|
||||
return .overlongEncodingByte
|
||||
default:
|
||||
_internalInvariant(x > 0xF4)
|
||||
return .invalidNonSurrogateCodePointByte
|
||||
}
|
||||
}
|
||||
|
||||
internal enum UTF8ValidationResult {
|
||||
case success(UTF8ExtraInfo)
|
||||
case error(toBeReplaced: Range<Int>)
|
||||
case error(
|
||||
kind: _UTF8EncodingErrorKind, toBeReplaced: Range<Int>
|
||||
)
|
||||
}
|
||||
|
||||
// FIXME: refactor other parts of stdlib to avoid this dumb mirror enum
|
||||
//
|
||||
// Mirror of UTF8.ValidationError.Kind, available on 6.1
|
||||
internal struct _UTF8EncodingErrorKind: Error, Sendable, Hashable
|
||||
// TODO: embedded?, Codable
|
||||
, RawRepresentable {
|
||||
internal var rawValue: UInt8
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
internal var _publicKind: UTF8.ValidationError.Kind {
|
||||
.init(rawValue: self.rawValue)!
|
||||
}
|
||||
|
||||
@inlinable
|
||||
internal init(rawValue: UInt8) {
|
||||
self.rawValue = rawValue
|
||||
}
|
||||
|
||||
/// A continuation byte (`10xxxxxx`) outside of a multi-byte sequence
|
||||
@_alwaysEmitIntoClient
|
||||
internal static var unexpectedContinuationByte: Self {
|
||||
.init(rawValue: 0)
|
||||
}
|
||||
|
||||
/// A byte in a surrogate code point (`U+D800..U+DFFF`) sequence
|
||||
@_alwaysEmitIntoClient
|
||||
internal static var surrogateCodePointByte: Self {
|
||||
.init(rawValue: 1)
|
||||
}
|
||||
|
||||
/// A byte in an invalid, non-surrogate code point (`>U+10FFFF`) sequence
|
||||
@_alwaysEmitIntoClient
|
||||
internal static var invalidNonSurrogateCodePointByte: Self {
|
||||
.init(rawValue: 2)
|
||||
}
|
||||
|
||||
/// A byte in an overlong encoding sequence
|
||||
@_alwaysEmitIntoClient
|
||||
internal static var overlongEncodingByte: Self {
|
||||
.init(rawValue: 3)
|
||||
}
|
||||
|
||||
/// A multi-byte sequence that is the start of a valid multi-byte scalar
|
||||
/// but is cut off before ending correctly
|
||||
@_alwaysEmitIntoClient
|
||||
internal static var truncatedScalar: Self {
|
||||
.init(rawValue: 4)
|
||||
}
|
||||
}
|
||||
|
||||
extension UTF8ValidationResult: Equatable {}
|
||||
|
||||
private struct UTF8ValidationError: Error {}
|
||||
|
||||
internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationResult {
|
||||
if unsafe _allASCII(buf) {
|
||||
return .success(UTF8ExtraInfo(isASCII: true))
|
||||
@@ -51,12 +118,20 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
|
||||
var iter = unsafe buf.makeIterator()
|
||||
var lastValidIndex = buf.startIndex
|
||||
|
||||
@inline(__always) func guaranteeIn(_ f: (UInt8) -> Bool) throws(UTF8ValidationError) {
|
||||
guard let cu = unsafe iter.next() else { throw UTF8ValidationError() }
|
||||
guard f(cu) else { throw UTF8ValidationError() }
|
||||
@inline(__always) func guarantee(
|
||||
_ f: (UInt8) -> Bool,
|
||||
_ err: _UTF8EncodingErrorKind
|
||||
) throws(_UTF8EncodingErrorKind) {
|
||||
guard let cu = unsafe iter.next() else {
|
||||
throw .truncatedScalar
|
||||
}
|
||||
guard f(cu) else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
@inline(__always) func guaranteeContinuation() throws(UTF8ValidationError) {
|
||||
try guaranteeIn(UTF8.isContinuation)
|
||||
@inline(__always) func guaranteeContinuation(
|
||||
) throws(_UTF8EncodingErrorKind) {
|
||||
try guarantee(UTF8.isContinuation, .truncatedScalar)
|
||||
}
|
||||
|
||||
func _legacyInvalidLengthCalculation(_ _buffer: (_storage: UInt32, ())) -> Int {
|
||||
@@ -117,21 +192,40 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
|
||||
return unsafe _legacyNarrowIllegalRange(buf: buf[illegalRange])
|
||||
}
|
||||
|
||||
do {
|
||||
do throws(_UTF8EncodingErrorKind) {
|
||||
|
||||
/*
|
||||
The table of valid UTF-8 is:
|
||||
|
||||
╔════════════════════╦════════╦════════╦════════╦════════╗
|
||||
║ Scalar value ║ Byte 0 ║ Byte 1 ║ Byte 2 ║ Byte 3 ║
|
||||
╠════════════════════╬════════╬════════╬════════╬════════╣
|
||||
║ U+0000..U+007F ║ 00..7F ║ ║ ║ ║
|
||||
║ U+0080..U+07FF ║ C2..DF ║ Contin ║ ║ ║
|
||||
║ U+0800..U+0FFF ║ E0 ║ A0..BF ║ Contin ║ ║
|
||||
║ U+1000..U+CFFF ║ E1..EC ║ Contin ║ Contin ║ ║
|
||||
║ U+D000..U+D7FF ║ ED ║ 80..9F ║ Contin ║ ║
|
||||
║ U+E000..U+FFFF ║ EE..EF ║ Contin ║ Contin ║ ║
|
||||
║ U+10000..U+3FFFF ║ F0 ║ 90..BF ║ Contin ║ Contin ║
|
||||
║ U+40000..U+FFFFF ║ F1..F3 ║ Contin ║ Contin ║ Contin ║
|
||||
║ U+100000..U+10FFFF ║ F4 ║ 80..8F ║ Contin ║ Contin ║
|
||||
╚════════════════════╩════════╩════════╩════════╩════════╝
|
||||
|
||||
"Contin" is any continuation byte, i.e. 80..BF or 10xxxxxx
|
||||
*/
|
||||
var isASCII = true
|
||||
while let cu = unsafe iter.next() {
|
||||
if UTF8.isASCII(cu) { lastValidIndex &+= 1; continue }
|
||||
isASCII = false
|
||||
if _slowPath(!_isUTF8MultiByteLeading(cu)) {
|
||||
func fail() throws(UTF8ValidationError) { throw UTF8ValidationError() }
|
||||
try fail()
|
||||
throw _diagnoseInvalidUTF8MultiByteLeading(cu)
|
||||
}
|
||||
switch cu {
|
||||
case 0xC2...0xDF:
|
||||
try guaranteeContinuation()
|
||||
lastValidIndex &+= 2
|
||||
case 0xE0:
|
||||
try guaranteeIn(_isNotOverlong_E0)
|
||||
try guarantee(_isNotOverlong_E0, .overlongEncodingByte)
|
||||
try guaranteeContinuation()
|
||||
lastValidIndex &+= 3
|
||||
case 0xE1...0xEC:
|
||||
@@ -139,7 +233,7 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
|
||||
try guaranteeContinuation()
|
||||
lastValidIndex &+= 3
|
||||
case 0xED:
|
||||
try guaranteeIn(_isNotOverlong_ED)
|
||||
try guarantee(_isNotInvalid_ED, .surrogateCodePointByte)
|
||||
try guaranteeContinuation()
|
||||
lastValidIndex &+= 3
|
||||
case 0xEE...0xEF:
|
||||
@@ -147,7 +241,7 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
|
||||
try guaranteeContinuation()
|
||||
lastValidIndex &+= 3
|
||||
case 0xF0:
|
||||
try guaranteeIn(_isNotOverlong_F0)
|
||||
try guarantee(_isNotOverlong_F0, .overlongEncodingByte)
|
||||
try guaranteeContinuation()
|
||||
try guaranteeContinuation()
|
||||
lastValidIndex &+= 4
|
||||
@@ -157,7 +251,8 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
|
||||
try guaranteeContinuation()
|
||||
lastValidIndex &+= 4
|
||||
case 0xF4:
|
||||
try guaranteeIn(_isNotOverlong_F4)
|
||||
try guarantee(
|
||||
_isNotInvalid_F4, .invalidNonSurrogateCodePointByte)
|
||||
try guaranteeContinuation()
|
||||
try guaranteeContinuation()
|
||||
lastValidIndex &+= 4
|
||||
@@ -167,7 +262,9 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
|
||||
}
|
||||
return .success(UTF8ExtraInfo(isASCII: isASCII))
|
||||
} catch {
|
||||
return unsafe .error(toBeReplaced: findInvalidRange(buf[lastValidIndex...]))
|
||||
return unsafe .error(
|
||||
kind: error,
|
||||
toBeReplaced: findInvalidRange(buf[lastValidIndex...]))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -214,7 +311,7 @@ internal func repairUTF8(_ input: UnsafeBufferPointer<UInt8>, firstKnownBrokenRa
|
||||
case .success:
|
||||
unsafe result.appendInPlace(remainingInput, isASCII: false)
|
||||
return String(result)
|
||||
case .error(let newBrokenRange):
|
||||
case .error(_, let newBrokenRange):
|
||||
brokenRange = newBrokenRange
|
||||
}
|
||||
} while !remainingInput.isEmpty
|
||||
|
||||
261
stdlib/public/core/UTF8EncodingError.swift
Normal file
261
stdlib/public/core/UTF8EncodingError.swift
Normal file
@@ -0,0 +1,261 @@
|
||||
extension Unicode.UTF8 {
|
||||
/**
|
||||
|
||||
The kind and location of a UTF-8 encoding error.
|
||||
|
||||
Valid UTF-8 is represented by this table:
|
||||
|
||||
```
|
||||
╔════════════════════╦════════╦════════╦════════╦════════╗
|
||||
║ Scalar value ║ Byte 0 ║ Byte 1 ║ Byte 2 ║ Byte 3 ║
|
||||
╠════════════════════╬════════╬════════╬════════╬════════╣
|
||||
║ U+0000..U+007F ║ 00..7F ║ ║ ║ ║
|
||||
║ U+0080..U+07FF ║ C2..DF ║ 80..BF ║ ║ ║
|
||||
║ U+0800..U+0FFF ║ E0 ║ A0..BF ║ 80..BF ║ ║
|
||||
║ U+1000..U+CFFF ║ E1..EC ║ 80..BF ║ 80..BF ║ ║
|
||||
║ U+D000..U+D7FF ║ ED ║ 80..9F ║ 80..BF ║ ║
|
||||
║ U+E000..U+FFFF ║ EE..EF ║ 80..BF ║ 80..BF ║ ║
|
||||
║ U+10000..U+3FFFF ║ F0 ║ 90..BF ║ 80..BF ║ 80..BF ║
|
||||
║ U+40000..U+FFFFF ║ F1..F3 ║ 80..BF ║ 80..BF ║ 80..BF ║
|
||||
║ U+100000..U+10FFFF ║ F4 ║ 80..8F ║ 80..BF ║ 80..BF ║
|
||||
╚════════════════════╩════════╩════════╩════════╩════════╝
|
||||
```
|
||||
|
||||
### Classifying errors
|
||||
|
||||
An *unexpected continuation* is when a continuation byte (`10xxxxxx`) occurs
|
||||
in a position that should be the start of a new scalar value. Unexpected
|
||||
continuations can often occur when the input contains arbitrary data
|
||||
instead of textual content. An unexpected continuation at the start of
|
||||
input might mean that the input was not correctly sliced along scalar
|
||||
boundaries or that it does not contain UTF-8.
|
||||
|
||||
A *truncated scalar* is a multi-byte sequence that is the start of a valid
|
||||
multi-byte scalar but is cut off before ending correctly. A truncated
|
||||
scalar at the end of the input might mean that only part of the entire
|
||||
input was received.
|
||||
|
||||
A *surrogate code point* (`U+D800..U+DFFF`) is invalid UTF-8. Surrogate
|
||||
code points are used by UTF-16 to encode scalars in the supplementary
|
||||
planes. Their presence may mean the input was encoded in a different 8-bit
|
||||
encoding, such as CESU-8, WTF-8, or Java's Modified UTF-8.
|
||||
|
||||
An *invalid non-surrogate code point* is any code point higher than
|
||||
`U+10FFFF`. This can often occur when the input is arbitrary data instead
|
||||
of textual content.
|
||||
|
||||
An *overlong encoding* occurs when a scalar value that could have been
|
||||
encoded using fewer bytes is encoded in a longer byte sequence. Overlong
|
||||
encodings are invalid UTF-8 and can lead to security issues if not
|
||||
correctly detected:
|
||||
|
||||
- https://nvd.nist.gov/vuln/detail/CVE-2008-2938
|
||||
- https://nvd.nist.gov/vuln/detail/CVE-2000-0884
|
||||
|
||||
An overlong encoding of `NUL`, `0xC0 0x80`, is used in Java's Modified
|
||||
UTF-8 but is invalid UTF-8. Overlong encoding errors often catch attempts
|
||||
to bypass security measures.
|
||||
|
||||
### Reporting the range of the error
|
||||
|
||||
The range of the error reported follows the *Maximal subpart of an
|
||||
ill-formed subsequence* algorithm in which each error is either one byte
|
||||
long or ends before the first byte that is disallowed. See "U+FFFD
|
||||
Substitution of Maximal Subparts" in the Unicode Standard. Unicode started
|
||||
recommending this algorithm in version 6 and is adopted by the W3C.
|
||||
|
||||
The maximal subpart algorithm will produce a single multi-byte range for a
|
||||
truncated scalar (a multi-byte sequence that is the start of a valid
|
||||
multi-byte scalar but is cut off before ending correctly). For all other
|
||||
errors (including overlong encodings, surrogates, and invalid code
|
||||
points), it will produce an error per byte.
|
||||
|
||||
// FIXME: without a checkAllErrors, we don't have these classification distinctions, should we drop it, ensure we will do it, or what?
|
||||
|
||||
Since overlong encodings, surrogates, and invalid code points are erroneous
|
||||
by the second byte (at the latest), the above definition produces the same
|
||||
ranges as defining such a sequence as a truncated scalar error followed by
|
||||
unexpected continuation byte errors. The more semantically-rich
|
||||
classification is reported.
|
||||
|
||||
For example, a surrogate count point sequence `ED A0 80` will be reported
|
||||
as three `.surrogateCodePointByte` errors rather than a `.truncatedScalar`
|
||||
followed by two `.unexpectedContinuationByte` errors.
|
||||
|
||||
Other commonly reported error ranges can be constructed from this result.
|
||||
For example, PEP 383's error-per-byte can be constructed by mapping over
|
||||
the reported range. Similarly, constructing a single error for the longest
|
||||
invalid byte range can be constructed by joining adjacent error ranges.
|
||||
|
||||
```
|
||||
╔═════════════════╦══════╦═════╦═════╦═════╦═════╦═════╦═════╦══════╗
|
||||
║ ║ 61 ║ F1 ║ 80 ║ 80 ║ E1 ║ 80 ║ C2 ║ 62 ║
|
||||
╠═════════════════╬══════╬═════╬═════╬═════╬═════╬═════╬═════╬══════╣
|
||||
║ Longest range ║ U+61 ║ err ║ ║ ║ ║ ║ ║ U+62 ║
|
||||
║ Maximal subpart ║ U+61 ║ err ║ ║ ║ err ║ ║ err ║ U+62 ║
|
||||
║ Error per byte ║ U+61 ║ err ║ err ║ err ║ err ║ err ║ err ║ U+62 ║
|
||||
╚═════════════════╩══════╩═════╩═════╩═════╩═════╩═════╩═════╩══════╝
|
||||
```
|
||||
|
||||
*/
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
@frozen
|
||||
public struct ValidationError: Error, Sendable, Hashable
|
||||
{
|
||||
/// The kind of encoding error
|
||||
public var kind: Unicode.UTF8.ValidationError.Kind
|
||||
|
||||
/// The range of offsets into our input containing the error
|
||||
public var byteOffsets: Range<Int>
|
||||
|
||||
@_alwaysEmitIntoClient
|
||||
public init(
|
||||
_ kind: Unicode.UTF8.ValidationError.Kind,
|
||||
_ byteOffsets: Range<Int>
|
||||
) {
|
||||
_precondition(byteOffsets.lowerBound >= 0)
|
||||
if kind == .truncatedScalar {
|
||||
_precondition(!byteOffsets.isEmpty)
|
||||
_precondition(byteOffsets.count < 4)
|
||||
} else {
|
||||
_precondition(byteOffsets.count == 1)
|
||||
}
|
||||
|
||||
self.kind = kind
|
||||
self.byteOffsets = byteOffsets
|
||||
}
|
||||
|
||||
@_alwaysEmitIntoClient
|
||||
public init(
|
||||
_ kind: Unicode.UTF8.ValidationError.Kind, at byteOffset: Int
|
||||
) {
|
||||
self.init(kind, byteOffset..<(byteOffset+1))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8.ValidationError {
|
||||
/// The kind of encoding error encountered during validation
|
||||
@frozen
|
||||
public struct Kind: Error, Sendable, Hashable, RawRepresentable
|
||||
{
|
||||
public var rawValue: UInt8
|
||||
|
||||
@inlinable
|
||||
public init?(rawValue: UInt8) {
|
||||
guard rawValue <= 4 else { return nil }
|
||||
self.rawValue = rawValue
|
||||
}
|
||||
|
||||
/// A continuation byte (`10xxxxxx`) outside of a multi-byte sequence
|
||||
@_alwaysEmitIntoClient
|
||||
public static var unexpectedContinuationByte: Self {
|
||||
.init(rawValue: 0)!
|
||||
}
|
||||
|
||||
/// A byte in a surrogate code point (`U+D800..U+DFFF`) sequence
|
||||
@_alwaysEmitIntoClient
|
||||
public static var surrogateCodePointByte: Self {
|
||||
.init(rawValue: 1)!
|
||||
}
|
||||
|
||||
/// A byte in an invalid, non-surrogate code point (`>U+10FFFF`) sequence
|
||||
@_alwaysEmitIntoClient
|
||||
public static var invalidNonSurrogateCodePointByte: Self {
|
||||
.init(rawValue: 2)!
|
||||
}
|
||||
|
||||
/// A byte in an overlong encoding sequence
|
||||
@_alwaysEmitIntoClient
|
||||
public static var overlongEncodingByte: Self {
|
||||
.init(rawValue: 3)!
|
||||
}
|
||||
|
||||
/// A multi-byte sequence that is the start of a valid multi-byte scalar
|
||||
/// but is cut off before ending correctly
|
||||
@_alwaysEmitIntoClient
|
||||
public static var truncatedScalar: Self {
|
||||
.init(rawValue: 4)!
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@_unavailableInEmbedded
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8.ValidationError.Kind: CustomStringConvertible {
|
||||
public var description: String {
|
||||
switch self {
|
||||
case .invalidNonSurrogateCodePointByte:
|
||||
".invalidNonSurrogateCodePointByte"
|
||||
case .overlongEncodingByte:
|
||||
".overlongEncodingByte"
|
||||
case .surrogateCodePointByte:
|
||||
".surrogateCodePointByte"
|
||||
case .truncatedScalar:
|
||||
".truncatedScalar"
|
||||
case .unexpectedContinuationByte:
|
||||
".unexpectedContinuationByte"
|
||||
default:
|
||||
fatalError("unreachable")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@_unavailableInEmbedded
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8.ValidationError: CustomStringConvertible {
|
||||
public var description: String {
|
||||
"UTF8.ValidationError(\(kind), \(byteOffsets))"
|
||||
}
|
||||
}
|
||||
|
||||
extension UTF8 {
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
@usableFromInline // for testing purposes
|
||||
internal static func _checkAllErrors(
|
||||
_ s: some Sequence<UInt8>
|
||||
) -> Array<UTF8.ValidationError> {
|
||||
// TODO: Span fast path
|
||||
// TODO: Fixed size buffer for non-contig inputs
|
||||
// TODO: Lifetime-dependent result variant
|
||||
let cus = Array(s)
|
||||
return unsafe cus.withUnsafeBytes {
|
||||
var bufPtr = unsafe $0
|
||||
var start = 0
|
||||
var errors: Array<UTF8.ValidationError> = []
|
||||
|
||||
// Remember the previous error, so that we can
|
||||
// apply it to subsequent bytes instead of reporting
|
||||
// just `.unexpectedContinuation`.
|
||||
var priorError: UTF8.ValidationError? = nil
|
||||
while true {
|
||||
do throws(UTF8.ValidationError) {
|
||||
_ = unsafe try bufPtr.baseAddress!._validateUTF8(limitedBy: bufPtr.count)
|
||||
return errors
|
||||
} catch {
|
||||
let adjustedRange =
|
||||
error.byteOffsets.lowerBound + start ..< error.byteOffsets.upperBound + start
|
||||
|
||||
let kind: UTF8.ValidationError.Kind
|
||||
if let prior = priorError,
|
||||
prior.byteOffsets.upperBound == adjustedRange.lowerBound,
|
||||
error.kind == .unexpectedContinuationByte
|
||||
{
|
||||
kind = prior.kind
|
||||
} else {
|
||||
kind = error.kind
|
||||
}
|
||||
let adjustedErr = UTF8.ValidationError(kind, adjustedRange)
|
||||
priorError = adjustedErr
|
||||
|
||||
let errEnd = error.byteOffsets.upperBound
|
||||
start += errEnd
|
||||
unsafe bufPtr = .init(rebasing: bufPtr[errEnd...])
|
||||
errors.append(adjustedErr)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
235
stdlib/public/core/UTF8Span.swift
Normal file
235
stdlib/public/core/UTF8Span.swift
Normal file
@@ -0,0 +1,235 @@
|
||||
// TODO: comment header
|
||||
|
||||
|
||||
/// TODO: docs
|
||||
@frozen
|
||||
@safe
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
public struct UTF8Span: Copyable, ~Escapable, BitwiseCopyable {
|
||||
@usableFromInline
|
||||
internal var _unsafeBaseAddress: UnsafeRawPointer?
|
||||
|
||||
/*
|
||||
A bit-packed count and flags (such as isASCII)
|
||||
|
||||
╔═══════╦═════╦══════════╦═══════╗
|
||||
║ b63 ║ b62 ║ b61:56 ║ b56:0 ║
|
||||
╠═══════╬═════╬══════════╬═══════╣
|
||||
║ ASCII ║ NFC ║ reserved ║ count ║
|
||||
╚═══════╩═════╩══════════╩═══════╝
|
||||
|
||||
ASCII means the contents are known to be all-ASCII (<0x7F).
|
||||
NFC means contents are known to be in normal form C for fast comparisons.
|
||||
*/
|
||||
@usableFromInline
|
||||
internal var _countAndFlags: UInt64
|
||||
|
||||
// @_alwaysEmitIntoClient
|
||||
@inline(__always)
|
||||
@lifetime(borrow start) // TODO: borrow or copy?
|
||||
internal init(
|
||||
_unsafeAssumingValidUTF8 start: borrowing UnsafeRawPointer,
|
||||
_countAndFlags: UInt64
|
||||
) {
|
||||
unsafe self._unsafeBaseAddress = copy start
|
||||
self._countAndFlags = _countAndFlags
|
||||
|
||||
_invariantCheck()
|
||||
}
|
||||
|
||||
/// Creates a UTF8Span, bypassing safety and security checks. The caller
|
||||
/// must guarantee that `codeUnits` contains validly-encoded UTF-8, or else
|
||||
/// undefined behavior may result upon use. If `isKnownASCII: true is
|
||||
/// passed`, the contents must be ASCII, or else undefined behavior may
|
||||
/// result upon use.
|
||||
@unsafe
|
||||
@lifetime(copy codeUnits)
|
||||
public init(
|
||||
unchecked codeUnits: Span<UInt8>,
|
||||
isKnownASCII: Bool = false
|
||||
) {
|
||||
self.init(
|
||||
_uncheckedAssumingValidUTF8: codeUnits,
|
||||
isKnownASCII: isKnownASCII,
|
||||
isKnownNFC: false
|
||||
)
|
||||
}
|
||||
|
||||
// FIXME: we need to make sure ALL API are nil safe, that is they
|
||||
// at least check the count first
|
||||
@_alwaysEmitIntoClient
|
||||
internal func _start() -> UnsafeRawPointer {
|
||||
unsafe _unsafeBaseAddress._unsafelyUnwrappedUnchecked
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: try to convert code to be ran on Span instead of URP
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8Span {
|
||||
/// Creates a UTF8Span containing `codeUnits`. Validates that the input is
|
||||
/// valid UTF-8, otherwise throws an error.
|
||||
///
|
||||
/// The resulting UTF8Span has the same lifetime constraints as `codeUnits`.
|
||||
@lifetime(copy codeUnits)
|
||||
public init(
|
||||
validating codeUnits: consuming Span<UInt8>
|
||||
) throws(UTF8.ValidationError) {
|
||||
try self.init(_validating: codeUnits)
|
||||
}
|
||||
|
||||
// TODO: this doesn't need to be underscored, I don't think
|
||||
@lifetime(copy codeUnits)
|
||||
internal init(
|
||||
_validating codeUnits: consuming Span<UInt8>
|
||||
) throws(UTF8.ValidationError) {
|
||||
guard let basePtr = unsafe codeUnits._pointer else {
|
||||
unsafe self._unsafeBaseAddress = nil
|
||||
self._countAndFlags = 0
|
||||
return
|
||||
}
|
||||
|
||||
let count = codeUnits._count
|
||||
let isASCII = unsafe try basePtr._validateUTF8(limitedBy: count)
|
||||
|
||||
unsafe self._unsafeBaseAddress = .init(basePtr)
|
||||
self._countAndFlags = UInt64(truncatingIfNeeded: count)
|
||||
if isASCII {
|
||||
_setIsASCII()
|
||||
}
|
||||
_internalInvariant(self.count == codeUnits.count)
|
||||
}
|
||||
|
||||
// TODO: SPI?
|
||||
@lifetime(copy codeUnits)
|
||||
internal init(
|
||||
_uncheckedAssumingValidUTF8 codeUnits: consuming Span<UInt8>,
|
||||
isKnownASCII: Bool,
|
||||
isKnownNFC: Bool
|
||||
) {
|
||||
guard let ptr = unsafe codeUnits._pointer else {
|
||||
unsafe self._unsafeBaseAddress = nil
|
||||
self._countAndFlags = 0
|
||||
return
|
||||
}
|
||||
|
||||
unsafe self._unsafeBaseAddress = ptr
|
||||
self._countAndFlags = UInt64(truncatingIfNeeded: codeUnits.count)
|
||||
if isKnownASCII {
|
||||
_setIsASCII()
|
||||
}
|
||||
if isKnownNFC {
|
||||
_setIsNFC()
|
||||
}
|
||||
_internalInvariant(self.count == codeUnits.count)
|
||||
}
|
||||
|
||||
// HACK: working around lack of internal plumbing work
|
||||
internal var _str: String { unsafe _start()._str(0..<count) }
|
||||
}
|
||||
|
||||
|
||||
// MARK: String
|
||||
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8Span {
|
||||
/// Calls a closure with a pointer to the viewed contiguous storage.
|
||||
///
|
||||
/// The buffer pointer passed as an argument to `body` is valid only
|
||||
/// during the execution of `withUnsafeBufferPointer(_:)`.
|
||||
/// Do not store or return the pointer for later use.
|
||||
///
|
||||
/// - Parameter body: A closure with an `UnsafeBufferPointer` parameter
|
||||
/// that points to the viewed contiguous storage. If `body` has
|
||||
/// a return value, that value is also used as the return value
|
||||
/// for the `withUnsafeBufferPointer(_:)` method. The closure's
|
||||
/// parameter is valid only for the duration of its execution.
|
||||
/// - Returns: The return value of the `body` closure parameter.
|
||||
@_alwaysEmitIntoClient
|
||||
borrowing public func _withUnsafeBufferPointer<
|
||||
E: Error, Result: ~Copyable //& ~Escapable
|
||||
>(
|
||||
_ body: (_ buffer: /*borrowing*/ UnsafeBufferPointer<UInt8>) throws(E) -> Result
|
||||
) throws(E) -> Result {
|
||||
try unsafe body(_start()._ubp(0..<count))
|
||||
}
|
||||
|
||||
// TODO: withSpan or similar?
|
||||
}
|
||||
|
||||
// MARK: Internals
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8Span {
|
||||
#if !INTERNAL_CHECKS_ENABLED
|
||||
@inline(__always) internal func _invariantCheck() {}
|
||||
#else
|
||||
@inline(never) @_effects(releasenone)
|
||||
internal func _invariantCheck() {
|
||||
// TODO: validate the UTF-8 as an assertion (and isASCII)
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8Span {
|
||||
public var isEmpty: Bool {
|
||||
self.count == 0
|
||||
}
|
||||
|
||||
public var span: Span<UInt8> {
|
||||
@lifetime(copy self)
|
||||
get {
|
||||
unsafe Span(_unchecked: _unsafeBaseAddress, count: self.count)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
// TODO(toolchain): decide if we rebase on top of Guillaume's work
|
||||
extension String {
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
public init(copying codeUnits: UTF8Span) {
|
||||
let isASCII = codeUnits.isKnownASCII
|
||||
self = unsafe codeUnits._withUnsafeBufferPointer { bufPtr in
|
||||
unsafe String._uncheckedFromUTF8(bufPtr, isASCII: isASCII)
|
||||
}
|
||||
}
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
public var utf8Span: UTF8Span {
|
||||
@lifetime(borrow self)
|
||||
borrowing get {
|
||||
let isKnownASCII = _guts.isASCII
|
||||
let utf8 = self.utf8
|
||||
let span = utf8.span
|
||||
let result = unsafe UTF8Span(
|
||||
unchecked: span,
|
||||
isKnownASCII: isKnownASCII)
|
||||
return unsafe _overrideLifetime(result, borrowing: self)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extension Substring {
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
public var utf8Span: UTF8Span {
|
||||
@lifetime(borrow self)
|
||||
borrowing get {
|
||||
let isKnownASCII = base._guts.isASCII
|
||||
let utf8 = self.utf8
|
||||
let span = utf8.span
|
||||
let result = unsafe UTF8Span(
|
||||
unchecked: span,
|
||||
isKnownASCII: isKnownASCII)
|
||||
return unsafe _overrideLifetime(result, borrowing: self)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
126
stdlib/public/core/UTF8SpanBits.swift
Normal file
126
stdlib/public/core/UTF8SpanBits.swift
Normal file
@@ -0,0 +1,126 @@
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8Span {
|
||||
/// Returns whether contents are known to be all-ASCII. A return value of
|
||||
/// `true` means that all code units are ASCII. A return value of `false`
|
||||
/// means there _may_ be non-ASCII content.
|
||||
///
|
||||
/// ASCII-ness is checked and remembered during UTF-8 validation, so this
|
||||
/// is often equivalent to is-ASCII, but there are some situations where
|
||||
/// we might return `false` even when the content happens to be all-ASCII.
|
||||
///
|
||||
/// For example, a UTF-8 span generated from a `String` that at some point
|
||||
/// contained non-ASCII content would report false for `isKnownASCII`, even
|
||||
/// if that String had subsequent mutation operations that removed any
|
||||
/// non-ASCII content.
|
||||
@_alwaysEmitIntoClient
|
||||
public var isKnownASCII: Bool {
|
||||
0 != _countAndFlags & Self._asciiBit
|
||||
}
|
||||
|
||||
/// Do a scan checking for whether the contents are all-ASCII.
|
||||
///
|
||||
/// Updates the `isKnownASCII` bit if contents are all-ASCII.
|
||||
@lifetime(self: copy self)
|
||||
public mutating func checkForASCII() -> Bool {
|
||||
if isKnownASCII { return true }
|
||||
|
||||
let result = unsafe _withUnsafeBufferPointer {
|
||||
unsafe _allASCII($0)
|
||||
}
|
||||
if result {
|
||||
_setIsASCII()
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
/// Returns whether the contents are known to be NFC. This is not
|
||||
/// always checked at initialization time and is set by `checkForNFC`.
|
||||
// TODO: should this be @_unavailableInEmbedded
|
||||
@_alwaysEmitIntoClient
|
||||
public var isKnownNFC: Bool {
|
||||
0 != _countAndFlags & Self._nfcBit
|
||||
}
|
||||
|
||||
// Set the isKnownASCII bit to true (also isNFC)
|
||||
@_alwaysEmitIntoClient
|
||||
@lifetime(self: copy self)
|
||||
internal mutating func _setIsASCII() {
|
||||
self._countAndFlags |= Self._asciiBit | Self._nfcBit
|
||||
}
|
||||
|
||||
// Set the isKnownNFC bit to true (also isNFC)
|
||||
@_alwaysEmitIntoClient
|
||||
@lifetime(self: copy self)
|
||||
internal mutating func _setIsNFC() {
|
||||
self._countAndFlags |= Self._nfcBit
|
||||
}
|
||||
|
||||
/// Do a scan checking for whether the contents are in Normal Form C.
|
||||
/// When the contents are in NFC, canonical equivalence checks are much
|
||||
/// faster.
|
||||
///
|
||||
/// `quickCheck` will check for a subset of NFC contents using the
|
||||
/// NFCQuickCheck algorithm, which is faster than the full normalization
|
||||
/// algorithm. However, it cannot detect all NFC contents.
|
||||
///
|
||||
/// Updates the `isKnownNFC` bit.
|
||||
@_unavailableInEmbedded
|
||||
@lifetime(self: copy self)
|
||||
public mutating func checkForNFC(
|
||||
quickCheck: Bool
|
||||
) -> Bool {
|
||||
if isKnownNFC { return true }
|
||||
|
||||
if quickCheck {
|
||||
let result = unsafe _withUnsafeBufferPointer { utf8 in
|
||||
var prevCCC: UInt8 = 0
|
||||
return unsafe _nfcQuickCheck(utf8, prevCCC: &prevCCC)
|
||||
}
|
||||
if result {
|
||||
self._countAndFlags |= Self._nfcBit
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// TODO: use faster internal algorithm
|
||||
let normalized = _str._nfcCodeUnits
|
||||
guard unsafe _start()._urbp(
|
||||
0..<count
|
||||
).elementsEqual(normalized) else {
|
||||
return false
|
||||
}
|
||||
|
||||
self._countAndFlags |= Self._nfcBit
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8Span {
|
||||
@_alwaysEmitIntoClient @inline(__always)
|
||||
internal static var _asciiBit: UInt64 {
|
||||
0x8000_0000_0000_0000
|
||||
}
|
||||
|
||||
@_alwaysEmitIntoClient @inline(__always)
|
||||
internal static var _nfcBit: UInt64 {
|
||||
0x4000_0000_0000_0000
|
||||
}
|
||||
|
||||
@_alwaysEmitIntoClient @inline(__always)
|
||||
internal static var _countMask: UInt64 {
|
||||
0x00FF_FFFF_FFFF_FFFF
|
||||
}
|
||||
|
||||
@_alwaysEmitIntoClient @inline(__always)
|
||||
internal static var _flagsMask: UInt64 {
|
||||
0xFF00_0000_0000_0000
|
||||
}
|
||||
|
||||
@_alwaysEmitIntoClient
|
||||
public var count: Int {
|
||||
Int(truncatingIfNeeded: _countAndFlags & Self._countMask)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
100
stdlib/public/core/UTF8SpanComparisons.swift
Normal file
100
stdlib/public/core/UTF8SpanComparisons.swift
Normal file
@@ -0,0 +1,100 @@
|
||||
// TODO: comment header
|
||||
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8Span {
|
||||
/// Whether this span has the same bytes as `other`.
|
||||
@_alwaysEmitIntoClient
|
||||
public func bytesEqual(to other: some Sequence<UInt8>) -> Bool {
|
||||
unsafe _withUnsafeBufferPointer { unsafe $0.elementsEqual(other) }
|
||||
}
|
||||
|
||||
/// Whether this span has the same `Unicode.Scalar`s as `other`.
|
||||
@_alwaysEmitIntoClient
|
||||
public func unicodeScalarsEqual(
|
||||
to other: some Sequence<Unicode.Scalar>
|
||||
) -> Bool {
|
||||
// TODO: We don't need to decode our code units, we can just match
|
||||
// against their scalars' encoded bytes
|
||||
|
||||
var scalars = makeUnicodeScalarIterator()
|
||||
var otherScalars = other.makeIterator()
|
||||
while let s = scalars.next() {
|
||||
guard let otherS = otherScalars.next(), s == otherS else {
|
||||
return false
|
||||
}
|
||||
}
|
||||
guard scalars.next() == nil else {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
/// Whether this span has the same `Character`s as `other`.
|
||||
@_unavailableInEmbedded
|
||||
@_alwaysEmitIntoClient
|
||||
public func charactersEqual(
|
||||
to other: some Sequence<Character>
|
||||
) -> Bool {
|
||||
var chars = makeCharacterIterator()
|
||||
var otherChars = other.makeIterator()
|
||||
while let c = chars.next() {
|
||||
guard let otherC = otherChars.next(), c == otherC else {
|
||||
return false
|
||||
}
|
||||
}
|
||||
guard chars.next() == nil else {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8Span {
|
||||
/// Whether `self` is equivalent to `other` under Unicode Canonical
|
||||
/// Equivalence.
|
||||
public func isCanonicallyEquivalent(
|
||||
to other: UTF8Span
|
||||
) -> Bool {
|
||||
unsafe self._withUnsafeBufferPointer { selfBufPtr in
|
||||
unsafe other._withUnsafeBufferPointer { otherBufPtr in
|
||||
unsafe _stringCompareFastUTF8(
|
||||
selfBufPtr,
|
||||
otherBufPtr,
|
||||
expecting: .equal,
|
||||
bothNFC: self.isKnownNFC && other.isKnownNFC)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether `self` orders less than `other` under Unicode Canonical
|
||||
/// Equivalence using normalized code-unit order (in NFC).
|
||||
public func isCanonicallyLessThan(
|
||||
_ other: UTF8Span
|
||||
) -> Bool {
|
||||
unsafe self._withUnsafeBufferPointer { selfBufPtr in
|
||||
unsafe other._withUnsafeBufferPointer { otherBufPtr in
|
||||
unsafe _stringCompareFastUTF8(
|
||||
selfBufPtr,
|
||||
otherBufPtr,
|
||||
expecting: .less,
|
||||
bothNFC: self.isKnownNFC && other.isKnownNFC)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// // FIXME: remove
|
||||
// @available(SwiftStdlib 6.2, *)
|
||||
// extension UTF8Span {
|
||||
// public static func ~=(_ lhs: StaticString, _ rhs: UTF8Span) -> Bool {
|
||||
// return lhs.withUTF8Buffer { str in
|
||||
// rhs._withUnsafeBufferPointer { span in
|
||||
// str.elementsEqual(span)
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
360
stdlib/public/core/UTF8SpanFundamentals.swift
Normal file
360
stdlib/public/core/UTF8SpanFundamentals.swift
Normal file
@@ -0,0 +1,360 @@
|
||||
// Core Scalar API
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8Span {
|
||||
/// Whether `i` is on a boundary between Unicode scalar values.
|
||||
///
|
||||
/// This function does not validate that `i` is within the span's bounds;
|
||||
/// this is an unsafe operation.
|
||||
internal func _isScalarAligned(unchecked i: Int) -> Bool {
|
||||
if i == count || i == 0 { return true }
|
||||
_internalInvariant(_boundsCheck(i))
|
||||
return unsafe _start()._isScalarAligned(i)
|
||||
}
|
||||
|
||||
/// Returns the start of the `Unicode.Scalar` ending at `i`, i.e. the scalar
|
||||
/// before the one starting at `i` or the last scalar if `i` is the end of
|
||||
/// the span.
|
||||
///
|
||||
/// `i` must be scalar-aligned.
|
||||
internal func _previousScalarStart(_ i: Int) -> Int {
|
||||
precondition(_boundsCheck(i&-1))
|
||||
return _previousScalarStart(unchecked: i)
|
||||
}
|
||||
|
||||
/// Returns the start of the `Unicode.Scalar` ending at `i`, i.e. the scalar
|
||||
/// before the one starting at `i` or the last scalar if `i` is the end of
|
||||
/// the span.
|
||||
///
|
||||
/// `i` must be scalar-aligned.
|
||||
///
|
||||
/// This function does not validate that `i` is within the span's bounds;
|
||||
/// this is an unsafe operation.
|
||||
internal func _previousScalarStart(unchecked i: Int) -> Int {
|
||||
_internalInvariant(_boundsCheck(i&-1))
|
||||
precondition(_isScalarAligned(unchecked: i))
|
||||
return _previousScalarStart(uncheckedAssumingAligned: i)
|
||||
}
|
||||
|
||||
/// Returns the start of the `Unicode.Scalar` ending at `i`, i.e. the scalar
|
||||
/// before the one starting at `i` or the last scalar if `i` is the end of
|
||||
/// the span.
|
||||
///
|
||||
/// `i` must be scalar-aligned.
|
||||
///
|
||||
/// This function does not validate that `i` is within the span's bounds;
|
||||
/// this is an unsafe operation.
|
||||
///
|
||||
///
|
||||
/// This function does not validate that `i` is scalar-aligned; this is an
|
||||
/// unsafe operation if `i` isn't.
|
||||
internal func _previousScalarStart(
|
||||
uncheckedAssumingAligned i: Int
|
||||
) -> Int {
|
||||
_internalInvariant(_boundsCheck(i&-1))
|
||||
_internalInvariant(_isScalarAligned(unchecked: i))
|
||||
return unsafe _start()._previousScalarStart(i)
|
||||
}
|
||||
|
||||
/// Decode the `Unicode.Scalar` starting at `i`. Return it and the start of
|
||||
/// the next scalar.
|
||||
///
|
||||
/// `i` must be scalar-aligned.
|
||||
///
|
||||
/// This function does not validate that `i` is within the span's bounds;
|
||||
/// this is an unsafe operation.
|
||||
///
|
||||
///
|
||||
/// This function does not validate that `i` is scalar-aligned; this is an
|
||||
/// unsafe operation if `i` isn't.
|
||||
internal func _decodeNextScalar(
|
||||
uncheckedAssumingAligned i: Int
|
||||
) -> (Unicode.Scalar, nextScalarStart: Int) {
|
||||
_internalInvariant(_boundsCheck(i))
|
||||
_internalInvariant(_isScalarAligned(unchecked: i))
|
||||
return unsafe _start()._decodeScalar(startingAt: i)
|
||||
}
|
||||
|
||||
/// Decode the `Unicode.Scalar` ending at `i`, i.e. the previous scalar.
|
||||
/// Return it and the start of that scalar.
|
||||
///
|
||||
/// `i` must be scalar-aligned.
|
||||
internal func _decodePreviousScalar(
|
||||
_ i: Int
|
||||
) -> (Unicode.Scalar, previousScalarStart: Int) {
|
||||
precondition(_boundsCheck(i &- 1))
|
||||
return _decodePreviousScalar(unchecked: i)
|
||||
}
|
||||
|
||||
/// Decode the `Unicode.Scalar` ending at `i`, i.e. the previous scalar.
|
||||
/// Return it and the start of that scalar.
|
||||
///
|
||||
/// `i` must be scalar-aligned.
|
||||
///
|
||||
/// This function does not validate that `i` is within the span's bounds;
|
||||
/// this is an unsafe operation.
|
||||
internal func _decodePreviousScalar(
|
||||
unchecked i: Int
|
||||
) -> (Unicode.Scalar, previousScalarStart: Int) {
|
||||
_internalInvariant(_boundsCheck(i &- 1))
|
||||
precondition(_isScalarAligned(unchecked: i))
|
||||
return _decodePreviousScalar(uncheckedAssumingAligned: i)
|
||||
}
|
||||
|
||||
/// Decode the `Unicode.Scalar` ending at `i`, i.e. the previous scalar.
|
||||
/// Return it and the start of that scalar.
|
||||
///
|
||||
/// `i` must be scalar-aligned.
|
||||
///
|
||||
/// This function does not validate that `i` is within the span's bounds;
|
||||
/// this is an unsafe operation.
|
||||
///
|
||||
///
|
||||
/// This function does not validate that `i` is scalar-aligned; this is an
|
||||
/// unsafe operation if `i` isn't.
|
||||
internal func _decodePreviousScalar(
|
||||
uncheckedAssumingAligned i: Int
|
||||
) -> (Unicode.Scalar, previousScalarStart: Int) {
|
||||
_internalInvariant(_boundsCheck(i &- 1))
|
||||
_internalInvariant(_isScalarAligned(unchecked: i))
|
||||
return unsafe _start()._decodeScalar(endingAt: i)
|
||||
}
|
||||
}
|
||||
|
||||
// Derived Scalar API
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8Span {
|
||||
/// Find the nearest scalar-aligned position `<= i`.
|
||||
internal func _scalarAlignBackwards(_ i: Int) -> Int {
|
||||
if i == count || i == 0 { return i }
|
||||
|
||||
precondition(_boundsCheck(i))
|
||||
return unsafe _start()._scalarAlign(i)
|
||||
}
|
||||
|
||||
/// Find the nearest scalar-aligned position `>= i`.
|
||||
internal func _scalarAlignForwards(_ i: Int) -> Int {
|
||||
// FIXME: do the bounds check
|
||||
// FIXME: stop at end of code units
|
||||
// - this should be an invariant, but checking it lets us avoid ever
|
||||
// reading off the end
|
||||
// FIXME: implement directly
|
||||
var i = i
|
||||
while _slowPath(!_isScalarAligned(unchecked: i)) {
|
||||
i &+= 1
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
/// Find the nearest scalar-aligned position `>= i`.
|
||||
///
|
||||
/// This function does not validate that `i` is within the span's bounds;
|
||||
/// this is an unsafe operation.
|
||||
internal func _scalarAlignForwards(unchecked i: Int) -> Int {
|
||||
if i == count || i == 0 { return i }
|
||||
|
||||
var i = i
|
||||
while _slowPath(!_isScalarAligned(unchecked: i)) {
|
||||
i &+= 1
|
||||
}
|
||||
return i
|
||||
}
|
||||
}
|
||||
|
||||
// Core Character API
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8Span {
|
||||
/// Returns the start of the next `Character` (i.e. grapheme cluster) after
|
||||
/// the one starting at `i`, or the end of the span if `i` denotes the final
|
||||
/// `Character`.
|
||||
///
|
||||
/// `i` must be `Character`-aligned.
|
||||
internal func _nextCharacterStart(_ i: Int) -> Int {
|
||||
precondition(_boundsCheck(i))
|
||||
return _nextCharacterStart(unchecked: i)
|
||||
}
|
||||
|
||||
/// Returns the start of the next `Character` (i.e. grapheme cluster) after
|
||||
/// the one starting at `i`, or the end of the span if `i` denotes the final
|
||||
/// `Character`.
|
||||
///
|
||||
/// `i` must be `Character`-aligned.
|
||||
///
|
||||
/// This function does not validate that `i` is within the span's bounds;
|
||||
/// this is an unsafe operation.
|
||||
internal func _nextCharacterStart(unchecked i: Int) -> Int {
|
||||
_internalInvariant(_boundsCheck(i))
|
||||
precondition(_isScalarAligned(unchecked: i))
|
||||
return _nextCharacterStart(uncheckedAssumingAligned: i)
|
||||
}
|
||||
|
||||
/// Returns the start of the next `Character` (i.e. grapheme cluster) after
|
||||
/// the one starting at `i`, or the end of the span if `i` denotes the final
|
||||
/// `Character`.
|
||||
///
|
||||
/// `i` must be `Character`-aligned.
|
||||
///
|
||||
/// This function does not validate that `i` is within the span's bounds;
|
||||
/// this is an unsafe operation.
|
||||
///
|
||||
/// This function does not validate that `i` is `Character`-aligned; this is
|
||||
/// an unsafe operation if `i` isn't.
|
||||
internal func _nextCharacterStart(
|
||||
uncheckedAssumingAligned i: Int
|
||||
) -> Int {
|
||||
_internalInvariant(_boundsCheck(i))
|
||||
_internalInvariant(_isScalarAligned(unchecked: i))
|
||||
return unsafe _start()._nextCharacterStart(i, limitedBy: count)
|
||||
}
|
||||
|
||||
/// Returns the start of the `Character` (i.e. grapheme cluster) ending at
|
||||
/// `i`, i.e. the `Character` before the one starting at `i` or the last
|
||||
/// `Character` if `i` is the end of the span.
|
||||
///
|
||||
/// `i` must be `Character`-aligned.
|
||||
internal func _previousCharacterStart(_ i: Int) -> Int {
|
||||
precondition(_boundsCheck(i&-1))
|
||||
return _previousCharacterStart(unchecked: i)
|
||||
}
|
||||
|
||||
/// Returns the start of the `Character` (i.e. grapheme cluster) ending at
|
||||
/// `i`, i.e. the `Character` before the one starting at `i` or the last
|
||||
/// `Character` if `i` is the end of the span.
|
||||
///
|
||||
/// `i` must be `Character`-aligned.
|
||||
///
|
||||
/// This function does not validate that `i` is within the span's bounds;
|
||||
/// this is an unsafe operation.
|
||||
internal func _previousCharacterStart(unchecked i: Int) -> Int {
|
||||
_internalInvariant(_boundsCheck(i&-1))
|
||||
precondition(_isScalarAligned(unchecked: i))
|
||||
return _previousCharacterStart(uncheckedAssumingAligned: i)
|
||||
}
|
||||
|
||||
/// Returns the start of the `Character` (i.e. grapheme cluster) ending at
|
||||
/// `i`, i.e. the `Character` before the one starting at `i` or the last
|
||||
/// `Character` if `i` is the end of the span.
|
||||
///
|
||||
/// `i` must be `Character`-aligned.
|
||||
///
|
||||
/// This function does not validate that `i` is within the span's bounds;
|
||||
/// this is an unsafe operation.
|
||||
///
|
||||
/// This function does not validate that `i` is `Character`-aligned; this is
|
||||
/// an unsafe operation if `i` isn't.
|
||||
internal func _previousCharacterStart(
|
||||
uncheckedAssumingAligned i: Int
|
||||
) -> Int {
|
||||
_internalInvariant(_boundsCheck(i&-1))
|
||||
_internalInvariant(_isScalarAligned(unchecked: i))
|
||||
return unsafe _start()._previousCharacterStart(i, limitedBy: count)
|
||||
}
|
||||
|
||||
/// Decode the `Character` starting at `i` Return it and the start of the
|
||||
/// next `Character`.
|
||||
///
|
||||
/// `i` must be `Character`-aligned.
|
||||
internal func _decodeNextCharacter(
|
||||
_ i: Int
|
||||
) -> (Character, nextCharacterStart: Int) {
|
||||
precondition(_boundsCheck(i))
|
||||
return _decodeNextCharacter(unchecked: i)
|
||||
}
|
||||
|
||||
/// Decode the `Character` starting at `i` Return it and the start of the
|
||||
/// next `Character`.
|
||||
///
|
||||
/// `i` must be `Character`-aligned.
|
||||
///
|
||||
/// This function does not validate that `i` is within the span's bounds;
|
||||
/// this is an unsafe operation.
|
||||
internal func _decodeNextCharacter(
|
||||
unchecked i: Int
|
||||
) -> (Character, nextCharacterStart: Int) {
|
||||
_internalInvariant(_boundsCheck(i))
|
||||
precondition(_isScalarAligned(unchecked: i))
|
||||
return _decodeNextCharacter(uncheckedAssumingAligned: i)
|
||||
}
|
||||
|
||||
/// Decode the `Character` starting at `i` Return it and the start of the
|
||||
/// next `Character`.
|
||||
///
|
||||
/// `i` must be `Character`-aligned.
|
||||
///
|
||||
/// This function does not validate that `i` is within the span's bounds;
|
||||
/// this is an unsafe operation.
|
||||
///
|
||||
/// This function does not validate that `i` is `Character`-aligned; this is
|
||||
/// an unsafe operation if `i` isn't.
|
||||
internal func _decodeNextCharacter(
|
||||
uncheckedAssumingAligned i: Int
|
||||
) -> (Character, nextCharacterStart: Int) {
|
||||
_internalInvariant(_boundsCheck(i))
|
||||
_internalInvariant(_isScalarAligned(unchecked: i))
|
||||
return unsafe _start()._decodeCharacter(
|
||||
startingAt: i, limitedBy: count)
|
||||
}
|
||||
|
||||
/// Decode the `Character` (i.e. grapheme cluster) ending at `i`, i.e. the
|
||||
/// previous `Character`. Return it and the start of that `Character`.
|
||||
///
|
||||
/// `i` must be `Character`-aligned.
|
||||
internal func _decodePreviousCharacter(_ i: Int) -> (Character, Int) {
|
||||
precondition(_boundsCheck(i &- 1))
|
||||
return _decodePreviousCharacter(unchecked: i)
|
||||
}
|
||||
|
||||
/// Decode the `Character` (i.e. grapheme cluster) ending at `i`, i.e. the
|
||||
/// previous `Character`. Return it and the start of that `Character`.
|
||||
///
|
||||
/// `i` must be `Character`-aligned.
|
||||
///
|
||||
/// This function does not validate that `i` is within the span's bounds;
|
||||
/// this is an unsafe operation.
|
||||
internal func _decodePreviousCharacter(
|
||||
unchecked i: Int
|
||||
) -> (Character, Int) {
|
||||
_internalInvariant(_boundsCheck(i &- 1))
|
||||
precondition(_isScalarAligned(unchecked: i))
|
||||
return _decodePreviousCharacter(uncheckedAssumingAligned: i)
|
||||
}
|
||||
|
||||
/// Decode the `Character` (i.e. grapheme cluster) ending at `i`, i.e. the
|
||||
/// previous `Character`. Return it and the start of that `Character`.
|
||||
///
|
||||
/// `i` must be `Character`-aligned.
|
||||
///
|
||||
/// This function does not validate that `i` is within the span's bounds;
|
||||
/// this is an unsafe operation.
|
||||
///
|
||||
/// This function does not validate that `i` is `Character`-aligned; this is
|
||||
/// an unsafe operation if `i` isn't.
|
||||
internal func _decodePreviousCharacter(
|
||||
uncheckedAssumingAligned i: Int
|
||||
) -> (Character, Int) {
|
||||
_internalInvariant(_boundsCheck(i &- 1))
|
||||
_internalInvariant(_isScalarAligned(unchecked: i))
|
||||
return unsafe _start()._decodeCharacter(
|
||||
endingAt: i, limitedBy: count)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// TODO: internal?
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8Span {
|
||||
/// Whether `i` is in bounds
|
||||
@_alwaysEmitIntoClient
|
||||
internal func _boundsCheck(_ i: Int) -> Bool {
|
||||
i >= 0 && i < count
|
||||
}
|
||||
/// Whether `bounds` is in bounds
|
||||
@_alwaysEmitIntoClient
|
||||
internal func _boundsCheck(_ bounds: Range<Int>) -> Bool {
|
||||
_boundsCheck(bounds.lowerBound)
|
||||
&& _boundsCheck(bounds.upperBound &- 1)
|
||||
}
|
||||
}
|
||||
|
||||
// Future work: UTF-16 support when we get views
|
||||
|
||||
|
||||
179
stdlib/public/core/UTF8SpanInternalHelpers.swift
Normal file
179
stdlib/public/core/UTF8SpanInternalHelpers.swift
Normal file
@@ -0,0 +1,179 @@
|
||||
/*
|
||||
|
||||
Additional helpers build on stdlibDuplicates.swift
|
||||
|
||||
*/
|
||||
|
||||
// TODO: Should we update our unicode helpers file to call these instead?
|
||||
|
||||
// import Builtin
|
||||
|
||||
extension UnsafeRawPointer {
|
||||
// @_alwaysEmitIntoClient
|
||||
internal func _loadByte(_ i: Int) -> UInt8 {
|
||||
_internalInvariant(i >= 0)
|
||||
return unsafe (self+i).loadUnaligned(as: UInt8.self)
|
||||
}
|
||||
|
||||
// @_alwaysEmitIntoClient
|
||||
internal func _isUTF8Continuation(_ i: Int) -> Bool {
|
||||
unsafe UTF8.isContinuation(_loadByte(i))
|
||||
}
|
||||
|
||||
// @_alwaysEmitIntoClient
|
||||
internal func _isScalarAligned(_ i: Int) -> Bool {
|
||||
_internalInvariant(i >= 0)
|
||||
return unsafe !_isUTF8Continuation(i)
|
||||
}
|
||||
|
||||
// @_alwaysEmitIntoClient
|
||||
internal func _scalarLength(startingAt i: Int) -> Int {
|
||||
unsafe _utf8ScalarLength(_loadByte(i))
|
||||
}
|
||||
|
||||
// NOTE: Adaptation of `_decodeScalar` to work on URP
|
||||
// @_alwaysEmitIntoClient
|
||||
internal func _decodeScalar(
|
||||
startingAt i: Int
|
||||
) -> (Unicode.Scalar, nextScalarStart: Int) {
|
||||
let cu0 = unsafe _loadByte(i)
|
||||
let len = _utf8ScalarLength(cu0)
|
||||
let next = len &+ i
|
||||
switch len {
|
||||
case 1: return (_decodeUTF8(cu0), next)
|
||||
case 2: return unsafe (_decodeUTF8(cu0, _loadByte(i &+ 1)), next)
|
||||
case 3: return unsafe (
|
||||
_decodeUTF8(cu0, _loadByte(i &+ 1), _loadByte(i &+ 2)), next
|
||||
)
|
||||
case 4:
|
||||
return (
|
||||
unsafe _decodeUTF8(
|
||||
cu0, _loadByte(i &+ 1), _loadByte(i &+ 2), _loadByte(i &+ 3)
|
||||
),
|
||||
next
|
||||
)
|
||||
default: Builtin.unreachable()
|
||||
}
|
||||
}
|
||||
|
||||
// @_alwaysEmitIntoClient
|
||||
internal func _decodeScalar(
|
||||
endingAt i: Int
|
||||
) -> (Unicode.Scalar, previousScalarStart: Int) {
|
||||
// TODO: no need to double load the bytes...
|
||||
let start = unsafe _previousScalarStart(i)
|
||||
return unsafe (_decodeScalar(startingAt: start).0, start)
|
||||
}
|
||||
|
||||
// @_alwaysEmitIntoClient
|
||||
internal func _previousScalarStart(_ i: Int) -> Int {
|
||||
var prev = i &- 1
|
||||
_internalInvariant(prev >= 0)
|
||||
while unsafe _isUTF8Continuation(prev) {
|
||||
prev &-= 1
|
||||
_internalInvariant(prev >= 0)
|
||||
}
|
||||
_internalInvariant(unsafe i == prev + _utf8ScalarLength(_loadByte(prev)))
|
||||
return prev
|
||||
}
|
||||
|
||||
// @_alwaysEmitIntoClient
|
||||
internal func _scalarAlign(_ i: Int) -> Int {
|
||||
var i = i
|
||||
while _slowPath(unsafe !_isScalarAligned(i)) {
|
||||
i &-= 1
|
||||
}
|
||||
return i
|
||||
}
|
||||
}
|
||||
|
||||
extension UnsafeRawPointer {
|
||||
// TODO: ASCII fast path wrappers around ufi functions
|
||||
|
||||
// TODO: hook up to real grapheme breaking
|
||||
internal func _urbp(_ range: Range<Int>) -> UnsafeRawBufferPointer {
|
||||
unsafe .init(start: self + range.lowerBound, count: range.count)
|
||||
}
|
||||
|
||||
@_alwaysEmitIntoClient
|
||||
internal func _ubp(_ range: Range<Int>) -> UnsafeBufferPointer<UInt8> {
|
||||
unsafe UnsafeBufferPointer<UInt8>(
|
||||
start: UnsafePointer((self+range.lowerBound)._rawValue),
|
||||
count: range.count)
|
||||
}
|
||||
|
||||
internal func _str(_ range: Range<Int>) -> String {
|
||||
unsafe String(decoding: _urbp(range) , as: UTF8.self)
|
||||
}
|
||||
|
||||
// @usableFromInline
|
||||
internal func _nextCharacterStart(
|
||||
_ i: Int, limitedBy end: Int
|
||||
) -> Int {
|
||||
_internalInvariant((0..<end).contains(i))
|
||||
_internalInvariant(unsafe _isScalarAligned(i))
|
||||
|
||||
return _nextGraphemeClusterBoundary(startingAt: i) { idx in
|
||||
guard idx < end else { return nil }
|
||||
let (scalar, end) = unsafe _decodeScalar(startingAt: idx)
|
||||
return (scalar, end)
|
||||
}
|
||||
}
|
||||
|
||||
// @usableFromInline
|
||||
internal func _previousCharacterStart(
|
||||
_ i: Int,
|
||||
limitedBy end: Int
|
||||
) -> Int {
|
||||
_internalInvariant(i > 0 && i <= end)
|
||||
_internalInvariant(unsafe i == end || _isScalarAligned(i))
|
||||
|
||||
return _previousGraphemeClusterBoundary(endingAt: i) { idx in
|
||||
guard idx > 0 else { return nil }
|
||||
let (scalar, prior) = unsafe _decodeScalar(endingAt: idx)
|
||||
return (scalar, prior)
|
||||
}
|
||||
}
|
||||
|
||||
// @usableFromInline
|
||||
internal func _decodeCharacter(
|
||||
startingAt i: Int, limitedBy end: Int
|
||||
) -> (Character, nextCharacterStart: Int) {
|
||||
let nextStart = unsafe _nextCharacterStart(i, limitedBy: end)
|
||||
return unsafe (Character(_str(i..<nextStart)), nextStart)
|
||||
}
|
||||
|
||||
// @usableFromInline
|
||||
internal func _decodeCharacter(
|
||||
endingAt i: Int,
|
||||
limitedBy end: Int
|
||||
) -> (Character, nextCharacterStart: Int) {
|
||||
let start = unsafe _previousCharacterStart(i, limitedBy: end)
|
||||
_internalInvariant(start >= 0)
|
||||
|
||||
return unsafe (Character(_str(start..<i)), start)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UnsafeRawPointer {
|
||||
internal enum _UTF8ValidationResult {
|
||||
case success(isASCII: Bool)
|
||||
case error(_: Range<Int>)
|
||||
}
|
||||
|
||||
// Returns isASCII
|
||||
// TODO: return more values
|
||||
internal func _validateUTF8(
|
||||
limitedBy end: Int
|
||||
) throws(UTF8.ValidationError) -> Bool {
|
||||
switch unsafe validateUTF8(_ubp(0..<end)) {
|
||||
case .success(let info):
|
||||
return info.isASCII
|
||||
case .error(let kind, let range):
|
||||
throw UTF8.ValidationError(kind._publicKind, range)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
391
stdlib/public/core/UTF8SpanIterators.swift
Normal file
391
stdlib/public/core/UTF8SpanIterators.swift
Normal file
@@ -0,0 +1,391 @@
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8Span {
|
||||
/// Returns an iterator that will decode the code units into
|
||||
/// `Unicode.Scalar`s.
|
||||
///
|
||||
/// The resulting iterator has the same lifetime constraints as `self`.
|
||||
@lifetime(copy self)
|
||||
public func makeUnicodeScalarIterator() -> UnicodeScalarIterator {
|
||||
.init(self)
|
||||
}
|
||||
|
||||
/// Iterate the `Unicode.Scalar`s contents of a `UTF8Span`.
|
||||
///
|
||||
/// **TODO**: Examples
|
||||
@frozen
|
||||
public struct UnicodeScalarIterator: ~Escapable {
|
||||
public let codeUnits: UTF8Span
|
||||
|
||||
/// The byte offset of the start of the next scalar. This is
|
||||
/// always scalar-aligned.
|
||||
fileprivate(set)
|
||||
public var currentCodeUnitOffset: Int
|
||||
|
||||
@lifetime(copy codeUnits)
|
||||
public init(_ codeUnits: UTF8Span) {
|
||||
self.codeUnits = codeUnits
|
||||
self.currentCodeUnitOffset = 0
|
||||
}
|
||||
|
||||
private var _start: UnsafeRawPointer {
|
||||
unsafe codeUnits._start()
|
||||
}
|
||||
|
||||
/// Decode and return the scalar starting at `currentCodeUnitOffset`.
|
||||
/// After the function returns, `currentCodeUnitOffset` holds the
|
||||
/// position at the end of the returned scalar, which is also the start
|
||||
/// of the next scalar.
|
||||
///
|
||||
/// Returns `nil` if at the end of the `UTF8Span`.
|
||||
@lifetime(self: copy self)
|
||||
public mutating func next() -> Unicode.Scalar? {
|
||||
guard currentCodeUnitOffset < codeUnits.count else {
|
||||
return nil
|
||||
}
|
||||
|
||||
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
|
||||
let (result, newPos) = unsafe _start._decodeScalar(startingAt: currentCodeUnitOffset)
|
||||
self.currentCodeUnitOffset = newPos
|
||||
return result
|
||||
}
|
||||
|
||||
/// Decode and return the scalar ending at `currentCodeUnitOffset`. After
|
||||
/// the function returns, `currentCodeUnitOffset` holds the position at
|
||||
/// the start of the returned scalar, which is also the end of the
|
||||
/// previous scalar.
|
||||
///
|
||||
/// Returns `nil` if at the start of the `UTF8Span`.
|
||||
@lifetime(self: copy self)
|
||||
public mutating func previous() -> Unicode.Scalar? {
|
||||
guard currentCodeUnitOffset > 0 else {
|
||||
return nil
|
||||
}
|
||||
|
||||
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
|
||||
let (result, newPos) = unsafe _start._decodeScalar(endingAt: currentCodeUnitOffset)
|
||||
self.currentCodeUnitOffset = newPos
|
||||
return result
|
||||
}
|
||||
|
||||
|
||||
/// Advance `codeUnitOffset` to the end of the current scalar, without
|
||||
/// decoding it.
|
||||
///
|
||||
/// Returns the number of `Unicode.Scalar`s skipped over, which can be 0
|
||||
/// if at the end of the UTF8Span.
|
||||
@lifetime(self: copy self)
|
||||
public mutating func skipForward() -> Int {
|
||||
guard currentCodeUnitOffset < codeUnits.count else {
|
||||
return 0
|
||||
}
|
||||
|
||||
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
|
||||
|
||||
currentCodeUnitOffset &+= unsafe _start._scalarLength(startingAt: currentCodeUnitOffset)
|
||||
return 1
|
||||
}
|
||||
|
||||
/// Advance `codeUnitOffset` to the end of `n` scalars, without decoding
|
||||
/// them.
|
||||
///
|
||||
/// Returns the number of `Unicode.Scalar`s skipped over, which can be
|
||||
/// fewer than `n` if at the end of the UTF8Span.
|
||||
@lifetime(self: copy self)
|
||||
public mutating func skipForward(by n: Int) -> Int {
|
||||
var numSkipped = 0
|
||||
while numSkipped < n && skipForward() != 0 {
|
||||
numSkipped += 1
|
||||
}
|
||||
|
||||
return numSkipped
|
||||
}
|
||||
|
||||
/// Move `codeUnitOffset` to the start of the previous scalar, without
|
||||
/// decoding it.
|
||||
///
|
||||
/// Returns the number of `Unicode.Scalar`s skipped over, which can be 0
|
||||
/// if at the start of the UTF8Span.
|
||||
@lifetime(self: copy self)
|
||||
public mutating func skipBack() -> Int {
|
||||
guard currentCodeUnitOffset > 0 else {
|
||||
return 0
|
||||
}
|
||||
|
||||
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
|
||||
|
||||
currentCodeUnitOffset = unsafe _start._previousScalarStart(currentCodeUnitOffset)
|
||||
return 1
|
||||
}
|
||||
|
||||
/// Move `codeUnitOffset` to the start of the previous `n` scalars,
|
||||
/// without decoding them.
|
||||
///
|
||||
/// Returns the number of `Unicode.Scalar`s skipped over, which can be
|
||||
/// fewer than `n` if at the start of the UTF8Span.
|
||||
@lifetime(self: copy self)
|
||||
public mutating func skipBack(by n: Int) -> Int {
|
||||
var numSkipped = 0
|
||||
while numSkipped < n && skipBack() != 0 {
|
||||
numSkipped += 1
|
||||
}
|
||||
|
||||
return numSkipped
|
||||
}
|
||||
|
||||
/// Reset to the nearest scalar-aligned code unit offset `<= i`.
|
||||
///
|
||||
/// **TODO**: Example
|
||||
@lifetime(self: copy self)
|
||||
public mutating func reset(roundingBackwardsFrom i: Int) {
|
||||
self.currentCodeUnitOffset = codeUnits._scalarAlignBackwards(i)
|
||||
}
|
||||
|
||||
/// Reset to the nearest scalar-aligned code unit offset `>= i`.
|
||||
///
|
||||
/// **TODO**: Example
|
||||
@lifetime(self: copy self)
|
||||
public mutating func reset(roundingForwardsFrom i: Int) {
|
||||
self.currentCodeUnitOffset = codeUnits._scalarAlignForwards(i)
|
||||
}
|
||||
|
||||
/// Reset this iterator to `codeUnitOffset`, skipping _all_ safety
|
||||
/// checks (including bounds checks).
|
||||
///
|
||||
/// Note: This is only for very specific, low-level use cases. If
|
||||
/// `codeUnitOffset` is not properly scalar-aligned, this function can
|
||||
/// result in undefined behavior when, e.g., `next()` is called.
|
||||
///
|
||||
/// TODO: verify that we're not UB, just garabage-data or guaranteed
|
||||
/// trap!
|
||||
///
|
||||
/// For example, this could be used by a regex engine to backtrack to a
|
||||
/// known-valid previous position.
|
||||
///
|
||||
@unsafe
|
||||
@lifetime(self: copy self)
|
||||
public mutating func reset(toUnchecked codeUnitOffset: Int) {
|
||||
_internalInvariant(codeUnits._isScalarAligned(unchecked: codeUnitOffset))
|
||||
self.currentCodeUnitOffset = codeUnitOffset
|
||||
}
|
||||
|
||||
/// Returns the UTF8Span containing all the content up to the iterator's
|
||||
/// current position.
|
||||
///
|
||||
/// The resultant `UTF8Span` has the same lifetime constraints as `self`.
|
||||
@lifetime(copy self)
|
||||
public func prefix() -> UTF8Span {
|
||||
let slice = codeUnits.span._extracting(0..<currentCodeUnitOffset)
|
||||
return UTF8Span(
|
||||
_uncheckedAssumingValidUTF8: slice,
|
||||
isKnownASCII: codeUnits.isKnownASCII,
|
||||
isKnownNFC: codeUnits.isKnownNFC)
|
||||
}
|
||||
|
||||
/// Returns the UTF8Span containing all the content after the iterator's
|
||||
/// current position.
|
||||
///
|
||||
/// The resultant `UTF8Span` has the same lifetime constraints as `self`.
|
||||
@lifetime(copy self)
|
||||
public func suffix() -> UTF8Span {
|
||||
let slice = codeUnits.span._extracting(currentCodeUnitOffset..<codeUnits.count)
|
||||
return UTF8Span(
|
||||
_uncheckedAssumingValidUTF8: slice,
|
||||
isKnownASCII: codeUnits.isKnownASCII,
|
||||
isKnownNFC: codeUnits.isKnownNFC)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
@_unavailableInEmbedded
|
||||
extension UTF8Span {
|
||||
/// Returns an iterator that will construct `Character`s from the underlying
|
||||
/// UTF-8 content.
|
||||
///
|
||||
/// The resulting iterator has the same lifetime constraints as `self`.
|
||||
@lifetime(copy self)
|
||||
public func makeCharacterIterator() -> CharacterIterator {
|
||||
.init(self)
|
||||
}
|
||||
|
||||
/// Iterate the `Character` contents of a `UTF8Span`.
|
||||
///
|
||||
/// **TODO**: Examples
|
||||
public struct CharacterIterator: ~Escapable {
|
||||
public let codeUnits: UTF8Span
|
||||
|
||||
/// The byte offset of the start of the next `Character`. This is always
|
||||
/// scalar-aligned. It is always `Character`-aligned relative to the last
|
||||
/// call to `reset` (or the start of the span if not called).
|
||||
fileprivate(set)
|
||||
public var currentCodeUnitOffset: Int
|
||||
|
||||
@lifetime(copy codeUnits)
|
||||
public init(_ codeUnits: UTF8Span) {
|
||||
self.codeUnits = codeUnits
|
||||
self.currentCodeUnitOffset = 0
|
||||
}
|
||||
|
||||
private var _start: UnsafeRawPointer {
|
||||
unsafe codeUnits._start()
|
||||
}
|
||||
|
||||
/// Return the `Character` starting at `currentCodeUnitOffset`. After the
|
||||
/// function returns, `currentCodeUnitOffset` holds the position at the
|
||||
/// end of the `Character`, which is also the start of the next
|
||||
/// `Character`.
|
||||
///
|
||||
/// Returns `nil` if at the end of the `UTF8Span`.
|
||||
@lifetime(self: copy self)
|
||||
public mutating func next() -> Character? {
|
||||
guard currentCodeUnitOffset < codeUnits.count else { return nil }
|
||||
|
||||
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
|
||||
let (result, newPos) = unsafe _start._decodeCharacter(
|
||||
startingAt: currentCodeUnitOffset,
|
||||
limitedBy: codeUnits.count
|
||||
)
|
||||
self.currentCodeUnitOffset = newPos
|
||||
return result
|
||||
}
|
||||
|
||||
/// Return the `Character` ending at `currentCodeUnitOffset`. After the
|
||||
/// function returns, `currentCodeUnitOffset` holds the position at the
|
||||
/// start of the returned `Character`, which is also the end of the
|
||||
/// previous `Character`.
|
||||
///
|
||||
/// Returns `nil` if at the start of the `UTF8Span`.
|
||||
@lifetime(self: copy self)
|
||||
public mutating func previous() -> Character? {
|
||||
guard currentCodeUnitOffset > 0 else { return nil }
|
||||
|
||||
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
|
||||
let (result, newPos) = unsafe _start._decodeCharacter(
|
||||
endingAt: currentCodeUnitOffset,
|
||||
limitedBy: codeUnits.count)
|
||||
self.currentCodeUnitOffset = newPos
|
||||
return result
|
||||
}
|
||||
|
||||
/// Advance `codeUnitOffset` to the end of the current `Character`,
|
||||
/// without constructing it.
|
||||
///
|
||||
/// Returns the number of `Character`s skipped over, which can be 0
|
||||
/// if at the end of the UTF8Span.
|
||||
@lifetime(self: copy self)
|
||||
public mutating func skipForward() -> Int {
|
||||
guard currentCodeUnitOffset < codeUnits.count else {
|
||||
return 0
|
||||
}
|
||||
|
||||
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
|
||||
|
||||
self.currentCodeUnitOffset = unsafe _start._nextCharacterStart(currentCodeUnitOffset, limitedBy: codeUnits.count)
|
||||
return 1
|
||||
}
|
||||
|
||||
/// Advance `codeUnitOffset` to the end of `n` `Characters`, without
|
||||
/// constructing them.
|
||||
///
|
||||
/// Returns the number of `Character`s skipped over, which can be
|
||||
/// fewer than `n` if at the end of the UTF8Span.
|
||||
@lifetime(self: copy self)
|
||||
public mutating func skipForward(by n: Int) -> Int {
|
||||
var numSkipped = 0
|
||||
while numSkipped < n && skipForward() != 0 {
|
||||
numSkipped += 1
|
||||
}
|
||||
|
||||
return numSkipped
|
||||
}
|
||||
|
||||
/// Move `codeUnitOffset` to the start of the previous `Character`,
|
||||
/// without constructing it.
|
||||
///
|
||||
/// Returns the number of `Character`s skipped over, which can be 0
|
||||
/// if at the start of the UTF8Span.
|
||||
@lifetime(self: copy self)
|
||||
public mutating func skipBack() -> Int {
|
||||
guard currentCodeUnitOffset > 0 else {
|
||||
return 0
|
||||
}
|
||||
|
||||
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
|
||||
|
||||
currentCodeUnitOffset = unsafe _start._previousCharacterStart(currentCodeUnitOffset, limitedBy: codeUnits.count)
|
||||
return 1
|
||||
|
||||
}
|
||||
|
||||
/// Move `codeUnitOffset` to the start of the previous `n` `Character`s,
|
||||
/// without constructing them.
|
||||
///
|
||||
/// Returns the number of `Character`s skipped over, which can be
|
||||
/// fewer than `n` if at the start of the UTF8Span.
|
||||
@lifetime(self: copy self)
|
||||
public mutating func skipBack(by n: Int) -> Int {
|
||||
var numSkipped = 0
|
||||
while numSkipped < n && skipBack() != 0 {
|
||||
numSkipped += 1
|
||||
}
|
||||
|
||||
return numSkipped
|
||||
}
|
||||
|
||||
/// Reset to the nearest character-aligned position `<= i`.
|
||||
@lifetime(self: copy self)
|
||||
public mutating func reset(roundingBackwardsFrom i: Int) {
|
||||
self.currentCodeUnitOffset = codeUnits._scalarAlignBackwards(i)
|
||||
}
|
||||
|
||||
/// Reset to the nearest character-aligned position `>= i`.
|
||||
@lifetime(self: copy self)
|
||||
public mutating func reset(roundingForwardsFrom i: Int) {
|
||||
self.currentCodeUnitOffset = codeUnits._scalarAlignForwards(i)
|
||||
}
|
||||
|
||||
/// Reset this iterator to `codeUnitOffset`, skipping _all_ safety
|
||||
/// checks.
|
||||
///
|
||||
/// Note: This is only for very specific, low-level use cases. If
|
||||
/// `codeUnitOffset` is not properly scalar-aligned, this function can
|
||||
/// result in undefined behavior when, e.g., `next()` is called.
|
||||
///
|
||||
/// If `i` is scalar-aligned, but not `Character`-aligned, you may get
|
||||
/// different results from running `Character` iteration.
|
||||
///
|
||||
/// For example, this could be used by a regex engine to backtrack to a
|
||||
/// known-valid previous position.
|
||||
///
|
||||
@unsafe
|
||||
@lifetime(self: copy self)
|
||||
public mutating func reset(toUnchecked codeUnitOffset: Int) {
|
||||
_internalInvariant(codeUnits._isScalarAligned(unchecked: codeUnitOffset))
|
||||
self.currentCodeUnitOffset = codeUnitOffset
|
||||
}
|
||||
|
||||
/// Returns the UTF8Span containing all the content up to the iterator's
|
||||
/// current position.
|
||||
@lifetime(copy self)
|
||||
public func prefix() -> UTF8Span {
|
||||
let slice = codeUnits.span._extracting(0..<currentCodeUnitOffset)
|
||||
return UTF8Span(
|
||||
_uncheckedAssumingValidUTF8: slice,
|
||||
isKnownASCII: codeUnits.isKnownASCII,
|
||||
isKnownNFC: codeUnits.isKnownNFC)
|
||||
}
|
||||
|
||||
/// Returns the UTF8Span containing all the content after the iterator's
|
||||
/// current position.
|
||||
@lifetime(copy self)
|
||||
public func suffix() -> UTF8Span {
|
||||
let slice = codeUnits.span._extracting(currentCodeUnitOffset..<codeUnits.count)
|
||||
return UTF8Span(
|
||||
_uncheckedAssumingValidUTF8: slice,
|
||||
isKnownASCII: codeUnits.isKnownASCII,
|
||||
isKnownNFC: codeUnits.isKnownNFC)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -5,8 +5,8 @@
|
||||
// RUN: c-index-test -read-diagnostics %t.dia > %t.deserialized_diagnostics.txt 2>&1
|
||||
// RUN: %FileCheck --input-file=%t.deserialized_diagnostics.txt %s
|
||||
|
||||
var x = String.init // expected-error{{ambiguous use of 'init'}}
|
||||
// CHECK: {{.*[/\\]}}serialized-diagnostics-prettyprint.swift:[[@LINE-1]]:16: error: ambiguous use of 'init'
|
||||
var x = String.init(_:) // expected-error{{ambiguous use of 'init(_:)'}}
|
||||
// CHECK: {{.*[/\\]}}serialized-diagnostics-prettyprint.swift:[[@LINE-1]]:16: error: ambiguous use of 'init(_:)'
|
||||
|
||||
// CHECK: Swift.String.init:2:19: note: found this candidate
|
||||
// CHECK: CONTENTS OF FILE Swift.String.init:
|
||||
|
||||
@@ -814,6 +814,133 @@ Added: _$ss7RawSpanVMa
|
||||
Added: _$ss7RawSpanVMn
|
||||
Added: _$ss7RawSpanVN
|
||||
|
||||
// SE-0464 UTF8Span
|
||||
Added: _$sSS7copyingSSs8UTF8SpanV_tcfC
|
||||
Added: _$sSS8utf8Spans04UTF8B0Vvg
|
||||
Added: _$sSS8utf8Spans04UTF8B0VvpMV
|
||||
Added: _$sSs8utf8Spans04UTF8B0Vvg
|
||||
Added: _$sSs8utf8Spans04UTF8B0VvpMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvM
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvg
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvpMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvs
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11descriptionSSvg
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11descriptionSSvpMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV2eeoiySbAF_AFtFZ
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV11descriptionSSvg
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV11descriptionSSvpMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV15truncatedScalarAHvpZMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV20overlongEncodingByteAHvpZMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV22surrogateCodePointByteAHvpZMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV26unexpectedContinuationByteAHvpZMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV32invalidNonSurrogateCodePointByteAHvpZMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValueAHSgs5UInt8V_tcfC
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8VvM
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8Vvg
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8VvpMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8Vvs
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVMa
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVMn
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVN
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSHsMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSHsWP
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSQsMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSQsWP
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSYsMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSYsWP
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs0D0sMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs0D0sWP
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs23CustomStringConvertiblesMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs23CustomStringConvertiblesWP
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4hash4intoys6HasherVz_tF
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvM
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvg
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvpMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvs
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV9hashValueSivg
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV9hashValueSivpMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVMa
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVMn
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVN
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSHsMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSHsWP
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSQsMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSQsWP
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs0D0sMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs0D0sWP
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs23CustomStringConvertiblesMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs23CustomStringConvertiblesWP
|
||||
Added: _$ss7UnicodeO4UTF8O15_checkAllErrorsySayAD15ValidationErrorVGxSTRzs5UInt8V7ElementRtzlFZ
|
||||
Added: _$ss8UTF8SpanV9unchecked12isKnownASCIIABs0B0Vys5UInt8VG_SbtcfC
|
||||
Added: _$ss8UTF8SpanV10_countMasks6UInt64VvpZMV
|
||||
Added: _$ss8UTF8SpanV10_flagsMasks6UInt64VvpZMV
|
||||
Added: _$ss8UTF8SpanV10isKnownNFCSbvpMV
|
||||
Added: _$ss8UTF8SpanV10validatingABs0B0Vys5UInt8VG_ts7UnicodeO0A0O15ValidationErrorVYKcfC
|
||||
Added: _$ss8UTF8SpanV11checkForNFC10quickCheckS2b_tF
|
||||
Added: _$ss8UTF8SpanV12isKnownASCIISbvpMV
|
||||
Added: _$ss8UTF8SpanV13checkForASCIISbyF
|
||||
Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64VvM
|
||||
Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64Vvg
|
||||
Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64VvpMV
|
||||
Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64Vvs
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV11skipForward2byS2i_tF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV11skipForwardSiyF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV21currentCodeUnitOffsetSivg
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV21currentCodeUnitOffsetSivpMV
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV4nextSJSgyF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV5reset20roundingForwardsFromySi_tF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV5reset21roundingBackwardsFromySi_tF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV5reset11toUncheckedySi_tF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV6prefixAByF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV6suffixAByF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV8previousSJSgyF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV8skipBack2byS2i_tF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV8skipBackSiyF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV9codeUnitsABvg
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV9codeUnitsABvpMV
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorVMa
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorVMn
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorVN
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorVyAdBcfC
|
||||
Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvM
|
||||
Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvg
|
||||
Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvpMV
|
||||
Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvs
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV11skipForward2byS2i_tF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV11skipForwardSiyF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV21currentCodeUnitOffsetSivg
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV21currentCodeUnitOffsetSivpMV
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV4nexts0C0O0D0VSgyF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset20roundingForwardsFromySi_tF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset21roundingBackwardsFromySi_tF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset11toUncheckedySi_tF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV6prefixAByF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV6suffixAByF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8previouss0C0O0D0VSgyF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8skipBack2byS2i_tF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8skipBackSiyF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV9codeUnitsABvg
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV9codeUnitsABvpMV
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVMa
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVMn
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVN
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVyAdBcfC
|
||||
Added: _$ss8UTF8SpanV21isCanonicallyLessThanySbABF
|
||||
Added: _$ss8UTF8SpanV21makeCharacterIteratorAB0dE0VyF
|
||||
Added: _$ss8UTF8SpanV23isCanonicallyEquivalent2toSbAB_tF
|
||||
Added: _$ss8UTF8SpanV25makeUnicodeScalarIteratorAB0deF0VyF
|
||||
Added: _$ss8UTF8SpanV4spans0B0Vys5UInt8VGvg
|
||||
Added: _$ss8UTF8SpanV4spans0B0Vys5UInt8VGvpMV
|
||||
Added: _$ss8UTF8SpanV5countSivpMV
|
||||
Added: _$ss8UTF8SpanV7_nfcBits6UInt64VvpZMV
|
||||
Added: _$ss8UTF8SpanV7isEmptySbvg
|
||||
Added: _$ss8UTF8SpanV7isEmptySbvpMV
|
||||
Added: _$ss8UTF8SpanV9_asciiBits6UInt64VvpZMV
|
||||
Added: _$ss8UTF8SpanVMa
|
||||
Added: _$ss8UTF8SpanVMn
|
||||
Added: _$ss8UTF8SpanVN
|
||||
|
||||
|
||||
// SE-0467 MutableSpan and MutableRawSpan
|
||||
Added: _$ss11MutableSpanVMa
|
||||
Added: _$ss11MutableSpanVMn
|
||||
|
||||
@@ -815,6 +815,133 @@ Added: _$ss7RawSpanVMa
|
||||
Added: _$ss7RawSpanVMn
|
||||
Added: _$ss7RawSpanVN
|
||||
|
||||
// SE-0464 UTF8Span
|
||||
Added: _$sSS7copyingSSs8UTF8SpanV_tcfC
|
||||
Added: _$sSS8utf8Spans04UTF8B0Vvg
|
||||
Added: _$sSS8utf8Spans04UTF8B0VvpMV
|
||||
Added: _$sSs8utf8Spans04UTF8B0Vvg
|
||||
Added: _$sSs8utf8Spans04UTF8B0VvpMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvM
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvg
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvpMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvs
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11descriptionSSvg
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11descriptionSSvpMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV2eeoiySbAF_AFtFZ
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV11descriptionSSvg
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV11descriptionSSvpMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV15truncatedScalarAHvpZMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV20overlongEncodingByteAHvpZMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV22surrogateCodePointByteAHvpZMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV26unexpectedContinuationByteAHvpZMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV32invalidNonSurrogateCodePointByteAHvpZMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValueAHSgs5UInt8V_tcfC
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8VvM
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8Vvg
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8VvpMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8Vvs
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVMa
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVMn
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVN
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSHsMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSHsWP
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSQsMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSQsWP
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSYsMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSYsWP
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs0D0sMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs0D0sWP
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs23CustomStringConvertiblesMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs23CustomStringConvertiblesWP
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4hash4intoys6HasherVz_tF
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvM
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvg
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvpMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvs
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV9hashValueSivg
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV9hashValueSivpMV
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVMa
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVMn
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVN
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSHsMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSHsWP
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSQsMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSQsWP
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs0D0sMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs0D0sWP
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs23CustomStringConvertiblesMc
|
||||
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs23CustomStringConvertiblesWP
|
||||
Added: _$ss7UnicodeO4UTF8O15_checkAllErrorsySayAD15ValidationErrorVGxSTRzs5UInt8V7ElementRtzlFZ
|
||||
Added: _$ss8UTF8SpanV9unchecked12isKnownASCIIABs0B0Vys5UInt8VG_SbtcfC
|
||||
Added: _$ss8UTF8SpanV10_countMasks6UInt64VvpZMV
|
||||
Added: _$ss8UTF8SpanV10_flagsMasks6UInt64VvpZMV
|
||||
Added: _$ss8UTF8SpanV10isKnownNFCSbvpMV
|
||||
Added: _$ss8UTF8SpanV10validatingABs0B0Vys5UInt8VG_ts7UnicodeO0A0O15ValidationErrorVYKcfC
|
||||
Added: _$ss8UTF8SpanV11checkForNFC10quickCheckS2b_tF
|
||||
Added: _$ss8UTF8SpanV12isKnownASCIISbvpMV
|
||||
Added: _$ss8UTF8SpanV13checkForASCIISbyF
|
||||
Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64VvM
|
||||
Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64Vvg
|
||||
Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64VvpMV
|
||||
Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64Vvs
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV11skipForward2byS2i_tF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV11skipForwardSiyF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV21currentCodeUnitOffsetSivg
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV21currentCodeUnitOffsetSivpMV
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV4nextSJSgyF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV5reset20roundingForwardsFromySi_tF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV5reset21roundingBackwardsFromySi_tF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV5reset11toUncheckedySi_tF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV6prefixAByF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV6suffixAByF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV8previousSJSgyF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV8skipBack2byS2i_tF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV8skipBackSiyF
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV9codeUnitsABvg
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorV9codeUnitsABvpMV
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorVMa
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorVMn
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorVN
|
||||
Added: _$ss8UTF8SpanV17CharacterIteratorVyAdBcfC
|
||||
Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvM
|
||||
Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvg
|
||||
Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvpMV
|
||||
Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvs
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV11skipForward2byS2i_tF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV11skipForwardSiyF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV21currentCodeUnitOffsetSivg
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV21currentCodeUnitOffsetSivpMV
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV4nexts0C0O0D0VSgyF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset20roundingForwardsFromySi_tF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset21roundingBackwardsFromySi_tF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset11toUncheckedySi_tF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV6prefixAByF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV6suffixAByF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8previouss0C0O0D0VSgyF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8skipBack2byS2i_tF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8skipBackSiyF
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV9codeUnitsABvg
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV9codeUnitsABvpMV
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVMa
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVMn
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVN
|
||||
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVyAdBcfC
|
||||
Added: _$ss8UTF8SpanV21isCanonicallyLessThanySbABF
|
||||
Added: _$ss8UTF8SpanV21makeCharacterIteratorAB0dE0VyF
|
||||
Added: _$ss8UTF8SpanV23isCanonicallyEquivalent2toSbAB_tF
|
||||
Added: _$ss8UTF8SpanV25makeUnicodeScalarIteratorAB0deF0VyF
|
||||
Added: _$ss8UTF8SpanV4spans0B0Vys5UInt8VGvg
|
||||
Added: _$ss8UTF8SpanV4spans0B0Vys5UInt8VGvpMV
|
||||
Added: _$ss8UTF8SpanV5countSivpMV
|
||||
Added: _$ss8UTF8SpanV7_nfcBits6UInt64VvpZMV
|
||||
Added: _$ss8UTF8SpanV7isEmptySbvg
|
||||
Added: _$ss8UTF8SpanV7isEmptySbvpMV
|
||||
Added: _$ss8UTF8SpanV9_asciiBits6UInt64VvpZMV
|
||||
Added: _$ss8UTF8SpanVMa
|
||||
Added: _$ss8UTF8SpanVMn
|
||||
Added: _$ss8UTF8SpanVN
|
||||
|
||||
|
||||
// SE-0467 MutableSpan and MutableRawSpan
|
||||
Added: _$ss11MutableSpanVMa
|
||||
Added: _$ss11MutableSpanVMn
|
||||
|
||||
295
test/stdlib/UTF8EncodingErrorTests.swift
Normal file
295
test/stdlib/UTF8EncodingErrorTests.swift
Normal file
@@ -0,0 +1,295 @@
|
||||
// RUN: %target-run-stdlib-swift %S/Inputs/
|
||||
|
||||
// REQUIRES: executable_test
|
||||
|
||||
// FIXME: this test is currently broken
|
||||
|
||||
import Swift
|
||||
import StdlibUnittest
|
||||
|
||||
var suite = TestSuite("UTF8.ValidationError")
|
||||
defer { runAllTests() }
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension Array {
|
||||
func withSpan<R>(_ f: (Span<Element>) throws -> R) rethrows -> R {
|
||||
try self.withUnsafeBufferPointer {
|
||||
try f(Span(_unsafeElements: $0))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extension Range<Int> {
|
||||
func _offset(by start: Int) -> Range<Int> {
|
||||
start + lowerBound ..< start + upperBound
|
||||
}
|
||||
}
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
private struct ValidationError {
|
||||
var error: UTF8.ValidationError
|
||||
|
||||
// When fetching all errors, we'll get the error kind given. When
|
||||
// slicing in order to get the next error (e.g.
|
||||
// `UTF8Span.init(validating:))`, we'll get `.unexpectedContinuation`.
|
||||
var errorStart: Bool
|
||||
|
||||
|
||||
init(
|
||||
_ error: UTF8.ValidationError,
|
||||
errorStart: Bool
|
||||
) {
|
||||
self.error = error
|
||||
self.errorStart = errorStart
|
||||
}
|
||||
|
||||
public static func unexpectedContinuationByte(
|
||||
at i: Int, errorStart: Bool = true
|
||||
) -> Self {
|
||||
Self(UTF8.ValidationError(.unexpectedContinuationByte, at: i), errorStart: errorStart)
|
||||
}
|
||||
|
||||
public static func surrogateCodePointByte(
|
||||
at i: Int, errorStart: Bool = true
|
||||
) -> Self {
|
||||
Self(UTF8.ValidationError(.surrogateCodePointByte, at: i), errorStart: errorStart)
|
||||
}
|
||||
|
||||
public static func invalidNonSurrogateCodePointByte(
|
||||
at i: Int, errorStart: Bool = true
|
||||
) -> Self {
|
||||
Self(UTF8.ValidationError(.invalidNonSurrogateCodePointByte, at: i), errorStart: errorStart)
|
||||
}
|
||||
|
||||
public static func overlongEncodingByte(
|
||||
at i: Int, errorStart: Bool = true
|
||||
) -> Self {
|
||||
Self(UTF8.ValidationError(.overlongEncodingByte, at: i), errorStart: errorStart)
|
||||
}
|
||||
|
||||
public static func truncatedScalar(
|
||||
_ range: Range<Int>, errorStart: Bool = true
|
||||
) -> Self {
|
||||
Self(UTF8.ValidationError(.truncatedScalar, range), errorStart: errorStart)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
private struct ValidationTestCase {
|
||||
var bytes: [UInt8]
|
||||
|
||||
// When fetching all errors, we'll get the error kind given. When
|
||||
// slicing in order to get the next error (e.g.
|
||||
// `UTF8Span.init(validating:))`, we'll get `.unexpectedContinuation`.
|
||||
var errors: [ValidationError]
|
||||
|
||||
var loc: SourceLocStack
|
||||
|
||||
init(
|
||||
_ bytes: [UInt8],
|
||||
file: String = #file,
|
||||
line: UInt = #line,
|
||||
_ errors: [ValidationError]
|
||||
) {
|
||||
self.bytes = bytes
|
||||
self.errors = errors
|
||||
self.loc = .init(SourceLoc(file, line))
|
||||
}
|
||||
|
||||
func fetchError(
|
||||
at i: Int, wasSliced: Bool
|
||||
) -> UTF8.ValidationError {
|
||||
let err = errors[i]
|
||||
if wasSliced && !err.errorStart {
|
||||
return .init(.unexpectedContinuationByte, err.error.byteOffsets)
|
||||
}
|
||||
return err.error
|
||||
}
|
||||
|
||||
func expect<T: Equatable>(
|
||||
_ lhs: T,
|
||||
_ rhs: T,
|
||||
file: String = #file,
|
||||
line: UInt = #line
|
||||
) {
|
||||
expectEqual(
|
||||
lhs,
|
||||
rhs,
|
||||
stackTrace: loc.withCurrentLoc(file: file, line: line))
|
||||
}
|
||||
func fail(
|
||||
_ message: String,
|
||||
file: String = #file,
|
||||
line: UInt = #line
|
||||
) {
|
||||
expectationFailure(
|
||||
message,
|
||||
trace: "",
|
||||
stackTrace: loc.with(.init(file, line)))
|
||||
}
|
||||
|
||||
/// Test UTF8._checkAllErrors(), which matches directly against
|
||||
/// the provided expected-errors.
|
||||
func testAllErrors() {
|
||||
let caughtErrors = Array(UTF8._checkAllErrors(bytes))
|
||||
for i in 0..<Swift.min(caughtErrors.count, errors.count) {
|
||||
expect(fetchError(at: i, wasSliced: false), caughtErrors[i])
|
||||
}
|
||||
expect(caughtErrors.count, errors.count)
|
||||
}
|
||||
|
||||
/// Test UTF8Span validation. Surface subsequent errors by slicing the
|
||||
/// input (which will convert the error-kind to .unexpectedContinuationByte)
|
||||
func testSpanSlicedErrors() {
|
||||
bytes.withSpan { span in
|
||||
if errors.isEmpty {
|
||||
do throws(UTF8.ValidationError) {
|
||||
// No errors expected
|
||||
_ = try UTF8Span(validating: span)
|
||||
} catch {
|
||||
fail("Unexpected error: \(error)")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Check every error, by slicing (which will change error classification
|
||||
// of continuation bytes in multi-byte errors to .unexpectedContinuation)
|
||||
var currentPos = 0
|
||||
var errorIdx = 0
|
||||
while true {
|
||||
do throws(UTF8.ValidationError) {
|
||||
// print("extracting \(currentPos)")
|
||||
_ = try UTF8Span(validating: span._extracting(currentPos...))
|
||||
|
||||
if errorIdx != errors.endIndex {
|
||||
fail("Expected a thrown UTF-8 encoding error")
|
||||
}
|
||||
break
|
||||
} catch {
|
||||
guard errorIdx < errors.endIndex else {
|
||||
fail("Found unexpected subsequent error \(error)")
|
||||
break
|
||||
}
|
||||
|
||||
let expectedError = fetchError(at: errorIdx, wasSliced: true)
|
||||
// print(currentPos)
|
||||
// print(error)
|
||||
|
||||
// print(error.byteOffsets._offset(by: currentPos))
|
||||
|
||||
|
||||
let adjustedErr = UTF8.ValidationError(
|
||||
error.kind,
|
||||
error.byteOffsets._offset(by: currentPos)
|
||||
)
|
||||
expect(expectedError, adjustedErr)
|
||||
|
||||
currentPos = adjustedErr.byteOffsets.upperBound
|
||||
errorIdx += 1
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Rest of input should be error-free
|
||||
if let start = errors.last?.error.byteOffsets.upperBound,
|
||||
start < bytes.count
|
||||
{
|
||||
do throws(UTF8.ValidationError) {
|
||||
_ = try UTF8Span(validating: span._extracting(start...))
|
||||
} catch {
|
||||
fail("Found subsequent error \(error)")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func run() {
|
||||
testSpanSlicedErrors()
|
||||
testAllErrors()
|
||||
}
|
||||
}
|
||||
|
||||
if #available(SwiftStdlib 6.2, *) {
|
||||
suite.test("UTF8Span/encoding errors") {
|
||||
func test(
|
||||
_ bytes: Array<UInt8>,
|
||||
_ file: String = #file, line: UInt = #line,
|
||||
_ errors: ValidationError...
|
||||
) {
|
||||
ValidationTestCase(
|
||||
bytes, file: file, line: line, errors
|
||||
).run()
|
||||
}
|
||||
|
||||
// Valid string
|
||||
// test(Array("abcde\u{301}f😀🇺🇸🧟♀️🧟♀️".utf8), [])
|
||||
|
||||
// Bad URL
|
||||
// test(
|
||||
// Array("http://servername/scripts/..".utf8)
|
||||
// + [0xC0, 0xAF]
|
||||
// + Array("../winnt/system32/cmd.exe".utf8),
|
||||
// [.overlongEncodingByte(at: 28), // C0
|
||||
// .overlongEncodingByte(at: 29, errorStart: false), // AF
|
||||
// ])
|
||||
|
||||
// test(
|
||||
// [0xC0, 0xAF, 0xE0, 0x80, 0xBF, 0xF0, 0x81, 0x82, 0x41],
|
||||
// [.overlongEncodingByte(at: 0), // C0
|
||||
// .overlongEncodingByte(at: 1, errorStart: false), // AF
|
||||
// .overlongEncodingByte(at: 2), // E0
|
||||
// .overlongEncodingByte(at: 3, errorStart: false), // 80
|
||||
// .overlongEncodingByte(at: 4, errorStart: false), // BF
|
||||
// .overlongEncodingByte(at: 5), // F0
|
||||
// .overlongEncodingByte(at: 6, errorStart: false), // 81
|
||||
// .overlongEncodingByte(at: 7, errorStart: false), // 82
|
||||
// ])
|
||||
// test(
|
||||
// [0x41, 0xC0, 0xAF, 0x41, 0xF4, 0x80, 0x80, 0x41],
|
||||
// [.overlongEncodingByte(at: 1), // C0
|
||||
// .overlongEncodingByte(at: 2, errorStart: false), // AF
|
||||
// .truncatedScalar(4...6), // F4 80 80
|
||||
// ])
|
||||
// test(
|
||||
// [0xED, 0xAF, 0x41],
|
||||
// [.surrogateCodePointByte(at: 0), // ED
|
||||
// .surrogateCodePointByte(at: 1, errorStart: false), // AF
|
||||
// ])
|
||||
// test(
|
||||
// [0xED, 0xA0, 0x80, 0xED, 0xBF, 0xBF, 0xED, 0xAF, 0x41],
|
||||
// [.surrogateCodePointByte(at: 0), // ED
|
||||
// .surrogateCodePointByte(at: 1, errorStart: false), // A0
|
||||
// .surrogateCodePointByte(at: 2, errorStart: false), // 80
|
||||
// .surrogateCodePointByte(at: 3), // ED
|
||||
// .surrogateCodePointByte(at: 4, errorStart: false), // BF
|
||||
// .surrogateCodePointByte(at: 5, errorStart: false), // BF
|
||||
// .surrogateCodePointByte(at: 6), // ED
|
||||
// .surrogateCodePointByte(at: 7, errorStart: false), // AF
|
||||
// ])
|
||||
// test(
|
||||
// [0xF4, 0x91, 0x92, 0x93, 0xFF, 0x41, 0x80, 0xBF, 0x42],
|
||||
// [.invalidNonSurrogateCodePointByte(at: 0), // F4
|
||||
// .invalidNonSurrogateCodePointByte(at: 1, errorStart: false), // 91
|
||||
// .invalidNonSurrogateCodePointByte(at: 2, errorStart: false), // 92
|
||||
// .invalidNonSurrogateCodePointByte(at: 3, errorStart: false), // 93
|
||||
// .invalidNonSurrogateCodePointByte(at: 4), // FF
|
||||
// .unexpectedContinuationByte(at: 6), // 80
|
||||
// .unexpectedContinuationByte(at: 7), // BF
|
||||
// ])
|
||||
// test(
|
||||
// [0xE1, 0x80, 0xE2, 0xF0, 0x91, 0x92, 0xF1, 0xBF, 0x41],
|
||||
// [.truncatedScalar(0...1), // E1 80
|
||||
// .truncatedScalar(2...2), // E2
|
||||
// .truncatedScalar(3...5), // F0 91 92
|
||||
// .truncatedScalar(6...7), // F1 BF
|
||||
// ])
|
||||
// test(
|
||||
// [0xE0, 0x81, 0x80],
|
||||
// [.overlongEncodingByte(at: 0), // E0
|
||||
// .overlongEncodingByte(at: 1, errorStart: false), // 81
|
||||
// .overlongEncodingByte(at: 2, errorStart: false), // 80
|
||||
// ])
|
||||
}
|
||||
}
|
||||
279
test/stdlib/UTF8SpanIteratorTests.swift
Normal file
279
test/stdlib/UTF8SpanIteratorTests.swift
Normal file
@@ -0,0 +1,279 @@
|
||||
// RUN: %target-run-stdlib-swift(-enable-experimental-feature LifetimeDependence) %S/Inputs/
|
||||
// REQUIRES: swift_feature_LifetimeDependence
|
||||
// REQUIRES: executable_test
|
||||
|
||||
import Swift
|
||||
import StdlibUnittest
|
||||
|
||||
var suite = TestSuite("UTF8SpanIterator")
|
||||
defer { runAllTests() }
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension Array {
|
||||
func withSpan<R>(_ f: (Span<Element>) throws -> R) rethrows -> R {
|
||||
try self.withUnsafeBufferPointer {
|
||||
try f(Span(_unsafeElements: $0))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8Span {
|
||||
func withSpan<R>(_ f: (Span<UInt8>) throws -> R) rethrows -> R {
|
||||
try self._withUnsafeBufferPointer {
|
||||
try f(Span(_unsafeElements: $0))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
struct ContentEquivalenceTestCase {
|
||||
var str: String
|
||||
var loc: SourceLocStack
|
||||
}
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension ContentEquivalenceTestCase {
|
||||
func expectStart(
|
||||
_ scalars: inout UTF8Span.UnicodeScalarIterator
|
||||
) {
|
||||
let firstScalar = str.unicodeScalars.first
|
||||
expectEqual(0, scalars.currentCodeUnitOffset, stackTrace: loc)
|
||||
expectNil(scalars.previous(), stackTrace: loc)
|
||||
expectEqual(firstScalar, scalars.next(), stackTrace: loc)
|
||||
expectEqual(firstScalar, scalars.previous(), stackTrace: loc)
|
||||
expectNil(scalars.previous(), stackTrace: loc)
|
||||
}
|
||||
|
||||
func expectEnd(
|
||||
_ scalars: inout UTF8Span.UnicodeScalarIterator
|
||||
) {
|
||||
let lastScalar = str.unicodeScalars.last
|
||||
expectEqual(scalars.currentCodeUnitOffset, scalars.codeUnits.count, stackTrace: loc)
|
||||
expectNil(scalars.next(), stackTrace: loc)
|
||||
expectEqual(lastScalar, scalars.previous(), stackTrace: loc)
|
||||
expectEqual(lastScalar, scalars.next(), stackTrace: loc)
|
||||
expectNil(scalars.next(), stackTrace: loc)
|
||||
}
|
||||
|
||||
func expectStart(
|
||||
_ chars: inout UTF8Span.CharacterIterator
|
||||
) {
|
||||
let firstChar = str.first
|
||||
expectEqual(0, chars.currentCodeUnitOffset, stackTrace: loc)
|
||||
expectNil(chars.previous(), stackTrace: loc)
|
||||
expectEqual(firstChar, chars.next(), stackTrace: loc)
|
||||
expectEqual(firstChar, chars.previous(), stackTrace: loc)
|
||||
expectNil(chars.previous(), stackTrace: loc)
|
||||
}
|
||||
|
||||
func expectEnd(
|
||||
_ chars: inout UTF8Span.CharacterIterator
|
||||
) {
|
||||
let lastChar = str.last
|
||||
expectEqual(chars.currentCodeUnitOffset, chars.codeUnits.count, stackTrace: loc)
|
||||
expectNil(chars.next(), stackTrace: loc)
|
||||
expectEqual(lastChar, chars.previous(), stackTrace: loc)
|
||||
expectEqual(lastChar, chars.next(), stackTrace: loc)
|
||||
expectNil(chars.next(), stackTrace: loc)
|
||||
}
|
||||
|
||||
|
||||
func withUTF8Span<R>(_ f: (UTF8Span) throws -> R) rethrows -> R {
|
||||
try Array(str.utf8).withSpan { span in
|
||||
try f(try! UTF8Span(validating: span))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension ContentEquivalenceTestCase {
|
||||
func testBytes() {
|
||||
let otherBytes = Array((str+"abc").utf8)
|
||||
|
||||
withUTF8Span { utf8Span in
|
||||
utf8Span._withUnsafeBufferPointer {
|
||||
expectEqualSequence(str.utf8, $0, stackTrace: loc)
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: There's a slight jarring due to not having the same
|
||||
// iterators for code units
|
||||
}
|
||||
|
||||
func testScalars() {
|
||||
withUTF8Span { utf8Span in
|
||||
// Test forwards
|
||||
var utf8SpanIter = utf8Span.makeUnicodeScalarIterator()
|
||||
var stringIter = str.unicodeScalars.makeIterator()
|
||||
while let scalar = utf8SpanIter.next() {
|
||||
expectEqual(scalar, stringIter.next(), stackTrace: loc)
|
||||
}
|
||||
expectNil(stringIter.next(), stackTrace: loc)
|
||||
expectEnd(&utf8SpanIter)
|
||||
|
||||
// Test backwards
|
||||
var stringRevIter = str.unicodeScalars.reversed().makeIterator()
|
||||
while let scalar = utf8SpanIter.previous() {
|
||||
expectEqual(scalar, stringRevIter.next(), stackTrace: loc)
|
||||
}
|
||||
expectNil(stringRevIter.next(), stackTrace: loc)
|
||||
expectStart(&utf8SpanIter)
|
||||
|
||||
let numElements = str.unicodeScalars.count
|
||||
let lastElement = str.unicodeScalars.last
|
||||
let firstElement = str.unicodeScalars.first
|
||||
|
||||
expectEqual(numElements, utf8SpanIter.skipForward(by: Int.max))
|
||||
expectEnd(&utf8SpanIter)
|
||||
expectEqual(numElements, utf8SpanIter.skipBack(by: Int.max))
|
||||
expectStart(&utf8SpanIter)
|
||||
expectEqual(numElements, utf8SpanIter.skipForward(by: numElements))
|
||||
expectEnd(&utf8SpanIter)
|
||||
expectEqual(numElements, utf8SpanIter.skipBack(by: numElements))
|
||||
expectStart(&utf8SpanIter)
|
||||
|
||||
if numElements > 0 {
|
||||
expectStart(&utf8SpanIter)
|
||||
expectEqual(numElements-1, utf8SpanIter.skipForward(by: numElements-1))
|
||||
expectEqual(lastElement, utf8SpanIter.next())
|
||||
expectEnd(&utf8SpanIter)
|
||||
|
||||
expectEqual(numElements-1, utf8SpanIter.skipBack(by: numElements-1))
|
||||
expectEqual(firstElement, utf8SpanIter.previous())
|
||||
expectStart(&utf8SpanIter)
|
||||
}
|
||||
|
||||
// TODO: test reset variants
|
||||
// TODO: test prefix/suffix
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func testCharacters() {
|
||||
withUTF8Span { utf8Span in
|
||||
// Test forwards
|
||||
var utf8SpanIter = utf8Span.makeCharacterIterator()
|
||||
var stringIter = str.makeIterator()
|
||||
while let char = utf8SpanIter.next() {
|
||||
expectEqual(char, stringIter.next(), stackTrace: loc)
|
||||
}
|
||||
expectNil(stringIter.next(), stackTrace: loc)
|
||||
expectEnd(&utf8SpanIter)
|
||||
|
||||
// Test backwards
|
||||
var stringRevIter = str.reversed().makeIterator()
|
||||
while let char = utf8SpanIter.previous() {
|
||||
expectEqual(char, stringRevIter.next(), stackTrace: loc)
|
||||
}
|
||||
expectNil(stringRevIter.next(), stackTrace: loc)
|
||||
expectStart(&utf8SpanIter)
|
||||
|
||||
let numElements = str.count
|
||||
let lastElement = str.last
|
||||
let firstElement = str.first
|
||||
|
||||
expectEqual(numElements, utf8SpanIter.skipForward(by: Int.max))
|
||||
expectEnd(&utf8SpanIter)
|
||||
expectEqual(numElements, utf8SpanIter.skipBack(by: Int.max))
|
||||
expectStart(&utf8SpanIter)
|
||||
expectEqual(numElements, utf8SpanIter.skipForward(by: numElements))
|
||||
expectEnd(&utf8SpanIter)
|
||||
expectEqual(numElements, utf8SpanIter.skipBack(by: numElements))
|
||||
expectStart(&utf8SpanIter)
|
||||
|
||||
if numElements > 0 {
|
||||
expectStart(&utf8SpanIter)
|
||||
expectEqual(numElements-1, utf8SpanIter.skipForward(by: numElements-1))
|
||||
expectEqual(lastElement, utf8SpanIter.next())
|
||||
expectEnd(&utf8SpanIter)
|
||||
|
||||
expectEqual(numElements-1, utf8SpanIter.skipBack(by: numElements-1))
|
||||
expectEqual(firstElement, utf8SpanIter.previous())
|
||||
expectStart(&utf8SpanIter)
|
||||
}
|
||||
|
||||
// TODO: test reset variants
|
||||
// TODO: test prefix/suffix
|
||||
}
|
||||
}
|
||||
|
||||
func run() {
|
||||
testBytes()
|
||||
testScalars()
|
||||
testCharacters()
|
||||
|
||||
// TODO: test grapheme break iterator
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if #available(SwiftStdlib 6.2, *) {
|
||||
suite.test("UTF8Span/iterators") {
|
||||
func test(
|
||||
_ s: String,
|
||||
file: String = #file,
|
||||
line: UInt = #line
|
||||
) {
|
||||
// print("testing: \(s)")
|
||||
let t = ContentEquivalenceTestCase(
|
||||
str: s, loc: .init(SourceLoc(file, line)))
|
||||
t.run()
|
||||
}
|
||||
|
||||
test("")
|
||||
test("a")
|
||||
test("á")
|
||||
test("a\u{301}")
|
||||
test("🧟♀️")
|
||||
test("abc")
|
||||
test("abcde\u{301}")
|
||||
test("abéÏ𓀀")
|
||||
test("012345678901234567890")
|
||||
test("abéÏ012345678901234567890𓀀")
|
||||
test("😀😃🤢🤮👩🏿🎤🧛🏻♂️🧛🏻♂️👩👩👦👦")
|
||||
test("defghijklmnopqrstuvwxyz")
|
||||
test("ab🧟♀️de\u{301}bytés")
|
||||
test("ab🧟♀️de\u{301}🧟♀️")
|
||||
test("ab🧟♀️de🧟♀️\u{301}")
|
||||
}
|
||||
}
|
||||
|
||||
// @available(SwiftStdlib 6.2, *)
|
||||
// extension UTF8Span {
|
||||
// func splitOffASCIIPrefix() -> (UTF8Span, UTF8Span) {
|
||||
// if isKnownASCII {
|
||||
// return (self, .init())
|
||||
// }
|
||||
// var splitPoint = 0
|
||||
// while splitPoint < codeUnits.count && codeUnits[unchecked: split] < 0x80 {
|
||||
// splitPoint += 1
|
||||
// }
|
||||
|
||||
// }
|
||||
// }
|
||||
|
||||
if #available(SwiftStdlib 6.2, *) {
|
||||
suite.test("UTF8Span/whatever") {
|
||||
// var badURLBytes: [UInt8] = []
|
||||
// badURLBytes.append(contentsOf: "http://servername/scripts/..".utf8)
|
||||
|
||||
// // Invalid overlong encoding of "/"
|
||||
// badURLBytes.append(contentsOf: [0xC0, 0xAF])
|
||||
|
||||
// badURLBytes.append(contentsOf: "../winnt/system32/cmd.exe".utf8)
|
||||
|
||||
// // try! UTF8Span(validating: badURLBytes.span)
|
||||
|
||||
// badURLBytes.withSpan {
|
||||
// try! UTF8Span(validating: $0)
|
||||
// }
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
278
test/stdlib/UTF8SpanQueriesComparisons.swift
Normal file
278
test/stdlib/UTF8SpanQueriesComparisons.swift
Normal file
@@ -0,0 +1,278 @@
|
||||
// RUN: %target-run-stdlib-swift %S/Inputs/
|
||||
|
||||
// REQUIRES: executable_test
|
||||
|
||||
import Swift
|
||||
import StdlibUnittest
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension UTF8Span {
|
||||
static func ~=(_ lhs: StaticString, _ rhs: UTF8Span) -> Bool {
|
||||
return lhs.withUTF8Buffer { str in
|
||||
rhs._withUnsafeBufferPointer { span in
|
||||
str.elementsEqual(span)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var suite = TestSuite("UTF8SpanQueriesComparisons")
|
||||
defer { runAllTests() }
|
||||
|
||||
@available(SwiftStdlib 6.2, *)
|
||||
extension Array where Element == UInt8 {
|
||||
func withSpan<R>(_ f: (Span<Element>) throws -> R) rethrows -> R {
|
||||
try self.withUnsafeBufferPointer {
|
||||
try f(Span(_unsafeElements: $0))
|
||||
}
|
||||
}
|
||||
func withUTF8Span<R>(_ f: (UTF8Span) throws -> R) rethrows -> R {
|
||||
try self.withSpan { span in
|
||||
try f(try! UTF8Span(validating: span))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if #available(SwiftStdlib 6.2, *) {
|
||||
suite.test("UTF8Span/tilde equals") {
|
||||
Array("abcdefg".utf8).withUTF8Span { utf8Span in
|
||||
switch utf8Span {
|
||||
case "def":
|
||||
expectationFailure(
|
||||
"unexpected pattern match",
|
||||
trace: "",
|
||||
stackTrace: SourceLocStack().withCurrentLoc())
|
||||
case "abcdef":
|
||||
expectationFailure(
|
||||
"unexpected pattern match",
|
||||
trace: "",
|
||||
stackTrace: SourceLocStack().withCurrentLoc())
|
||||
case "abcdefg ":
|
||||
expectationFailure(
|
||||
"unexpected pattern match",
|
||||
trace: "",
|
||||
stackTrace: SourceLocStack().withCurrentLoc())
|
||||
case "abcdefg\0":
|
||||
expectationFailure(
|
||||
"unexpected pattern match",
|
||||
trace: "",
|
||||
stackTrace: SourceLocStack().withCurrentLoc())
|
||||
case "abcdefg":
|
||||
break
|
||||
default:
|
||||
expectationFailure(
|
||||
"expected a pattern match",
|
||||
trace: "",
|
||||
stackTrace: SourceLocStack().withCurrentLoc())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
suite.test("UTF8Span/Sequence equal") {
|
||||
// // A string and its canonical equivalent
|
||||
// let testCases: [(String, String?)] = [
|
||||
// ("abdefg", nil)
|
||||
// ("café", "cafe\u{301}")
|
||||
// ]
|
||||
}
|
||||
|
||||
suite.test("UTF8Span/isKnownASCII") {
|
||||
let tests: [(String, Bool)] = [
|
||||
("abc", true),
|
||||
("abcdefghil1235@#% _/.sladfj234 ", true),
|
||||
("abcdefghil1\u{80}sladfj234 ", false),
|
||||
]
|
||||
|
||||
for (test, expected) in tests {
|
||||
Array(test.utf8).withUTF8Span {
|
||||
expectEqual(expected, $0.isKnownASCII)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
suite.test("UTF8Span/isKnownNFC") {
|
||||
enum Normalness {
|
||||
case known
|
||||
case quickCheck
|
||||
case fullCheck
|
||||
case notNFC
|
||||
}
|
||||
|
||||
let nfcQCNo = "\u{0374}"
|
||||
let nfcQCYes = "\u{0374}"
|
||||
|
||||
let tests: [(String, Normalness)] = [
|
||||
("abc", .known),
|
||||
("abcdefghil123567890", .known),
|
||||
("abcdefghil1\u{299}123345678 ", .quickCheck),
|
||||
("abc日曜日xyz", .quickCheck),
|
||||
("abcde日曜日\u{301}", .fullCheck),
|
||||
("abcde\u{301}fghijkl", .notNFC),
|
||||
]
|
||||
|
||||
for (test, expected) in tests {
|
||||
Array(test.utf8).withUTF8Span {
|
||||
var span = $0
|
||||
if span.isKnownNFC {
|
||||
expectEqual(expected, .known)
|
||||
} else if span.checkForNFC(quickCheck: true) {
|
||||
expectEqual(expected, .quickCheck)
|
||||
} else if span.checkForNFC(quickCheck: false) {
|
||||
expectEqual(expected, .fullCheck)
|
||||
} else {
|
||||
expectEqual(expected, .notNFC)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
suite.test("UTF8Span/canonical equivalence") {
|
||||
|
||||
// TODO: refactor to be test-case declaration driven, and add more tests...
|
||||
// `(normalized: String, variants: [String], lessThan: String, greaterThan: String)`
|
||||
|
||||
let precomposedStr = "café"
|
||||
let decomposedStr = "cafe\u{301}"
|
||||
|
||||
let precomposed = Array(precomposedStr.utf8)
|
||||
let decomposed = Array(decomposedStr.utf8)
|
||||
|
||||
precomposed.withSpan { pre in
|
||||
let utf8Precomposed = try! UTF8Span(validating: pre)
|
||||
decomposed.withSpan { de in
|
||||
let utf8Decomposed = try! UTF8Span(validating: de)
|
||||
|
||||
// print("scalars for \(precomposedStr.unicodeScalars)")
|
||||
// var preScalars = utf8Precomposed.makeUnicodeScalarIterator()
|
||||
// while let s = preScalars.next() {
|
||||
// print(s)
|
||||
// }
|
||||
|
||||
// print("scalars for \(decomposedStr.unicodeScalars)")
|
||||
// var deScalars = utf8Decomposed.makeUnicodeScalarIterator()
|
||||
// while let s = deScalars.next() {
|
||||
// print(s)
|
||||
// }
|
||||
|
||||
expectTrue(utf8Precomposed.isCanonicallyEquivalent(to: utf8Decomposed))
|
||||
|
||||
expectTrue(utf8Precomposed.bytesEqual(to: precomposedStr.utf8))
|
||||
expectFalse(utf8Precomposed.bytesEqual(to: decomposedStr.utf8))
|
||||
|
||||
expectTrue(utf8Decomposed.bytesEqual(to: decomposedStr.utf8))
|
||||
expectFalse(utf8Decomposed.bytesEqual(to: precomposedStr.utf8))
|
||||
|
||||
expectTrue(utf8Precomposed.unicodeScalarsEqual(to: precomposedStr.unicodeScalars))
|
||||
expectFalse(utf8Precomposed.unicodeScalarsEqual(to: decomposedStr.unicodeScalars))
|
||||
|
||||
expectTrue(utf8Decomposed.unicodeScalarsEqual(to: decomposedStr.unicodeScalars))
|
||||
expectFalse(utf8Decomposed.unicodeScalarsEqual(to: precomposedStr.unicodeScalars))
|
||||
|
||||
expectTrue(utf8Precomposed.charactersEqual(to: precomposedStr))
|
||||
expectTrue(utf8Precomposed.charactersEqual(to: decomposedStr))
|
||||
|
||||
expectTrue(utf8Decomposed.charactersEqual(to: decomposedStr))
|
||||
expectTrue(utf8Decomposed.charactersEqual(to: precomposedStr))
|
||||
|
||||
// Equivalence means no-one is less than the other
|
||||
expectFalse(utf8Decomposed.isCanonicallyLessThan(utf8Precomposed))
|
||||
expectFalse(utf8Precomposed.isCanonicallyLessThan(utf8Decomposed))
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
// TODO: Rest of this file is in-progress TODOs
|
||||
|
||||
|
||||
/*
|
||||
|
||||
|
||||
isASCII
|
||||
isKnownNFC
|
||||
checkForNFC(quickCheck:)
|
||||
isKnownSingleScalarCharacters
|
||||
checkForSingleScalarCharacters(quickCheck:)
|
||||
|
||||
public func bytesEqual(to other: UTF8Span) -> Bool
|
||||
public func bytesEqual(to other: some Sequence<UInt8>) -> Bool
|
||||
|
||||
public func scalarsEqual(
|
||||
to other: some Sequence<Unicode.Scalar>
|
||||
) -> Bool
|
||||
|
||||
public func charactersEqual(
|
||||
to other: some Sequence<Character>
|
||||
) -> Bool
|
||||
|
||||
public func isCanonicallyEquivalent(
|
||||
to other: UTF8Span
|
||||
) -> Bool
|
||||
|
||||
public func isCanonicallyLessThan(
|
||||
_ other: UTF8Span
|
||||
) -> Bool
|
||||
|
||||
*/
|
||||
|
||||
// @available(SwiftStdlib 6.2, *)
|
||||
// private struct QueryTestCase {
|
||||
// var content: String
|
||||
|
||||
// var loc: SourceLocStack
|
||||
|
||||
// var isASCII: Bool
|
||||
|
||||
// // TODO: This might become API, or otherwise calculated at init time
|
||||
// var isLatinyNFC: Bool {
|
||||
// bytes.allSatisfy { $0 < 0xCC }
|
||||
// }
|
||||
|
||||
// var isQuickNFC: Bool
|
||||
// var isNFC: Bool
|
||||
|
||||
// var isQuickSSC: Bool
|
||||
// var isSSC: Bool
|
||||
// }
|
||||
|
||||
// if #available(SwiftStdlib 6.2, *) {
|
||||
// suite.test("UTF8Span/queries") {
|
||||
// }
|
||||
// }
|
||||
|
||||
// enum ComparisonResult {
|
||||
// binaryEqual
|
||||
// canonicallyEqual
|
||||
// canonicallyLess
|
||||
// inequal
|
||||
// }
|
||||
|
||||
// private struct ComparisonTestCase {
|
||||
// var content: String
|
||||
// var comparisons: [(String, ComparisonResult)]
|
||||
|
||||
// var loc: SourceLocStack
|
||||
// }
|
||||
|
||||
// if #available(SwiftStdlib 6.2, *) {
|
||||
// suite.test("UTF8Span/comparisons") {
|
||||
// func test()
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
/*
|
||||
|
||||
input string, to check the bits and relevant info
|
||||
comparison string and expected comparison level
|
||||
|
||||
*/
|
||||
|
||||
|
||||
// }
|
||||
Reference in New Issue
Block a user