Add support for UTF8Span

Also, refactor validation and grapheme breaking
This commit is contained in:
Michael Ilseman
2025-04-11 16:11:11 -06:00
committed by GitHub
parent 014825127b
commit e6e4bd6056
22 changed files with 3365 additions and 467 deletions

View File

@@ -208,6 +208,13 @@ add_library(swiftCore
UnsafeRawPointer.swift
UTFEncoding.swift
UTF8.swift
UTF8EncodingError.swift
UTF8Span.swift
UTF8SpanBits.swift
UTF8SpanComparisons.swift
UTF8SpanFundamentals.swift
UTF8SpanInternalHelpers.swift
UTF8SpanIterators.swift
UTF16.swift
UTF32.swift
Unicode.swift # ORDER DEPENDENCY: must follow new unicode support

View File

@@ -214,6 +214,13 @@ split_embedded_sources(
EMBEDDED UnsafeRawPointer.swift
EMBEDDED UTFEncoding.swift
EMBEDDED UTF8.swift
EMBEDDED UTF8EncodingError.swift
EMBEDDED UTF8Span.swift
EMBEDDED UTF8SpanBits.swift
EMBEDDED UTF8SpanComparisons.swift
EMBEDDED UTF8SpanFundamentals.swift
EMBEDDED UTF8SpanInternalHelpers.swift
EMBEDDED UTF8SpanIterators.swift
EMBEDDED UTF16.swift
EMBEDDED UTF32.swift
EMBEDDED Unicode.swift # ORDER DEPENDENCY: must follow new unicode support

View File

@@ -205,6 +205,15 @@
"RawSpan.swift",
"Span.swift"
],
"UTF8Span": [
"UTF8EncodingError.swift",
"UTF8Span.swift",
"UTF8SpanBits.swift",
"UTF8SpanComparisons.swift",
"UTF8SpanFundamentals.swift",
"UTF8SpanInternalHelpers.swift",
"UTF8SpanIterators.swift"
],
"Protocols": [
"CompilerProtocols.swift",
"ShadowProtocols.swift"

View File

@@ -1112,108 +1112,4 @@ extension String {
}
}
extension _StringGutsSlice {
internal func _isScalarNFCQC(
_ scalar: Unicode.Scalar,
_ prevCCC: inout UInt8
) -> Bool {
let normData = Unicode._NormData(scalar, fastUpperbound: 0x300)
if prevCCC > normData.ccc, normData.ccc != 0 {
return false
}
if !normData.isNFCQC {
return false
}
prevCCC = normData.ccc
return true
}
internal func _withNFCCodeUnits(_ f: (UInt8) throws -> Void) rethrows {
let substring = String(_guts)[range]
// Fast path: If we're already NFC (or ASCII), then we don't need to do
// anything at all.
if _fastPath(_guts.isNFC) {
try substring.utf8.forEach(f)
return
}
var isNFCQC = true
var prevCCC: UInt8 = 0
if _guts.isFastUTF8 {
_fastNFCCheck(&isNFCQC, &prevCCC)
// Because we have access to the fastUTF8, we can go through that instead
// of accessing the UTF8 view on String.
if isNFCQC {
try unsafe withFastUTF8 {
for unsafe byte in unsafe $0 {
try f(byte)
}
}
return
}
} else {
for scalar in substring.unicodeScalars {
if !_isScalarNFCQC(scalar, &prevCCC) {
isNFCQC = false
break
}
}
if isNFCQC {
for byte in substring.utf8 {
try f(byte)
}
return
}
}
for scalar in substring.unicodeScalars._internalNFC {
try scalar.withUTF8CodeUnits {
for unsafe byte in unsafe $0 {
try f(byte)
}
}
}
}
internal func _fastNFCCheck(_ isNFCQC: inout Bool, _ prevCCC: inout UInt8) {
unsafe withFastUTF8 { utf8 in
var position = 0
while position < utf8.count {
// If our first byte is less than 0xCC, then it means we're under the
// 0x300 scalar value and everything up to 0x300 is NFC already.
if unsafe utf8[position] < 0xCC {
// If our first byte is less than 0xC0, then it means it is ASCII
// and only takes up a single byte.
if unsafe utf8[position] < 0xC0 {
position &+= 1
} else {
// Otherwise, this is a 2 byte < 0x300 sequence.
position &+= 2
}
// ASCII always has ccc of 0.
prevCCC = 0
continue
}
let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: position)
if !_isScalarNFCQC(scalar, &prevCCC) {
isNFCQC = false
return
}
position &+= len
}
}
}
}

View File

@@ -97,7 +97,7 @@ internal func _stringCompareInternal(
}
@_effects(readonly)
private func _stringCompareFastUTF8(
internal func _stringCompareFastUTF8(
_ utf8Left: UnsafeBufferPointer<UInt8>,
_ utf8Right: UnsafeBufferPointer<UInt8>,
expecting: _StringComparisonResult,

View File

@@ -117,7 +117,7 @@ extension String {
return unsafe (String._uncheckedFromUTF8(
input, asciiPreScanResult: extraInfo.isASCII
), false)
case .error(let initialRange):
case .error(_, let initialRange):
return unsafe (repairUTF8(input, firstKnownBrokenRange: initialRange), true)
}
}
@@ -139,7 +139,7 @@ extension String {
newIsASCII: info.isASCII
)
return result.asString
case .error(let initialRange):
case .error(_, let initialRange):
defer { _fixLifetime(result) }
//This could be optimized to use excess tail capacity
return unsafe repairUTF8(result.codeUnits, firstKnownBrokenRange: initialRange)

View File

@@ -13,14 +13,18 @@
import SwiftShims
/// CR and LF are common special cases in grapheme breaking logic
private var _CR: UInt8 { return 0x0d }
private var _LF: UInt8 { return 0x0a }
private var _CR: UInt8 { return 0x0D }
private var _LF: UInt8 { return 0x0A }
internal func _hasGraphemeBreakBetween(
/// Perform a quick-check to determine if there's a grapheme-break between two
/// scalars, without consulting the data tables. Returns true if there
/// definitely is a break, false if there definitely is none, and nil if a
/// break couldn't be determined
internal func _quickHasGraphemeBreakBetween(
_ lhs: Unicode.Scalar, _ rhs: Unicode.Scalar
) -> Bool {
// CR-LF is a special case: no break between these
) -> Bool? {
// GB3:
// CR-LF is a special case: no break between these
if lhs == Unicode.Scalar(_CR) && rhs == Unicode.Scalar(_LF) {
return false
}
@@ -80,7 +84,10 @@ internal func _hasGraphemeBreakBetween(
default: return false
}
}
return hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs)
if hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs) {
return true
}
return nil
}
extension _StringGuts {
@@ -513,6 +520,8 @@ extension Unicode {
internal var _previous: Unicode.Scalar
internal var _state: _GraphemeBreakingState
/// Refactoring TODO: should we use a quick check result?
///
/// Returns a non-nil value if it can be determined whether there is a
/// grapheme break between `scalar1` and `scalar2` without knowing anything
/// about the scalars that precede `scalar1`. This can optionally be used as
@@ -523,13 +532,7 @@ extension Unicode {
between scalar1: Unicode.Scalar,
and scalar2: Unicode.Scalar
) -> Bool? {
if scalar1.value == 0xD, scalar2.value == 0xA {
return false
}
if _hasGraphemeBreakBetween(scalar1, scalar2) {
return true
}
return nil
_quickHasGraphemeBreakBetween(scalar1, scalar2)
}
/// Initialize a new character recognizer at the _start of text_ (sot)
@@ -637,59 +640,76 @@ extension _StringGuts {
nextScalar: (Int) -> (scalar: Unicode.Scalar, end: Int)?
) -> Int {
_internalInvariant(index < endIndex._encodedOffset)
return _nextGraphemeClusterBoundary(startingAt: index, nextScalar: nextScalar)
}
}
// Note: If `index` in't already on a boundary, then starting with an empty
// state here sometimes leads to this method returning results that diverge
// from the true breaks in the string.
var state = _GraphemeBreakingState()
var (scalar, index) = nextScalar(index)!
internal func _nextGraphemeClusterBoundary(
startingAt index: Int,
nextScalar: (Int) -> (scalar: Unicode.Scalar, end: Int)?
) -> Int {
while true {
guard let (scalar2, nextIndex) = nextScalar(index) else { break }
if state.shouldBreak(between: scalar, and: scalar2) {
break
}
index = nextIndex
scalar = scalar2
// Note: If `index` isn't already on a boundary, then starting with an empty
// state here sometimes leads to this method returning results that diverge
// from the true breaks in the string.
var state = _GraphemeBreakingState()
var (scalar, index) = nextScalar(index)!
while true {
guard let (scalar2, nextIndex) = nextScalar(index) else { break }
if state.shouldBreak(between: scalar, and: scalar2) {
break
}
return index
index = nextIndex
scalar = scalar2
}
// Returns the stride of the grapheme cluster ending at offset `index`.
//
// This method uses `previousScalar` to looks back in the string as far as
// necessary to find a correct grapheme cluster boundary, whether or not
// `index` happens to be on a boundary itself.
internal func previousBoundary(
return index
}
extension _StringGuts {
fileprivate func previousBoundary(
endingAt index: Int,
previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
) -> Int {
// FIXME: This requires potentially arbitrary lookback in each iteration,
// leading to quadratic behavior in some edge cases. Ideally lookback should
// only be done once per cluster (or in the case of RI sequences, once per
// flag sequence). One way to avoid most quadratic behavior is to replace
// this implementation with a scheme that first searches backwards for a
// safe point then iterates forward using the regular `shouldBreak` until we
// reach `index`, as recommended in section 6.4 of TR#29.
//
// https://www.unicode.org/reports/tr29/#Random_Access
var (scalar2, index) = previousScalar(index)!
while true {
guard let (scalar1, previousIndex) = previousScalar(index) else { break }
if shouldBreakWithLookback(
between: scalar1, and: scalar2, at: index, with: previousScalar
) {
break
}
index = previousIndex
scalar2 = scalar1
}
return index
_previousGraphemeClusterBoundary(endingAt: index, previousScalar: previousScalar)
}
}
// Returns the stride of the grapheme cluster ending at offset `index`.
//
// This method uses `previousScalar` to looks back in the string as far as
// necessary to find a correct grapheme cluster boundary, whether or not
// `index` happens to be on a boundary itself.
internal func _previousGraphemeClusterBoundary(
endingAt index: Int,
previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
) -> Int {
// FIXME: This requires potentially arbitrary lookback in each iteration,
// leading to quadratic behavior in some edge cases. Ideally lookback should
// only be done once per cluster (or in the case of RI sequences, once per
// flag sequence). One way to avoid most quadratic behavior is to replace
// this implementation with a scheme that first searches backwards for a
// safe point then iterates forward using the regular `shouldBreak` until we
// reach `index`, as recommended in section 6.4 of TR#29.
//
// https://www.unicode.org/reports/tr29/#Random_Access
var (scalar2, index) = previousScalar(index)!
while true {
guard let (scalar1, previousIndex) = previousScalar(index) else { break }
if _shouldBreakWithLookback(
between: scalar1, and: scalar2, at: index, with: previousScalar
) {
break
}
index = previousIndex
scalar2 = scalar1
}
return index
}
extension _GraphemeBreakingState {
@@ -708,13 +728,8 @@ extension _GraphemeBreakingState {
between scalar1: Unicode.Scalar,
and scalar2: Unicode.Scalar
) -> Bool {
// GB3
if scalar1.value == 0xD, scalar2.value == 0xA {
return false
}
if _hasGraphemeBreakBetween(scalar1, scalar2) {
return true
if let result = _quickHasGraphemeBreakBetween(scalar1, scalar2) {
return result
}
let x = Unicode._GraphemeBreakProperty(from: scalar1)
@@ -868,289 +883,282 @@ extension _GraphemeBreakingState {
}
}
extension _StringGuts {
// Return true if there is an extended grapheme cluster boundary between two
// scalars, with no previous knowledge about preceding scalars.
//
// This method looks back as far as it needs to determine the correct
// placement of boundaries.
//
// This is based off of the Unicode Annex #29 for [Grapheme Cluster Boundary
// Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
internal func shouldBreakWithLookback(
between scalar1: Unicode.Scalar,
and scalar2: Unicode.Scalar,
at index: Int,
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
) -> Bool {
// GB3
if scalar1.value == 0xD, scalar2.value == 0xA {
return false
}
if _hasGraphemeBreakBetween(scalar1, scalar2) {
return true
}
let x = Unicode._GraphemeBreakProperty(from: scalar1)
let y = Unicode._GraphemeBreakProperty(from: scalar2)
switch (x, y) {
// Fast path: If we know our scalars have no properties the decision is
// trivial and we don't need to crawl to the default statement.
case (.any, .any):
return true
// GB4
case (.control, _):
return true
// GB5
case (_, .control):
return true
// GB6
case (.l, .l),
(.l, .v),
(.l, .lv),
(.l, .lvt):
return false
// GB7
case (.lv, .v),
(.v, .v),
(.lv, .t),
(.v, .t):
return false
// GB8
case (.lvt, .t),
(.t, .t):
return false
// GB9
case (_, .extend),
(_, .zwj):
return false
// GB9a
case (_, .spacingMark):
return false
// GB9b
case (.prepend, _):
return false
// GB11
case (.zwj, .extendedPictographic):
return !checkIfInEmojiSequence(at: index, with: previousScalar)
// GB12 & GB13
case (.regionalIndicator, .regionalIndicator):
return countRIs(at: index, with: previousScalar)
// GB999
default:
// GB9c
//
// Check if our rhs is an InCB=Consonant first because we can more easily
// exit out of this branch in most cases. Otherwise, this is a consonant.
// Check that the lhs is an InCB=Extend or InCB=Linker (we have to check
// if it's an .extend or .zwj first because _isInCBExtend assumes that it
// is true).
if scalar2._isInCBConsonant,
(x == .extend || x == .zwj),
(scalar1._isInCBExtend || scalar1._isInCBLinker) {
return !checkIfInIndicSequence(at: index, with: previousScalar)
}
return true
}
// Return true if there is an extended grapheme cluster boundary between two
// scalars, with no previous knowledge about preceding scalars.
//
// This method looks back as far as it needs to determine the correct
// placement of boundaries.
//
// This is based off of the Unicode Annex #29 for [Grapheme Cluster Boundary
// Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
fileprivate func _shouldBreakWithLookback(
between scalar1: Unicode.Scalar,
and scalar2: Unicode.Scalar,
at index: Int,
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
) -> Bool {
if let result = _quickHasGraphemeBreakBetween(scalar1, scalar2) {
return result
}
// When walking backwards, it's impossible to know whether we were in an emoji
// sequence without walking further backwards. This walks the string backwards
// enough until we figure out whether or not to break our
// (.zwj, .extendedPictographic) question. For example:
//
// Scalar view #1:
//
// [.control, .zwj, .extendedPictographic]
// ^
// | = To determine whether or not we break here, we need
// to see the previous scalar's grapheme property.
// ^
// | = This is neither .extendedPictographic nor .extend, thus we
// were never in an emoji sequence, so break between the .zwj
// and .extendedPictographic.
//
// Scalar view #2:
//
// [.extendedPictographic, .zwj, .extendedPictographic]
// ^
// | = Same as above, move backwards one to
// view the previous scalar's property.
// ^
// | = This is an .extendedPictographic, so this indicates that
// we are in an emoji sequence, so we should NOT break
// between the .zwj and .extendedPictographic.
//
// Scalar view #3:
//
// [.extendedPictographic, .extend, .extend, .zwj, .extendedPictographic]
// ^
// | = Same as above
// ^
// | = This is an .extend which means
// there is a potential emoji
// sequence, walk further backwards
// to find an .extendedPictographic.
//
// <-- = Another extend, go backwards more.
// ^
// | = We found our starting .extendedPictographic letting us
// know that we are in an emoji sequence so our initial
// break question is answered as NO.
internal func checkIfInEmojiSequence(
at index: Int,
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
) -> Bool {
guard var i = previousScalar(index)?.start else { return false }
while let prev = previousScalar(i) {
i = prev.start
let gbp = Unicode._GraphemeBreakProperty(from: prev.scalar)
let x = Unicode._GraphemeBreakProperty(from: scalar1)
let y = Unicode._GraphemeBreakProperty(from: scalar2)
switch gbp {
case .extend:
continue
case .extendedPictographic:
return true
default:
return false
}
}
switch (x, y) {
// Fast path: If we know our scalars have no properties the decision is
// trivial and we don't need to crawl to the default statement.
case (.any, .any):
return true
// GB4
case (.control, _):
return true
// GB5
case (_, .control):
return true
// GB6
case (.l, .l),
(.l, .v),
(.l, .lv),
(.l, .lvt):
return false
}
// When walking backwards, it's impossible to know whether we break when we
// see our first (InCB=Extend, InCB=Consonant) or (InCB=Linker, InCB=Consonant)
// without walking further backwards. This walks the string backwards enough
// until we figure out whether or not to break this indic sequence. For example:
//
// Scalar view #1:
//
// [InCB=Linker, InCB=Extend, InCB=Consonant]
// ^
// | = To be able to know whether or not to
// break these two, we need to walk
// backwards to determine if this is a
// legitimate indic sequence.
// ^
// | = The scalar sequence ends without a starting InCB=Consonant,
// so this is in fact not an indic sequence, so we can break the two.
//
// Scalar view #2:
//
// [InCB=Consonant, InCB=Linker, InCB=Extend, InCB=Consonant]
// ^
// | = Same as above
// ^
// | = This is a Linker, so we at least have seen
// 1 to be able to return true if we see a
// consonant later.
// ^
// | = Is a consonant and we've seen a linker, so this is a
// legitimate indic sequence, so do NOT break the initial question.
internal func checkIfInIndicSequence(
at index: Int,
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
) -> Bool {
guard let p = previousScalar(index) else { return false }
var hasSeenInCBLinker = p.scalar._isInCBLinker
var i = p.start
while let (scalar, prev) = previousScalar(i) {
i = prev
if scalar._isInCBConsonant {
return hasSeenInCBLinker
}
let gbp = Unicode._GraphemeBreakProperty(from: scalar)
guard gbp == .extend || gbp == .zwj else {
return false
}
switch (scalar._isInCBExtend, scalar._isInCBLinker) {
case (false, false):
return false
case (false, true):
hasSeenInCBLinker = true
case (true, false):
continue
case (true, true):
// This case should never happen, but if it does then just be cautious
// and say this is invalid.
return false
}
}
// GB7
case (.lv, .v),
(.v, .v),
(.lv, .t),
(.v, .t):
return false
}
// When walking backwards, it's impossible to know whether we break when we
// see our first (.regionalIndicator, .regionalIndicator) without walking
// further backwards. This walks the string backwards enough until we figure
// out whether or not to break these RIs. For example:
//
// Scalar view #1:
//
// [.control, .regionalIndicator, .regionalIndicator]
// ^
// | = To be able to know whether or not to
// break these two, we need to walk
// backwards to determine if there were
// any previous .regionalIndicators in
// a row.
// ^
// | = Not a .regionalIndicator, so our total riCount is 0 and 0 is
// even thus we do not break.
//
// Scalar view #2:
//
// [.control, .regionalIndicator, .regionalIndicator, .regionalIndicator]
// ^
// | = Same as above
// ^
// | = This is a .regionalIndicator, so continue
// walking backwards for more of them. riCount is
// now equal to 1.
// ^
// | = Not a .regionalIndicator. riCount = 1 which is odd, so break
// the last two .regionalIndicators.
internal func countRIs(
at index: Int,
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
) -> Bool {
guard let p = previousScalar(index) else { return false }
var i = p.start
var riCount = 0
while let p = previousScalar(i) {
i = p.start
// GB8
case (.lvt, .t),
(.t, .t):
return false
let gbp = Unicode._GraphemeBreakProperty(from: p.scalar)
guard gbp == .regionalIndicator else {
break
}
// GB9
case (_, .extend),
(_, .zwj):
return false
riCount += 1
// GB9a
case (_, .spacingMark):
return false
// GB9b
case (.prepend, _):
return false
// GB11
case (.zwj, .extendedPictographic):
return !_checkIfInEmojiSequence(at: index, with: previousScalar)
// GB12 & GB13
case (.regionalIndicator, .regionalIndicator):
return _countRIs(at: index, with: previousScalar)
// GB999
default:
// GB9c
//
// Check if our rhs is an InCB=Consonant first because we can more easily
// exit out of this branch in most cases. Otherwise, this is a consonant.
// Check that the lhs is an InCB=Extend or InCB=Linker (we have to check
// if it's an .extend or .zwj first because _isInCBExtend assumes that it
// is true).
if scalar2._isInCBConsonant,
(x == .extend || x == .zwj),
(scalar1._isInCBExtend || scalar1._isInCBLinker) {
return !_checkIfInIndicSequence(at: index, with: previousScalar)
}
return riCount & 1 != 0
return true
}
}
// When walking backwards, it's impossible to know whether we were in an emoji
// sequence without walking further backwards. This walks the string backwards
// enough until we figure out whether or not to break our
// (.zwj, .extendedPictographic) question. For example:
//
// Scalar view #1:
//
// [.control, .zwj, .extendedPictographic]
// ^
// | = To determine whether or not we break here, we need
// to see the previous scalar's grapheme property.
// ^
// | = This is neither .extendedPictographic nor .extend, thus we
// were never in an emoji sequence, so break between the .zwj
// and .extendedPictographic.
//
// Scalar view #2:
//
// [.extendedPictographic, .zwj, .extendedPictographic]
// ^
// | = Same as above, move backwards one to
// view the previous scalar's property.
// ^
// | = This is an .extendedPictographic, so this indicates that
// we are in an emoji sequence, so we should NOT break
// between the .zwj and .extendedPictographic.
//
// Scalar view #3:
//
// [.extendedPictographic, .extend, .extend, .zwj, .extendedPictographic]
// ^
// | = Same as above
// ^
// | = This is an .extend which means
// there is a potential emoji
// sequence, walk further backwards
// to find an .extendedPictographic.
//
// <-- = Another extend, go backwards more.
// ^
// | = We found our starting .extendedPictographic letting us
// know that we are in an emoji sequence so our initial
// break question is answered as NO.
fileprivate func _checkIfInEmojiSequence(
at index: Int,
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
) -> Bool {
guard var i = previousScalar(index)?.start else { return false }
while let prev = previousScalar(i) {
i = prev.start
let gbp = Unicode._GraphemeBreakProperty(from: prev.scalar)
switch gbp {
case .extend:
continue
case .extendedPictographic:
return true
default:
return false
}
}
return false
}
// When walking backwards, it's impossible to know whether we break when we
// see our first (InCB=Extend, InCB=Consonant) or (InCB=Linker, InCB=Consonant)
// without walking further backwards. This walks the string backwards enough
// until we figure out whether or not to break this indic sequence. For example:
//
// Scalar view #1:
//
// [InCB=Linker, InCB=Extend, InCB=Consonant]
// ^
// | = To be able to know whether or not to
// break these two, we need to walk
// backwards to determine if this is a
// legitimate indic sequence.
// ^
// | = The scalar sequence ends without a starting InCB=Consonant,
// so this is in fact not an indic sequence, so we can break the two.
//
// Scalar view #2:
//
// [InCB=Consonant, InCB=Linker, InCB=Extend, InCB=Consonant]
// ^
// | = Same as above
// ^
// | = This is a Linker, so we at least have seen
// 1 to be able to return true if we see a
// consonant later.
// ^
// | = Is a consonant and we've seen a linker, so this is a
// legitimate indic sequence, so do NOT break the initial question.
fileprivate func _checkIfInIndicSequence(
at index: Int,
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
) -> Bool {
guard let p = previousScalar(index) else { return false }
var hasSeenInCBLinker = p.scalar._isInCBLinker
var i = p.start
while let (scalar, prev) = previousScalar(i) {
i = prev
if scalar._isInCBConsonant {
return hasSeenInCBLinker
}
let gbp = Unicode._GraphemeBreakProperty(from: scalar)
guard gbp == .extend || gbp == .zwj else {
return false
}
switch (scalar._isInCBExtend, scalar._isInCBLinker) {
case (false, false):
return false
case (false, true):
hasSeenInCBLinker = true
case (true, false):
continue
case (true, true):
// This case should never happen, but if it does then just be cautious
// and say this is invalid.
return false
}
}
return false
}
// When walking backwards, it's impossible to know whether we break when we
// see our first (.regionalIndicator, .regionalIndicator) without walking
// further backwards. This walks the string backwards enough until we figure
// out whether or not to break these RIs. For example:
//
// Scalar view #1:
//
// [.control, .regionalIndicator, .regionalIndicator]
// ^
// | = To be able to know whether or not to
// break these two, we need to walk
// backwards to determine if there were
// any previous .regionalIndicators in
// a row.
// ^
// | = Not a .regionalIndicator, so our total riCount is 0 and 0 is
// even thus we do not break.
//
// Scalar view #2:
//
// [.control, .regionalIndicator, .regionalIndicator, .regionalIndicator]
// ^
// | = Same as above
// ^
// | = This is a .regionalIndicator, so continue
// walking backwards for more of them. riCount is
// now equal to 1.
// ^
// | = Not a .regionalIndicator. riCount = 1 which is odd, so break
// the last two .regionalIndicators.
fileprivate func _countRIs(
at index: Int,
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
) -> Bool {
guard let p = previousScalar(index) else { return false }
var i = p.start
var riCount = 0
while let p = previousScalar(i) {
i = p.start
let gbp = Unicode._GraphemeBreakProperty(from: p.scalar)
guard gbp == .regionalIndicator else {
break
}
riCount += 1
}
return riCount & 1 != 0
}

View File

@@ -52,3 +52,119 @@ extension UnsafeBufferPointer where Element == UInt8 {
return unsafe !UTF8.isContinuation(self[offset])
}
}
internal func _isScalarNFCQC(
_ scalar: Unicode.Scalar,
_ prevCCC: inout UInt8
) -> Bool {
let normData = Unicode._NormData(scalar, fastUpperbound: 0x300)
if prevCCC > normData.ccc, normData.ccc != 0 {
return false
}
if !normData.isNFCQC {
return false
}
prevCCC = normData.ccc
return true
}
extension _StringGutsSlice {
internal func _withNFCCodeUnits(_ f: (UInt8) throws -> Void) rethrows {
let substring = String(_guts)[range]
// Fast path: If we're already NFC (or ASCII), then we don't need to do
// anything at all.
if _fastPath(_guts.isNFC) {
try substring.utf8.forEach(f)
return
}
var isNFCQC = true
var prevCCC: UInt8 = 0
if _guts.isFastUTF8 {
_fastNFCCheck(&isNFCQC, &prevCCC)
// Because we have access to the fastUTF8, we can go through that instead
// of accessing the UTF8 view on String.
if isNFCQC {
try unsafe withFastUTF8 {
for unsafe byte in unsafe $0 {
try f(byte)
}
}
return
}
} else {
for scalar in substring.unicodeScalars {
if !_isScalarNFCQC(scalar, &prevCCC) {
isNFCQC = false
break
}
}
if isNFCQC {
for byte in substring.utf8 {
try f(byte)
}
return
}
}
for scalar in substring.unicodeScalars._internalNFC {
try scalar.withUTF8CodeUnits {
for unsafe byte in unsafe $0 {
try f(byte)
}
}
}
}
internal func _fastNFCCheck(_ isNFCQC: inout Bool, _ prevCCC: inout UInt8) {
unsafe withFastUTF8 { utf8 in
isNFCQC = unsafe _nfcQuickCheck(utf8, prevCCC: &prevCCC)
}
}
}
/// Run the Unicode NFC quick check algorithm, returns
internal func _nfcQuickCheck(
_ utf8: UnsafeBufferPointer<UInt8>,
prevCCC: inout UInt8
) -> Bool {
var position = 0
while position < utf8.count {
// If our first byte is less than 0xCC, then it means we're under the
// 0x300 scalar value and everything up to 0x300 is NFC already.
if unsafe utf8[position] < 0xCC {
// If our first byte is less than 0xC0, then it means it is ASCII
// and only takes up a single byte.
if unsafe utf8[position] < 0xC0 {
position &+= 1
} else {
// Otherwise, this is a 2 byte < 0x300 sequence.
position &+= 2
}
// ASCII always has ccc of 0.
prevCCC = 0
continue
}
let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: position)
guard _isScalarNFCQC(scalar, &prevCCC) else {
return false
}
position &+= len
}
return true
}

View File

@@ -18,7 +18,7 @@ private func _isNotOverlong_F0(_ x: UInt8) -> Bool {
return (0x90...0xBF).contains(x)
}
private func _isNotOverlong_F4(_ x: UInt8) -> Bool {
private func _isNotInvalid_F4(_ x: UInt8) -> Bool {
return UTF8.isContinuation(x) && x <= 0x8F
}
@@ -26,7 +26,7 @@ private func _isNotOverlong_E0(_ x: UInt8) -> Bool {
return (0xA0...0xBF).contains(x)
}
private func _isNotOverlong_ED(_ x: UInt8) -> Bool {
private func _isNotInvalid_ED(_ x: UInt8) -> Bool {
return UTF8.isContinuation(x) && x <= 0x9F
}
@@ -34,15 +34,82 @@ internal struct UTF8ExtraInfo: Equatable {
public var isASCII: Bool
}
@inline(never) // slow-path
private func _diagnoseInvalidUTF8MultiByteLeading(
_ x: UInt8
) -> _UTF8EncodingErrorKind {
_internalInvariant(x >= 0x80)
_internalInvariant(!_isUTF8MultiByteLeading(x))
switch x {
case 0x80...0xBF:
return .unexpectedContinuationByte
case 0xC0..<0xC2:
return .overlongEncodingByte
default:
_internalInvariant(x > 0xF4)
return .invalidNonSurrogateCodePointByte
}
}
internal enum UTF8ValidationResult {
case success(UTF8ExtraInfo)
case error(toBeReplaced: Range<Int>)
case error(
kind: _UTF8EncodingErrorKind, toBeReplaced: Range<Int>
)
}
// FIXME: refactor other parts of stdlib to avoid this dumb mirror enum
//
// Mirror of UTF8.ValidationError.Kind, available on 6.1
internal struct _UTF8EncodingErrorKind: Error, Sendable, Hashable
// TODO: embedded?, Codable
, RawRepresentable {
internal var rawValue: UInt8
@available(SwiftStdlib 6.2, *)
internal var _publicKind: UTF8.ValidationError.Kind {
.init(rawValue: self.rawValue)!
}
@inlinable
internal init(rawValue: UInt8) {
self.rawValue = rawValue
}
/// A continuation byte (`10xxxxxx`) outside of a multi-byte sequence
@_alwaysEmitIntoClient
internal static var unexpectedContinuationByte: Self {
.init(rawValue: 0)
}
/// A byte in a surrogate code point (`U+D800..U+DFFF`) sequence
@_alwaysEmitIntoClient
internal static var surrogateCodePointByte: Self {
.init(rawValue: 1)
}
/// A byte in an invalid, non-surrogate code point (`>U+10FFFF`) sequence
@_alwaysEmitIntoClient
internal static var invalidNonSurrogateCodePointByte: Self {
.init(rawValue: 2)
}
/// A byte in an overlong encoding sequence
@_alwaysEmitIntoClient
internal static var overlongEncodingByte: Self {
.init(rawValue: 3)
}
/// A multi-byte sequence that is the start of a valid multi-byte scalar
/// but is cut off before ending correctly
@_alwaysEmitIntoClient
internal static var truncatedScalar: Self {
.init(rawValue: 4)
}
}
extension UTF8ValidationResult: Equatable {}
private struct UTF8ValidationError: Error {}
internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationResult {
if unsafe _allASCII(buf) {
return .success(UTF8ExtraInfo(isASCII: true))
@@ -51,12 +118,20 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
var iter = unsafe buf.makeIterator()
var lastValidIndex = buf.startIndex
@inline(__always) func guaranteeIn(_ f: (UInt8) -> Bool) throws(UTF8ValidationError) {
guard let cu = unsafe iter.next() else { throw UTF8ValidationError() }
guard f(cu) else { throw UTF8ValidationError() }
@inline(__always) func guarantee(
_ f: (UInt8) -> Bool,
_ err: _UTF8EncodingErrorKind
) throws(_UTF8EncodingErrorKind) {
guard let cu = unsafe iter.next() else {
throw .truncatedScalar
}
guard f(cu) else {
throw err
}
}
@inline(__always) func guaranteeContinuation() throws(UTF8ValidationError) {
try guaranteeIn(UTF8.isContinuation)
@inline(__always) func guaranteeContinuation(
) throws(_UTF8EncodingErrorKind) {
try guarantee(UTF8.isContinuation, .truncatedScalar)
}
func _legacyInvalidLengthCalculation(_ _buffer: (_storage: UInt32, ())) -> Int {
@@ -117,21 +192,40 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
return unsafe _legacyNarrowIllegalRange(buf: buf[illegalRange])
}
do {
do throws(_UTF8EncodingErrorKind) {
/*
The table of valid UTF-8 is:
Scalar value Byte 0 Byte 1 Byte 2 Byte 3
U+0000..U+007F 00..7F
U+0080..U+07FF C2..DF Contin
U+0800..U+0FFF E0 A0..BF Contin
U+1000..U+CFFF E1..EC Contin Contin
U+D000..U+D7FF ED 80..9F Contin
U+E000..U+FFFF EE..EF Contin Contin
U+10000..U+3FFFF F0 90..BF Contin Contin
U+40000..U+FFFFF F1..F3 Contin Contin Contin
U+100000..U+10FFFF F4 80..8F Contin Contin
"Contin" is any continuation byte, i.e. 80..BF or 10xxxxxx
*/
var isASCII = true
while let cu = unsafe iter.next() {
if UTF8.isASCII(cu) { lastValidIndex &+= 1; continue }
isASCII = false
if _slowPath(!_isUTF8MultiByteLeading(cu)) {
func fail() throws(UTF8ValidationError) { throw UTF8ValidationError() }
try fail()
throw _diagnoseInvalidUTF8MultiByteLeading(cu)
}
switch cu {
case 0xC2...0xDF:
try guaranteeContinuation()
lastValidIndex &+= 2
case 0xE0:
try guaranteeIn(_isNotOverlong_E0)
try guarantee(_isNotOverlong_E0, .overlongEncodingByte)
try guaranteeContinuation()
lastValidIndex &+= 3
case 0xE1...0xEC:
@@ -139,7 +233,7 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
try guaranteeContinuation()
lastValidIndex &+= 3
case 0xED:
try guaranteeIn(_isNotOverlong_ED)
try guarantee(_isNotInvalid_ED, .surrogateCodePointByte)
try guaranteeContinuation()
lastValidIndex &+= 3
case 0xEE...0xEF:
@@ -147,7 +241,7 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
try guaranteeContinuation()
lastValidIndex &+= 3
case 0xF0:
try guaranteeIn(_isNotOverlong_F0)
try guarantee(_isNotOverlong_F0, .overlongEncodingByte)
try guaranteeContinuation()
try guaranteeContinuation()
lastValidIndex &+= 4
@@ -157,7 +251,8 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
try guaranteeContinuation()
lastValidIndex &+= 4
case 0xF4:
try guaranteeIn(_isNotOverlong_F4)
try guarantee(
_isNotInvalid_F4, .invalidNonSurrogateCodePointByte)
try guaranteeContinuation()
try guaranteeContinuation()
lastValidIndex &+= 4
@@ -167,7 +262,9 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
}
return .success(UTF8ExtraInfo(isASCII: isASCII))
} catch {
return unsafe .error(toBeReplaced: findInvalidRange(buf[lastValidIndex...]))
return unsafe .error(
kind: error,
toBeReplaced: findInvalidRange(buf[lastValidIndex...]))
}
}
@@ -214,7 +311,7 @@ internal func repairUTF8(_ input: UnsafeBufferPointer<UInt8>, firstKnownBrokenRa
case .success:
unsafe result.appendInPlace(remainingInput, isASCII: false)
return String(result)
case .error(let newBrokenRange):
case .error(_, let newBrokenRange):
brokenRange = newBrokenRange
}
} while !remainingInput.isEmpty

View File

@@ -0,0 +1,261 @@
extension Unicode.UTF8 {
/**
The kind and location of a UTF-8 encoding error.
Valid UTF-8 is represented by this table:
```
Scalar value Byte 0 Byte 1 Byte 2 Byte 3
U+0000..U+007F 00..7F
U+0080..U+07FF C2..DF 80..BF
U+0800..U+0FFF E0 A0..BF 80..BF
U+1000..U+CFFF E1..EC 80..BF 80..BF
U+D000..U+D7FF ED 80..9F 80..BF
U+E000..U+FFFF EE..EF 80..BF 80..BF
U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
```
### Classifying errors
An *unexpected continuation* is when a continuation byte (`10xxxxxx`) occurs
in a position that should be the start of a new scalar value. Unexpected
continuations can often occur when the input contains arbitrary data
instead of textual content. An unexpected continuation at the start of
input might mean that the input was not correctly sliced along scalar
boundaries or that it does not contain UTF-8.
A *truncated scalar* is a multi-byte sequence that is the start of a valid
multi-byte scalar but is cut off before ending correctly. A truncated
scalar at the end of the input might mean that only part of the entire
input was received.
A *surrogate code point* (`U+D800..U+DFFF`) is invalid UTF-8. Surrogate
code points are used by UTF-16 to encode scalars in the supplementary
planes. Their presence may mean the input was encoded in a different 8-bit
encoding, such as CESU-8, WTF-8, or Java's Modified UTF-8.
An *invalid non-surrogate code point* is any code point higher than
`U+10FFFF`. This can often occur when the input is arbitrary data instead
of textual content.
An *overlong encoding* occurs when a scalar value that could have been
encoded using fewer bytes is encoded in a longer byte sequence. Overlong
encodings are invalid UTF-8 and can lead to security issues if not
correctly detected:
- https://nvd.nist.gov/vuln/detail/CVE-2008-2938
- https://nvd.nist.gov/vuln/detail/CVE-2000-0884
An overlong encoding of `NUL`, `0xC0 0x80`, is used in Java's Modified
UTF-8 but is invalid UTF-8. Overlong encoding errors often catch attempts
to bypass security measures.
### Reporting the range of the error
The range of the error reported follows the *Maximal subpart of an
ill-formed subsequence* algorithm in which each error is either one byte
long or ends before the first byte that is disallowed. See "U+FFFD
Substitution of Maximal Subparts" in the Unicode Standard. Unicode started
recommending this algorithm in version 6 and is adopted by the W3C.
The maximal subpart algorithm will produce a single multi-byte range for a
truncated scalar (a multi-byte sequence that is the start of a valid
multi-byte scalar but is cut off before ending correctly). For all other
errors (including overlong encodings, surrogates, and invalid code
points), it will produce an error per byte.
// FIXME: without a checkAllErrors, we don't have these classification distinctions, should we drop it, ensure we will do it, or what?
Since overlong encodings, surrogates, and invalid code points are erroneous
by the second byte (at the latest), the above definition produces the same
ranges as defining such a sequence as a truncated scalar error followed by
unexpected continuation byte errors. The more semantically-rich
classification is reported.
For example, a surrogate count point sequence `ED A0 80` will be reported
as three `.surrogateCodePointByte` errors rather than a `.truncatedScalar`
followed by two `.unexpectedContinuationByte` errors.
Other commonly reported error ranges can be constructed from this result.
For example, PEP 383's error-per-byte can be constructed by mapping over
the reported range. Similarly, constructing a single error for the longest
invalid byte range can be constructed by joining adjacent error ranges.
```
61 F1 80 80 E1 80 C2 62
Longest range U+61 err U+62
Maximal subpart U+61 err err err U+62
Error per byte U+61 err err err err err err U+62
```
*/
@available(SwiftStdlib 6.2, *)
@frozen
public struct ValidationError: Error, Sendable, Hashable
{
/// The kind of encoding error
public var kind: Unicode.UTF8.ValidationError.Kind
/// The range of offsets into our input containing the error
public var byteOffsets: Range<Int>
@_alwaysEmitIntoClient
public init(
_ kind: Unicode.UTF8.ValidationError.Kind,
_ byteOffsets: Range<Int>
) {
_precondition(byteOffsets.lowerBound >= 0)
if kind == .truncatedScalar {
_precondition(!byteOffsets.isEmpty)
_precondition(byteOffsets.count < 4)
} else {
_precondition(byteOffsets.count == 1)
}
self.kind = kind
self.byteOffsets = byteOffsets
}
@_alwaysEmitIntoClient
public init(
_ kind: Unicode.UTF8.ValidationError.Kind, at byteOffset: Int
) {
self.init(kind, byteOffset..<(byteOffset+1))
}
}
}
@available(SwiftStdlib 6.2, *)
extension UTF8.ValidationError {
/// The kind of encoding error encountered during validation
@frozen
public struct Kind: Error, Sendable, Hashable, RawRepresentable
{
public var rawValue: UInt8
@inlinable
public init?(rawValue: UInt8) {
guard rawValue <= 4 else { return nil }
self.rawValue = rawValue
}
/// A continuation byte (`10xxxxxx`) outside of a multi-byte sequence
@_alwaysEmitIntoClient
public static var unexpectedContinuationByte: Self {
.init(rawValue: 0)!
}
/// A byte in a surrogate code point (`U+D800..U+DFFF`) sequence
@_alwaysEmitIntoClient
public static var surrogateCodePointByte: Self {
.init(rawValue: 1)!
}
/// A byte in an invalid, non-surrogate code point (`>U+10FFFF`) sequence
@_alwaysEmitIntoClient
public static var invalidNonSurrogateCodePointByte: Self {
.init(rawValue: 2)!
}
/// A byte in an overlong encoding sequence
@_alwaysEmitIntoClient
public static var overlongEncodingByte: Self {
.init(rawValue: 3)!
}
/// A multi-byte sequence that is the start of a valid multi-byte scalar
/// but is cut off before ending correctly
@_alwaysEmitIntoClient
public static var truncatedScalar: Self {
.init(rawValue: 4)!
}
}
}
@_unavailableInEmbedded
@available(SwiftStdlib 6.2, *)
extension UTF8.ValidationError.Kind: CustomStringConvertible {
public var description: String {
switch self {
case .invalidNonSurrogateCodePointByte:
".invalidNonSurrogateCodePointByte"
case .overlongEncodingByte:
".overlongEncodingByte"
case .surrogateCodePointByte:
".surrogateCodePointByte"
case .truncatedScalar:
".truncatedScalar"
case .unexpectedContinuationByte:
".unexpectedContinuationByte"
default:
fatalError("unreachable")
}
}
}
@_unavailableInEmbedded
@available(SwiftStdlib 6.2, *)
extension UTF8.ValidationError: CustomStringConvertible {
public var description: String {
"UTF8.ValidationError(\(kind), \(byteOffsets))"
}
}
extension UTF8 {
@available(SwiftStdlib 6.2, *)
@usableFromInline // for testing purposes
internal static func _checkAllErrors(
_ s: some Sequence<UInt8>
) -> Array<UTF8.ValidationError> {
// TODO: Span fast path
// TODO: Fixed size buffer for non-contig inputs
// TODO: Lifetime-dependent result variant
let cus = Array(s)
return unsafe cus.withUnsafeBytes {
var bufPtr = unsafe $0
var start = 0
var errors: Array<UTF8.ValidationError> = []
// Remember the previous error, so that we can
// apply it to subsequent bytes instead of reporting
// just `.unexpectedContinuation`.
var priorError: UTF8.ValidationError? = nil
while true {
do throws(UTF8.ValidationError) {
_ = unsafe try bufPtr.baseAddress!._validateUTF8(limitedBy: bufPtr.count)
return errors
} catch {
let adjustedRange =
error.byteOffsets.lowerBound + start ..< error.byteOffsets.upperBound + start
let kind: UTF8.ValidationError.Kind
if let prior = priorError,
prior.byteOffsets.upperBound == adjustedRange.lowerBound,
error.kind == .unexpectedContinuationByte
{
kind = prior.kind
} else {
kind = error.kind
}
let adjustedErr = UTF8.ValidationError(kind, adjustedRange)
priorError = adjustedErr
let errEnd = error.byteOffsets.upperBound
start += errEnd
unsafe bufPtr = .init(rebasing: bufPtr[errEnd...])
errors.append(adjustedErr)
}
}
}
}
}

View File

@@ -0,0 +1,235 @@
// TODO: comment header
/// TODO: docs
@frozen
@safe
@available(SwiftStdlib 6.2, *)
public struct UTF8Span: Copyable, ~Escapable, BitwiseCopyable {
@usableFromInline
internal var _unsafeBaseAddress: UnsafeRawPointer?
/*
A bit-packed count and flags (such as isASCII)
b63 b62 b61:56 b56:0
ASCII NFC reserved count
ASCII means the contents are known to be all-ASCII (<0x7F).
NFC means contents are known to be in normal form C for fast comparisons.
*/
@usableFromInline
internal var _countAndFlags: UInt64
// @_alwaysEmitIntoClient
@inline(__always)
@lifetime(borrow start) // TODO: borrow or copy?
internal init(
_unsafeAssumingValidUTF8 start: borrowing UnsafeRawPointer,
_countAndFlags: UInt64
) {
unsafe self._unsafeBaseAddress = copy start
self._countAndFlags = _countAndFlags
_invariantCheck()
}
/// Creates a UTF8Span, bypassing safety and security checks. The caller
/// must guarantee that `codeUnits` contains validly-encoded UTF-8, or else
/// undefined behavior may result upon use. If `isKnownASCII: true is
/// passed`, the contents must be ASCII, or else undefined behavior may
/// result upon use.
@unsafe
@lifetime(copy codeUnits)
public init(
unchecked codeUnits: Span<UInt8>,
isKnownASCII: Bool = false
) {
self.init(
_uncheckedAssumingValidUTF8: codeUnits,
isKnownASCII: isKnownASCII,
isKnownNFC: false
)
}
// FIXME: we need to make sure ALL API are nil safe, that is they
// at least check the count first
@_alwaysEmitIntoClient
internal func _start() -> UnsafeRawPointer {
unsafe _unsafeBaseAddress._unsafelyUnwrappedUnchecked
}
}
// TODO: try to convert code to be ran on Span instead of URP
@available(SwiftStdlib 6.2, *)
extension UTF8Span {
/// Creates a UTF8Span containing `codeUnits`. Validates that the input is
/// valid UTF-8, otherwise throws an error.
///
/// The resulting UTF8Span has the same lifetime constraints as `codeUnits`.
@lifetime(copy codeUnits)
public init(
validating codeUnits: consuming Span<UInt8>
) throws(UTF8.ValidationError) {
try self.init(_validating: codeUnits)
}
// TODO: this doesn't need to be underscored, I don't think
@lifetime(copy codeUnits)
internal init(
_validating codeUnits: consuming Span<UInt8>
) throws(UTF8.ValidationError) {
guard let basePtr = unsafe codeUnits._pointer else {
unsafe self._unsafeBaseAddress = nil
self._countAndFlags = 0
return
}
let count = codeUnits._count
let isASCII = unsafe try basePtr._validateUTF8(limitedBy: count)
unsafe self._unsafeBaseAddress = .init(basePtr)
self._countAndFlags = UInt64(truncatingIfNeeded: count)
if isASCII {
_setIsASCII()
}
_internalInvariant(self.count == codeUnits.count)
}
// TODO: SPI?
@lifetime(copy codeUnits)
internal init(
_uncheckedAssumingValidUTF8 codeUnits: consuming Span<UInt8>,
isKnownASCII: Bool,
isKnownNFC: Bool
) {
guard let ptr = unsafe codeUnits._pointer else {
unsafe self._unsafeBaseAddress = nil
self._countAndFlags = 0
return
}
unsafe self._unsafeBaseAddress = ptr
self._countAndFlags = UInt64(truncatingIfNeeded: codeUnits.count)
if isKnownASCII {
_setIsASCII()
}
if isKnownNFC {
_setIsNFC()
}
_internalInvariant(self.count == codeUnits.count)
}
// HACK: working around lack of internal plumbing work
internal var _str: String { unsafe _start()._str(0..<count) }
}
// MARK: String
@available(SwiftStdlib 6.2, *)
extension UTF8Span {
/// Calls a closure with a pointer to the viewed contiguous storage.
///
/// The buffer pointer passed as an argument to `body` is valid only
/// during the execution of `withUnsafeBufferPointer(_:)`.
/// Do not store or return the pointer for later use.
///
/// - Parameter body: A closure with an `UnsafeBufferPointer` parameter
/// that points to the viewed contiguous storage. If `body` has
/// a return value, that value is also used as the return value
/// for the `withUnsafeBufferPointer(_:)` method. The closure's
/// parameter is valid only for the duration of its execution.
/// - Returns: The return value of the `body` closure parameter.
@_alwaysEmitIntoClient
borrowing public func _withUnsafeBufferPointer<
E: Error, Result: ~Copyable //& ~Escapable
>(
_ body: (_ buffer: /*borrowing*/ UnsafeBufferPointer<UInt8>) throws(E) -> Result
) throws(E) -> Result {
try unsafe body(_start()._ubp(0..<count))
}
// TODO: withSpan or similar?
}
// MARK: Internals
@available(SwiftStdlib 6.2, *)
extension UTF8Span {
#if !INTERNAL_CHECKS_ENABLED
@inline(__always) internal func _invariantCheck() {}
#else
@inline(never) @_effects(releasenone)
internal func _invariantCheck() {
// TODO: validate the UTF-8 as an assertion (and isASCII)
}
#endif
}
@available(SwiftStdlib 6.2, *)
extension UTF8Span {
public var isEmpty: Bool {
self.count == 0
}
public var span: Span<UInt8> {
@lifetime(copy self)
get {
unsafe Span(_unchecked: _unsafeBaseAddress, count: self.count)
}
}
}
// TODO(toolchain): decide if we rebase on top of Guillaume's work
extension String {
@available(SwiftStdlib 6.2, *)
public init(copying codeUnits: UTF8Span) {
let isASCII = codeUnits.isKnownASCII
self = unsafe codeUnits._withUnsafeBufferPointer { bufPtr in
unsafe String._uncheckedFromUTF8(bufPtr, isASCII: isASCII)
}
}
@available(SwiftStdlib 6.2, *)
public var utf8Span: UTF8Span {
@lifetime(borrow self)
borrowing get {
let isKnownASCII = _guts.isASCII
let utf8 = self.utf8
let span = utf8.span
let result = unsafe UTF8Span(
unchecked: span,
isKnownASCII: isKnownASCII)
return unsafe _overrideLifetime(result, borrowing: self)
}
}
}
extension Substring {
@available(SwiftStdlib 6.2, *)
public var utf8Span: UTF8Span {
@lifetime(borrow self)
borrowing get {
let isKnownASCII = base._guts.isASCII
let utf8 = self.utf8
let span = utf8.span
let result = unsafe UTF8Span(
unchecked: span,
isKnownASCII: isKnownASCII)
return unsafe _overrideLifetime(result, borrowing: self)
}
}
}

View File

@@ -0,0 +1,126 @@
@available(SwiftStdlib 6.2, *)
extension UTF8Span {
/// Returns whether contents are known to be all-ASCII. A return value of
/// `true` means that all code units are ASCII. A return value of `false`
/// means there _may_ be non-ASCII content.
///
/// ASCII-ness is checked and remembered during UTF-8 validation, so this
/// is often equivalent to is-ASCII, but there are some situations where
/// we might return `false` even when the content happens to be all-ASCII.
///
/// For example, a UTF-8 span generated from a `String` that at some point
/// contained non-ASCII content would report false for `isKnownASCII`, even
/// if that String had subsequent mutation operations that removed any
/// non-ASCII content.
@_alwaysEmitIntoClient
public var isKnownASCII: Bool {
0 != _countAndFlags & Self._asciiBit
}
/// Do a scan checking for whether the contents are all-ASCII.
///
/// Updates the `isKnownASCII` bit if contents are all-ASCII.
@lifetime(self: copy self)
public mutating func checkForASCII() -> Bool {
if isKnownASCII { return true }
let result = unsafe _withUnsafeBufferPointer {
unsafe _allASCII($0)
}
if result {
_setIsASCII()
}
return result
}
/// Returns whether the contents are known to be NFC. This is not
/// always checked at initialization time and is set by `checkForNFC`.
// TODO: should this be @_unavailableInEmbedded
@_alwaysEmitIntoClient
public var isKnownNFC: Bool {
0 != _countAndFlags & Self._nfcBit
}
// Set the isKnownASCII bit to true (also isNFC)
@_alwaysEmitIntoClient
@lifetime(self: copy self)
internal mutating func _setIsASCII() {
self._countAndFlags |= Self._asciiBit | Self._nfcBit
}
// Set the isKnownNFC bit to true (also isNFC)
@_alwaysEmitIntoClient
@lifetime(self: copy self)
internal mutating func _setIsNFC() {
self._countAndFlags |= Self._nfcBit
}
/// Do a scan checking for whether the contents are in Normal Form C.
/// When the contents are in NFC, canonical equivalence checks are much
/// faster.
///
/// `quickCheck` will check for a subset of NFC contents using the
/// NFCQuickCheck algorithm, which is faster than the full normalization
/// algorithm. However, it cannot detect all NFC contents.
///
/// Updates the `isKnownNFC` bit.
@_unavailableInEmbedded
@lifetime(self: copy self)
public mutating func checkForNFC(
quickCheck: Bool
) -> Bool {
if isKnownNFC { return true }
if quickCheck {
let result = unsafe _withUnsafeBufferPointer { utf8 in
var prevCCC: UInt8 = 0
return unsafe _nfcQuickCheck(utf8, prevCCC: &prevCCC)
}
if result {
self._countAndFlags |= Self._nfcBit
}
return result
}
// TODO: use faster internal algorithm
let normalized = _str._nfcCodeUnits
guard unsafe _start()._urbp(
0..<count
).elementsEqual(normalized) else {
return false
}
self._countAndFlags |= Self._nfcBit
return true
}
}
@available(SwiftStdlib 6.2, *)
extension UTF8Span {
@_alwaysEmitIntoClient @inline(__always)
internal static var _asciiBit: UInt64 {
0x8000_0000_0000_0000
}
@_alwaysEmitIntoClient @inline(__always)
internal static var _nfcBit: UInt64 {
0x4000_0000_0000_0000
}
@_alwaysEmitIntoClient @inline(__always)
internal static var _countMask: UInt64 {
0x00FF_FFFF_FFFF_FFFF
}
@_alwaysEmitIntoClient @inline(__always)
internal static var _flagsMask: UInt64 {
0xFF00_0000_0000_0000
}
@_alwaysEmitIntoClient
public var count: Int {
Int(truncatingIfNeeded: _countAndFlags & Self._countMask)
}
}

View File

@@ -0,0 +1,100 @@
// TODO: comment header
@available(SwiftStdlib 6.2, *)
extension UTF8Span {
/// Whether this span has the same bytes as `other`.
@_alwaysEmitIntoClient
public func bytesEqual(to other: some Sequence<UInt8>) -> Bool {
unsafe _withUnsafeBufferPointer { unsafe $0.elementsEqual(other) }
}
/// Whether this span has the same `Unicode.Scalar`s as `other`.
@_alwaysEmitIntoClient
public func unicodeScalarsEqual(
to other: some Sequence<Unicode.Scalar>
) -> Bool {
// TODO: We don't need to decode our code units, we can just match
// against their scalars' encoded bytes
var scalars = makeUnicodeScalarIterator()
var otherScalars = other.makeIterator()
while let s = scalars.next() {
guard let otherS = otherScalars.next(), s == otherS else {
return false
}
}
guard scalars.next() == nil else {
return false
}
return true
}
/// Whether this span has the same `Character`s as `other`.
@_unavailableInEmbedded
@_alwaysEmitIntoClient
public func charactersEqual(
to other: some Sequence<Character>
) -> Bool {
var chars = makeCharacterIterator()
var otherChars = other.makeIterator()
while let c = chars.next() {
guard let otherC = otherChars.next(), c == otherC else {
return false
}
}
guard chars.next() == nil else {
return false
}
return true
}
}
@available(SwiftStdlib 6.2, *)
extension UTF8Span {
/// Whether `self` is equivalent to `other` under Unicode Canonical
/// Equivalence.
public func isCanonicallyEquivalent(
to other: UTF8Span
) -> Bool {
unsafe self._withUnsafeBufferPointer { selfBufPtr in
unsafe other._withUnsafeBufferPointer { otherBufPtr in
unsafe _stringCompareFastUTF8(
selfBufPtr,
otherBufPtr,
expecting: .equal,
bothNFC: self.isKnownNFC && other.isKnownNFC)
}
}
}
/// Whether `self` orders less than `other` under Unicode Canonical
/// Equivalence using normalized code-unit order (in NFC).
public func isCanonicallyLessThan(
_ other: UTF8Span
) -> Bool {
unsafe self._withUnsafeBufferPointer { selfBufPtr in
unsafe other._withUnsafeBufferPointer { otherBufPtr in
unsafe _stringCompareFastUTF8(
selfBufPtr,
otherBufPtr,
expecting: .less,
bothNFC: self.isKnownNFC && other.isKnownNFC)
}
}
}
}
// // FIXME: remove
// @available(SwiftStdlib 6.2, *)
// extension UTF8Span {
// public static func ~=(_ lhs: StaticString, _ rhs: UTF8Span) -> Bool {
// return lhs.withUTF8Buffer { str in
// rhs._withUnsafeBufferPointer { span in
// str.elementsEqual(span)
// }
// }
// }
// }

View File

@@ -0,0 +1,360 @@
// Core Scalar API
@available(SwiftStdlib 6.2, *)
extension UTF8Span {
/// Whether `i` is on a boundary between Unicode scalar values.
///
/// This function does not validate that `i` is within the span's bounds;
/// this is an unsafe operation.
internal func _isScalarAligned(unchecked i: Int) -> Bool {
if i == count || i == 0 { return true }
_internalInvariant(_boundsCheck(i))
return unsafe _start()._isScalarAligned(i)
}
/// Returns the start of the `Unicode.Scalar` ending at `i`, i.e. the scalar
/// before the one starting at `i` or the last scalar if `i` is the end of
/// the span.
///
/// `i` must be scalar-aligned.
internal func _previousScalarStart(_ i: Int) -> Int {
precondition(_boundsCheck(i&-1))
return _previousScalarStart(unchecked: i)
}
/// Returns the start of the `Unicode.Scalar` ending at `i`, i.e. the scalar
/// before the one starting at `i` or the last scalar if `i` is the end of
/// the span.
///
/// `i` must be scalar-aligned.
///
/// This function does not validate that `i` is within the span's bounds;
/// this is an unsafe operation.
internal func _previousScalarStart(unchecked i: Int) -> Int {
_internalInvariant(_boundsCheck(i&-1))
precondition(_isScalarAligned(unchecked: i))
return _previousScalarStart(uncheckedAssumingAligned: i)
}
/// Returns the start of the `Unicode.Scalar` ending at `i`, i.e. the scalar
/// before the one starting at `i` or the last scalar if `i` is the end of
/// the span.
///
/// `i` must be scalar-aligned.
///
/// This function does not validate that `i` is within the span's bounds;
/// this is an unsafe operation.
///
///
/// This function does not validate that `i` is scalar-aligned; this is an
/// unsafe operation if `i` isn't.
internal func _previousScalarStart(
uncheckedAssumingAligned i: Int
) -> Int {
_internalInvariant(_boundsCheck(i&-1))
_internalInvariant(_isScalarAligned(unchecked: i))
return unsafe _start()._previousScalarStart(i)
}
/// Decode the `Unicode.Scalar` starting at `i`. Return it and the start of
/// the next scalar.
///
/// `i` must be scalar-aligned.
///
/// This function does not validate that `i` is within the span's bounds;
/// this is an unsafe operation.
///
///
/// This function does not validate that `i` is scalar-aligned; this is an
/// unsafe operation if `i` isn't.
internal func _decodeNextScalar(
uncheckedAssumingAligned i: Int
) -> (Unicode.Scalar, nextScalarStart: Int) {
_internalInvariant(_boundsCheck(i))
_internalInvariant(_isScalarAligned(unchecked: i))
return unsafe _start()._decodeScalar(startingAt: i)
}
/// Decode the `Unicode.Scalar` ending at `i`, i.e. the previous scalar.
/// Return it and the start of that scalar.
///
/// `i` must be scalar-aligned.
internal func _decodePreviousScalar(
_ i: Int
) -> (Unicode.Scalar, previousScalarStart: Int) {
precondition(_boundsCheck(i &- 1))
return _decodePreviousScalar(unchecked: i)
}
/// Decode the `Unicode.Scalar` ending at `i`, i.e. the previous scalar.
/// Return it and the start of that scalar.
///
/// `i` must be scalar-aligned.
///
/// This function does not validate that `i` is within the span's bounds;
/// this is an unsafe operation.
internal func _decodePreviousScalar(
unchecked i: Int
) -> (Unicode.Scalar, previousScalarStart: Int) {
_internalInvariant(_boundsCheck(i &- 1))
precondition(_isScalarAligned(unchecked: i))
return _decodePreviousScalar(uncheckedAssumingAligned: i)
}
/// Decode the `Unicode.Scalar` ending at `i`, i.e. the previous scalar.
/// Return it and the start of that scalar.
///
/// `i` must be scalar-aligned.
///
/// This function does not validate that `i` is within the span's bounds;
/// this is an unsafe operation.
///
///
/// This function does not validate that `i` is scalar-aligned; this is an
/// unsafe operation if `i` isn't.
internal func _decodePreviousScalar(
uncheckedAssumingAligned i: Int
) -> (Unicode.Scalar, previousScalarStart: Int) {
_internalInvariant(_boundsCheck(i &- 1))
_internalInvariant(_isScalarAligned(unchecked: i))
return unsafe _start()._decodeScalar(endingAt: i)
}
}
// Derived Scalar API
@available(SwiftStdlib 6.2, *)
extension UTF8Span {
/// Find the nearest scalar-aligned position `<= i`.
internal func _scalarAlignBackwards(_ i: Int) -> Int {
if i == count || i == 0 { return i }
precondition(_boundsCheck(i))
return unsafe _start()._scalarAlign(i)
}
/// Find the nearest scalar-aligned position `>= i`.
internal func _scalarAlignForwards(_ i: Int) -> Int {
// FIXME: do the bounds check
// FIXME: stop at end of code units
// - this should be an invariant, but checking it lets us avoid ever
// reading off the end
// FIXME: implement directly
var i = i
while _slowPath(!_isScalarAligned(unchecked: i)) {
i &+= 1
}
return i
}
/// Find the nearest scalar-aligned position `>= i`.
///
/// This function does not validate that `i` is within the span's bounds;
/// this is an unsafe operation.
internal func _scalarAlignForwards(unchecked i: Int) -> Int {
if i == count || i == 0 { return i }
var i = i
while _slowPath(!_isScalarAligned(unchecked: i)) {
i &+= 1
}
return i
}
}
// Core Character API
@available(SwiftStdlib 6.2, *)
extension UTF8Span {
/// Returns the start of the next `Character` (i.e. grapheme cluster) after
/// the one starting at `i`, or the end of the span if `i` denotes the final
/// `Character`.
///
/// `i` must be `Character`-aligned.
internal func _nextCharacterStart(_ i: Int) -> Int {
precondition(_boundsCheck(i))
return _nextCharacterStart(unchecked: i)
}
/// Returns the start of the next `Character` (i.e. grapheme cluster) after
/// the one starting at `i`, or the end of the span if `i` denotes the final
/// `Character`.
///
/// `i` must be `Character`-aligned.
///
/// This function does not validate that `i` is within the span's bounds;
/// this is an unsafe operation.
internal func _nextCharacterStart(unchecked i: Int) -> Int {
_internalInvariant(_boundsCheck(i))
precondition(_isScalarAligned(unchecked: i))
return _nextCharacterStart(uncheckedAssumingAligned: i)
}
/// Returns the start of the next `Character` (i.e. grapheme cluster) after
/// the one starting at `i`, or the end of the span if `i` denotes the final
/// `Character`.
///
/// `i` must be `Character`-aligned.
///
/// This function does not validate that `i` is within the span's bounds;
/// this is an unsafe operation.
///
/// This function does not validate that `i` is `Character`-aligned; this is
/// an unsafe operation if `i` isn't.
internal func _nextCharacterStart(
uncheckedAssumingAligned i: Int
) -> Int {
_internalInvariant(_boundsCheck(i))
_internalInvariant(_isScalarAligned(unchecked: i))
return unsafe _start()._nextCharacterStart(i, limitedBy: count)
}
/// Returns the start of the `Character` (i.e. grapheme cluster) ending at
/// `i`, i.e. the `Character` before the one starting at `i` or the last
/// `Character` if `i` is the end of the span.
///
/// `i` must be `Character`-aligned.
internal func _previousCharacterStart(_ i: Int) -> Int {
precondition(_boundsCheck(i&-1))
return _previousCharacterStart(unchecked: i)
}
/// Returns the start of the `Character` (i.e. grapheme cluster) ending at
/// `i`, i.e. the `Character` before the one starting at `i` or the last
/// `Character` if `i` is the end of the span.
///
/// `i` must be `Character`-aligned.
///
/// This function does not validate that `i` is within the span's bounds;
/// this is an unsafe operation.
internal func _previousCharacterStart(unchecked i: Int) -> Int {
_internalInvariant(_boundsCheck(i&-1))
precondition(_isScalarAligned(unchecked: i))
return _previousCharacterStart(uncheckedAssumingAligned: i)
}
/// Returns the start of the `Character` (i.e. grapheme cluster) ending at
/// `i`, i.e. the `Character` before the one starting at `i` or the last
/// `Character` if `i` is the end of the span.
///
/// `i` must be `Character`-aligned.
///
/// This function does not validate that `i` is within the span's bounds;
/// this is an unsafe operation.
///
/// This function does not validate that `i` is `Character`-aligned; this is
/// an unsafe operation if `i` isn't.
internal func _previousCharacterStart(
uncheckedAssumingAligned i: Int
) -> Int {
_internalInvariant(_boundsCheck(i&-1))
_internalInvariant(_isScalarAligned(unchecked: i))
return unsafe _start()._previousCharacterStart(i, limitedBy: count)
}
/// Decode the `Character` starting at `i` Return it and the start of the
/// next `Character`.
///
/// `i` must be `Character`-aligned.
internal func _decodeNextCharacter(
_ i: Int
) -> (Character, nextCharacterStart: Int) {
precondition(_boundsCheck(i))
return _decodeNextCharacter(unchecked: i)
}
/// Decode the `Character` starting at `i` Return it and the start of the
/// next `Character`.
///
/// `i` must be `Character`-aligned.
///
/// This function does not validate that `i` is within the span's bounds;
/// this is an unsafe operation.
internal func _decodeNextCharacter(
unchecked i: Int
) -> (Character, nextCharacterStart: Int) {
_internalInvariant(_boundsCheck(i))
precondition(_isScalarAligned(unchecked: i))
return _decodeNextCharacter(uncheckedAssumingAligned: i)
}
/// Decode the `Character` starting at `i` Return it and the start of the
/// next `Character`.
///
/// `i` must be `Character`-aligned.
///
/// This function does not validate that `i` is within the span's bounds;
/// this is an unsafe operation.
///
/// This function does not validate that `i` is `Character`-aligned; this is
/// an unsafe operation if `i` isn't.
internal func _decodeNextCharacter(
uncheckedAssumingAligned i: Int
) -> (Character, nextCharacterStart: Int) {
_internalInvariant(_boundsCheck(i))
_internalInvariant(_isScalarAligned(unchecked: i))
return unsafe _start()._decodeCharacter(
startingAt: i, limitedBy: count)
}
/// Decode the `Character` (i.e. grapheme cluster) ending at `i`, i.e. the
/// previous `Character`. Return it and the start of that `Character`.
///
/// `i` must be `Character`-aligned.
internal func _decodePreviousCharacter(_ i: Int) -> (Character, Int) {
precondition(_boundsCheck(i &- 1))
return _decodePreviousCharacter(unchecked: i)
}
/// Decode the `Character` (i.e. grapheme cluster) ending at `i`, i.e. the
/// previous `Character`. Return it and the start of that `Character`.
///
/// `i` must be `Character`-aligned.
///
/// This function does not validate that `i` is within the span's bounds;
/// this is an unsafe operation.
internal func _decodePreviousCharacter(
unchecked i: Int
) -> (Character, Int) {
_internalInvariant(_boundsCheck(i &- 1))
precondition(_isScalarAligned(unchecked: i))
return _decodePreviousCharacter(uncheckedAssumingAligned: i)
}
/// Decode the `Character` (i.e. grapheme cluster) ending at `i`, i.e. the
/// previous `Character`. Return it and the start of that `Character`.
///
/// `i` must be `Character`-aligned.
///
/// This function does not validate that `i` is within the span's bounds;
/// this is an unsafe operation.
///
/// This function does not validate that `i` is `Character`-aligned; this is
/// an unsafe operation if `i` isn't.
internal func _decodePreviousCharacter(
uncheckedAssumingAligned i: Int
) -> (Character, Int) {
_internalInvariant(_boundsCheck(i &- 1))
_internalInvariant(_isScalarAligned(unchecked: i))
return unsafe _start()._decodeCharacter(
endingAt: i, limitedBy: count)
}
}
// TODO: internal?
@available(SwiftStdlib 6.2, *)
extension UTF8Span {
/// Whether `i` is in bounds
@_alwaysEmitIntoClient
internal func _boundsCheck(_ i: Int) -> Bool {
i >= 0 && i < count
}
/// Whether `bounds` is in bounds
@_alwaysEmitIntoClient
internal func _boundsCheck(_ bounds: Range<Int>) -> Bool {
_boundsCheck(bounds.lowerBound)
&& _boundsCheck(bounds.upperBound &- 1)
}
}
// Future work: UTF-16 support when we get views

View File

@@ -0,0 +1,179 @@
/*
Additional helpers build on stdlibDuplicates.swift
*/
// TODO: Should we update our unicode helpers file to call these instead?
// import Builtin
extension UnsafeRawPointer {
// @_alwaysEmitIntoClient
internal func _loadByte(_ i: Int) -> UInt8 {
_internalInvariant(i >= 0)
return unsafe (self+i).loadUnaligned(as: UInt8.self)
}
// @_alwaysEmitIntoClient
internal func _isUTF8Continuation(_ i: Int) -> Bool {
unsafe UTF8.isContinuation(_loadByte(i))
}
// @_alwaysEmitIntoClient
internal func _isScalarAligned(_ i: Int) -> Bool {
_internalInvariant(i >= 0)
return unsafe !_isUTF8Continuation(i)
}
// @_alwaysEmitIntoClient
internal func _scalarLength(startingAt i: Int) -> Int {
unsafe _utf8ScalarLength(_loadByte(i))
}
// NOTE: Adaptation of `_decodeScalar` to work on URP
// @_alwaysEmitIntoClient
internal func _decodeScalar(
startingAt i: Int
) -> (Unicode.Scalar, nextScalarStart: Int) {
let cu0 = unsafe _loadByte(i)
let len = _utf8ScalarLength(cu0)
let next = len &+ i
switch len {
case 1: return (_decodeUTF8(cu0), next)
case 2: return unsafe (_decodeUTF8(cu0, _loadByte(i &+ 1)), next)
case 3: return unsafe (
_decodeUTF8(cu0, _loadByte(i &+ 1), _loadByte(i &+ 2)), next
)
case 4:
return (
unsafe _decodeUTF8(
cu0, _loadByte(i &+ 1), _loadByte(i &+ 2), _loadByte(i &+ 3)
),
next
)
default: Builtin.unreachable()
}
}
// @_alwaysEmitIntoClient
internal func _decodeScalar(
endingAt i: Int
) -> (Unicode.Scalar, previousScalarStart: Int) {
// TODO: no need to double load the bytes...
let start = unsafe _previousScalarStart(i)
return unsafe (_decodeScalar(startingAt: start).0, start)
}
// @_alwaysEmitIntoClient
internal func _previousScalarStart(_ i: Int) -> Int {
var prev = i &- 1
_internalInvariant(prev >= 0)
while unsafe _isUTF8Continuation(prev) {
prev &-= 1
_internalInvariant(prev >= 0)
}
_internalInvariant(unsafe i == prev + _utf8ScalarLength(_loadByte(prev)))
return prev
}
// @_alwaysEmitIntoClient
internal func _scalarAlign(_ i: Int) -> Int {
var i = i
while _slowPath(unsafe !_isScalarAligned(i)) {
i &-= 1
}
return i
}
}
extension UnsafeRawPointer {
// TODO: ASCII fast path wrappers around ufi functions
// TODO: hook up to real grapheme breaking
internal func _urbp(_ range: Range<Int>) -> UnsafeRawBufferPointer {
unsafe .init(start: self + range.lowerBound, count: range.count)
}
@_alwaysEmitIntoClient
internal func _ubp(_ range: Range<Int>) -> UnsafeBufferPointer<UInt8> {
unsafe UnsafeBufferPointer<UInt8>(
start: UnsafePointer((self+range.lowerBound)._rawValue),
count: range.count)
}
internal func _str(_ range: Range<Int>) -> String {
unsafe String(decoding: _urbp(range) , as: UTF8.self)
}
// @usableFromInline
internal func _nextCharacterStart(
_ i: Int, limitedBy end: Int
) -> Int {
_internalInvariant((0..<end).contains(i))
_internalInvariant(unsafe _isScalarAligned(i))
return _nextGraphemeClusterBoundary(startingAt: i) { idx in
guard idx < end else { return nil }
let (scalar, end) = unsafe _decodeScalar(startingAt: idx)
return (scalar, end)
}
}
// @usableFromInline
internal func _previousCharacterStart(
_ i: Int,
limitedBy end: Int
) -> Int {
_internalInvariant(i > 0 && i <= end)
_internalInvariant(unsafe i == end || _isScalarAligned(i))
return _previousGraphemeClusterBoundary(endingAt: i) { idx in
guard idx > 0 else { return nil }
let (scalar, prior) = unsafe _decodeScalar(endingAt: idx)
return (scalar, prior)
}
}
// @usableFromInline
internal func _decodeCharacter(
startingAt i: Int, limitedBy end: Int
) -> (Character, nextCharacterStart: Int) {
let nextStart = unsafe _nextCharacterStart(i, limitedBy: end)
return unsafe (Character(_str(i..<nextStart)), nextStart)
}
// @usableFromInline
internal func _decodeCharacter(
endingAt i: Int,
limitedBy end: Int
) -> (Character, nextCharacterStart: Int) {
let start = unsafe _previousCharacterStart(i, limitedBy: end)
_internalInvariant(start >= 0)
return unsafe (Character(_str(start..<i)), start)
}
}
@available(SwiftStdlib 6.2, *)
extension UnsafeRawPointer {
internal enum _UTF8ValidationResult {
case success(isASCII: Bool)
case error(_: Range<Int>)
}
// Returns isASCII
// TODO: return more values
internal func _validateUTF8(
limitedBy end: Int
) throws(UTF8.ValidationError) -> Bool {
switch unsafe validateUTF8(_ubp(0..<end)) {
case .success(let info):
return info.isASCII
case .error(let kind, let range):
throw UTF8.ValidationError(kind._publicKind, range)
}
}
}

View File

@@ -0,0 +1,391 @@
@available(SwiftStdlib 6.2, *)
extension UTF8Span {
/// Returns an iterator that will decode the code units into
/// `Unicode.Scalar`s.
///
/// The resulting iterator has the same lifetime constraints as `self`.
@lifetime(copy self)
public func makeUnicodeScalarIterator() -> UnicodeScalarIterator {
.init(self)
}
/// Iterate the `Unicode.Scalar`s contents of a `UTF8Span`.
///
/// **TODO**: Examples
@frozen
public struct UnicodeScalarIterator: ~Escapable {
public let codeUnits: UTF8Span
/// The byte offset of the start of the next scalar. This is
/// always scalar-aligned.
fileprivate(set)
public var currentCodeUnitOffset: Int
@lifetime(copy codeUnits)
public init(_ codeUnits: UTF8Span) {
self.codeUnits = codeUnits
self.currentCodeUnitOffset = 0
}
private var _start: UnsafeRawPointer {
unsafe codeUnits._start()
}
/// Decode and return the scalar starting at `currentCodeUnitOffset`.
/// After the function returns, `currentCodeUnitOffset` holds the
/// position at the end of the returned scalar, which is also the start
/// of the next scalar.
///
/// Returns `nil` if at the end of the `UTF8Span`.
@lifetime(self: copy self)
public mutating func next() -> Unicode.Scalar? {
guard currentCodeUnitOffset < codeUnits.count else {
return nil
}
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
let (result, newPos) = unsafe _start._decodeScalar(startingAt: currentCodeUnitOffset)
self.currentCodeUnitOffset = newPos
return result
}
/// Decode and return the scalar ending at `currentCodeUnitOffset`. After
/// the function returns, `currentCodeUnitOffset` holds the position at
/// the start of the returned scalar, which is also the end of the
/// previous scalar.
///
/// Returns `nil` if at the start of the `UTF8Span`.
@lifetime(self: copy self)
public mutating func previous() -> Unicode.Scalar? {
guard currentCodeUnitOffset > 0 else {
return nil
}
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
let (result, newPos) = unsafe _start._decodeScalar(endingAt: currentCodeUnitOffset)
self.currentCodeUnitOffset = newPos
return result
}
/// Advance `codeUnitOffset` to the end of the current scalar, without
/// decoding it.
///
/// Returns the number of `Unicode.Scalar`s skipped over, which can be 0
/// if at the end of the UTF8Span.
@lifetime(self: copy self)
public mutating func skipForward() -> Int {
guard currentCodeUnitOffset < codeUnits.count else {
return 0
}
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
currentCodeUnitOffset &+= unsafe _start._scalarLength(startingAt: currentCodeUnitOffset)
return 1
}
/// Advance `codeUnitOffset` to the end of `n` scalars, without decoding
/// them.
///
/// Returns the number of `Unicode.Scalar`s skipped over, which can be
/// fewer than `n` if at the end of the UTF8Span.
@lifetime(self: copy self)
public mutating func skipForward(by n: Int) -> Int {
var numSkipped = 0
while numSkipped < n && skipForward() != 0 {
numSkipped += 1
}
return numSkipped
}
/// Move `codeUnitOffset` to the start of the previous scalar, without
/// decoding it.
///
/// Returns the number of `Unicode.Scalar`s skipped over, which can be 0
/// if at the start of the UTF8Span.
@lifetime(self: copy self)
public mutating func skipBack() -> Int {
guard currentCodeUnitOffset > 0 else {
return 0
}
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
currentCodeUnitOffset = unsafe _start._previousScalarStart(currentCodeUnitOffset)
return 1
}
/// Move `codeUnitOffset` to the start of the previous `n` scalars,
/// without decoding them.
///
/// Returns the number of `Unicode.Scalar`s skipped over, which can be
/// fewer than `n` if at the start of the UTF8Span.
@lifetime(self: copy self)
public mutating func skipBack(by n: Int) -> Int {
var numSkipped = 0
while numSkipped < n && skipBack() != 0 {
numSkipped += 1
}
return numSkipped
}
/// Reset to the nearest scalar-aligned code unit offset `<= i`.
///
/// **TODO**: Example
@lifetime(self: copy self)
public mutating func reset(roundingBackwardsFrom i: Int) {
self.currentCodeUnitOffset = codeUnits._scalarAlignBackwards(i)
}
/// Reset to the nearest scalar-aligned code unit offset `>= i`.
///
/// **TODO**: Example
@lifetime(self: copy self)
public mutating func reset(roundingForwardsFrom i: Int) {
self.currentCodeUnitOffset = codeUnits._scalarAlignForwards(i)
}
/// Reset this iterator to `codeUnitOffset`, skipping _all_ safety
/// checks (including bounds checks).
///
/// Note: This is only for very specific, low-level use cases. If
/// `codeUnitOffset` is not properly scalar-aligned, this function can
/// result in undefined behavior when, e.g., `next()` is called.
///
/// TODO: verify that we're not UB, just garabage-data or guaranteed
/// trap!
///
/// For example, this could be used by a regex engine to backtrack to a
/// known-valid previous position.
///
@unsafe
@lifetime(self: copy self)
public mutating func reset(toUnchecked codeUnitOffset: Int) {
_internalInvariant(codeUnits._isScalarAligned(unchecked: codeUnitOffset))
self.currentCodeUnitOffset = codeUnitOffset
}
/// Returns the UTF8Span containing all the content up to the iterator's
/// current position.
///
/// The resultant `UTF8Span` has the same lifetime constraints as `self`.
@lifetime(copy self)
public func prefix() -> UTF8Span {
let slice = codeUnits.span._extracting(0..<currentCodeUnitOffset)
return UTF8Span(
_uncheckedAssumingValidUTF8: slice,
isKnownASCII: codeUnits.isKnownASCII,
isKnownNFC: codeUnits.isKnownNFC)
}
/// Returns the UTF8Span containing all the content after the iterator's
/// current position.
///
/// The resultant `UTF8Span` has the same lifetime constraints as `self`.
@lifetime(copy self)
public func suffix() -> UTF8Span {
let slice = codeUnits.span._extracting(currentCodeUnitOffset..<codeUnits.count)
return UTF8Span(
_uncheckedAssumingValidUTF8: slice,
isKnownASCII: codeUnits.isKnownASCII,
isKnownNFC: codeUnits.isKnownNFC)
}
}
}
@available(SwiftStdlib 6.2, *)
@_unavailableInEmbedded
extension UTF8Span {
/// Returns an iterator that will construct `Character`s from the underlying
/// UTF-8 content.
///
/// The resulting iterator has the same lifetime constraints as `self`.
@lifetime(copy self)
public func makeCharacterIterator() -> CharacterIterator {
.init(self)
}
/// Iterate the `Character` contents of a `UTF8Span`.
///
/// **TODO**: Examples
public struct CharacterIterator: ~Escapable {
public let codeUnits: UTF8Span
/// The byte offset of the start of the next `Character`. This is always
/// scalar-aligned. It is always `Character`-aligned relative to the last
/// call to `reset` (or the start of the span if not called).
fileprivate(set)
public var currentCodeUnitOffset: Int
@lifetime(copy codeUnits)
public init(_ codeUnits: UTF8Span) {
self.codeUnits = codeUnits
self.currentCodeUnitOffset = 0
}
private var _start: UnsafeRawPointer {
unsafe codeUnits._start()
}
/// Return the `Character` starting at `currentCodeUnitOffset`. After the
/// function returns, `currentCodeUnitOffset` holds the position at the
/// end of the `Character`, which is also the start of the next
/// `Character`.
///
/// Returns `nil` if at the end of the `UTF8Span`.
@lifetime(self: copy self)
public mutating func next() -> Character? {
guard currentCodeUnitOffset < codeUnits.count else { return nil }
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
let (result, newPos) = unsafe _start._decodeCharacter(
startingAt: currentCodeUnitOffset,
limitedBy: codeUnits.count
)
self.currentCodeUnitOffset = newPos
return result
}
/// Return the `Character` ending at `currentCodeUnitOffset`. After the
/// function returns, `currentCodeUnitOffset` holds the position at the
/// start of the returned `Character`, which is also the end of the
/// previous `Character`.
///
/// Returns `nil` if at the start of the `UTF8Span`.
@lifetime(self: copy self)
public mutating func previous() -> Character? {
guard currentCodeUnitOffset > 0 else { return nil }
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
let (result, newPos) = unsafe _start._decodeCharacter(
endingAt: currentCodeUnitOffset,
limitedBy: codeUnits.count)
self.currentCodeUnitOffset = newPos
return result
}
/// Advance `codeUnitOffset` to the end of the current `Character`,
/// without constructing it.
///
/// Returns the number of `Character`s skipped over, which can be 0
/// if at the end of the UTF8Span.
@lifetime(self: copy self)
public mutating func skipForward() -> Int {
guard currentCodeUnitOffset < codeUnits.count else {
return 0
}
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
self.currentCodeUnitOffset = unsafe _start._nextCharacterStart(currentCodeUnitOffset, limitedBy: codeUnits.count)
return 1
}
/// Advance `codeUnitOffset` to the end of `n` `Characters`, without
/// constructing them.
///
/// Returns the number of `Character`s skipped over, which can be
/// fewer than `n` if at the end of the UTF8Span.
@lifetime(self: copy self)
public mutating func skipForward(by n: Int) -> Int {
var numSkipped = 0
while numSkipped < n && skipForward() != 0 {
numSkipped += 1
}
return numSkipped
}
/// Move `codeUnitOffset` to the start of the previous `Character`,
/// without constructing it.
///
/// Returns the number of `Character`s skipped over, which can be 0
/// if at the start of the UTF8Span.
@lifetime(self: copy self)
public mutating func skipBack() -> Int {
guard currentCodeUnitOffset > 0 else {
return 0
}
_internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
currentCodeUnitOffset = unsafe _start._previousCharacterStart(currentCodeUnitOffset, limitedBy: codeUnits.count)
return 1
}
/// Move `codeUnitOffset` to the start of the previous `n` `Character`s,
/// without constructing them.
///
/// Returns the number of `Character`s skipped over, which can be
/// fewer than `n` if at the start of the UTF8Span.
@lifetime(self: copy self)
public mutating func skipBack(by n: Int) -> Int {
var numSkipped = 0
while numSkipped < n && skipBack() != 0 {
numSkipped += 1
}
return numSkipped
}
/// Reset to the nearest character-aligned position `<= i`.
@lifetime(self: copy self)
public mutating func reset(roundingBackwardsFrom i: Int) {
self.currentCodeUnitOffset = codeUnits._scalarAlignBackwards(i)
}
/// Reset to the nearest character-aligned position `>= i`.
@lifetime(self: copy self)
public mutating func reset(roundingForwardsFrom i: Int) {
self.currentCodeUnitOffset = codeUnits._scalarAlignForwards(i)
}
/// Reset this iterator to `codeUnitOffset`, skipping _all_ safety
/// checks.
///
/// Note: This is only for very specific, low-level use cases. If
/// `codeUnitOffset` is not properly scalar-aligned, this function can
/// result in undefined behavior when, e.g., `next()` is called.
///
/// If `i` is scalar-aligned, but not `Character`-aligned, you may get
/// different results from running `Character` iteration.
///
/// For example, this could be used by a regex engine to backtrack to a
/// known-valid previous position.
///
@unsafe
@lifetime(self: copy self)
public mutating func reset(toUnchecked codeUnitOffset: Int) {
_internalInvariant(codeUnits._isScalarAligned(unchecked: codeUnitOffset))
self.currentCodeUnitOffset = codeUnitOffset
}
/// Returns the UTF8Span containing all the content up to the iterator's
/// current position.
@lifetime(copy self)
public func prefix() -> UTF8Span {
let slice = codeUnits.span._extracting(0..<currentCodeUnitOffset)
return UTF8Span(
_uncheckedAssumingValidUTF8: slice,
isKnownASCII: codeUnits.isKnownASCII,
isKnownNFC: codeUnits.isKnownNFC)
}
/// Returns the UTF8Span containing all the content after the iterator's
/// current position.
@lifetime(copy self)
public func suffix() -> UTF8Span {
let slice = codeUnits.span._extracting(currentCodeUnitOffset..<codeUnits.count)
return UTF8Span(
_uncheckedAssumingValidUTF8: slice,
isKnownASCII: codeUnits.isKnownASCII,
isKnownNFC: codeUnits.isKnownNFC)
}
}
}

View File

@@ -5,8 +5,8 @@
// RUN: c-index-test -read-diagnostics %t.dia > %t.deserialized_diagnostics.txt 2>&1
// RUN: %FileCheck --input-file=%t.deserialized_diagnostics.txt %s
var x = String.init // expected-error{{ambiguous use of 'init'}}
// CHECK: {{.*[/\\]}}serialized-diagnostics-prettyprint.swift:[[@LINE-1]]:16: error: ambiguous use of 'init'
var x = String.init(_:) // expected-error{{ambiguous use of 'init(_:)'}}
// CHECK: {{.*[/\\]}}serialized-diagnostics-prettyprint.swift:[[@LINE-1]]:16: error: ambiguous use of 'init(_:)'
// CHECK: Swift.String.init:2:19: note: found this candidate
// CHECK: CONTENTS OF FILE Swift.String.init:

View File

@@ -814,6 +814,133 @@ Added: _$ss7RawSpanVMa
Added: _$ss7RawSpanVMn
Added: _$ss7RawSpanVN
// SE-0464 UTF8Span
Added: _$sSS7copyingSSs8UTF8SpanV_tcfC
Added: _$sSS8utf8Spans04UTF8B0Vvg
Added: _$sSS8utf8Spans04UTF8B0VvpMV
Added: _$sSs8utf8Spans04UTF8B0Vvg
Added: _$sSs8utf8Spans04UTF8B0VvpMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvM
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvg
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvpMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvs
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11descriptionSSvg
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11descriptionSSvpMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV2eeoiySbAF_AFtFZ
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV11descriptionSSvg
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV11descriptionSSvpMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV15truncatedScalarAHvpZMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV20overlongEncodingByteAHvpZMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV22surrogateCodePointByteAHvpZMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV26unexpectedContinuationByteAHvpZMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV32invalidNonSurrogateCodePointByteAHvpZMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValueAHSgs5UInt8V_tcfC
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8VvM
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8Vvg
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8VvpMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8Vvs
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVMa
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVMn
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVN
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSHsMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSHsWP
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSQsMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSQsWP
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSYsMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSYsWP
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs0D0sMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs0D0sWP
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs23CustomStringConvertiblesMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs23CustomStringConvertiblesWP
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4hash4intoys6HasherVz_tF
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvM
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvg
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvpMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvs
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV9hashValueSivg
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV9hashValueSivpMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVMa
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVMn
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVN
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSHsMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSHsWP
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSQsMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSQsWP
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs0D0sMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs0D0sWP
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs23CustomStringConvertiblesMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs23CustomStringConvertiblesWP
Added: _$ss7UnicodeO4UTF8O15_checkAllErrorsySayAD15ValidationErrorVGxSTRzs5UInt8V7ElementRtzlFZ
Added: _$ss8UTF8SpanV9unchecked12isKnownASCIIABs0B0Vys5UInt8VG_SbtcfC
Added: _$ss8UTF8SpanV10_countMasks6UInt64VvpZMV
Added: _$ss8UTF8SpanV10_flagsMasks6UInt64VvpZMV
Added: _$ss8UTF8SpanV10isKnownNFCSbvpMV
Added: _$ss8UTF8SpanV10validatingABs0B0Vys5UInt8VG_ts7UnicodeO0A0O15ValidationErrorVYKcfC
Added: _$ss8UTF8SpanV11checkForNFC10quickCheckS2b_tF
Added: _$ss8UTF8SpanV12isKnownASCIISbvpMV
Added: _$ss8UTF8SpanV13checkForASCIISbyF
Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64VvM
Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64Vvg
Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64VvpMV
Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64Vvs
Added: _$ss8UTF8SpanV17CharacterIteratorV11skipForward2byS2i_tF
Added: _$ss8UTF8SpanV17CharacterIteratorV11skipForwardSiyF
Added: _$ss8UTF8SpanV17CharacterIteratorV21currentCodeUnitOffsetSivg
Added: _$ss8UTF8SpanV17CharacterIteratorV21currentCodeUnitOffsetSivpMV
Added: _$ss8UTF8SpanV17CharacterIteratorV4nextSJSgyF
Added: _$ss8UTF8SpanV17CharacterIteratorV5reset20roundingForwardsFromySi_tF
Added: _$ss8UTF8SpanV17CharacterIteratorV5reset21roundingBackwardsFromySi_tF
Added: _$ss8UTF8SpanV17CharacterIteratorV5reset11toUncheckedySi_tF
Added: _$ss8UTF8SpanV17CharacterIteratorV6prefixAByF
Added: _$ss8UTF8SpanV17CharacterIteratorV6suffixAByF
Added: _$ss8UTF8SpanV17CharacterIteratorV8previousSJSgyF
Added: _$ss8UTF8SpanV17CharacterIteratorV8skipBack2byS2i_tF
Added: _$ss8UTF8SpanV17CharacterIteratorV8skipBackSiyF
Added: _$ss8UTF8SpanV17CharacterIteratorV9codeUnitsABvg
Added: _$ss8UTF8SpanV17CharacterIteratorV9codeUnitsABvpMV
Added: _$ss8UTF8SpanV17CharacterIteratorVMa
Added: _$ss8UTF8SpanV17CharacterIteratorVMn
Added: _$ss8UTF8SpanV17CharacterIteratorVN
Added: _$ss8UTF8SpanV17CharacterIteratorVyAdBcfC
Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvM
Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvg
Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvpMV
Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvs
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV11skipForward2byS2i_tF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV11skipForwardSiyF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV21currentCodeUnitOffsetSivg
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV21currentCodeUnitOffsetSivpMV
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV4nexts0C0O0D0VSgyF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset20roundingForwardsFromySi_tF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset21roundingBackwardsFromySi_tF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset11toUncheckedySi_tF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV6prefixAByF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV6suffixAByF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8previouss0C0O0D0VSgyF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8skipBack2byS2i_tF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8skipBackSiyF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV9codeUnitsABvg
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV9codeUnitsABvpMV
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVMa
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVMn
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVN
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVyAdBcfC
Added: _$ss8UTF8SpanV21isCanonicallyLessThanySbABF
Added: _$ss8UTF8SpanV21makeCharacterIteratorAB0dE0VyF
Added: _$ss8UTF8SpanV23isCanonicallyEquivalent2toSbAB_tF
Added: _$ss8UTF8SpanV25makeUnicodeScalarIteratorAB0deF0VyF
Added: _$ss8UTF8SpanV4spans0B0Vys5UInt8VGvg
Added: _$ss8UTF8SpanV4spans0B0Vys5UInt8VGvpMV
Added: _$ss8UTF8SpanV5countSivpMV
Added: _$ss8UTF8SpanV7_nfcBits6UInt64VvpZMV
Added: _$ss8UTF8SpanV7isEmptySbvg
Added: _$ss8UTF8SpanV7isEmptySbvpMV
Added: _$ss8UTF8SpanV9_asciiBits6UInt64VvpZMV
Added: _$ss8UTF8SpanVMa
Added: _$ss8UTF8SpanVMn
Added: _$ss8UTF8SpanVN
// SE-0467 MutableSpan and MutableRawSpan
Added: _$ss11MutableSpanVMa
Added: _$ss11MutableSpanVMn

View File

@@ -815,6 +815,133 @@ Added: _$ss7RawSpanVMa
Added: _$ss7RawSpanVMn
Added: _$ss7RawSpanVN
// SE-0464 UTF8Span
Added: _$sSS7copyingSSs8UTF8SpanV_tcfC
Added: _$sSS8utf8Spans04UTF8B0Vvg
Added: _$sSS8utf8Spans04UTF8B0VvpMV
Added: _$sSs8utf8Spans04UTF8B0Vvg
Added: _$sSs8utf8Spans04UTF8B0VvpMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvM
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvg
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvpMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvs
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11descriptionSSvg
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11descriptionSSvpMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV2eeoiySbAF_AFtFZ
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV11descriptionSSvg
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV11descriptionSSvpMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV15truncatedScalarAHvpZMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV20overlongEncodingByteAHvpZMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV22surrogateCodePointByteAHvpZMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV26unexpectedContinuationByteAHvpZMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV32invalidNonSurrogateCodePointByteAHvpZMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValueAHSgs5UInt8V_tcfC
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8VvM
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8Vvg
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8VvpMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8Vvs
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVMa
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVMn
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVN
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSHsMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSHsWP
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSQsMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSQsWP
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSYsMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSYsWP
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs0D0sMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs0D0sWP
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs23CustomStringConvertiblesMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs23CustomStringConvertiblesWP
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4hash4intoys6HasherVz_tF
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvM
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvg
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvpMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvs
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV9hashValueSivg
Added: _$ss7UnicodeO4UTF8O15ValidationErrorV9hashValueSivpMV
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVMa
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVMn
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVN
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSHsMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSHsWP
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSQsMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSQsWP
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs0D0sMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs0D0sWP
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs23CustomStringConvertiblesMc
Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs23CustomStringConvertiblesWP
Added: _$ss7UnicodeO4UTF8O15_checkAllErrorsySayAD15ValidationErrorVGxSTRzs5UInt8V7ElementRtzlFZ
Added: _$ss8UTF8SpanV9unchecked12isKnownASCIIABs0B0Vys5UInt8VG_SbtcfC
Added: _$ss8UTF8SpanV10_countMasks6UInt64VvpZMV
Added: _$ss8UTF8SpanV10_flagsMasks6UInt64VvpZMV
Added: _$ss8UTF8SpanV10isKnownNFCSbvpMV
Added: _$ss8UTF8SpanV10validatingABs0B0Vys5UInt8VG_ts7UnicodeO0A0O15ValidationErrorVYKcfC
Added: _$ss8UTF8SpanV11checkForNFC10quickCheckS2b_tF
Added: _$ss8UTF8SpanV12isKnownASCIISbvpMV
Added: _$ss8UTF8SpanV13checkForASCIISbyF
Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64VvM
Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64Vvg
Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64VvpMV
Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64Vvs
Added: _$ss8UTF8SpanV17CharacterIteratorV11skipForward2byS2i_tF
Added: _$ss8UTF8SpanV17CharacterIteratorV11skipForwardSiyF
Added: _$ss8UTF8SpanV17CharacterIteratorV21currentCodeUnitOffsetSivg
Added: _$ss8UTF8SpanV17CharacterIteratorV21currentCodeUnitOffsetSivpMV
Added: _$ss8UTF8SpanV17CharacterIteratorV4nextSJSgyF
Added: _$ss8UTF8SpanV17CharacterIteratorV5reset20roundingForwardsFromySi_tF
Added: _$ss8UTF8SpanV17CharacterIteratorV5reset21roundingBackwardsFromySi_tF
Added: _$ss8UTF8SpanV17CharacterIteratorV5reset11toUncheckedySi_tF
Added: _$ss8UTF8SpanV17CharacterIteratorV6prefixAByF
Added: _$ss8UTF8SpanV17CharacterIteratorV6suffixAByF
Added: _$ss8UTF8SpanV17CharacterIteratorV8previousSJSgyF
Added: _$ss8UTF8SpanV17CharacterIteratorV8skipBack2byS2i_tF
Added: _$ss8UTF8SpanV17CharacterIteratorV8skipBackSiyF
Added: _$ss8UTF8SpanV17CharacterIteratorV9codeUnitsABvg
Added: _$ss8UTF8SpanV17CharacterIteratorV9codeUnitsABvpMV
Added: _$ss8UTF8SpanV17CharacterIteratorVMa
Added: _$ss8UTF8SpanV17CharacterIteratorVMn
Added: _$ss8UTF8SpanV17CharacterIteratorVN
Added: _$ss8UTF8SpanV17CharacterIteratorVyAdBcfC
Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvM
Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvg
Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvpMV
Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvs
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV11skipForward2byS2i_tF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV11skipForwardSiyF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV21currentCodeUnitOffsetSivg
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV21currentCodeUnitOffsetSivpMV
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV4nexts0C0O0D0VSgyF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset20roundingForwardsFromySi_tF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset21roundingBackwardsFromySi_tF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset11toUncheckedySi_tF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV6prefixAByF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV6suffixAByF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8previouss0C0O0D0VSgyF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8skipBack2byS2i_tF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8skipBackSiyF
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV9codeUnitsABvg
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV9codeUnitsABvpMV
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVMa
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVMn
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVN
Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVyAdBcfC
Added: _$ss8UTF8SpanV21isCanonicallyLessThanySbABF
Added: _$ss8UTF8SpanV21makeCharacterIteratorAB0dE0VyF
Added: _$ss8UTF8SpanV23isCanonicallyEquivalent2toSbAB_tF
Added: _$ss8UTF8SpanV25makeUnicodeScalarIteratorAB0deF0VyF
Added: _$ss8UTF8SpanV4spans0B0Vys5UInt8VGvg
Added: _$ss8UTF8SpanV4spans0B0Vys5UInt8VGvpMV
Added: _$ss8UTF8SpanV5countSivpMV
Added: _$ss8UTF8SpanV7_nfcBits6UInt64VvpZMV
Added: _$ss8UTF8SpanV7isEmptySbvg
Added: _$ss8UTF8SpanV7isEmptySbvpMV
Added: _$ss8UTF8SpanV9_asciiBits6UInt64VvpZMV
Added: _$ss8UTF8SpanVMa
Added: _$ss8UTF8SpanVMn
Added: _$ss8UTF8SpanVN
// SE-0467 MutableSpan and MutableRawSpan
Added: _$ss11MutableSpanVMa
Added: _$ss11MutableSpanVMn

View File

@@ -0,0 +1,295 @@
// RUN: %target-run-stdlib-swift %S/Inputs/
// REQUIRES: executable_test
// FIXME: this test is currently broken
import Swift
import StdlibUnittest
var suite = TestSuite("UTF8.ValidationError")
defer { runAllTests() }
@available(SwiftStdlib 6.2, *)
extension Array {
func withSpan<R>(_ f: (Span<Element>) throws -> R) rethrows -> R {
try self.withUnsafeBufferPointer {
try f(Span(_unsafeElements: $0))
}
}
}
extension Range<Int> {
func _offset(by start: Int) -> Range<Int> {
start + lowerBound ..< start + upperBound
}
}
@available(SwiftStdlib 6.2, *)
private struct ValidationError {
var error: UTF8.ValidationError
// When fetching all errors, we'll get the error kind given. When
// slicing in order to get the next error (e.g.
// `UTF8Span.init(validating:))`, we'll get `.unexpectedContinuation`.
var errorStart: Bool
init(
_ error: UTF8.ValidationError,
errorStart: Bool
) {
self.error = error
self.errorStart = errorStart
}
public static func unexpectedContinuationByte(
at i: Int, errorStart: Bool = true
) -> Self {
Self(UTF8.ValidationError(.unexpectedContinuationByte, at: i), errorStart: errorStart)
}
public static func surrogateCodePointByte(
at i: Int, errorStart: Bool = true
) -> Self {
Self(UTF8.ValidationError(.surrogateCodePointByte, at: i), errorStart: errorStart)
}
public static func invalidNonSurrogateCodePointByte(
at i: Int, errorStart: Bool = true
) -> Self {
Self(UTF8.ValidationError(.invalidNonSurrogateCodePointByte, at: i), errorStart: errorStart)
}
public static func overlongEncodingByte(
at i: Int, errorStart: Bool = true
) -> Self {
Self(UTF8.ValidationError(.overlongEncodingByte, at: i), errorStart: errorStart)
}
public static func truncatedScalar(
_ range: Range<Int>, errorStart: Bool = true
) -> Self {
Self(UTF8.ValidationError(.truncatedScalar, range), errorStart: errorStart)
}
}
@available(SwiftStdlib 6.2, *)
private struct ValidationTestCase {
var bytes: [UInt8]
// When fetching all errors, we'll get the error kind given. When
// slicing in order to get the next error (e.g.
// `UTF8Span.init(validating:))`, we'll get `.unexpectedContinuation`.
var errors: [ValidationError]
var loc: SourceLocStack
init(
_ bytes: [UInt8],
file: String = #file,
line: UInt = #line,
_ errors: [ValidationError]
) {
self.bytes = bytes
self.errors = errors
self.loc = .init(SourceLoc(file, line))
}
func fetchError(
at i: Int, wasSliced: Bool
) -> UTF8.ValidationError {
let err = errors[i]
if wasSliced && !err.errorStart {
return .init(.unexpectedContinuationByte, err.error.byteOffsets)
}
return err.error
}
func expect<T: Equatable>(
_ lhs: T,
_ rhs: T,
file: String = #file,
line: UInt = #line
) {
expectEqual(
lhs,
rhs,
stackTrace: loc.withCurrentLoc(file: file, line: line))
}
func fail(
_ message: String,
file: String = #file,
line: UInt = #line
) {
expectationFailure(
message,
trace: "",
stackTrace: loc.with(.init(file, line)))
}
/// Test UTF8._checkAllErrors(), which matches directly against
/// the provided expected-errors.
func testAllErrors() {
let caughtErrors = Array(UTF8._checkAllErrors(bytes))
for i in 0..<Swift.min(caughtErrors.count, errors.count) {
expect(fetchError(at: i, wasSliced: false), caughtErrors[i])
}
expect(caughtErrors.count, errors.count)
}
/// Test UTF8Span validation. Surface subsequent errors by slicing the
/// input (which will convert the error-kind to .unexpectedContinuationByte)
func testSpanSlicedErrors() {
bytes.withSpan { span in
if errors.isEmpty {
do throws(UTF8.ValidationError) {
// No errors expected
_ = try UTF8Span(validating: span)
} catch {
fail("Unexpected error: \(error)")
}
return
}
// Check every error, by slicing (which will change error classification
// of continuation bytes in multi-byte errors to .unexpectedContinuation)
var currentPos = 0
var errorIdx = 0
while true {
do throws(UTF8.ValidationError) {
// print("extracting \(currentPos)")
_ = try UTF8Span(validating: span._extracting(currentPos...))
if errorIdx != errors.endIndex {
fail("Expected a thrown UTF-8 encoding error")
}
break
} catch {
guard errorIdx < errors.endIndex else {
fail("Found unexpected subsequent error \(error)")
break
}
let expectedError = fetchError(at: errorIdx, wasSliced: true)
// print(currentPos)
// print(error)
// print(error.byteOffsets._offset(by: currentPos))
let adjustedErr = UTF8.ValidationError(
error.kind,
error.byteOffsets._offset(by: currentPos)
)
expect(expectedError, adjustedErr)
currentPos = adjustedErr.byteOffsets.upperBound
errorIdx += 1
}
}
// Rest of input should be error-free
if let start = errors.last?.error.byteOffsets.upperBound,
start < bytes.count
{
do throws(UTF8.ValidationError) {
_ = try UTF8Span(validating: span._extracting(start...))
} catch {
fail("Found subsequent error \(error)")
}
}
}
}
func run() {
testSpanSlicedErrors()
testAllErrors()
}
}
if #available(SwiftStdlib 6.2, *) {
suite.test("UTF8Span/encoding errors") {
func test(
_ bytes: Array<UInt8>,
_ file: String = #file, line: UInt = #line,
_ errors: ValidationError...
) {
ValidationTestCase(
bytes, file: file, line: line, errors
).run()
}
// Valid string
// test(Array("abcde\u{301}f😀🇺🇸🧟🧟".utf8), [])
// Bad URL
// test(
// Array("http://servername/scripts/..".utf8)
// + [0xC0, 0xAF]
// + Array("../winnt/system32/cmd.exe".utf8),
// [.overlongEncodingByte(at: 28), // C0
// .overlongEncodingByte(at: 29, errorStart: false), // AF
// ])
// test(
// [0xC0, 0xAF, 0xE0, 0x80, 0xBF, 0xF0, 0x81, 0x82, 0x41],
// [.overlongEncodingByte(at: 0), // C0
// .overlongEncodingByte(at: 1, errorStart: false), // AF
// .overlongEncodingByte(at: 2), // E0
// .overlongEncodingByte(at: 3, errorStart: false), // 80
// .overlongEncodingByte(at: 4, errorStart: false), // BF
// .overlongEncodingByte(at: 5), // F0
// .overlongEncodingByte(at: 6, errorStart: false), // 81
// .overlongEncodingByte(at: 7, errorStart: false), // 82
// ])
// test(
// [0x41, 0xC0, 0xAF, 0x41, 0xF4, 0x80, 0x80, 0x41],
// [.overlongEncodingByte(at: 1), // C0
// .overlongEncodingByte(at: 2, errorStart: false), // AF
// .truncatedScalar(4...6), // F4 80 80
// ])
// test(
// [0xED, 0xAF, 0x41],
// [.surrogateCodePointByte(at: 0), // ED
// .surrogateCodePointByte(at: 1, errorStart: false), // AF
// ])
// test(
// [0xED, 0xA0, 0x80, 0xED, 0xBF, 0xBF, 0xED, 0xAF, 0x41],
// [.surrogateCodePointByte(at: 0), // ED
// .surrogateCodePointByte(at: 1, errorStart: false), // A0
// .surrogateCodePointByte(at: 2, errorStart: false), // 80
// .surrogateCodePointByte(at: 3), // ED
// .surrogateCodePointByte(at: 4, errorStart: false), // BF
// .surrogateCodePointByte(at: 5, errorStart: false), // BF
// .surrogateCodePointByte(at: 6), // ED
// .surrogateCodePointByte(at: 7, errorStart: false), // AF
// ])
// test(
// [0xF4, 0x91, 0x92, 0x93, 0xFF, 0x41, 0x80, 0xBF, 0x42],
// [.invalidNonSurrogateCodePointByte(at: 0), // F4
// .invalidNonSurrogateCodePointByte(at: 1, errorStart: false), // 91
// .invalidNonSurrogateCodePointByte(at: 2, errorStart: false), // 92
// .invalidNonSurrogateCodePointByte(at: 3, errorStart: false), // 93
// .invalidNonSurrogateCodePointByte(at: 4), // FF
// .unexpectedContinuationByte(at: 6), // 80
// .unexpectedContinuationByte(at: 7), // BF
// ])
// test(
// [0xE1, 0x80, 0xE2, 0xF0, 0x91, 0x92, 0xF1, 0xBF, 0x41],
// [.truncatedScalar(0...1), // E1 80
// .truncatedScalar(2...2), // E2
// .truncatedScalar(3...5), // F0 91 92
// .truncatedScalar(6...7), // F1 BF
// ])
// test(
// [0xE0, 0x81, 0x80],
// [.overlongEncodingByte(at: 0), // E0
// .overlongEncodingByte(at: 1, errorStart: false), // 81
// .overlongEncodingByte(at: 2, errorStart: false), // 80
// ])
}
}

View File

@@ -0,0 +1,279 @@
// RUN: %target-run-stdlib-swift(-enable-experimental-feature LifetimeDependence) %S/Inputs/
// REQUIRES: swift_feature_LifetimeDependence
// REQUIRES: executable_test
import Swift
import StdlibUnittest
var suite = TestSuite("UTF8SpanIterator")
defer { runAllTests() }
@available(SwiftStdlib 6.2, *)
extension Array {
func withSpan<R>(_ f: (Span<Element>) throws -> R) rethrows -> R {
try self.withUnsafeBufferPointer {
try f(Span(_unsafeElements: $0))
}
}
}
@available(SwiftStdlib 6.2, *)
extension UTF8Span {
func withSpan<R>(_ f: (Span<UInt8>) throws -> R) rethrows -> R {
try self._withUnsafeBufferPointer {
try f(Span(_unsafeElements: $0))
}
}
}
//
@available(SwiftStdlib 6.2, *)
struct ContentEquivalenceTestCase {
var str: String
var loc: SourceLocStack
}
@available(SwiftStdlib 6.2, *)
extension ContentEquivalenceTestCase {
func expectStart(
_ scalars: inout UTF8Span.UnicodeScalarIterator
) {
let firstScalar = str.unicodeScalars.first
expectEqual(0, scalars.currentCodeUnitOffset, stackTrace: loc)
expectNil(scalars.previous(), stackTrace: loc)
expectEqual(firstScalar, scalars.next(), stackTrace: loc)
expectEqual(firstScalar, scalars.previous(), stackTrace: loc)
expectNil(scalars.previous(), stackTrace: loc)
}
func expectEnd(
_ scalars: inout UTF8Span.UnicodeScalarIterator
) {
let lastScalar = str.unicodeScalars.last
expectEqual(scalars.currentCodeUnitOffset, scalars.codeUnits.count, stackTrace: loc)
expectNil(scalars.next(), stackTrace: loc)
expectEqual(lastScalar, scalars.previous(), stackTrace: loc)
expectEqual(lastScalar, scalars.next(), stackTrace: loc)
expectNil(scalars.next(), stackTrace: loc)
}
func expectStart(
_ chars: inout UTF8Span.CharacterIterator
) {
let firstChar = str.first
expectEqual(0, chars.currentCodeUnitOffset, stackTrace: loc)
expectNil(chars.previous(), stackTrace: loc)
expectEqual(firstChar, chars.next(), stackTrace: loc)
expectEqual(firstChar, chars.previous(), stackTrace: loc)
expectNil(chars.previous(), stackTrace: loc)
}
func expectEnd(
_ chars: inout UTF8Span.CharacterIterator
) {
let lastChar = str.last
expectEqual(chars.currentCodeUnitOffset, chars.codeUnits.count, stackTrace: loc)
expectNil(chars.next(), stackTrace: loc)
expectEqual(lastChar, chars.previous(), stackTrace: loc)
expectEqual(lastChar, chars.next(), stackTrace: loc)
expectNil(chars.next(), stackTrace: loc)
}
func withUTF8Span<R>(_ f: (UTF8Span) throws -> R) rethrows -> R {
try Array(str.utf8).withSpan { span in
try f(try! UTF8Span(validating: span))
}
}
}
@available(SwiftStdlib 6.2, *)
extension ContentEquivalenceTestCase {
func testBytes() {
let otherBytes = Array((str+"abc").utf8)
withUTF8Span { utf8Span in
utf8Span._withUnsafeBufferPointer {
expectEqualSequence(str.utf8, $0, stackTrace: loc)
}
}
// NOTE: There's a slight jarring due to not having the same
// iterators for code units
}
func testScalars() {
withUTF8Span { utf8Span in
// Test forwards
var utf8SpanIter = utf8Span.makeUnicodeScalarIterator()
var stringIter = str.unicodeScalars.makeIterator()
while let scalar = utf8SpanIter.next() {
expectEqual(scalar, stringIter.next(), stackTrace: loc)
}
expectNil(stringIter.next(), stackTrace: loc)
expectEnd(&utf8SpanIter)
// Test backwards
var stringRevIter = str.unicodeScalars.reversed().makeIterator()
while let scalar = utf8SpanIter.previous() {
expectEqual(scalar, stringRevIter.next(), stackTrace: loc)
}
expectNil(stringRevIter.next(), stackTrace: loc)
expectStart(&utf8SpanIter)
let numElements = str.unicodeScalars.count
let lastElement = str.unicodeScalars.last
let firstElement = str.unicodeScalars.first
expectEqual(numElements, utf8SpanIter.skipForward(by: Int.max))
expectEnd(&utf8SpanIter)
expectEqual(numElements, utf8SpanIter.skipBack(by: Int.max))
expectStart(&utf8SpanIter)
expectEqual(numElements, utf8SpanIter.skipForward(by: numElements))
expectEnd(&utf8SpanIter)
expectEqual(numElements, utf8SpanIter.skipBack(by: numElements))
expectStart(&utf8SpanIter)
if numElements > 0 {
expectStart(&utf8SpanIter)
expectEqual(numElements-1, utf8SpanIter.skipForward(by: numElements-1))
expectEqual(lastElement, utf8SpanIter.next())
expectEnd(&utf8SpanIter)
expectEqual(numElements-1, utf8SpanIter.skipBack(by: numElements-1))
expectEqual(firstElement, utf8SpanIter.previous())
expectStart(&utf8SpanIter)
}
// TODO: test reset variants
// TODO: test prefix/suffix
}
}
func testCharacters() {
withUTF8Span { utf8Span in
// Test forwards
var utf8SpanIter = utf8Span.makeCharacterIterator()
var stringIter = str.makeIterator()
while let char = utf8SpanIter.next() {
expectEqual(char, stringIter.next(), stackTrace: loc)
}
expectNil(stringIter.next(), stackTrace: loc)
expectEnd(&utf8SpanIter)
// Test backwards
var stringRevIter = str.reversed().makeIterator()
while let char = utf8SpanIter.previous() {
expectEqual(char, stringRevIter.next(), stackTrace: loc)
}
expectNil(stringRevIter.next(), stackTrace: loc)
expectStart(&utf8SpanIter)
let numElements = str.count
let lastElement = str.last
let firstElement = str.first
expectEqual(numElements, utf8SpanIter.skipForward(by: Int.max))
expectEnd(&utf8SpanIter)
expectEqual(numElements, utf8SpanIter.skipBack(by: Int.max))
expectStart(&utf8SpanIter)
expectEqual(numElements, utf8SpanIter.skipForward(by: numElements))
expectEnd(&utf8SpanIter)
expectEqual(numElements, utf8SpanIter.skipBack(by: numElements))
expectStart(&utf8SpanIter)
if numElements > 0 {
expectStart(&utf8SpanIter)
expectEqual(numElements-1, utf8SpanIter.skipForward(by: numElements-1))
expectEqual(lastElement, utf8SpanIter.next())
expectEnd(&utf8SpanIter)
expectEqual(numElements-1, utf8SpanIter.skipBack(by: numElements-1))
expectEqual(firstElement, utf8SpanIter.previous())
expectStart(&utf8SpanIter)
}
// TODO: test reset variants
// TODO: test prefix/suffix
}
}
func run() {
testBytes()
testScalars()
testCharacters()
// TODO: test grapheme break iterator
}
}
if #available(SwiftStdlib 6.2, *) {
suite.test("UTF8Span/iterators") {
func test(
_ s: String,
file: String = #file,
line: UInt = #line
) {
// print("testing: \(s)")
let t = ContentEquivalenceTestCase(
str: s, loc: .init(SourceLoc(file, line)))
t.run()
}
test("")
test("a")
test("á")
test("a\u{301}")
test("🧟‍♀️")
test("abc")
test("abcde\u{301}")
test("abéÏ𓀀")
test("012345678901234567890")
test("abéÏ012345678901234567890𓀀")
test("😀😃🤢🤮👩🏿‍🎤🧛🏻‍♂️🧛🏻‍♂️👩‍👩‍👦‍👦")
test("defghijklmnopqrstuvwxyz")
test("ab🧟de\u{301}bytés")
test("ab🧟de\u{301}🧟‍♀️")
test("ab🧟de🧟\u{301}")
}
}
// @available(SwiftStdlib 6.2, *)
// extension UTF8Span {
// func splitOffASCIIPrefix() -> (UTF8Span, UTF8Span) {
// if isKnownASCII {
// return (self, .init())
// }
// var splitPoint = 0
// while splitPoint < codeUnits.count && codeUnits[unchecked: split] < 0x80 {
// splitPoint += 1
// }
// }
// }
if #available(SwiftStdlib 6.2, *) {
suite.test("UTF8Span/whatever") {
// var badURLBytes: [UInt8] = []
// badURLBytes.append(contentsOf: "http://servername/scripts/..".utf8)
// // Invalid overlong encoding of "/"
// badURLBytes.append(contentsOf: [0xC0, 0xAF])
// badURLBytes.append(contentsOf: "../winnt/system32/cmd.exe".utf8)
// // try! UTF8Span(validating: badURLBytes.span)
// badURLBytes.withSpan {
// try! UTF8Span(validating: $0)
// }
}
}

View File

@@ -0,0 +1,278 @@
// RUN: %target-run-stdlib-swift %S/Inputs/
// REQUIRES: executable_test
import Swift
import StdlibUnittest
@available(SwiftStdlib 6.2, *)
extension UTF8Span {
static func ~=(_ lhs: StaticString, _ rhs: UTF8Span) -> Bool {
return lhs.withUTF8Buffer { str in
rhs._withUnsafeBufferPointer { span in
str.elementsEqual(span)
}
}
}
}
var suite = TestSuite("UTF8SpanQueriesComparisons")
defer { runAllTests() }
@available(SwiftStdlib 6.2, *)
extension Array where Element == UInt8 {
func withSpan<R>(_ f: (Span<Element>) throws -> R) rethrows -> R {
try self.withUnsafeBufferPointer {
try f(Span(_unsafeElements: $0))
}
}
func withUTF8Span<R>(_ f: (UTF8Span) throws -> R) rethrows -> R {
try self.withSpan { span in
try f(try! UTF8Span(validating: span))
}
}
}
if #available(SwiftStdlib 6.2, *) {
suite.test("UTF8Span/tilde equals") {
Array("abcdefg".utf8).withUTF8Span { utf8Span in
switch utf8Span {
case "def":
expectationFailure(
"unexpected pattern match",
trace: "",
stackTrace: SourceLocStack().withCurrentLoc())
case "abcdef":
expectationFailure(
"unexpected pattern match",
trace: "",
stackTrace: SourceLocStack().withCurrentLoc())
case "abcdefg ":
expectationFailure(
"unexpected pattern match",
trace: "",
stackTrace: SourceLocStack().withCurrentLoc())
case "abcdefg\0":
expectationFailure(
"unexpected pattern match",
trace: "",
stackTrace: SourceLocStack().withCurrentLoc())
case "abcdefg":
break
default:
expectationFailure(
"expected a pattern match",
trace: "",
stackTrace: SourceLocStack().withCurrentLoc())
}
}
}
suite.test("UTF8Span/Sequence equal") {
// // A string and its canonical equivalent
// let testCases: [(String, String?)] = [
// ("abdefg", nil)
// ("café", "cafe\u{301}")
// ]
}
suite.test("UTF8Span/isKnownASCII") {
let tests: [(String, Bool)] = [
("abc", true),
("abcdefghil1235@#% _/.sladfj234 ", true),
("abcdefghil1\u{80}sladfj234 ", false),
]
for (test, expected) in tests {
Array(test.utf8).withUTF8Span {
expectEqual(expected, $0.isKnownASCII)
}
}
}
suite.test("UTF8Span/isKnownNFC") {
enum Normalness {
case known
case quickCheck
case fullCheck
case notNFC
}
let nfcQCNo = "\u{0374}"
let nfcQCYes = "\u{0374}"
let tests: [(String, Normalness)] = [
("abc", .known),
("abcdefghil123567890", .known),
("abcdefghil1\u{299}123345678 ", .quickCheck),
("abc日曜日xyz", .quickCheck),
("abcde日曜日\u{301}", .fullCheck),
("abcde\u{301}fghijkl", .notNFC),
]
for (test, expected) in tests {
Array(test.utf8).withUTF8Span {
var span = $0
if span.isKnownNFC {
expectEqual(expected, .known)
} else if span.checkForNFC(quickCheck: true) {
expectEqual(expected, .quickCheck)
} else if span.checkForNFC(quickCheck: false) {
expectEqual(expected, .fullCheck)
} else {
expectEqual(expected, .notNFC)
}
}
}
}
suite.test("UTF8Span/canonical equivalence") {
// TODO: refactor to be test-case declaration driven, and add more tests...
// `(normalized: String, variants: [String], lessThan: String, greaterThan: String)`
let precomposedStr = "café"
let decomposedStr = "cafe\u{301}"
let precomposed = Array(precomposedStr.utf8)
let decomposed = Array(decomposedStr.utf8)
precomposed.withSpan { pre in
let utf8Precomposed = try! UTF8Span(validating: pre)
decomposed.withSpan { de in
let utf8Decomposed = try! UTF8Span(validating: de)
// print("scalars for \(precomposedStr.unicodeScalars)")
// var preScalars = utf8Precomposed.makeUnicodeScalarIterator()
// while let s = preScalars.next() {
// print(s)
// }
// print("scalars for \(decomposedStr.unicodeScalars)")
// var deScalars = utf8Decomposed.makeUnicodeScalarIterator()
// while let s = deScalars.next() {
// print(s)
// }
expectTrue(utf8Precomposed.isCanonicallyEquivalent(to: utf8Decomposed))
expectTrue(utf8Precomposed.bytesEqual(to: precomposedStr.utf8))
expectFalse(utf8Precomposed.bytesEqual(to: decomposedStr.utf8))
expectTrue(utf8Decomposed.bytesEqual(to: decomposedStr.utf8))
expectFalse(utf8Decomposed.bytesEqual(to: precomposedStr.utf8))
expectTrue(utf8Precomposed.unicodeScalarsEqual(to: precomposedStr.unicodeScalars))
expectFalse(utf8Precomposed.unicodeScalarsEqual(to: decomposedStr.unicodeScalars))
expectTrue(utf8Decomposed.unicodeScalarsEqual(to: decomposedStr.unicodeScalars))
expectFalse(utf8Decomposed.unicodeScalarsEqual(to: precomposedStr.unicodeScalars))
expectTrue(utf8Precomposed.charactersEqual(to: precomposedStr))
expectTrue(utf8Precomposed.charactersEqual(to: decomposedStr))
expectTrue(utf8Decomposed.charactersEqual(to: decomposedStr))
expectTrue(utf8Decomposed.charactersEqual(to: precomposedStr))
// Equivalence means no-one is less than the other
expectFalse(utf8Decomposed.isCanonicallyLessThan(utf8Precomposed))
expectFalse(utf8Precomposed.isCanonicallyLessThan(utf8Decomposed))
}
}
}
}
// TODO: Rest of this file is in-progress TODOs
/*
isASCII
isKnownNFC
checkForNFC(quickCheck:)
isKnownSingleScalarCharacters
checkForSingleScalarCharacters(quickCheck:)
public func bytesEqual(to other: UTF8Span) -> Bool
public func bytesEqual(to other: some Sequence<UInt8>) -> Bool
public func scalarsEqual(
to other: some Sequence<Unicode.Scalar>
) -> Bool
public func charactersEqual(
to other: some Sequence<Character>
) -> Bool
public func isCanonicallyEquivalent(
to other: UTF8Span
) -> Bool
public func isCanonicallyLessThan(
_ other: UTF8Span
) -> Bool
*/
// @available(SwiftStdlib 6.2, *)
// private struct QueryTestCase {
// var content: String
// var loc: SourceLocStack
// var isASCII: Bool
// // TODO: This might become API, or otherwise calculated at init time
// var isLatinyNFC: Bool {
// bytes.allSatisfy { $0 < 0xCC }
// }
// var isQuickNFC: Bool
// var isNFC: Bool
// var isQuickSSC: Bool
// var isSSC: Bool
// }
// if #available(SwiftStdlib 6.2, *) {
// suite.test("UTF8Span/queries") {
// }
// }
// enum ComparisonResult {
// binaryEqual
// canonicallyEqual
// canonicallyLess
// inequal
// }
// private struct ComparisonTestCase {
// var content: String
// var comparisons: [(String, ComparisonResult)]
// var loc: SourceLocStack
// }
// if #available(SwiftStdlib 6.2, *) {
// suite.test("UTF8Span/comparisons") {
// func test()
// }
// }
/*
input string, to check the bits and relevant info
comparison string and expected comparison level
*/
// }