UTF8Span (#78531)

Add support for UTF8Span Also, refactor validation and grapheme breaking
2025-12-14 20:36:38 +01:00 · 2025-04-11 16:11:11 -06:00
parent 014825127b
commit e6e4bd6056
22 changed files with 3365 additions and 467 deletions
--- a/Runtimes/Core/core/CMakeLists.txt
+++ b/Runtimes/Core/core/CMakeLists.txt
@@ -208,6 +208,13 @@ add_library(swiftCore
  UnsafeRawPointer.swift
  UTFEncoding.swift
  UTF8.swift
+  UTF8EncodingError.swift
+  UTF8Span.swift
+  UTF8SpanBits.swift
+  UTF8SpanComparisons.swift
+  UTF8SpanFundamentals.swift
+  UTF8SpanInternalHelpers.swift
+  UTF8SpanIterators.swift
  UTF16.swift
  UTF32.swift
  Unicode.swift # ORDER DEPENDENCY: must follow new unicode support
--- a/stdlib/public/core/CMakeLists.txt
+++ b/stdlib/public/core/CMakeLists.txt
@@ -214,6 +214,13 @@ split_embedded_sources(
  EMBEDDED UnsafeRawPointer.swift
  EMBEDDED UTFEncoding.swift
  EMBEDDED UTF8.swift
+  EMBEDDED UTF8EncodingError.swift
+  EMBEDDED UTF8Span.swift
+  EMBEDDED UTF8SpanBits.swift
+  EMBEDDED UTF8SpanComparisons.swift
+  EMBEDDED UTF8SpanFundamentals.swift
+  EMBEDDED UTF8SpanInternalHelpers.swift
+  EMBEDDED UTF8SpanIterators.swift
  EMBEDDED UTF16.swift
  EMBEDDED UTF32.swift
  EMBEDDED Unicode.swift # ORDER DEPENDENCY: must follow new unicode support
--- a/stdlib/public/core/GroupInfo.json
+++ b/stdlib/public/core/GroupInfo.json
@@ -205,6 +205,15 @@
    "RawSpan.swift",
    "Span.swift"
  ],
+  "UTF8Span": [ 
+    "UTF8EncodingError.swift",
+    "UTF8Span.swift",
+    "UTF8SpanBits.swift",
+    "UTF8SpanComparisons.swift",
+    "UTF8SpanFundamentals.swift",
+    "UTF8SpanInternalHelpers.swift",
+    "UTF8SpanIterators.swift"
+  ],
  "Protocols": [
    "CompilerProtocols.swift",
    "ShadowProtocols.swift"
--- a/stdlib/public/core/String.swift
+++ b/stdlib/public/core/String.swift
@@ -1112,108 +1112,4 @@ extension String {
  }
 }

-extension _StringGutsSlice {
-  internal func _isScalarNFCQC(
-    _ scalar: Unicode.Scalar,
-    _ prevCCC: inout UInt8
-  ) -> Bool {
-    let normData = Unicode._NormData(scalar, fastUpperbound: 0x300)

-    if prevCCC > normData.ccc, normData.ccc != 0 {
-      return false
-    }
-
-    if !normData.isNFCQC {
-      return false
-    }
-
-    prevCCC = normData.ccc
-    return true
-  }
-
-  internal func _withNFCCodeUnits(_ f: (UInt8) throws -> Void) rethrows {
-    let substring = String(_guts)[range]
-    // Fast path: If we're already NFC (or ASCII), then we don't need to do
-    // anything at all.
-    if _fastPath(_guts.isNFC) {
-      try substring.utf8.forEach(f)
-      return
-    }
-
-    var isNFCQC = true
-    var prevCCC: UInt8 = 0
-
-    if _guts.isFastUTF8 {
-      _fastNFCCheck(&isNFCQC, &prevCCC)
-
-      // Because we have access to the fastUTF8, we can go through that instead
-      // of accessing the UTF8 view on String.
-      if isNFCQC {
-        try unsafe withFastUTF8 {
-          for unsafe byte in unsafe $0 {
-            try f(byte)
-          }
-        }
-
-        return
-      }
-    } else {
-      for scalar in substring.unicodeScalars {
-        if !_isScalarNFCQC(scalar, &prevCCC) {
-          isNFCQC = false
-          break
-        }
-      }
-
-      if isNFCQC {
-        for byte in substring.utf8 {
-          try f(byte)
-        }
-
-        return
-      }
-    }
-
-    for scalar in substring.unicodeScalars._internalNFC {
-      try scalar.withUTF8CodeUnits {
-        for unsafe byte in unsafe $0 {
-          try f(byte)
-        }
-      }
-    }
-  }
-
-  internal func _fastNFCCheck(_ isNFCQC: inout Bool, _ prevCCC: inout UInt8) {
-    unsafe withFastUTF8 { utf8 in
-      var position = 0
-
-      while position < utf8.count {
-        // If our first byte is less than 0xCC, then it means we're under the
-        // 0x300 scalar value and everything up to 0x300 is NFC already.
-        if unsafe utf8[position] < 0xCC {
-          // If our first byte is less than 0xC0, then it means it is ASCII
-          // and only takes up a single byte.
-          if unsafe utf8[position] < 0xC0 {
-            position &+= 1
-          } else {
-            // Otherwise, this is a 2 byte < 0x300 sequence.
-            position &+= 2
-          }
-          // ASCII always has ccc of 0.
-          prevCCC = 0
-
-          continue
-        }
-
-        let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: position)
-
-        if !_isScalarNFCQC(scalar, &prevCCC) {
-          isNFCQC = false
-          return
-        }
-
-        position &+= len
-      }
-    }
-  }
-}
--- a/stdlib/public/core/StringComparison.swift
+++ b/stdlib/public/core/StringComparison.swift
@@ -97,7 +97,7 @@ internal func _stringCompareInternal(
 }

@_effects(readonly)
-private func _stringCompareFastUTF8(
+internal func _stringCompareFastUTF8(
  _ utf8Left: UnsafeBufferPointer<UInt8>,
  _ utf8Right: UnsafeBufferPointer<UInt8>,
  expecting: _StringComparisonResult,
--- a/stdlib/public/core/StringCreate.swift
+++ b/stdlib/public/core/StringCreate.swift
@@ -117,7 +117,7 @@ extension String {
      return unsafe (String._uncheckedFromUTF8(
        input, asciiPreScanResult: extraInfo.isASCII
      ), false)
-    case .error(let initialRange):
+    case .error(_, let initialRange):
        return unsafe (repairUTF8(input, firstKnownBrokenRange: initialRange), true)
    }
  }
@@ -139,7 +139,7 @@ extension String {
        newIsASCII: info.isASCII
      )
      return result.asString
-    case .error(let initialRange):
+    case .error(_, let initialRange):
      defer { _fixLifetime(result) }
      //This could be optimized to use excess tail capacity
      return unsafe repairUTF8(result.codeUnits, firstKnownBrokenRange: initialRange)
--- a/stdlib/public/core/StringGraphemeBreaking.swift
+++ b/stdlib/public/core/StringGraphemeBreaking.swift
@@ -13,14 +13,18 @@
 import SwiftShims

 /// CR and LF are common special cases in grapheme breaking logic
-private var _CR: UInt8 { return 0x0d }
-private var _LF: UInt8 { return 0x0a }
+private var _CR: UInt8 { return 0x0D }
+private var _LF: UInt8 { return 0x0A }

-internal func _hasGraphemeBreakBetween(
+/// Perform a quick-check to determine if there's a grapheme-break between two
+/// scalars, without consulting the data tables. Returns true if there
+/// definitely is a break, false if there definitely is none, and nil if a
+/// break couldn't be determined
+internal func _quickHasGraphemeBreakBetween(
  _ lhs: Unicode.Scalar, _ rhs: Unicode.Scalar
-) -> Bool {
-
-  // CR-LF is a special case: no break between these
+) -> Bool? {
+  // GB3:
+  //   CR-LF is a special case: no break between these
  if lhs == Unicode.Scalar(_CR) && rhs == Unicode.Scalar(_LF) {
    return false
  }
@@ -80,7 +84,10 @@ internal func _hasGraphemeBreakBetween(
    default: return false
    }
  }
-  return hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs)
+  if hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs) {
+    return true
+  }
+  return nil
 }

 extension _StringGuts {
@@ -513,6 +520,8 @@ extension Unicode {
    internal var _previous: Unicode.Scalar
    internal var _state: _GraphemeBreakingState

+    /// Refactoring TODO: should we use a quick check result?
+    ///
    /// Returns a non-nil value if it can be determined whether there is a
    /// grapheme break between `scalar1` and `scalar2` without knowing anything
    /// about the scalars that precede `scalar1`. This can optionally be used as
@@ -523,13 +532,7 @@ extension Unicode {
      between scalar1: Unicode.Scalar,
      and scalar2: Unicode.Scalar
    ) -> Bool? {
-      if scalar1.value == 0xD, scalar2.value == 0xA {
-        return false
-      }
-      if _hasGraphemeBreakBetween(scalar1, scalar2) {
-        return true
-      }
-      return nil
+      _quickHasGraphemeBreakBetween(scalar1, scalar2)
    }

    /// Initialize a new character recognizer at the _start of text_ (sot)
@@ -637,59 +640,76 @@ extension _StringGuts {
    nextScalar: (Int) -> (scalar: Unicode.Scalar, end: Int)?
  ) -> Int {
    _internalInvariant(index < endIndex._encodedOffset)
+    return _nextGraphemeClusterBoundary(startingAt: index, nextScalar: nextScalar)
+  }
+}

-    // Note: If `index` in't already on a boundary, then starting with an empty
-    // state here sometimes leads to this method returning results that diverge
-    // from the true breaks in the string.
-    var state = _GraphemeBreakingState()
-    var (scalar, index) = nextScalar(index)!
+internal func _nextGraphemeClusterBoundary(
+  startingAt index: Int,
+  nextScalar: (Int) -> (scalar: Unicode.Scalar, end: Int)?
+) -> Int {

-    while true {
-      guard let (scalar2, nextIndex) = nextScalar(index) else { break }
-      if state.shouldBreak(between: scalar, and: scalar2) {
-        break
-      }
-      index = nextIndex
-      scalar = scalar2
+  // Note: If `index` isn't already on a boundary, then starting with an empty
+  // state here sometimes leads to this method returning results that diverge
+  // from the true breaks in the string.
+  var state = _GraphemeBreakingState()
+  var (scalar, index) = nextScalar(index)!
+
+  while true {
+    guard let (scalar2, nextIndex) = nextScalar(index) else { break }
+    if state.shouldBreak(between: scalar, and: scalar2) {
+      break
    }
-
-    return index
+    index = nextIndex
+    scalar = scalar2
  }

-  // Returns the stride of the grapheme cluster ending at offset `index`.
-  //
-  // This method uses `previousScalar` to looks back in the string as far as
-  // necessary to find a correct grapheme cluster boundary, whether or not
-  // `index` happens to be on a boundary itself.
-  internal func previousBoundary(
+  return index
+}
+
+extension _StringGuts {
+  fileprivate func previousBoundary(
    endingAt index: Int,
    previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
  ) -> Int {
-    // FIXME: This requires potentially arbitrary lookback in each iteration,
-    // leading to quadratic behavior in some edge cases. Ideally lookback should
-    // only be done once per cluster (or in the case of RI sequences, once per
-    // flag sequence). One way to avoid most quadratic behavior is to replace
-    // this implementation with a scheme that first searches backwards for a
-    // safe point then iterates forward using the regular `shouldBreak` until we
-    // reach `index`, as recommended in section 6.4 of TR#29.
-    //
-    // https://www.unicode.org/reports/tr29/#Random_Access
-
-    var (scalar2, index) = previousScalar(index)!
-
-    while true {
-      guard let (scalar1, previousIndex) = previousScalar(index) else { break }
-      if shouldBreakWithLookback(
-        between: scalar1, and: scalar2, at: index, with: previousScalar
-      ) {
-        break
-      }
-      index = previousIndex
-      scalar2 = scalar1
-    }
-
-    return index
+    _previousGraphemeClusterBoundary(endingAt: index, previousScalar: previousScalar)
  }
+
+}
+
+// Returns the stride of the grapheme cluster ending at offset `index`.
+//
+// This method uses `previousScalar` to looks back in the string as far as
+// necessary to find a correct grapheme cluster boundary, whether or not
+// `index` happens to be on a boundary itself.
+internal func _previousGraphemeClusterBoundary(
+  endingAt index: Int,
+  previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
+) -> Int {
+  // FIXME: This requires potentially arbitrary lookback in each iteration,
+  // leading to quadratic behavior in some edge cases. Ideally lookback should
+  // only be done once per cluster (or in the case of RI sequences, once per
+  // flag sequence). One way to avoid most quadratic behavior is to replace
+  // this implementation with a scheme that first searches backwards for a
+  // safe point then iterates forward using the regular `shouldBreak` until we
+  // reach `index`, as recommended in section 6.4 of TR#29.
+  //
+  // https://www.unicode.org/reports/tr29/#Random_Access
+
+  var (scalar2, index) = previousScalar(index)!
+
+  while true {
+    guard let (scalar1, previousIndex) = previousScalar(index) else { break }
+    if _shouldBreakWithLookback(
+      between: scalar1, and: scalar2, at: index, with: previousScalar
+    ) {
+      break
+    }
+    index = previousIndex
+    scalar2 = scalar1
+  }
+
+  return index
 }

 extension _GraphemeBreakingState {
@@ -708,13 +728,8 @@ extension _GraphemeBreakingState {
    between scalar1: Unicode.Scalar,
    and scalar2: Unicode.Scalar
  ) -> Bool {
-    // GB3
-    if scalar1.value == 0xD, scalar2.value == 0xA {
-      return false
-    }
-
-    if _hasGraphemeBreakBetween(scalar1, scalar2) {
-      return true
+    if let result = _quickHasGraphemeBreakBetween(scalar1, scalar2) {
+      return result
    }

    let x = Unicode._GraphemeBreakProperty(from: scalar1)
@@ -868,289 +883,282 @@ extension _GraphemeBreakingState {
  }
 }

-extension _StringGuts {
-  // Return true if there is an extended grapheme cluster boundary between two
-  // scalars, with no previous knowledge about preceding scalars.
-  //
-  // This method looks back as far as it needs to determine the correct
-  // placement of boundaries.
-  //
-  // This is based off of the Unicode Annex #29 for [Grapheme Cluster Boundary
-  // Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
-  internal func shouldBreakWithLookback(
-    between scalar1: Unicode.Scalar,
-    and scalar2: Unicode.Scalar,
-    at index: Int,
-    with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
-  ) -> Bool {
-    // GB3
-    if scalar1.value == 0xD, scalar2.value == 0xA {
-      return false
-    }
-
-    if _hasGraphemeBreakBetween(scalar1, scalar2) {
-      return true
-    }
-
-    let x = Unicode._GraphemeBreakProperty(from: scalar1)
-    let y = Unicode._GraphemeBreakProperty(from: scalar2)
-
-    switch (x, y) {
-
-    // Fast path: If we know our scalars have no properties the decision is
-    //            trivial and we don't need to crawl to the default statement.
-    case (.any, .any):
-      return true
-
-    // GB4
-    case (.control, _):
-      return true
-
-    // GB5
-    case (_, .control):
-      return true
-
-    // GB6
-    case (.l, .l),
-         (.l, .v),
-         (.l, .lv),
-         (.l, .lvt):
-      return false
-
-    // GB7
-    case (.lv, .v),
-         (.v, .v),
-         (.lv, .t),
-         (.v, .t):
-      return false
-
-    // GB8
-    case (.lvt, .t),
-         (.t, .t):
-      return false
-
-    // GB9
-    case (_, .extend),
-         (_, .zwj):
-      return false
-
-    // GB9a
-    case (_, .spacingMark):
-      return false
-
-    // GB9b
-    case (.prepend, _):
-      return false
-
-    // GB11
-    case (.zwj, .extendedPictographic):
-      return !checkIfInEmojiSequence(at: index, with: previousScalar)
-
-    // GB12 & GB13
-    case (.regionalIndicator, .regionalIndicator):
-      return countRIs(at: index, with: previousScalar)
-
-    // GB999
-    default:
-      // GB9c
-      //
-      // Check if our rhs is an InCB=Consonant first because we can more easily
-      // exit out of this branch in most cases. Otherwise, this is a consonant.
-      // Check that the lhs is an InCB=Extend or InCB=Linker (we have to check
-      // if it's an .extend or .zwj first because _isInCBExtend assumes that it
-      // is true).
-      if scalar2._isInCBConsonant,
-         (x == .extend || x == .zwj),
-         (scalar1._isInCBExtend || scalar1._isInCBLinker) {
-        return !checkIfInIndicSequence(at: index, with: previousScalar)
-      }
-
-      return true
-    }
+// Return true if there is an extended grapheme cluster boundary between two
+// scalars, with no previous knowledge about preceding scalars.
+//
+// This method looks back as far as it needs to determine the correct
+// placement of boundaries.
+//
+// This is based off of the Unicode Annex #29 for [Grapheme Cluster Boundary
+// Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
+fileprivate func _shouldBreakWithLookback(
+  between scalar1: Unicode.Scalar,
+  and scalar2: Unicode.Scalar,
+  at index: Int,
+  with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
+) -> Bool {
+  if let result = _quickHasGraphemeBreakBetween(scalar1, scalar2) {
+    return result
  }

-  // When walking backwards, it's impossible to know whether we were in an emoji
-  // sequence without walking further backwards. This walks the string backwards
-  // enough until we figure out whether or not to break our
-  // (.zwj, .extendedPictographic) question. For example:
-  //
-  // Scalar view #1:
-  //
-  //     [.control, .zwj, .extendedPictographic]
-  //                     ^
-  //                     | = To determine whether or not we break here, we need
-  //                         to see the previous scalar's grapheme property.
-  //          ^
-  //          | = This is neither .extendedPictographic nor .extend, thus we
-  //              were never in an emoji sequence, so break between the .zwj
-  //              and .extendedPictographic.
-  //
-  // Scalar view #2:
-  //
-  //     [.extendedPictographic, .zwj, .extendedPictographic]
-  //                                  ^
-  //                                  | = Same as above, move backwards one to
-  //                                      view the previous scalar's property.
-  //                ^
-  //                | = This is an .extendedPictographic, so this indicates that
-  //                    we are in an emoji sequence, so we should NOT break
-  //                    between the .zwj and .extendedPictographic.
-  //
-  // Scalar view #3:
-  //
-  //     [.extendedPictographic, .extend, .extend, .zwj, .extendedPictographic]
-  //                                                    ^
-  //                                                    | = Same as above
-  //                                         ^
-  //                                         | = This is an .extend which means
-  //                                             there is a potential emoji
-  //                                             sequence, walk further backwards
-  //                                             to find an .extendedPictographic.
-  //
-  //                               <-- = Another extend, go backwards more.
-  //                ^
-  //                | = We found our starting .extendedPictographic letting us
-  //                    know that we are in an emoji sequence so our initial
-  //                    break question is answered as NO.
-  internal func checkIfInEmojiSequence(
-    at index: Int,
-    with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
-  ) -> Bool {
-    guard var i = previousScalar(index)?.start else { return false }
-    while let prev = previousScalar(i) {
-      i = prev.start
-      let gbp = Unicode._GraphemeBreakProperty(from: prev.scalar)
+  let x = Unicode._GraphemeBreakProperty(from: scalar1)
+  let y = Unicode._GraphemeBreakProperty(from: scalar2)

-      switch gbp {
-      case .extend:
-        continue
-      case .extendedPictographic:
-        return true
-      default:
-        return false
-      }
-    }
+  switch (x, y) {
+
+  // Fast path: If we know our scalars have no properties the decision is
+  //            trivial and we don't need to crawl to the default statement.
+  case (.any, .any):
+    return true
+
+  // GB4
+  case (.control, _):
+    return true
+
+  // GB5
+  case (_, .control):
+    return true
+
+  // GB6
+  case (.l, .l),
+       (.l, .v),
+       (.l, .lv),
+       (.l, .lvt):
    return false
-  }
-
-  // When walking backwards, it's impossible to know whether we break when we
-  // see our first (InCB=Extend, InCB=Consonant) or (InCB=Linker, InCB=Consonant)
-  // without walking further backwards. This walks the string backwards enough
-  // until we figure out whether or not to break this indic sequence. For example:
-  //
-  // Scalar view #1:
-  //
-  //     [InCB=Linker, InCB=Extend, InCB=Consonant]
-  //                               ^
-  //                               | = To be able to know whether or not to
-  //                                   break these two, we need to walk
-  //                                   backwards to determine if this is a
-  //                                   legitimate indic sequence.
-  //      ^
-  //      | = The scalar sequence ends without a starting InCB=Consonant,
-  //          so this is in fact not an indic sequence, so we can break the two.
-  //
-  // Scalar view #2:
-  //
-  //     [InCB=Consonant, InCB=Linker, InCB=Extend, InCB=Consonant]
-  //                                               ^
-  //                                               | = Same as above
-  //                            ^
-  //                            | = This is a Linker, so we at least have seen
-  //                                1 to be able to return true if we see a
-  //                                consonant later.
-  //         ^
-  //         | = Is a consonant and we've seen a linker, so this is a
-  //             legitimate indic sequence, so do NOT break the initial question.
-  internal func checkIfInIndicSequence(
-    at index: Int,
-    with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
-  ) -> Bool {
-    guard let p = previousScalar(index) else { return false }
-
-    var hasSeenInCBLinker = p.scalar._isInCBLinker
-    var i = p.start
-
-    while let (scalar, prev) = previousScalar(i) {
-      i = prev
-
-      if scalar._isInCBConsonant {
-        return hasSeenInCBLinker
-      }
-
-      let gbp = Unicode._GraphemeBreakProperty(from: scalar)
-
-      guard gbp == .extend || gbp == .zwj else {
-        return false
-      }
-
-      switch (scalar._isInCBExtend, scalar._isInCBLinker) {
-      case (false, false):
-        return false
-
-      case (false, true):
-        hasSeenInCBLinker = true
-
-      case (true, false):
-        continue
-
-      case (true, true):
-        // This case should never happen, but if it does then just be cautious
-        // and say this is invalid.
-        return false
-      }
-    }

+  // GB7
+  case (.lv, .v),
+       (.v, .v),
+       (.lv, .t),
+       (.v, .t):
    return false
-  }

-  // When walking backwards, it's impossible to know whether we break when we
-  // see our first (.regionalIndicator, .regionalIndicator) without walking
-  // further backwards. This walks the string backwards enough until we figure
-  // out whether or not to break these RIs. For example:
-  //
-  // Scalar view #1:
-  //
-  //     [.control, .regionalIndicator, .regionalIndicator]
-  //                                   ^
-  //                                   | = To be able to know whether or not to
-  //                                       break these two, we need to walk
-  //                                       backwards to determine if there were
-  //                                       any previous .regionalIndicators in
-  //                                       a row.
-  //         ^
-  //         | = Not a .regionalIndicator, so our total riCount is 0 and 0 is
-  //             even thus we do not break.
-  //
-  // Scalar view #2:
-  //
-  //     [.control, .regionalIndicator, .regionalIndicator, .regionalIndicator]
-  //                                                       ^
-  //                                                       | = Same as above
-  //                         ^
-  //                         | = This is a .regionalIndicator, so continue
-  //                             walking backwards for more of them. riCount is
-  //                             now equal to 1.
-  //         ^
-  //         | = Not a .regionalIndicator. riCount = 1 which is odd, so break
-  //             the last two .regionalIndicators.
-  internal func countRIs(
-    at index: Int,
-    with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
-  ) -> Bool {
-    guard let p = previousScalar(index) else { return false }
-    var i = p.start
-    var riCount = 0
-    while let p = previousScalar(i) {
-      i = p.start
+  // GB8
+  case (.lvt, .t),
+       (.t, .t):
+    return false

-      let gbp = Unicode._GraphemeBreakProperty(from: p.scalar)
-      guard gbp == .regionalIndicator else {
-        break
-      }
+  // GB9
+  case (_, .extend),
+       (_, .zwj):
+    return false

-      riCount += 1
+  // GB9a
+  case (_, .spacingMark):
+    return false
+
+  // GB9b
+  case (.prepend, _):
+    return false
+
+  // GB11
+  case (.zwj, .extendedPictographic):
+    return !_checkIfInEmojiSequence(at: index, with: previousScalar)
+
+  // GB12 & GB13
+  case (.regionalIndicator, .regionalIndicator):
+    return _countRIs(at: index, with: previousScalar)
+
+  // GB999
+  default:
+    // GB9c
+    //
+    // Check if our rhs is an InCB=Consonant first because we can more easily
+    // exit out of this branch in most cases. Otherwise, this is a consonant.
+    // Check that the lhs is an InCB=Extend or InCB=Linker (we have to check
+    // if it's an .extend or .zwj first because _isInCBExtend assumes that it
+    // is true).
+    if scalar2._isInCBConsonant,
+       (x == .extend || x == .zwj),
+       (scalar1._isInCBExtend || scalar1._isInCBLinker) {
+      return !_checkIfInIndicSequence(at: index, with: previousScalar)
    }
-    return riCount & 1 != 0
+
+    return true
  }
 }
+
+// When walking backwards, it's impossible to know whether we were in an emoji
+// sequence without walking further backwards. This walks the string backwards
+// enough until we figure out whether or not to break our
+// (.zwj, .extendedPictographic) question. For example:
+//
+// Scalar view #1:
+//
+//     [.control, .zwj, .extendedPictographic]
+//                     ^
+//                     | = To determine whether or not we break here, we need
+//                         to see the previous scalar's grapheme property.
+//          ^
+//          | = This is neither .extendedPictographic nor .extend, thus we
+//              were never in an emoji sequence, so break between the .zwj
+//              and .extendedPictographic.
+//
+// Scalar view #2:
+//
+//     [.extendedPictographic, .zwj, .extendedPictographic]
+//                                  ^
+//                                  | = Same as above, move backwards one to
+//                                      view the previous scalar's property.
+//                ^
+//                | = This is an .extendedPictographic, so this indicates that
+//                    we are in an emoji sequence, so we should NOT break
+//                    between the .zwj and .extendedPictographic.
+//
+// Scalar view #3:
+//
+//     [.extendedPictographic, .extend, .extend, .zwj, .extendedPictographic]
+//                                                    ^
+//                                                    | = Same as above
+//                                         ^
+//                                         | = This is an .extend which means
+//                                             there is a potential emoji
+//                                             sequence, walk further backwards
+//                                             to find an .extendedPictographic.
+//
+//                               <-- = Another extend, go backwards more.
+//                ^
+//                | = We found our starting .extendedPictographic letting us
+//                    know that we are in an emoji sequence so our initial
+//                    break question is answered as NO.
+fileprivate func _checkIfInEmojiSequence(
+  at index: Int,
+  with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
+) -> Bool {
+  guard var i = previousScalar(index)?.start else { return false }
+  while let prev = previousScalar(i) {
+    i = prev.start
+    let gbp = Unicode._GraphemeBreakProperty(from: prev.scalar)
+
+    switch gbp {
+    case .extend:
+      continue
+    case .extendedPictographic:
+      return true
+    default:
+      return false
+    }
+  }
+  return false
+}
+
+// When walking backwards, it's impossible to know whether we break when we
+// see our first (InCB=Extend, InCB=Consonant) or (InCB=Linker, InCB=Consonant)
+// without walking further backwards. This walks the string backwards enough
+// until we figure out whether or not to break this indic sequence. For example:
+//
+// Scalar view #1:
+//
+//     [InCB=Linker, InCB=Extend, InCB=Consonant]
+//                               ^
+//                               | = To be able to know whether or not to
+//                                   break these two, we need to walk
+//                                   backwards to determine if this is a
+//                                   legitimate indic sequence.
+//      ^
+//      | = The scalar sequence ends without a starting InCB=Consonant,
+//          so this is in fact not an indic sequence, so we can break the two.
+//
+// Scalar view #2:
+//
+//     [InCB=Consonant, InCB=Linker, InCB=Extend, InCB=Consonant]
+//                                               ^
+//                                               | = Same as above
+//                            ^
+//                            | = This is a Linker, so we at least have seen
+//                                1 to be able to return true if we see a
+//                                consonant later.
+//         ^
+//         | = Is a consonant and we've seen a linker, so this is a
+//             legitimate indic sequence, so do NOT break the initial question.
+fileprivate func _checkIfInIndicSequence(
+  at index: Int,
+  with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
+) -> Bool {
+  guard let p = previousScalar(index) else { return false }
+
+  var hasSeenInCBLinker = p.scalar._isInCBLinker
+  var i = p.start
+
+  while let (scalar, prev) = previousScalar(i) {
+    i = prev
+
+    if scalar._isInCBConsonant {
+      return hasSeenInCBLinker
+    }
+
+    let gbp = Unicode._GraphemeBreakProperty(from: scalar)
+
+    guard gbp == .extend || gbp == .zwj else {
+      return false
+    }
+
+    switch (scalar._isInCBExtend, scalar._isInCBLinker) {
+    case (false, false):
+      return false
+
+    case (false, true):
+      hasSeenInCBLinker = true
+
+    case (true, false):
+      continue
+
+    case (true, true):
+      // This case should never happen, but if it does then just be cautious
+      // and say this is invalid.
+      return false
+    }
+  }
+
+  return false
+}
+
+// When walking backwards, it's impossible to know whether we break when we
+// see our first (.regionalIndicator, .regionalIndicator) without walking
+// further backwards. This walks the string backwards enough until we figure
+// out whether or not to break these RIs. For example:
+//
+// Scalar view #1:
+//
+//     [.control, .regionalIndicator, .regionalIndicator]
+//                                   ^
+//                                   | = To be able to know whether or not to
+//                                       break these two, we need to walk
+//                                       backwards to determine if there were
+//                                       any previous .regionalIndicators in
+//                                       a row.
+//         ^
+//         | = Not a .regionalIndicator, so our total riCount is 0 and 0 is
+//             even thus we do not break.
+//
+// Scalar view #2:
+//
+//     [.control, .regionalIndicator, .regionalIndicator, .regionalIndicator]
+//                                                       ^
+//                                                       | = Same as above
+//                         ^
+//                         | = This is a .regionalIndicator, so continue
+//                             walking backwards for more of them. riCount is
+//                             now equal to 1.
+//         ^
+//         | = Not a .regionalIndicator. riCount = 1 which is odd, so break
+//             the last two .regionalIndicators.
+fileprivate func _countRIs(
+  at index: Int,
+  with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
+) -> Bool {
+  guard let p = previousScalar(index) else { return false }
+  var i = p.start
+  var riCount = 0
+  while let p = previousScalar(i) {
+    i = p.start
+
+    let gbp = Unicode._GraphemeBreakProperty(from: p.scalar)
+    guard gbp == .regionalIndicator else {
+      break
+    }
+
+    riCount += 1
+  }
+  return riCount & 1 != 0
+}
--- a/stdlib/public/core/StringNormalization.swift
+++ b/stdlib/public/core/StringNormalization.swift
@@ -52,3 +52,119 @@ extension UnsafeBufferPointer where Element == UInt8 {
    return unsafe !UTF8.isContinuation(self[offset])
  }
 }
+
+internal func _isScalarNFCQC(
+  _ scalar: Unicode.Scalar,
+  _ prevCCC: inout UInt8
+) -> Bool {
+  let normData = Unicode._NormData(scalar, fastUpperbound: 0x300)
+
+  if prevCCC > normData.ccc, normData.ccc != 0 {
+    return false
+  }
+
+  if !normData.isNFCQC {
+    return false
+  }
+
+  prevCCC = normData.ccc
+  return true
+}
+
+extension _StringGutsSlice {
+  internal func _withNFCCodeUnits(_ f: (UInt8) throws -> Void) rethrows {
+    let substring = String(_guts)[range]
+    // Fast path: If we're already NFC (or ASCII), then we don't need to do
+    // anything at all.
+    if _fastPath(_guts.isNFC) {
+      try substring.utf8.forEach(f)
+      return
+    }
+
+    var isNFCQC = true
+    var prevCCC: UInt8 = 0
+
+    if _guts.isFastUTF8 {
+      _fastNFCCheck(&isNFCQC, &prevCCC)
+
+      // Because we have access to the fastUTF8, we can go through that instead
+      // of accessing the UTF8 view on String.
+      if isNFCQC {
+        try unsafe withFastUTF8 {
+          for unsafe byte in unsafe $0 {
+            try f(byte)
+          }
+        }
+
+        return
+      }
+    } else {
+      for scalar in substring.unicodeScalars {
+        if !_isScalarNFCQC(scalar, &prevCCC) {
+          isNFCQC = false
+          break
+        }
+      }
+
+      if isNFCQC {
+        for byte in substring.utf8 {
+          try f(byte)
+        }
+
+        return
+      }
+    }
+
+    for scalar in substring.unicodeScalars._internalNFC {
+      try scalar.withUTF8CodeUnits {
+        for unsafe byte in unsafe $0 {
+          try f(byte)
+        }
+      }
+    }
+  }
+
+  internal func _fastNFCCheck(_ isNFCQC: inout Bool, _ prevCCC: inout UInt8) {
+    unsafe withFastUTF8 { utf8 in
+      isNFCQC = unsafe _nfcQuickCheck(utf8, prevCCC: &prevCCC)
+    }
+  }
+}
+
+/// Run the Unicode NFC quick check algorithm, returns
+internal func _nfcQuickCheck(
+  _ utf8: UnsafeBufferPointer<UInt8>,
+  prevCCC: inout UInt8
+) -> Bool {
+  var position = 0
+
+  while position < utf8.count {
+    // If our first byte is less than 0xCC, then it means we're under the
+    // 0x300 scalar value and everything up to 0x300 is NFC already.
+    if unsafe utf8[position] < 0xCC {
+      // If our first byte is less than 0xC0, then it means it is ASCII
+      // and only takes up a single byte.
+      if unsafe utf8[position] < 0xC0 {
+        position &+= 1
+      } else {
+        // Otherwise, this is a 2 byte < 0x300 sequence.
+        position &+= 2
+      }
+      // ASCII always has ccc of 0.
+      prevCCC = 0
+
+      continue
+    }
+
+    let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: position)
+
+    guard _isScalarNFCQC(scalar, &prevCCC) else {
+      return false
+    }
+
+    position &+= len
+  }
+
+  return true
+}
+
--- a/stdlib/public/core/StringUTF8Validation.swift
+++ b/stdlib/public/core/StringUTF8Validation.swift
@@ -18,7 +18,7 @@ private func _isNotOverlong_F0(_ x: UInt8) -> Bool {
  return (0x90...0xBF).contains(x)
 }

-private func _isNotOverlong_F4(_ x: UInt8) -> Bool {
+private func _isNotInvalid_F4(_ x: UInt8) -> Bool {
  return UTF8.isContinuation(x) && x <= 0x8F
 }

@@ -26,7 +26,7 @@ private func _isNotOverlong_E0(_ x: UInt8) -> Bool {
  return (0xA0...0xBF).contains(x)
 }

-private func _isNotOverlong_ED(_ x: UInt8) -> Bool {
+private func _isNotInvalid_ED(_ x: UInt8) -> Bool {
  return UTF8.isContinuation(x) && x <= 0x9F
 }

@@ -34,15 +34,82 @@ internal struct UTF8ExtraInfo: Equatable {
  public var isASCII: Bool
 }

+@inline(never) // slow-path
+private func _diagnoseInvalidUTF8MultiByteLeading(
+  _ x: UInt8
+) -> _UTF8EncodingErrorKind {
+  _internalInvariant(x >= 0x80)
+  _internalInvariant(!_isUTF8MultiByteLeading(x))
+  switch x {
+  case 0x80...0xBF:
+    return .unexpectedContinuationByte
+  case 0xC0..<0xC2:
+    return .overlongEncodingByte
+  default:
+    _internalInvariant(x > 0xF4)
+    return .invalidNonSurrogateCodePointByte
+  }
+}
+
 internal enum UTF8ValidationResult {
  case success(UTF8ExtraInfo)
-  case error(toBeReplaced: Range<Int>)
+  case error(
+    kind: _UTF8EncodingErrorKind, toBeReplaced: Range<Int>
+  )
+}
+
+// FIXME: refactor other parts of stdlib to avoid this dumb mirror enum
+//
+// Mirror of UTF8.ValidationError.Kind, available on 6.1
+internal struct _UTF8EncodingErrorKind: Error, Sendable, Hashable
+// TODO: embedded?, Codable
+  , RawRepresentable {
+  internal var rawValue: UInt8
+
+  @available(SwiftStdlib 6.2, *)
+  internal var _publicKind: UTF8.ValidationError.Kind {
+    .init(rawValue: self.rawValue)!
+  }
+
+  @inlinable
+  internal init(rawValue: UInt8) {
+    self.rawValue = rawValue
+  }
+
+  /// A continuation byte (`10xxxxxx`) outside of a multi-byte sequence
+  @_alwaysEmitIntoClient
+  internal static var unexpectedContinuationByte: Self {
+    .init(rawValue: 0)
+  }
+
+  /// A byte in a surrogate code point (`U+D800..U+DFFF`) sequence
+  @_alwaysEmitIntoClient
+  internal static var surrogateCodePointByte: Self {
+    .init(rawValue: 1)
+  }
+
+  /// A byte in an invalid, non-surrogate code point (`>U+10FFFF`) sequence
+  @_alwaysEmitIntoClient
+  internal static var invalidNonSurrogateCodePointByte: Self {
+    .init(rawValue: 2)
+  }
+
+  /// A byte in an overlong encoding sequence
+  @_alwaysEmitIntoClient
+  internal static var overlongEncodingByte: Self {
+    .init(rawValue: 3)
+  }
+
+  /// A multi-byte sequence that is the start of a valid multi-byte scalar
+  /// but is cut off before ending correctly
+  @_alwaysEmitIntoClient
+  internal static var truncatedScalar: Self {
+    .init(rawValue: 4)
+  }
 }

 extension UTF8ValidationResult: Equatable {}

-private struct UTF8ValidationError: Error {}
-
 internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationResult {
  if unsafe _allASCII(buf) {
    return .success(UTF8ExtraInfo(isASCII: true))
@@ -51,12 +118,20 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
  var iter = unsafe buf.makeIterator()
  var lastValidIndex = buf.startIndex

-  @inline(__always) func guaranteeIn(_ f: (UInt8) -> Bool) throws(UTF8ValidationError) {
-    guard let cu = unsafe iter.next() else { throw UTF8ValidationError() }
-    guard f(cu) else { throw UTF8ValidationError() }
+  @inline(__always) func guarantee(
+    _ f: (UInt8) -> Bool,
+    _ err: _UTF8EncodingErrorKind
+  ) throws(_UTF8EncodingErrorKind) {
+    guard let cu = unsafe iter.next() else {
+      throw .truncatedScalar
+    }
+    guard f(cu) else {
+      throw err
+    }
  }
-  @inline(__always) func guaranteeContinuation() throws(UTF8ValidationError) {
-    try guaranteeIn(UTF8.isContinuation)
+  @inline(__always) func guaranteeContinuation(
+  ) throws(_UTF8EncodingErrorKind) {
+    try guarantee(UTF8.isContinuation, .truncatedScalar)
  }

  func _legacyInvalidLengthCalculation(_ _buffer: (_storage: UInt32, ())) -> Int {
@@ -117,21 +192,40 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
    return unsafe _legacyNarrowIllegalRange(buf: buf[illegalRange])
  }

-  do {
+  do throws(_UTF8EncodingErrorKind) {
+
+    /*
+    The table of valid UTF-8 is:
+
+     ╔════════════════════╦════════╦════════╦════════╦════════╗
+     ║    Scalar value    ║ Byte 0 ║ Byte 1 ║ Byte 2 ║ Byte 3 ║
+     ╠════════════════════╬════════╬════════╬════════╬════════╣
+     ║ U+0000..U+007F     ║ 00..7F ║        ║        ║        ║
+     ║ U+0080..U+07FF     ║ C2..DF ║ Contin ║        ║        ║
+     ║ U+0800..U+0FFF     ║ E0     ║ A0..BF ║ Contin ║        ║
+     ║ U+1000..U+CFFF     ║ E1..EC ║ Contin ║ Contin ║        ║
+     ║ U+D000..U+D7FF     ║ ED     ║ 80..9F ║ Contin ║        ║
+     ║ U+E000..U+FFFF     ║ EE..EF ║ Contin ║ Contin ║        ║
+     ║ U+10000..U+3FFFF   ║ F0     ║ 90..BF ║ Contin ║ Contin ║
+     ║ U+40000..U+FFFFF   ║ F1..F3 ║ Contin ║ Contin ║ Contin ║
+     ║ U+100000..U+10FFFF ║ F4     ║ 80..8F ║ Contin ║ Contin ║
+     ╚════════════════════╩════════╩════════╩════════╩════════╝
+
+     "Contin" is any continuation byte, i.e. 80..BF or 10xxxxxx
+     */
    var isASCII = true
    while let cu = unsafe iter.next() {
      if UTF8.isASCII(cu) { lastValidIndex &+= 1; continue }
      isASCII = false
      if _slowPath(!_isUTF8MultiByteLeading(cu)) {
-        func fail() throws(UTF8ValidationError) { throw UTF8ValidationError() }
-        try fail()
+        throw _diagnoseInvalidUTF8MultiByteLeading(cu)
      }
      switch cu {
      case 0xC2...0xDF:
        try guaranteeContinuation()
        lastValidIndex &+= 2
      case 0xE0:
-        try guaranteeIn(_isNotOverlong_E0)
+        try guarantee(_isNotOverlong_E0, .overlongEncodingByte)
        try guaranteeContinuation()
        lastValidIndex &+= 3
      case 0xE1...0xEC:
@@ -139,7 +233,7 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
        try guaranteeContinuation()
        lastValidIndex &+= 3
      case 0xED:
-        try guaranteeIn(_isNotOverlong_ED)
+        try guarantee(_isNotInvalid_ED, .surrogateCodePointByte)
        try guaranteeContinuation()
        lastValidIndex &+= 3
      case 0xEE...0xEF:
@@ -147,7 +241,7 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
        try guaranteeContinuation()
        lastValidIndex &+= 3
      case 0xF0:
-        try guaranteeIn(_isNotOverlong_F0)
+        try guarantee(_isNotOverlong_F0, .overlongEncodingByte)
        try guaranteeContinuation()
        try guaranteeContinuation()
        lastValidIndex &+= 4
@@ -157,7 +251,8 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
        try guaranteeContinuation()
        lastValidIndex &+= 4
      case 0xF4:
-        try guaranteeIn(_isNotOverlong_F4)
+        try guarantee(
+          _isNotInvalid_F4, .invalidNonSurrogateCodePointByte)
        try guaranteeContinuation()
        try guaranteeContinuation()
        lastValidIndex &+= 4
@@ -167,7 +262,9 @@ internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationR
    }
    return .success(UTF8ExtraInfo(isASCII: isASCII))
  } catch {
-    return unsafe .error(toBeReplaced: findInvalidRange(buf[lastValidIndex...]))
+    return unsafe .error(
+      kind: error,
+      toBeReplaced: findInvalidRange(buf[lastValidIndex...]))
  }
 }

@@ -214,7 +311,7 @@ internal func repairUTF8(_ input: UnsafeBufferPointer<UInt8>, firstKnownBrokenRa
    case .success:
      unsafe result.appendInPlace(remainingInput, isASCII: false)
      return String(result)
-    case .error(let newBrokenRange):
+    case .error(_, let newBrokenRange):
      brokenRange = newBrokenRange
    }
  } while !remainingInput.isEmpty
--- a/stdlib/public/core/UTF8EncodingError.swift
+++ b/stdlib/public/core/UTF8EncodingError.swift
@@ -0,0 +1,261 @@
+extension Unicode.UTF8 {
+  /**
+
+   The kind and location of a UTF-8 encoding error.
+
+   Valid UTF-8 is represented by this table:
+
+   ```
+   ╔════════════════════╦════════╦════════╦════════╦════════╗
+   ║    Scalar value    ║ Byte 0 ║ Byte 1 ║ Byte 2 ║ Byte 3 ║
+   ╠════════════════════╬════════╬════════╬════════╬════════╣
+   ║ U+0000..U+007F     ║ 00..7F ║        ║        ║        ║
+   ║ U+0080..U+07FF     ║ C2..DF ║ 80..BF ║        ║        ║
+   ║ U+0800..U+0FFF     ║ E0     ║ A0..BF ║ 80..BF ║        ║
+   ║ U+1000..U+CFFF     ║ E1..EC ║ 80..BF ║ 80..BF ║        ║
+   ║ U+D000..U+D7FF     ║ ED     ║ 80..9F ║ 80..BF ║        ║
+   ║ U+E000..U+FFFF     ║ EE..EF ║ 80..BF ║ 80..BF ║        ║
+   ║ U+10000..U+3FFFF   ║ F0     ║ 90..BF ║ 80..BF ║ 80..BF ║
+   ║ U+40000..U+FFFFF   ║ F1..F3 ║ 80..BF ║ 80..BF ║ 80..BF ║
+   ║ U+100000..U+10FFFF ║ F4     ║ 80..8F ║ 80..BF ║ 80..BF ║
+   ╚════════════════════╩════════╩════════╩════════╩════════╝
+   ```
+
+   ### Classifying errors
+
+   An *unexpected continuation* is when a continuation byte (`10xxxxxx`) occurs
+   in a position that should be the start of a new scalar value. Unexpected
+   continuations can often occur when the input contains arbitrary data
+   instead of textual content. An unexpected continuation at the start of
+   input might mean that the input was not correctly sliced along scalar
+   boundaries or that it does not contain UTF-8.
+
+   A *truncated scalar* is a multi-byte sequence that is the start of a valid
+   multi-byte scalar but is cut off before ending correctly. A truncated
+   scalar at the end of the input might mean that only part of the entire
+   input was received.
+
+   A *surrogate code point* (`U+D800..U+DFFF`) is invalid UTF-8. Surrogate
+   code points are used by UTF-16 to encode scalars in the supplementary
+   planes. Their presence may mean the input was encoded in a different 8-bit
+   encoding, such as CESU-8, WTF-8, or Java's Modified UTF-8.
+
+   An *invalid non-surrogate code point* is any code point higher than
+   `U+10FFFF`. This can often occur when the input is arbitrary data instead
+   of textual content.
+
+   An *overlong encoding* occurs when a scalar value that could have been
+   encoded using fewer bytes is encoded in a longer byte sequence. Overlong
+   encodings are invalid UTF-8 and can lead to security issues if not
+   correctly detected:
+
+   - https://nvd.nist.gov/vuln/detail/CVE-2008-2938
+   - https://nvd.nist.gov/vuln/detail/CVE-2000-0884
+
+   An overlong encoding of `NUL`, `0xC0 0x80`, is used in Java's Modified
+   UTF-8 but is invalid UTF-8. Overlong encoding errors often catch attempts
+   to bypass security measures.
+
+   ### Reporting the range of the error
+
+   The range of the error reported follows the *Maximal subpart of an
+   ill-formed subsequence* algorithm in which each error is either one byte
+   long or ends before the first byte that is disallowed. See "U+FFFD
+   Substitution of Maximal Subparts" in the Unicode Standard. Unicode started
+   recommending this algorithm in version 6 and is adopted by the W3C.
+
+   The maximal subpart algorithm will produce a single multi-byte range for a
+   truncated scalar (a multi-byte sequence that is the start of a valid
+   multi-byte scalar but is cut off before ending correctly). For all other
+   errors (including overlong encodings, surrogates, and invalid code
+   points), it will produce an error per byte.
+
+   // FIXME: without a checkAllErrors, we don't have these classification distinctions, should we drop it, ensure we will do it, or what?
+
+   Since overlong encodings, surrogates, and invalid code points are erroneous
+   by the second byte (at the latest), the above definition produces the same
+   ranges as defining such a sequence as a truncated scalar error followed by
+   unexpected continuation byte errors. The more semantically-rich
+   classification is reported.
+
+   For example, a surrogate count point sequence `ED A0 80` will be reported
+   as three `.surrogateCodePointByte` errors rather than a `.truncatedScalar`
+   followed by two `.unexpectedContinuationByte` errors.
+
+   Other commonly reported error ranges can be constructed from this result.
+   For example, PEP 383's error-per-byte can be constructed by mapping over
+   the reported range. Similarly, constructing a single error for the longest
+   invalid byte range can be constructed by joining adjacent error ranges.
+
+   ```
+   ╔═════════════════╦══════╦═════╦═════╦═════╦═════╦═════╦═════╦══════╗
+   ║                 ║  61  ║ F1  ║ 80  ║ 80  ║ E1  ║ 80  ║ C2  ║  62  ║
+   ╠═════════════════╬══════╬═════╬═════╬═════╬═════╬═════╬═════╬══════╣
+   ║ Longest range   ║ U+61 ║ err ║     ║     ║     ║     ║     ║ U+62 ║
+   ║ Maximal subpart ║ U+61 ║ err ║     ║     ║ err ║     ║ err ║ U+62 ║
+   ║ Error per byte  ║ U+61 ║ err ║ err ║ err ║ err ║ err ║ err ║ U+62 ║
+   ╚═════════════════╩══════╩═════╩═════╩═════╩═════╩═════╩═════╩══════╝
+   ```
+
+   */
+  @available(SwiftStdlib 6.2, *)
+  @frozen
+  public struct ValidationError: Error, Sendable, Hashable
+  {
+    /// The kind of encoding error
+    public var kind: Unicode.UTF8.ValidationError.Kind
+
+    /// The range of offsets into our input containing the error
+    public var byteOffsets: Range<Int>
+
+    @_alwaysEmitIntoClient
+    public init(
+      _ kind: Unicode.UTF8.ValidationError.Kind,
+      _ byteOffsets: Range<Int>
+    ) {
+      _precondition(byteOffsets.lowerBound >= 0)
+      if kind == .truncatedScalar {
+        _precondition(!byteOffsets.isEmpty)
+        _precondition(byteOffsets.count < 4)
+      } else {
+        _precondition(byteOffsets.count == 1)
+      }
+
+      self.kind = kind
+      self.byteOffsets = byteOffsets
+    }
+
+    @_alwaysEmitIntoClient
+    public init(
+      _ kind: Unicode.UTF8.ValidationError.Kind, at byteOffset: Int
+    ) {
+      self.init(kind, byteOffset..<(byteOffset+1))
+    }
+  }
+}
+
+
+@available(SwiftStdlib 6.2, *)
+extension UTF8.ValidationError {
+  /// The kind of encoding error encountered during validation
+  @frozen
+  public struct Kind: Error, Sendable, Hashable, RawRepresentable
+   {
+    public var rawValue: UInt8
+
+    @inlinable
+    public init?(rawValue: UInt8) {
+      guard rawValue <= 4 else { return nil }
+      self.rawValue = rawValue
+    }
+
+    /// A continuation byte (`10xxxxxx`) outside of a multi-byte sequence
+    @_alwaysEmitIntoClient
+    public static var unexpectedContinuationByte: Self {
+      .init(rawValue: 0)!
+    }
+
+    /// A byte in a surrogate code point (`U+D800..U+DFFF`) sequence
+    @_alwaysEmitIntoClient
+    public static var surrogateCodePointByte: Self {
+      .init(rawValue: 1)!
+    }
+
+    /// A byte in an invalid, non-surrogate code point (`>U+10FFFF`) sequence
+    @_alwaysEmitIntoClient
+    public static var invalidNonSurrogateCodePointByte: Self {
+      .init(rawValue: 2)!
+    }
+
+    /// A byte in an overlong encoding sequence
+    @_alwaysEmitIntoClient
+    public static var overlongEncodingByte: Self {
+      .init(rawValue: 3)!
+    }
+
+    /// A multi-byte sequence that is the start of a valid multi-byte scalar
+    /// but is cut off before ending correctly
+    @_alwaysEmitIntoClient
+    public static var truncatedScalar: Self {
+      .init(rawValue: 4)!
+    }
+  }
+}
+
+@_unavailableInEmbedded
+@available(SwiftStdlib 6.2, *)
+extension UTF8.ValidationError.Kind: CustomStringConvertible {
+  public var description: String {
+    switch self {
+    case .invalidNonSurrogateCodePointByte:
+      ".invalidNonSurrogateCodePointByte"
+    case .overlongEncodingByte:
+      ".overlongEncodingByte"
+    case .surrogateCodePointByte:
+      ".surrogateCodePointByte"
+    case .truncatedScalar:
+      ".truncatedScalar"
+    case .unexpectedContinuationByte:
+      ".unexpectedContinuationByte"
+    default:
+      fatalError("unreachable")
+    }
+  }
+}
+
+@_unavailableInEmbedded
+@available(SwiftStdlib 6.2, *)
+extension UTF8.ValidationError: CustomStringConvertible {
+  public var description: String {
+    "UTF8.ValidationError(\(kind), \(byteOffsets))"
+  }
+}
+
+extension UTF8 {
+  @available(SwiftStdlib 6.2, *)
+  @usableFromInline // for testing purposes
+  internal static func _checkAllErrors(
+    _ s: some Sequence<UInt8>
+  ) -> Array<UTF8.ValidationError> {
+    // TODO: Span fast path
+    // TODO: Fixed size buffer for non-contig inputs
+    // TODO: Lifetime-dependent result variant
+    let cus = Array(s)
+    return unsafe cus.withUnsafeBytes {
+      var bufPtr = unsafe $0
+      var start = 0
+      var errors: Array<UTF8.ValidationError> = []
+
+      // Remember the previous error, so that we can
+      // apply it to subsequent bytes instead of reporting
+      // just `.unexpectedContinuation`.
+      var priorError: UTF8.ValidationError? = nil
+      while true {
+        do throws(UTF8.ValidationError) {
+          _ = unsafe try bufPtr.baseAddress!._validateUTF8(limitedBy: bufPtr.count)
+          return errors
+        } catch {
+          let adjustedRange =
+            error.byteOffsets.lowerBound + start ..< error.byteOffsets.upperBound + start
+
+          let kind: UTF8.ValidationError.Kind
+          if let prior = priorError,
+             prior.byteOffsets.upperBound == adjustedRange.lowerBound,
+             error.kind == .unexpectedContinuationByte
+          {
+            kind = prior.kind
+          } else {
+            kind = error.kind
+          }
+          let adjustedErr = UTF8.ValidationError(kind, adjustedRange)
+          priorError = adjustedErr
+
+          let errEnd = error.byteOffsets.upperBound
+          start += errEnd
+          unsafe bufPtr = .init(rebasing: bufPtr[errEnd...])
+          errors.append(adjustedErr)
+        }
+      }
+    }
+  }
+}
--- a/stdlib/public/core/UTF8Span.swift
+++ b/stdlib/public/core/UTF8Span.swift
@@ -0,0 +1,235 @@
+// TODO: comment header
+
+
+/// TODO: docs
+@frozen
+@safe
+@available(SwiftStdlib 6.2, *)
+public struct UTF8Span: Copyable, ~Escapable, BitwiseCopyable {
+  @usableFromInline
+  internal var _unsafeBaseAddress: UnsafeRawPointer?
+
+  /*
+   A bit-packed count and flags (such as isASCII)
+
+   ╔═══════╦═════╦══════════╦═══════╗
+   ║  b63  ║ b62 ║ b61:56   ║ b56:0 ║
+   ╠═══════╬═════╬══════════╬═══════╣
+   ║ ASCII ║ NFC ║ reserved ║ count ║
+   ╚═══════╩═════╩══════════╩═══════╝
+
+   ASCII means the contents are known to be all-ASCII (<0x7F).
+   NFC means contents are known to be in normal form C for fast comparisons.
+   */
+  @usableFromInline
+  internal var _countAndFlags: UInt64
+
+  // @_alwaysEmitIntoClient
+  @inline(__always)
+  @lifetime(borrow start) // TODO: borrow or copy?
+  internal init(
+    _unsafeAssumingValidUTF8 start: borrowing UnsafeRawPointer,
+    _countAndFlags: UInt64
+  ) {
+    unsafe self._unsafeBaseAddress = copy start
+    self._countAndFlags = _countAndFlags
+
+    _invariantCheck()
+  }
+
+  /// Creates a UTF8Span, bypassing safety and security checks. The caller
+  /// must guarantee that `codeUnits` contains validly-encoded UTF-8, or else
+  /// undefined behavior may result upon use. If `isKnownASCII: true is
+  /// passed`, the contents must be ASCII, or else undefined behavior may
+  /// result upon use.
+  @unsafe
+  @lifetime(copy codeUnits)
+  public init(
+    unchecked codeUnits: Span<UInt8>,
+    isKnownASCII: Bool = false
+  ) {
+    self.init(
+      _uncheckedAssumingValidUTF8: codeUnits,
+      isKnownASCII: isKnownASCII,
+      isKnownNFC: false
+    )
+  }
+
+  // FIXME: we need to make sure ALL API are nil safe, that is they
+  // at least check the count first
+  @_alwaysEmitIntoClient
+  internal func _start() -> UnsafeRawPointer {
+    unsafe _unsafeBaseAddress._unsafelyUnwrappedUnchecked
+  }
+}
+
+// TODO: try to convert code to be ran on Span instead of URP
+
+@available(SwiftStdlib 6.2, *)
+extension UTF8Span {
+  /// Creates a UTF8Span containing `codeUnits`. Validates that the input is
+  /// valid UTF-8, otherwise throws an error.
+  ///
+  /// The resulting UTF8Span has the same lifetime constraints as `codeUnits`.
+  @lifetime(copy codeUnits)
+  public init(
+    validating codeUnits: consuming Span<UInt8>
+  ) throws(UTF8.ValidationError) {
+    try self.init(_validating: codeUnits)
+  }
+
+  // TODO: this doesn't need to be underscored, I don't think
+  @lifetime(copy codeUnits)
+  internal init(
+    _validating codeUnits: consuming Span<UInt8>
+  ) throws(UTF8.ValidationError) {
+    guard let basePtr = unsafe codeUnits._pointer else {
+      unsafe self._unsafeBaseAddress = nil
+      self._countAndFlags = 0
+      return
+    }
+
+    let count = codeUnits._count
+    let isASCII = unsafe try basePtr._validateUTF8(limitedBy: count)
+
+    unsafe self._unsafeBaseAddress = .init(basePtr)
+    self._countAndFlags = UInt64(truncatingIfNeeded: count)
+    if isASCII {
+      _setIsASCII()
+    }
+    _internalInvariant(self.count == codeUnits.count)
+  }
+
+  // TODO: SPI?
+  @lifetime(copy codeUnits)
+  internal init(
+    _uncheckedAssumingValidUTF8 codeUnits: consuming Span<UInt8>,
+    isKnownASCII: Bool,
+    isKnownNFC: Bool
+  ) {
+    guard let ptr = unsafe codeUnits._pointer else {
+      unsafe self._unsafeBaseAddress = nil
+      self._countAndFlags = 0
+      return
+    }
+
+    unsafe self._unsafeBaseAddress = ptr
+    self._countAndFlags = UInt64(truncatingIfNeeded: codeUnits.count)
+    if isKnownASCII {
+      _setIsASCII()
+    }
+    if isKnownNFC {
+      _setIsNFC()
+    }
+    _internalInvariant(self.count == codeUnits.count)
+  }
+
+  // HACK: working around lack of internal plumbing work
+  internal var _str: String { unsafe _start()._str(0..<count) }
+}
+
+
+// MARK: String
+
+
+@available(SwiftStdlib 6.2, *)
+extension UTF8Span {
+  /// Calls a closure with a pointer to the viewed contiguous storage.
+  ///
+  /// The buffer pointer passed as an argument to `body` is valid only
+  /// during the execution of `withUnsafeBufferPointer(_:)`.
+  /// Do not store or return the pointer for later use.
+  ///
+  /// - Parameter body: A closure with an `UnsafeBufferPointer` parameter
+  ///   that points to the viewed contiguous storage. If `body` has
+  ///   a return value, that value is also used as the return value
+  ///   for the `withUnsafeBufferPointer(_:)` method. The closure's
+  ///   parameter is valid only for the duration of its execution.
+  /// - Returns: The return value of the `body` closure parameter.
+  @_alwaysEmitIntoClient
+  borrowing public func _withUnsafeBufferPointer<
+    E: Error, Result: ~Copyable //& ~Escapable
+  >(
+    _ body: (_ buffer: /*borrowing*/ UnsafeBufferPointer<UInt8>) throws(E) -> Result
+  ) throws(E) -> Result {
+    try unsafe body(_start()._ubp(0..<count))
+  }
+
+  // TODO: withSpan or similar?
+}
+
+// MARK: Internals
+
+@available(SwiftStdlib 6.2, *)
+extension UTF8Span {
+#if !INTERNAL_CHECKS_ENABLED
+  @inline(__always) internal func _invariantCheck() {}
+#else
+  @inline(never) @_effects(releasenone)
+  internal func _invariantCheck() {
+    // TODO: validate the UTF-8 as an assertion (and isASCII)
+  }
+#endif
+}
+
+@available(SwiftStdlib 6.2, *)
+extension UTF8Span {
+  public var isEmpty: Bool {
+    self.count == 0
+  }
+
+  public var span: Span<UInt8> {
+    @lifetime(copy self)
+    get {
+      unsafe Span(_unchecked: _unsafeBaseAddress, count: self.count)
+    }
+  }
+
+
+}
+
+// TODO(toolchain): decide if we rebase on top of Guillaume's work
+extension String {
+
+  @available(SwiftStdlib 6.2, *)
+  public init(copying codeUnits: UTF8Span) {
+    let isASCII = codeUnits.isKnownASCII
+    self = unsafe codeUnits._withUnsafeBufferPointer { bufPtr in
+      unsafe String._uncheckedFromUTF8(bufPtr, isASCII: isASCII)
+    }
+  }
+
+  @available(SwiftStdlib 6.2, *)
+  public var utf8Span: UTF8Span {
+    @lifetime(borrow self)
+    borrowing get {
+      let isKnownASCII = _guts.isASCII
+      let utf8 = self.utf8
+      let span = utf8.span
+      let result = unsafe UTF8Span(
+        unchecked: span,
+        isKnownASCII: isKnownASCII)
+      return unsafe _overrideLifetime(result, borrowing: self)
+    }
+  }
+}
+
+extension Substring {
+  @available(SwiftStdlib 6.2, *)
+  public var utf8Span: UTF8Span {
+    @lifetime(borrow self)
+    borrowing get {
+      let isKnownASCII = base._guts.isASCII
+      let utf8 = self.utf8
+      let span = utf8.span
+      let result = unsafe UTF8Span(
+        unchecked: span,
+        isKnownASCII: isKnownASCII)
+      return unsafe _overrideLifetime(result, borrowing: self)
+    }
+  }
+}
+
+
+
+
--- a/stdlib/public/core/UTF8SpanBits.swift
+++ b/stdlib/public/core/UTF8SpanBits.swift
@@ -0,0 +1,126 @@
+@available(SwiftStdlib 6.2, *)
+extension UTF8Span {
+  /// Returns whether contents are known to be all-ASCII. A return value of
+  /// `true` means that all code units are ASCII. A return value of `false`
+  /// means there _may_ be non-ASCII content.
+  ///
+  /// ASCII-ness is checked and remembered during UTF-8 validation, so this
+  /// is often equivalent to is-ASCII, but there are some situations where
+  /// we might return `false` even when the content happens to be all-ASCII.
+  ///
+  /// For example, a UTF-8 span generated from a `String` that at some point
+  /// contained non-ASCII content would report false for `isKnownASCII`, even
+  /// if that String had subsequent mutation operations that removed any
+  /// non-ASCII content.
+  @_alwaysEmitIntoClient
+  public var isKnownASCII: Bool {
+    0 != _countAndFlags & Self._asciiBit
+  }
+
+  /// Do a scan checking for whether the contents are all-ASCII.
+  ///
+  /// Updates the `isKnownASCII` bit if contents are all-ASCII.
+  @lifetime(self: copy self)
+  public mutating func checkForASCII() -> Bool {
+    if isKnownASCII { return true }
+
+    let result = unsafe _withUnsafeBufferPointer {
+      unsafe _allASCII($0)
+    }
+    if result {
+      _setIsASCII()
+    }
+    return result
+  }
+
+  /// Returns whether the contents are known to be NFC. This is not
+  /// always checked at initialization time and is set by `checkForNFC`.
+  // TODO: should this be @_unavailableInEmbedded
+  @_alwaysEmitIntoClient
+  public var isKnownNFC: Bool {
+    0 != _countAndFlags & Self._nfcBit
+  }
+
+  // Set the isKnownASCII bit to true (also isNFC)
+  @_alwaysEmitIntoClient
+  @lifetime(self: copy self)
+  internal mutating func _setIsASCII() {
+    self._countAndFlags |= Self._asciiBit | Self._nfcBit
+  }
+
+  // Set the isKnownNFC bit to true (also isNFC)
+  @_alwaysEmitIntoClient
+  @lifetime(self: copy self)
+  internal mutating func _setIsNFC() {
+    self._countAndFlags |= Self._nfcBit
+  }
+
+  /// Do a scan checking for whether the contents are in Normal Form C.
+  /// When the contents are in NFC, canonical equivalence checks are much
+  /// faster.
+  ///
+  /// `quickCheck` will check for a subset of NFC contents using the
+  /// NFCQuickCheck algorithm, which is faster than the full normalization
+  /// algorithm. However, it cannot detect all NFC contents.
+  ///
+  /// Updates the `isKnownNFC` bit.
+  @_unavailableInEmbedded
+  @lifetime(self: copy self)
+  public mutating func checkForNFC(
+    quickCheck: Bool
+  ) -> Bool {
+    if isKnownNFC { return true }
+
+    if quickCheck {
+      let result = unsafe _withUnsafeBufferPointer { utf8 in
+        var prevCCC: UInt8 = 0
+        return unsafe _nfcQuickCheck(utf8, prevCCC: &prevCCC)
+      }
+      if result {
+        self._countAndFlags |= Self._nfcBit
+      }
+      return result
+    }
+
+    // TODO: use faster internal algorithm
+    let normalized = _str._nfcCodeUnits
+    guard unsafe _start()._urbp(
+      0..<count
+    ).elementsEqual(normalized) else {
+      return false
+    }
+
+    self._countAndFlags |= Self._nfcBit
+    return true
+  }
+}
+
+@available(SwiftStdlib 6.2, *)
+extension UTF8Span {
+  @_alwaysEmitIntoClient @inline(__always)
+  internal static var _asciiBit: UInt64 {
+    0x8000_0000_0000_0000
+  }
+
+  @_alwaysEmitIntoClient @inline(__always)
+  internal static var _nfcBit: UInt64 {
+    0x4000_0000_0000_0000
+  }
+
+  @_alwaysEmitIntoClient @inline(__always)
+  internal static var _countMask: UInt64 {
+    0x00FF_FFFF_FFFF_FFFF
+  }
+
+  @_alwaysEmitIntoClient @inline(__always)
+  internal static var _flagsMask: UInt64 {
+    0xFF00_0000_0000_0000
+  }
+
+  @_alwaysEmitIntoClient
+  public var count: Int {
+    Int(truncatingIfNeeded: _countAndFlags & Self._countMask)
+  }
+}
+
+
--- a/stdlib/public/core/UTF8SpanComparisons.swift
+++ b/stdlib/public/core/UTF8SpanComparisons.swift
@@ -0,0 +1,100 @@
+// TODO: comment header
+
+
+@available(SwiftStdlib 6.2, *)
+extension UTF8Span {
+  /// Whether this span has the same bytes as `other`.
+  @_alwaysEmitIntoClient
+  public func bytesEqual(to other: some Sequence<UInt8>) -> Bool {
+    unsafe _withUnsafeBufferPointer { unsafe $0.elementsEqual(other) }
+  }
+
+  /// Whether this span has the same `Unicode.Scalar`s as `other`.
+  @_alwaysEmitIntoClient
+  public func unicodeScalarsEqual(
+    to other: some Sequence<Unicode.Scalar>
+  ) -> Bool {
+    // TODO: We don't need to decode our code units, we can just match
+    // against their scalars' encoded bytes
+
+    var scalars = makeUnicodeScalarIterator()
+    var otherScalars = other.makeIterator()
+    while let s = scalars.next() {
+      guard let otherS = otherScalars.next(), s == otherS else {
+        return false
+      }
+    }
+    guard scalars.next() == nil else {
+      return false
+    }
+    return true
+  }
+
+  /// Whether this span has the same `Character`s as `other`.
+  @_unavailableInEmbedded
+  @_alwaysEmitIntoClient
+  public func charactersEqual(
+    to other: some Sequence<Character>
+  ) -> Bool {
+    var chars = makeCharacterIterator()
+    var otherChars = other.makeIterator()
+    while let c = chars.next() {
+      guard let otherC = otherChars.next(), c == otherC else {
+        return false
+      }
+    }
+    guard chars.next() == nil else {
+      return false
+    }
+    return true
+  }
+}
+
+@available(SwiftStdlib 6.2, *)
+extension UTF8Span {
+  /// Whether `self` is equivalent to `other` under Unicode Canonical
+  /// Equivalence.
+  public func isCanonicallyEquivalent(
+    to other: UTF8Span
+  ) -> Bool {
+    unsafe self._withUnsafeBufferPointer { selfBufPtr in
+      unsafe other._withUnsafeBufferPointer { otherBufPtr in
+        unsafe _stringCompareFastUTF8(
+          selfBufPtr,
+          otherBufPtr,
+          expecting: .equal,
+          bothNFC: self.isKnownNFC && other.isKnownNFC)
+      }
+    }
+  }
+
+  /// Whether `self` orders less than `other` under Unicode Canonical
+  /// Equivalence using normalized code-unit order (in NFC).
+  public func isCanonicallyLessThan(
+    _ other: UTF8Span
+  ) -> Bool {
+    unsafe self._withUnsafeBufferPointer { selfBufPtr in
+      unsafe other._withUnsafeBufferPointer { otherBufPtr in
+        unsafe _stringCompareFastUTF8(
+          selfBufPtr,
+          otherBufPtr,
+          expecting: .less,
+          bothNFC: self.isKnownNFC && other.isKnownNFC)
+      }
+    }
+  }
+}
+
+// // FIXME: remove
+// @available(SwiftStdlib 6.2, *)
+// extension UTF8Span {
+//   public static func ~=(_ lhs: StaticString, _ rhs: UTF8Span) -> Bool {
+//     return lhs.withUTF8Buffer { str in
+//       rhs._withUnsafeBufferPointer { span in
+//         str.elementsEqual(span)
+//       }
+//     }
+//   }
+// }
+
+
--- a/stdlib/public/core/UTF8SpanFundamentals.swift
+++ b/stdlib/public/core/UTF8SpanFundamentals.swift
@@ -0,0 +1,360 @@
+// Core Scalar API
+@available(SwiftStdlib 6.2, *)
+extension UTF8Span {
+  /// Whether `i` is on a boundary between Unicode scalar values.
+  ///
+  /// This function does not validate that `i` is within the span's bounds;
+  /// this is an unsafe operation.
+  internal func _isScalarAligned(unchecked i: Int) -> Bool {
+    if i == count || i == 0 { return true }
+    _internalInvariant(_boundsCheck(i))
+    return unsafe _start()._isScalarAligned(i)
+  }
+
+  /// Returns the start of the `Unicode.Scalar` ending at `i`, i.e. the scalar
+  /// before the one starting at `i` or the last scalar if `i` is the end of
+  /// the span.
+  ///
+  /// `i` must be scalar-aligned.
+  internal func _previousScalarStart(_ i: Int) -> Int {
+    precondition(_boundsCheck(i&-1))
+    return _previousScalarStart(unchecked: i)
+  }
+
+  /// Returns the start of the `Unicode.Scalar` ending at `i`, i.e. the scalar
+  /// before the one starting at `i` or the last scalar if `i` is the end of
+  /// the span.
+  ///
+  /// `i` must be scalar-aligned.
+  ///
+  /// This function does not validate that `i` is within the span's bounds;
+  /// this is an unsafe operation.
+  internal func _previousScalarStart(unchecked i: Int) -> Int {
+    _internalInvariant(_boundsCheck(i&-1))
+    precondition(_isScalarAligned(unchecked: i))
+    return _previousScalarStart(uncheckedAssumingAligned: i)
+  }
+
+  /// Returns the start of the `Unicode.Scalar` ending at `i`, i.e. the scalar
+  /// before the one starting at `i` or the last scalar if `i` is the end of
+  /// the span.
+  ///
+  /// `i` must be scalar-aligned.
+  ///
+  /// This function does not validate that `i` is within the span's bounds;
+  /// this is an unsafe operation.
+  ///
+  ///
+  /// This function does not validate that `i` is scalar-aligned; this is an
+  /// unsafe operation if `i` isn't.
+  internal func _previousScalarStart(
+    uncheckedAssumingAligned i: Int
+  ) -> Int {
+    _internalInvariant(_boundsCheck(i&-1))
+    _internalInvariant(_isScalarAligned(unchecked: i))
+    return unsafe _start()._previousScalarStart(i)
+  }
+
+  /// Decode the `Unicode.Scalar` starting at `i`. Return it and the start of
+  /// the next scalar.
+  ///
+  /// `i` must be scalar-aligned.
+  ///
+  /// This function does not validate that `i` is within the span's bounds;
+  /// this is an unsafe operation.
+  ///
+  ///
+  /// This function does not validate that `i` is scalar-aligned; this is an
+  /// unsafe operation if `i` isn't.
+  internal func _decodeNextScalar(
+    uncheckedAssumingAligned i: Int
+  ) -> (Unicode.Scalar, nextScalarStart: Int) {
+    _internalInvariant(_boundsCheck(i))
+    _internalInvariant(_isScalarAligned(unchecked: i))
+    return unsafe _start()._decodeScalar(startingAt: i)
+  }
+
+  /// Decode the `Unicode.Scalar` ending at `i`, i.e. the previous scalar.
+  /// Return it and the start of that scalar.
+  ///
+  /// `i` must be scalar-aligned.
+  internal func _decodePreviousScalar(
+    _ i: Int
+  ) -> (Unicode.Scalar, previousScalarStart: Int) {
+    precondition(_boundsCheck(i &- 1))
+    return _decodePreviousScalar(unchecked: i)
+  }
+
+  /// Decode the `Unicode.Scalar` ending at `i`, i.e. the previous scalar.
+  /// Return it and the start of that scalar.
+  ///
+  /// `i` must be scalar-aligned.
+  ///
+  /// This function does not validate that `i` is within the span's bounds;
+  /// this is an unsafe operation.
+  internal func _decodePreviousScalar(
+    unchecked i: Int
+  ) -> (Unicode.Scalar, previousScalarStart: Int) {
+    _internalInvariant(_boundsCheck(i &- 1))
+    precondition(_isScalarAligned(unchecked: i))
+    return _decodePreviousScalar(uncheckedAssumingAligned: i)
+  }
+
+  /// Decode the `Unicode.Scalar` ending at `i`, i.e. the previous scalar.
+  /// Return it and the start of that scalar.
+  ///
+  /// `i` must be scalar-aligned.
+  ///
+  /// This function does not validate that `i` is within the span's bounds;
+  /// this is an unsafe operation.
+  ///
+  ///
+  /// This function does not validate that `i` is scalar-aligned; this is an
+  /// unsafe operation if `i` isn't.
+  internal func _decodePreviousScalar(
+    uncheckedAssumingAligned i: Int
+  ) -> (Unicode.Scalar, previousScalarStart: Int) {
+    _internalInvariant(_boundsCheck(i &- 1))
+    _internalInvariant(_isScalarAligned(unchecked: i))
+    return unsafe _start()._decodeScalar(endingAt: i)
+  }
+}
+
+// Derived Scalar API
+@available(SwiftStdlib 6.2, *)
+extension UTF8Span {
+  /// Find the nearest scalar-aligned position `<= i`.
+  internal func _scalarAlignBackwards(_ i: Int) -> Int {
+    if i == count || i == 0 { return i }
+
+    precondition(_boundsCheck(i))
+    return unsafe _start()._scalarAlign(i)
+  }
+
+  /// Find the nearest scalar-aligned position `>= i`.
+  internal func _scalarAlignForwards(_ i: Int) -> Int {
+    // FIXME: do the bounds check
+    // FIXME: stop at end of code units
+    //   - this should be an invariant, but checking it lets us avoid ever
+    //     reading off the end
+    // FIXME: implement directly
+    var i = i
+    while _slowPath(!_isScalarAligned(unchecked: i)) {
+      i &+= 1
+    }
+    return i
+  }
+
+  /// Find the nearest scalar-aligned position `>= i`.
+  ///
+  /// This function does not validate that `i` is within the span's bounds;
+  /// this is an unsafe operation.
+  internal func _scalarAlignForwards(unchecked i: Int) -> Int {
+    if i == count || i == 0 { return i }
+
+    var i = i
+    while _slowPath(!_isScalarAligned(unchecked: i)) {
+      i &+= 1
+    }
+    return i
+  }
+}
+
+// Core Character API
+@available(SwiftStdlib 6.2, *)
+extension UTF8Span {
+  /// Returns the start of the next `Character` (i.e. grapheme cluster) after
+  /// the one  starting at `i`, or the end of the span if `i` denotes the final
+  /// `Character`.
+  ///
+  /// `i` must be `Character`-aligned.
+  internal func _nextCharacterStart(_ i: Int) -> Int {
+    precondition(_boundsCheck(i))
+    return _nextCharacterStart(unchecked: i)
+  }
+
+  /// Returns the start of the next `Character` (i.e. grapheme cluster) after
+  /// the one  starting at `i`, or the end of the span if `i` denotes the final
+  /// `Character`.
+  ///
+  /// `i` must be `Character`-aligned.
+  ///
+  /// This function does not validate that `i` is within the span's bounds;
+  /// this is an unsafe operation.
+  internal func _nextCharacterStart(unchecked i: Int) -> Int {
+    _internalInvariant(_boundsCheck(i))
+    precondition(_isScalarAligned(unchecked: i))
+    return _nextCharacterStart(uncheckedAssumingAligned: i)
+  }
+
+  /// Returns the start of the next `Character` (i.e. grapheme cluster) after
+  /// the one  starting at `i`, or the end of the span if `i` denotes the final
+  /// `Character`.
+  ///
+  /// `i` must be `Character`-aligned.
+  ///
+  /// This function does not validate that `i` is within the span's bounds;
+  /// this is an unsafe operation.
+  ///
+  /// This function does not validate that `i` is `Character`-aligned; this is
+  /// an unsafe operation if `i` isn't.
+  internal func _nextCharacterStart(
+    uncheckedAssumingAligned i: Int
+  ) -> Int {
+    _internalInvariant(_boundsCheck(i))
+    _internalInvariant(_isScalarAligned(unchecked: i))
+    return unsafe _start()._nextCharacterStart(i, limitedBy: count)
+  }
+
+  /// Returns the start of the `Character` (i.e. grapheme cluster) ending at
+  /// `i`, i.e. the `Character` before the one starting at `i` or the last
+  /// `Character` if `i` is the end of the span.
+  ///
+  /// `i` must be `Character`-aligned.
+  internal func _previousCharacterStart(_ i: Int) -> Int {
+    precondition(_boundsCheck(i&-1))
+    return _previousCharacterStart(unchecked: i)
+  }
+
+  /// Returns the start of the `Character` (i.e. grapheme cluster) ending at
+  /// `i`, i.e. the `Character` before the one starting at `i` or the last
+  /// `Character` if `i` is the end of the span.
+  ///
+  /// `i` must be `Character`-aligned.
+  ///
+  /// This function does not validate that `i` is within the span's bounds;
+  /// this is an unsafe operation.
+  internal func _previousCharacterStart(unchecked i: Int) -> Int {
+    _internalInvariant(_boundsCheck(i&-1))
+    precondition(_isScalarAligned(unchecked: i))
+    return _previousCharacterStart(uncheckedAssumingAligned: i)
+  }
+
+  /// Returns the start of the `Character` (i.e. grapheme cluster) ending at
+  /// `i`, i.e. the `Character` before the one starting at `i` or the last
+  /// `Character` if `i` is the end of the span.
+  ///
+  /// `i` must be `Character`-aligned.
+  ///
+  /// This function does not validate that `i` is within the span's bounds;
+  /// this is an unsafe operation.
+  ///
+  /// This function does not validate that `i` is `Character`-aligned; this is
+  /// an unsafe operation if `i` isn't.
+  internal func _previousCharacterStart(
+    uncheckedAssumingAligned i: Int
+  ) -> Int {
+    _internalInvariant(_boundsCheck(i&-1))
+    _internalInvariant(_isScalarAligned(unchecked: i))
+    return unsafe _start()._previousCharacterStart(i, limitedBy: count)
+  }
+
+  /// Decode the `Character` starting at `i` Return it and the start of the
+  /// next `Character`.
+  ///
+  /// `i` must be `Character`-aligned.
+  internal func _decodeNextCharacter(
+    _ i: Int
+  ) -> (Character, nextCharacterStart: Int) {
+    precondition(_boundsCheck(i))
+    return _decodeNextCharacter(unchecked: i)
+  }
+
+  /// Decode the `Character` starting at `i` Return it and the start of the
+  /// next `Character`.
+  ///
+  /// `i` must be `Character`-aligned.
+  ///
+  /// This function does not validate that `i` is within the span's bounds;
+  /// this is an unsafe operation.
+  internal func _decodeNextCharacter(
+    unchecked i: Int
+  ) -> (Character, nextCharacterStart: Int) {
+    _internalInvariant(_boundsCheck(i))
+    precondition(_isScalarAligned(unchecked: i))
+    return _decodeNextCharacter(uncheckedAssumingAligned: i)
+  }
+
+  /// Decode the `Character` starting at `i` Return it and the start of the
+  /// next `Character`.
+  ///
+  /// `i` must be `Character`-aligned.
+  ///
+  /// This function does not validate that `i` is within the span's bounds;
+  /// this is an unsafe operation.
+  ///
+  /// This function does not validate that `i` is `Character`-aligned; this is
+  /// an unsafe operation if `i` isn't.
+  internal func _decodeNextCharacter(
+    uncheckedAssumingAligned i: Int
+  ) -> (Character, nextCharacterStart: Int) {
+    _internalInvariant(_boundsCheck(i))
+    _internalInvariant(_isScalarAligned(unchecked: i))
+    return unsafe _start()._decodeCharacter(
+      startingAt: i, limitedBy: count)
+  }
+
+  /// Decode the `Character` (i.e. grapheme cluster) ending at `i`, i.e. the
+  /// previous `Character`. Return it and the start of that `Character`.
+  ///
+  /// `i` must be `Character`-aligned.
+  internal func _decodePreviousCharacter(_ i: Int) -> (Character, Int) {
+    precondition(_boundsCheck(i &- 1))
+    return _decodePreviousCharacter(unchecked: i)
+  }
+
+  /// Decode the `Character` (i.e. grapheme cluster) ending at `i`, i.e. the
+  /// previous `Character`. Return it and the start of that `Character`.
+  ///
+  /// `i` must be `Character`-aligned.
+  ///
+  /// This function does not validate that `i` is within the span's bounds;
+  /// this is an unsafe operation.
+  internal func _decodePreviousCharacter(
+    unchecked i: Int
+  ) -> (Character, Int) {
+    _internalInvariant(_boundsCheck(i &- 1))
+    precondition(_isScalarAligned(unchecked: i))
+    return _decodePreviousCharacter(uncheckedAssumingAligned: i)
+  }
+
+  /// Decode the `Character` (i.e. grapheme cluster) ending at `i`, i.e. the
+  /// previous `Character`. Return it and the start of that `Character`.
+  ///
+  /// `i` must be `Character`-aligned.
+  ///
+  /// This function does not validate that `i` is within the span's bounds;
+  /// this is an unsafe operation.
+  ///
+  /// This function does not validate that `i` is `Character`-aligned; this is
+  /// an unsafe operation if `i` isn't.
+  internal func _decodePreviousCharacter(
+    uncheckedAssumingAligned i: Int
+  ) -> (Character, Int) {
+    _internalInvariant(_boundsCheck(i &- 1))
+    _internalInvariant(_isScalarAligned(unchecked: i))
+    return unsafe _start()._decodeCharacter(
+      endingAt: i, limitedBy: count)
+  }
+
+}
+
+// TODO: internal?
+@available(SwiftStdlib 6.2, *)
+extension UTF8Span {
+  /// Whether `i` is in bounds
+  @_alwaysEmitIntoClient
+  internal func _boundsCheck(_ i: Int) -> Bool {
+    i >= 0 && i < count
+  }
+  /// Whether `bounds` is in bounds
+  @_alwaysEmitIntoClient
+  internal func _boundsCheck(_ bounds: Range<Int>) -> Bool {
+    _boundsCheck(bounds.lowerBound)
+    && _boundsCheck(bounds.upperBound &- 1)
+  }
+}
+
+// Future work: UTF-16 support when we get views
+
+
--- a/stdlib/public/core/UTF8SpanInternalHelpers.swift
+++ b/stdlib/public/core/UTF8SpanInternalHelpers.swift
@@ -0,0 +1,179 @@
+/*
+
+ Additional helpers build on stdlibDuplicates.swift
+
+ */
+
+// TODO: Should we update our unicode helpers file to call these instead?
+
+// import Builtin
+
+extension UnsafeRawPointer {
+  // @_alwaysEmitIntoClient
+  internal func _loadByte(_ i: Int) -> UInt8 {
+    _internalInvariant(i >= 0)
+    return unsafe (self+i).loadUnaligned(as: UInt8.self)
+  }
+
+  // @_alwaysEmitIntoClient
+  internal func _isUTF8Continuation(_ i: Int) -> Bool {
+    unsafe UTF8.isContinuation(_loadByte(i))
+  }
+
+  // @_alwaysEmitIntoClient
+  internal func _isScalarAligned(_ i: Int) -> Bool {
+    _internalInvariant(i >= 0)
+    return unsafe !_isUTF8Continuation(i)
+  }
+
+  // @_alwaysEmitIntoClient
+  internal func _scalarLength(startingAt i: Int) -> Int {
+    unsafe _utf8ScalarLength(_loadByte(i))
+  }
+
+  // NOTE: Adaptation of `_decodeScalar` to work on URP
+//  @_alwaysEmitIntoClient
+  internal func _decodeScalar(
+    startingAt i: Int
+  ) -> (Unicode.Scalar, nextScalarStart: Int) {
+    let cu0 = unsafe _loadByte(i)
+    let len = _utf8ScalarLength(cu0)
+    let next = len &+ i
+    switch  len {
+    case 1: return (_decodeUTF8(cu0), next)
+    case 2: return unsafe (_decodeUTF8(cu0, _loadByte(i &+ 1)), next)
+    case 3: return unsafe (
+      _decodeUTF8(cu0, _loadByte(i &+ 1), _loadByte(i &+ 2)), next
+    )
+    case 4:
+      return (
+        unsafe _decodeUTF8(
+          cu0, _loadByte(i &+ 1), _loadByte(i &+ 2), _loadByte(i &+ 3)
+        ),
+        next
+      )
+    default: Builtin.unreachable()
+    }
+  }
+
+  // @_alwaysEmitIntoClient
+  internal func _decodeScalar(
+    endingAt i: Int
+  ) -> (Unicode.Scalar, previousScalarStart: Int) {
+    // TODO: no need to double load the bytes...
+    let start = unsafe _previousScalarStart(i)
+    return unsafe (_decodeScalar(startingAt: start).0, start)
+  }
+
+  // @_alwaysEmitIntoClient
+  internal func _previousScalarStart(_ i: Int) -> Int {
+    var prev = i &- 1
+    _internalInvariant(prev >= 0)
+    while unsafe _isUTF8Continuation(prev) {
+      prev &-= 1
+      _internalInvariant(prev >= 0)
+    }
+    _internalInvariant(unsafe i == prev + _utf8ScalarLength(_loadByte(prev)))
+    return prev
+  }
+
+  // @_alwaysEmitIntoClient
+  internal func _scalarAlign(_ i: Int) -> Int {
+    var i = i
+    while _slowPath(unsafe !_isScalarAligned(i)) {
+      i &-= 1
+    }
+    return i
+  }
+}
+
+extension UnsafeRawPointer {
+  // TODO: ASCII fast path wrappers around ufi functions
+
+  // TODO: hook up to real grapheme breaking
+  internal func _urbp(_ range: Range<Int>) -> UnsafeRawBufferPointer {
+    unsafe .init(start: self + range.lowerBound, count: range.count)
+  }
+
+  @_alwaysEmitIntoClient
+  internal func _ubp(_ range: Range<Int>) -> UnsafeBufferPointer<UInt8> {
+    unsafe UnsafeBufferPointer<UInt8>(
+      start: UnsafePointer((self+range.lowerBound)._rawValue),
+      count: range.count)
+  }
+
+  internal func _str(_ range: Range<Int>) -> String {
+    unsafe String(decoding: _urbp(range) , as: UTF8.self)
+  }
+
+  // @usableFromInline
+  internal func _nextCharacterStart(
+    _ i: Int, limitedBy end: Int
+  ) -> Int {
+    _internalInvariant((0..<end).contains(i))
+    _internalInvariant(unsafe _isScalarAligned(i))
+
+    return _nextGraphemeClusterBoundary(startingAt: i) { idx in
+      guard idx < end else { return nil }
+      let (scalar, end) = unsafe _decodeScalar(startingAt: idx)
+      return (scalar, end)
+    }
+  }
+
+  // @usableFromInline
+  internal func _previousCharacterStart(
+    _ i: Int,
+    limitedBy end: Int
+  ) -> Int {
+    _internalInvariant(i > 0 && i <= end)
+    _internalInvariant(unsafe i == end || _isScalarAligned(i))
+
+    return _previousGraphemeClusterBoundary(endingAt: i) { idx in
+      guard idx > 0 else { return nil }
+      let (scalar, prior) = unsafe _decodeScalar(endingAt: idx)
+      return (scalar, prior)
+    }
+  }
+
+  // @usableFromInline
+  internal func _decodeCharacter(
+    startingAt i: Int, limitedBy end: Int
+  ) -> (Character, nextCharacterStart: Int) {
+    let nextStart = unsafe _nextCharacterStart(i, limitedBy: end)
+    return unsafe (Character(_str(i..<nextStart)), nextStart)
+  }
+
+  // @usableFromInline
+  internal func _decodeCharacter(
+    endingAt i: Int,
+    limitedBy end: Int
+  ) -> (Character, nextCharacterStart: Int) {
+    let start = unsafe _previousCharacterStart(i, limitedBy: end)
+    _internalInvariant(start >= 0)
+
+    return unsafe (Character(_str(start..<i)), start)
+  }
+
+}
+
+@available(SwiftStdlib 6.2, *)
+extension UnsafeRawPointer {
+  internal enum _UTF8ValidationResult {
+    case success(isASCII: Bool)
+    case error(_: Range<Int>)
+  }
+
+  // Returns isASCII
+  // TODO: return more values
+  internal func _validateUTF8(
+    limitedBy end: Int
+  ) throws(UTF8.ValidationError) -> Bool {
+    switch unsafe validateUTF8(_ubp(0..<end)) {
+    case .success(let info):
+      return info.isASCII
+    case .error(let kind, let range):
+      throw UTF8.ValidationError(kind._publicKind, range)
+    }
+  }
+
+}
--- a/stdlib/public/core/UTF8SpanIterators.swift
+++ b/stdlib/public/core/UTF8SpanIterators.swift
@@ -0,0 +1,391 @@
+@available(SwiftStdlib 6.2, *)
+extension UTF8Span {
+  /// Returns an iterator that will decode the code units into
+  /// `Unicode.Scalar`s.
+  ///
+  /// The resulting iterator has the same lifetime constraints as `self`.
+  @lifetime(copy self)
+  public func makeUnicodeScalarIterator() -> UnicodeScalarIterator {
+    .init(self)
+  }
+
+  /// Iterate the `Unicode.Scalar`s  contents of a `UTF8Span`.
+  ///
+  /// **TODO**: Examples
+  @frozen
+  public struct UnicodeScalarIterator: ~Escapable {
+    public let codeUnits: UTF8Span
+
+    /// The byte offset of the start of the next scalar. This is
+    /// always scalar-aligned.
+    fileprivate(set)
+    public var currentCodeUnitOffset: Int
+
+    @lifetime(copy codeUnits)
+    public init(_ codeUnits: UTF8Span) {
+      self.codeUnits = codeUnits
+      self.currentCodeUnitOffset = 0
+    }
+
+    private var _start: UnsafeRawPointer {
+      unsafe codeUnits._start()
+    }
+
+    /// Decode and return the scalar starting at `currentCodeUnitOffset`.
+    /// After the function returns, `currentCodeUnitOffset` holds the
+    /// position at the end of the returned scalar, which is also the start
+    /// of the next scalar.
+    ///
+    /// Returns `nil` if at the end of the `UTF8Span`.
+    @lifetime(self: copy self)
+    public mutating func next() -> Unicode.Scalar? {
+      guard currentCodeUnitOffset < codeUnits.count else {
+        return nil
+      }
+
+      _internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
+      let (result, newPos) = unsafe _start._decodeScalar(startingAt: currentCodeUnitOffset)
+      self.currentCodeUnitOffset = newPos
+      return result
+    }
+
+    /// Decode and return the scalar ending at `currentCodeUnitOffset`. After
+    /// the function returns, `currentCodeUnitOffset` holds the position at
+    /// the start of the returned scalar, which is also the end of the
+    /// previous scalar.
+    ///
+    /// Returns `nil` if at the start of the `UTF8Span`.
+    @lifetime(self: copy self)
+    public mutating func previous() -> Unicode.Scalar? {
+      guard currentCodeUnitOffset > 0 else {
+        return nil
+      }
+
+      _internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
+      let (result, newPos) = unsafe _start._decodeScalar(endingAt: currentCodeUnitOffset)
+      self.currentCodeUnitOffset = newPos
+      return result
+    }
+
+
+    /// Advance `codeUnitOffset` to the end of the current scalar, without
+    /// decoding it.
+    ///
+    /// Returns the number of `Unicode.Scalar`s skipped over, which can be 0
+    /// if at the end of the UTF8Span.
+    @lifetime(self: copy self)
+    public mutating func skipForward() -> Int {
+      guard currentCodeUnitOffset < codeUnits.count else {
+        return 0
+      }
+
+      _internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
+
+      currentCodeUnitOffset &+= unsafe _start._scalarLength(startingAt: currentCodeUnitOffset)
+      return 1
+    }
+
+    /// Advance `codeUnitOffset` to the end of `n` scalars, without decoding
+    /// them.
+    ///
+    /// Returns the number of `Unicode.Scalar`s skipped over, which can be
+    /// fewer than `n` if at the end of the UTF8Span.
+    @lifetime(self: copy self)
+    public mutating func skipForward(by n: Int) -> Int {
+      var numSkipped = 0
+      while numSkipped < n && skipForward() != 0 {
+        numSkipped += 1
+      }
+
+      return numSkipped
+    }
+
+    /// Move `codeUnitOffset` to the start of the previous scalar, without
+    /// decoding it.
+    ///
+    /// Returns the number of `Unicode.Scalar`s skipped over, which can be 0
+    /// if at the start of the UTF8Span.
+    @lifetime(self: copy self)
+    public mutating func skipBack() -> Int {
+      guard currentCodeUnitOffset > 0 else {
+        return 0
+      }
+
+      _internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
+
+      currentCodeUnitOffset = unsafe _start._previousScalarStart(currentCodeUnitOffset)
+      return 1
+    }
+
+    /// Move `codeUnitOffset` to the start of the previous `n` scalars,
+    /// without decoding them.
+    ///
+    /// Returns the number of `Unicode.Scalar`s skipped over, which can be
+    /// fewer than `n` if at the start of the UTF8Span.
+    @lifetime(self: copy self)
+    public mutating func skipBack(by n: Int) -> Int {
+      var numSkipped = 0
+      while numSkipped < n && skipBack() != 0 {
+        numSkipped += 1
+      }
+
+      return numSkipped
+    }
+
+    /// Reset to the nearest scalar-aligned code unit offset `<= i`.
+    ///
+    /// **TODO**: Example
+    @lifetime(self: copy self)
+    public mutating func reset(roundingBackwardsFrom i: Int)  {
+      self.currentCodeUnitOffset = codeUnits._scalarAlignBackwards(i)
+    }
+
+    /// Reset to the nearest scalar-aligned code unit offset `>= i`.
+    ///
+    /// **TODO**: Example
+    @lifetime(self: copy self)
+    public mutating func reset(roundingForwardsFrom i: Int)  {
+      self.currentCodeUnitOffset = codeUnits._scalarAlignForwards(i)
+    }
+
+    /// Reset this iterator to `codeUnitOffset`, skipping _all_ safety
+    /// checks (including bounds checks).
+    ///
+    /// Note: This is only for very specific, low-level use cases. If
+    /// `codeUnitOffset` is not properly scalar-aligned, this function can
+    /// result in undefined behavior when, e.g., `next()` is called.
+    ///
+    /// TODO: verify that we're not UB, just garabage-data or guaranteed
+    ///       trap!
+    ///
+    /// For example, this could be used by a regex engine to backtrack to a
+    /// known-valid previous position.
+    ///
+    @unsafe
+    @lifetime(self: copy self)
+    public mutating func reset(toUnchecked codeUnitOffset: Int) {
+      _internalInvariant(codeUnits._isScalarAligned(unchecked: codeUnitOffset))
+      self.currentCodeUnitOffset = codeUnitOffset
+    }
+
+    /// Returns the UTF8Span containing all the content up to the iterator's
+    /// current position.
+    ///
+    /// The resultant `UTF8Span` has the same lifetime constraints as `self`.
+    @lifetime(copy self)
+    public func prefix() -> UTF8Span {
+      let slice = codeUnits.span._extracting(0..<currentCodeUnitOffset)
+      return UTF8Span(
+        _uncheckedAssumingValidUTF8: slice,
+        isKnownASCII: codeUnits.isKnownASCII,
+        isKnownNFC: codeUnits.isKnownNFC)
+    }
+
+    /// Returns the UTF8Span containing all the content after the iterator's
+    /// current position.
+    ///
+    /// The resultant `UTF8Span` has the same lifetime constraints as `self`.
+    @lifetime(copy self)
+    public func suffix() -> UTF8Span {
+      let slice = codeUnits.span._extracting(currentCodeUnitOffset..<codeUnits.count)
+      return UTF8Span(
+        _uncheckedAssumingValidUTF8: slice,
+        isKnownASCII: codeUnits.isKnownASCII,
+        isKnownNFC: codeUnits.isKnownNFC)
+    }
+  }
+}
+
+@available(SwiftStdlib 6.2, *)
+@_unavailableInEmbedded
+extension UTF8Span {
+  /// Returns an iterator that will construct `Character`s from the underlying
+  /// UTF-8 content.
+  ///
+  /// The resulting iterator has the same lifetime constraints as `self`.
+  @lifetime(copy self)
+  public func makeCharacterIterator() -> CharacterIterator {
+    .init(self)
+  }
+
+  /// Iterate the `Character` contents of a `UTF8Span`.
+  ///
+  /// **TODO**: Examples
+  public struct CharacterIterator: ~Escapable {
+    public let codeUnits: UTF8Span
+
+    /// The byte offset of the start of the next `Character`. This is always
+    /// scalar-aligned. It is always `Character`-aligned relative to the last
+    /// call to `reset` (or the start of the span if not called).
+    fileprivate(set)
+    public var currentCodeUnitOffset: Int
+
+    @lifetime(copy codeUnits)
+    public init(_ codeUnits: UTF8Span) {
+      self.codeUnits = codeUnits
+      self.currentCodeUnitOffset = 0
+    }
+
+    private var _start: UnsafeRawPointer {
+      unsafe codeUnits._start()
+    }
+
+    /// Return the `Character` starting at `currentCodeUnitOffset`. After the
+    /// function returns, `currentCodeUnitOffset` holds the position at the
+    /// end of the `Character`, which is also the start of the next
+    /// `Character`.
+    ///
+    /// Returns `nil` if at the end of the `UTF8Span`.
+    @lifetime(self: copy self)
+    public mutating func next() -> Character? {
+      guard currentCodeUnitOffset < codeUnits.count else { return nil }
+
+      _internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
+      let (result, newPos) = unsafe _start._decodeCharacter(
+        startingAt: currentCodeUnitOffset,
+        limitedBy: codeUnits.count
+      )
+      self.currentCodeUnitOffset = newPos
+      return result
+    }
+
+    /// Return the `Character` ending at `currentCodeUnitOffset`. After the
+    /// function returns, `currentCodeUnitOffset` holds the position at the
+    /// start of the returned `Character`, which is also the end of the
+    /// previous `Character`.
+    ///
+    /// Returns `nil` if at the start of the `UTF8Span`.
+    @lifetime(self: copy self)
+    public mutating func previous() -> Character? {
+      guard currentCodeUnitOffset > 0 else { return nil }
+
+      _internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
+      let (result, newPos) = unsafe _start._decodeCharacter(
+        endingAt: currentCodeUnitOffset,
+        limitedBy: codeUnits.count)
+      self.currentCodeUnitOffset = newPos
+      return result
+    }
+
+    /// Advance `codeUnitOffset` to the end of the current `Character`,
+    /// without constructing it.
+    ///
+    /// Returns the number of `Character`s skipped over, which can be 0
+    /// if at the end of the UTF8Span.
+    @lifetime(self: copy self)
+    public mutating func skipForward() -> Int {
+      guard currentCodeUnitOffset < codeUnits.count else {
+        return 0
+      }
+
+      _internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
+
+      self.currentCodeUnitOffset = unsafe _start._nextCharacterStart(currentCodeUnitOffset, limitedBy: codeUnits.count)
+      return 1
+    }
+
+    /// Advance `codeUnitOffset` to the end of `n` `Characters`, without
+    /// constructing them.
+    ///
+    /// Returns the number of `Character`s skipped over, which can be
+    /// fewer than `n` if at the end of the UTF8Span.
+    @lifetime(self: copy self)
+    public mutating func skipForward(by n: Int) -> Int {
+      var numSkipped = 0
+      while numSkipped < n && skipForward() != 0 {
+        numSkipped += 1
+      }
+
+      return numSkipped
+    }
+
+    /// Move `codeUnitOffset` to the start of the previous `Character`,
+    /// without constructing it.
+    ///
+    /// Returns the number of `Character`s skipped over, which can be 0
+    /// if at the start of the UTF8Span.
+    @lifetime(self: copy self)
+    public mutating func skipBack() -> Int {
+      guard currentCodeUnitOffset > 0 else {
+        return 0
+      }
+
+      _internalInvariant(codeUnits._isScalarAligned(unchecked: currentCodeUnitOffset))
+
+      currentCodeUnitOffset = unsafe _start._previousCharacterStart(currentCodeUnitOffset, limitedBy: codeUnits.count)
+      return 1
+
+    }
+
+    /// Move `codeUnitOffset` to the start of the previous `n` `Character`s,
+    /// without constructing them.
+    ///
+    /// Returns the number of `Character`s skipped over, which can be
+    /// fewer than `n` if at the start of the UTF8Span.
+    @lifetime(self: copy self)
+    public mutating func skipBack(by n: Int) -> Int {
+      var numSkipped = 0
+      while numSkipped < n && skipBack() != 0 {
+        numSkipped += 1
+      }
+
+      return numSkipped
+    }
+
+    /// Reset to the nearest character-aligned position `<= i`.
+    @lifetime(self: copy self)
+    public mutating func reset(roundingBackwardsFrom i: Int) {
+      self.currentCodeUnitOffset = codeUnits._scalarAlignBackwards(i)
+    }
+
+    /// Reset to the nearest character-aligned position `>= i`.
+    @lifetime(self: copy self)
+    public mutating func reset(roundingForwardsFrom i: Int) {
+      self.currentCodeUnitOffset = codeUnits._scalarAlignForwards(i)
+    }
+
+    /// Reset this iterator to `codeUnitOffset`, skipping _all_ safety
+    /// checks.
+    ///
+    /// Note: This is only for very specific, low-level use cases. If
+    /// `codeUnitOffset` is not properly scalar-aligned, this function can
+    /// result in undefined behavior when, e.g., `next()` is called.
+    ///
+    /// If `i` is scalar-aligned, but not `Character`-aligned, you may get
+    /// different results from running `Character` iteration.
+    ///
+    /// For example, this could be used by a regex engine to backtrack to a
+    /// known-valid previous position.
+    ///
+    @unsafe
+    @lifetime(self: copy self)
+    public mutating func reset(toUnchecked codeUnitOffset: Int) {
+      _internalInvariant(codeUnits._isScalarAligned(unchecked: codeUnitOffset))
+      self.currentCodeUnitOffset = codeUnitOffset
+    }
+
+    /// Returns the UTF8Span containing all the content up to the iterator's
+    /// current position.
+    @lifetime(copy self)
+    public func prefix() -> UTF8Span {
+      let slice = codeUnits.span._extracting(0..<currentCodeUnitOffset)
+      return UTF8Span(
+        _uncheckedAssumingValidUTF8: slice,
+        isKnownASCII: codeUnits.isKnownASCII,
+        isKnownNFC: codeUnits.isKnownNFC)
+    }
+
+    /// Returns the UTF8Span containing all the content after the iterator's
+    /// current position.
+    @lifetime(copy self)
+    public func suffix() -> UTF8Span {
+      let slice = codeUnits.span._extracting(currentCodeUnitOffset..<codeUnits.count)
+      return UTF8Span(
+        _uncheckedAssumingValidUTF8: slice,
+        isKnownASCII: codeUnits.isKnownASCII,
+        isKnownNFC: codeUnits.isKnownNFC)
+    }
+  }
+}
+
+
--- a/test/Misc/serialized-diagnostics-prettyprint.swift
+++ b/test/Misc/serialized-diagnostics-prettyprint.swift
@@ -5,8 +5,8 @@
 // RUN: c-index-test -read-diagnostics %t.dia > %t.deserialized_diagnostics.txt 2>&1
 // RUN: %FileCheck --input-file=%t.deserialized_diagnostics.txt %s

-var x = String.init // expected-error{{ambiguous use of 'init'}}
-// CHECK: {{.*[/\\]}}serialized-diagnostics-prettyprint.swift:[[@LINE-1]]:16: error: ambiguous use of 'init'
+var x = String.init(_:) // expected-error{{ambiguous use of 'init(_:)'}}
+// CHECK: {{.*[/\\]}}serialized-diagnostics-prettyprint.swift:[[@LINE-1]]:16: error: ambiguous use of 'init(_:)'

 // CHECK: Swift.String.init:2:19: note: found this candidate
 // CHECK: CONTENTS OF FILE Swift.String.init:
--- a/test/abi/macOS/arm64/stdlib.swift
+++ b/test/abi/macOS/arm64/stdlib.swift
@@ -814,6 +814,133 @@ Added: _$ss7RawSpanVMa
 Added: _$ss7RawSpanVMn
 Added: _$ss7RawSpanVN

+// SE-0464 UTF8Span
+Added: _$sSS7copyingSSs8UTF8SpanV_tcfC
+Added: _$sSS8utf8Spans04UTF8B0Vvg
+Added: _$sSS8utf8Spans04UTF8B0VvpMV
+Added: _$sSs8utf8Spans04UTF8B0Vvg
+Added: _$sSs8utf8Spans04UTF8B0VvpMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvM
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvg
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvpMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvs
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11descriptionSSvg
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11descriptionSSvpMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV2eeoiySbAF_AFtFZ
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV11descriptionSSvg
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV11descriptionSSvpMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV15truncatedScalarAHvpZMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV20overlongEncodingByteAHvpZMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV22surrogateCodePointByteAHvpZMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV26unexpectedContinuationByteAHvpZMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV32invalidNonSurrogateCodePointByteAHvpZMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValueAHSgs5UInt8V_tcfC
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8VvM
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8Vvg
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8VvpMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8Vvs
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVMa
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVMn
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVN
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSHsMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSHsWP
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSQsMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSQsWP
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSYsMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSYsWP
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs0D0sMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs0D0sWP
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs23CustomStringConvertiblesMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs23CustomStringConvertiblesWP
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4hash4intoys6HasherVz_tF
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvM
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvg
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvpMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvs
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV9hashValueSivg
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV9hashValueSivpMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVMa
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVMn
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVN
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSHsMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSHsWP
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSQsMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSQsWP
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs0D0sMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs0D0sWP
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs23CustomStringConvertiblesMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs23CustomStringConvertiblesWP
+Added: _$ss7UnicodeO4UTF8O15_checkAllErrorsySayAD15ValidationErrorVGxSTRzs5UInt8V7ElementRtzlFZ
+Added: _$ss8UTF8SpanV9unchecked12isKnownASCIIABs0B0Vys5UInt8VG_SbtcfC
+Added: _$ss8UTF8SpanV10_countMasks6UInt64VvpZMV
+Added: _$ss8UTF8SpanV10_flagsMasks6UInt64VvpZMV
+Added: _$ss8UTF8SpanV10isKnownNFCSbvpMV
+Added: _$ss8UTF8SpanV10validatingABs0B0Vys5UInt8VG_ts7UnicodeO0A0O15ValidationErrorVYKcfC
+Added: _$ss8UTF8SpanV11checkForNFC10quickCheckS2b_tF
+Added: _$ss8UTF8SpanV12isKnownASCIISbvpMV
+Added: _$ss8UTF8SpanV13checkForASCIISbyF
+Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64VvM
+Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64Vvg
+Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64VvpMV
+Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64Vvs
+Added: _$ss8UTF8SpanV17CharacterIteratorV11skipForward2byS2i_tF
+Added: _$ss8UTF8SpanV17CharacterIteratorV11skipForwardSiyF
+Added: _$ss8UTF8SpanV17CharacterIteratorV21currentCodeUnitOffsetSivg
+Added: _$ss8UTF8SpanV17CharacterIteratorV21currentCodeUnitOffsetSivpMV
+Added: _$ss8UTF8SpanV17CharacterIteratorV4nextSJSgyF
+Added: _$ss8UTF8SpanV17CharacterIteratorV5reset20roundingForwardsFromySi_tF
+Added: _$ss8UTF8SpanV17CharacterIteratorV5reset21roundingBackwardsFromySi_tF
+Added: _$ss8UTF8SpanV17CharacterIteratorV5reset11toUncheckedySi_tF
+Added: _$ss8UTF8SpanV17CharacterIteratorV6prefixAByF
+Added: _$ss8UTF8SpanV17CharacterIteratorV6suffixAByF
+Added: _$ss8UTF8SpanV17CharacterIteratorV8previousSJSgyF
+Added: _$ss8UTF8SpanV17CharacterIteratorV8skipBack2byS2i_tF
+Added: _$ss8UTF8SpanV17CharacterIteratorV8skipBackSiyF
+Added: _$ss8UTF8SpanV17CharacterIteratorV9codeUnitsABvg
+Added: _$ss8UTF8SpanV17CharacterIteratorV9codeUnitsABvpMV
+Added: _$ss8UTF8SpanV17CharacterIteratorVMa
+Added: _$ss8UTF8SpanV17CharacterIteratorVMn
+Added: _$ss8UTF8SpanV17CharacterIteratorVN
+Added: _$ss8UTF8SpanV17CharacterIteratorVyAdBcfC
+Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvM
+Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvg
+Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvpMV
+Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvs
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV11skipForward2byS2i_tF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV11skipForwardSiyF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV21currentCodeUnitOffsetSivg
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV21currentCodeUnitOffsetSivpMV
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV4nexts0C0O0D0VSgyF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset20roundingForwardsFromySi_tF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset21roundingBackwardsFromySi_tF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset11toUncheckedySi_tF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV6prefixAByF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV6suffixAByF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8previouss0C0O0D0VSgyF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8skipBack2byS2i_tF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8skipBackSiyF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV9codeUnitsABvg
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV9codeUnitsABvpMV
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVMa
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVMn
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVN
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVyAdBcfC
+Added: _$ss8UTF8SpanV21isCanonicallyLessThanySbABF
+Added: _$ss8UTF8SpanV21makeCharacterIteratorAB0dE0VyF
+Added: _$ss8UTF8SpanV23isCanonicallyEquivalent2toSbAB_tF
+Added: _$ss8UTF8SpanV25makeUnicodeScalarIteratorAB0deF0VyF
+Added: _$ss8UTF8SpanV4spans0B0Vys5UInt8VGvg
+Added: _$ss8UTF8SpanV4spans0B0Vys5UInt8VGvpMV
+Added: _$ss8UTF8SpanV5countSivpMV
+Added: _$ss8UTF8SpanV7_nfcBits6UInt64VvpZMV
+Added: _$ss8UTF8SpanV7isEmptySbvg
+Added: _$ss8UTF8SpanV7isEmptySbvpMV
+Added: _$ss8UTF8SpanV9_asciiBits6UInt64VvpZMV
+Added: _$ss8UTF8SpanVMa
+Added: _$ss8UTF8SpanVMn
+Added: _$ss8UTF8SpanVN
+
+
 // SE-0467 MutableSpan and MutableRawSpan
 Added: _$ss11MutableSpanVMa
 Added: _$ss11MutableSpanVMn
--- a/test/abi/macOS/x86_64/stdlib.swift
+++ b/test/abi/macOS/x86_64/stdlib.swift
@@ -815,6 +815,133 @@ Added: _$ss7RawSpanVMa
 Added: _$ss7RawSpanVMn
 Added: _$ss7RawSpanVN

+// SE-0464 UTF8Span
+Added: _$sSS7copyingSSs8UTF8SpanV_tcfC
+Added: _$sSS8utf8Spans04UTF8B0Vvg
+Added: _$sSS8utf8Spans04UTF8B0VvpMV
+Added: _$sSs8utf8Spans04UTF8B0Vvg
+Added: _$sSs8utf8Spans04UTF8B0VvpMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvM
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvg
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvpMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11byteOffsetsSnySiGvs
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11descriptionSSvg
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV11descriptionSSvpMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV2eeoiySbAF_AFtFZ
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV11descriptionSSvg
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV11descriptionSSvpMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV15truncatedScalarAHvpZMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV20overlongEncodingByteAHvpZMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV22surrogateCodePointByteAHvpZMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV26unexpectedContinuationByteAHvpZMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV32invalidNonSurrogateCodePointByteAHvpZMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValueAHSgs5UInt8V_tcfC
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8VvM
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8Vvg
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8VvpMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindV8rawValues5UInt8Vvs
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVMa
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVMn
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVN
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSHsMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSHsWP
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSQsMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSQsWP
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSYsMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVSYsWP
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs0D0sMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs0D0sWP
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs23CustomStringConvertiblesMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4KindVs23CustomStringConvertiblesWP
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4hash4intoys6HasherVz_tF
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvM
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvg
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvpMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV4kindAF4KindVvs
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV9hashValueSivg
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorV9hashValueSivpMV
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVMa
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVMn
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVN
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSHsMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSHsWP
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSQsMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVSQsWP
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs0D0sMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs0D0sWP
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs23CustomStringConvertiblesMc
+Added: _$ss7UnicodeO4UTF8O15ValidationErrorVs23CustomStringConvertiblesWP
+Added: _$ss7UnicodeO4UTF8O15_checkAllErrorsySayAD15ValidationErrorVGxSTRzs5UInt8V7ElementRtzlFZ
+Added: _$ss8UTF8SpanV9unchecked12isKnownASCIIABs0B0Vys5UInt8VG_SbtcfC
+Added: _$ss8UTF8SpanV10_countMasks6UInt64VvpZMV
+Added: _$ss8UTF8SpanV10_flagsMasks6UInt64VvpZMV
+Added: _$ss8UTF8SpanV10isKnownNFCSbvpMV
+Added: _$ss8UTF8SpanV10validatingABs0B0Vys5UInt8VG_ts7UnicodeO0A0O15ValidationErrorVYKcfC
+Added: _$ss8UTF8SpanV11checkForNFC10quickCheckS2b_tF
+Added: _$ss8UTF8SpanV12isKnownASCIISbvpMV
+Added: _$ss8UTF8SpanV13checkForASCIISbyF
+Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64VvM
+Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64Vvg
+Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64VvpMV
+Added: _$ss8UTF8SpanV14_countAndFlagss6UInt64Vvs
+Added: _$ss8UTF8SpanV17CharacterIteratorV11skipForward2byS2i_tF
+Added: _$ss8UTF8SpanV17CharacterIteratorV11skipForwardSiyF
+Added: _$ss8UTF8SpanV17CharacterIteratorV21currentCodeUnitOffsetSivg
+Added: _$ss8UTF8SpanV17CharacterIteratorV21currentCodeUnitOffsetSivpMV
+Added: _$ss8UTF8SpanV17CharacterIteratorV4nextSJSgyF
+Added: _$ss8UTF8SpanV17CharacterIteratorV5reset20roundingForwardsFromySi_tF
+Added: _$ss8UTF8SpanV17CharacterIteratorV5reset21roundingBackwardsFromySi_tF
+Added: _$ss8UTF8SpanV17CharacterIteratorV5reset11toUncheckedySi_tF
+Added: _$ss8UTF8SpanV17CharacterIteratorV6prefixAByF
+Added: _$ss8UTF8SpanV17CharacterIteratorV6suffixAByF
+Added: _$ss8UTF8SpanV17CharacterIteratorV8previousSJSgyF
+Added: _$ss8UTF8SpanV17CharacterIteratorV8skipBack2byS2i_tF
+Added: _$ss8UTF8SpanV17CharacterIteratorV8skipBackSiyF
+Added: _$ss8UTF8SpanV17CharacterIteratorV9codeUnitsABvg
+Added: _$ss8UTF8SpanV17CharacterIteratorV9codeUnitsABvpMV
+Added: _$ss8UTF8SpanV17CharacterIteratorVMa
+Added: _$ss8UTF8SpanV17CharacterIteratorVMn
+Added: _$ss8UTF8SpanV17CharacterIteratorVN
+Added: _$ss8UTF8SpanV17CharacterIteratorVyAdBcfC
+Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvM
+Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvg
+Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvpMV
+Added: _$ss8UTF8SpanV18_unsafeBaseAddressSVSgvs
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV11skipForward2byS2i_tF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV11skipForwardSiyF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV21currentCodeUnitOffsetSivg
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV21currentCodeUnitOffsetSivpMV
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV4nexts0C0O0D0VSgyF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset20roundingForwardsFromySi_tF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset21roundingBackwardsFromySi_tF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV5reset11toUncheckedySi_tF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV6prefixAByF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV6suffixAByF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8previouss0C0O0D0VSgyF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8skipBack2byS2i_tF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV8skipBackSiyF
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV9codeUnitsABvg
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorV9codeUnitsABvpMV
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVMa
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVMn
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVN
+Added: _$ss8UTF8SpanV21UnicodeScalarIteratorVyAdBcfC
+Added: _$ss8UTF8SpanV21isCanonicallyLessThanySbABF
+Added: _$ss8UTF8SpanV21makeCharacterIteratorAB0dE0VyF
+Added: _$ss8UTF8SpanV23isCanonicallyEquivalent2toSbAB_tF
+Added: _$ss8UTF8SpanV25makeUnicodeScalarIteratorAB0deF0VyF
+Added: _$ss8UTF8SpanV4spans0B0Vys5UInt8VGvg
+Added: _$ss8UTF8SpanV4spans0B0Vys5UInt8VGvpMV
+Added: _$ss8UTF8SpanV5countSivpMV
+Added: _$ss8UTF8SpanV7_nfcBits6UInt64VvpZMV
+Added: _$ss8UTF8SpanV7isEmptySbvg
+Added: _$ss8UTF8SpanV7isEmptySbvpMV
+Added: _$ss8UTF8SpanV9_asciiBits6UInt64VvpZMV
+Added: _$ss8UTF8SpanVMa
+Added: _$ss8UTF8SpanVMn
+Added: _$ss8UTF8SpanVN
+
+
 // SE-0467 MutableSpan and MutableRawSpan
 Added: _$ss11MutableSpanVMa
 Added: _$ss11MutableSpanVMn
--- a/test/stdlib/UTF8EncodingErrorTests.swift
+++ b/test/stdlib/UTF8EncodingErrorTests.swift
@@ -0,0 +1,295 @@
+// RUN: %target-run-stdlib-swift %S/Inputs/
+
+// REQUIRES: executable_test
+
+// FIXME: this test is currently broken
+
+import Swift
+import StdlibUnittest
+
+var suite = TestSuite("UTF8.ValidationError")
+defer { runAllTests() }
+
+@available(SwiftStdlib 6.2, *)
+extension Array {
+  func withSpan<R>(_ f: (Span<Element>) throws -> R) rethrows -> R {
+    try self.withUnsafeBufferPointer {
+      try f(Span(_unsafeElements: $0))
+    }
+  }
+}
+
+
+extension Range<Int> {
+  func _offset(by start: Int) -> Range<Int> {
+    start + lowerBound ..< start + upperBound
+  }
+}
+
+@available(SwiftStdlib 6.2, *)
+private struct ValidationError {
+  var error: UTF8.ValidationError
+
+  // When fetching all errors, we'll get the error kind given. When
+  // slicing in order to get the next error (e.g.
+  // `UTF8Span.init(validating:))`, we'll get `.unexpectedContinuation`.
+  var errorStart: Bool
+
+
+  init(
+    _ error: UTF8.ValidationError,
+    errorStart: Bool
+  ) {
+    self.error = error
+    self.errorStart = errorStart
+  }
+
+  public static func unexpectedContinuationByte(
+    at i: Int, errorStart: Bool = true
+  ) -> Self {
+    Self(UTF8.ValidationError(.unexpectedContinuationByte, at: i), errorStart: errorStart)
+  }
+
+  public static func surrogateCodePointByte(
+    at i: Int, errorStart: Bool = true
+  ) -> Self {
+    Self(UTF8.ValidationError(.surrogateCodePointByte, at: i), errorStart: errorStart)
+  }
+
+  public static func invalidNonSurrogateCodePointByte(
+    at i: Int, errorStart: Bool = true
+  ) -> Self {
+    Self(UTF8.ValidationError(.invalidNonSurrogateCodePointByte, at: i), errorStart: errorStart)
+  }
+
+  public static func overlongEncodingByte(
+    at i: Int, errorStart: Bool = true
+  ) -> Self {
+    Self(UTF8.ValidationError(.overlongEncodingByte, at: i), errorStart: errorStart)
+  }
+
+  public static func truncatedScalar(
+    _ range: Range<Int>, errorStart: Bool = true
+  ) -> Self {
+    Self(UTF8.ValidationError(.truncatedScalar, range), errorStart: errorStart)
+  }
+}
+
+
+@available(SwiftStdlib 6.2, *)
+private struct ValidationTestCase {
+  var bytes: [UInt8]
+
+  // When fetching all errors, we'll get the error kind given. When
+  // slicing in order to get the next error (e.g.
+  // `UTF8Span.init(validating:))`, we'll get `.unexpectedContinuation`.
+  var errors: [ValidationError]
+
+  var loc: SourceLocStack
+
+  init(
+    _ bytes: [UInt8],
+    file: String = #file,
+    line: UInt = #line,
+    _ errors: [ValidationError]
+  ) {
+    self.bytes = bytes
+    self.errors = errors
+    self.loc = .init(SourceLoc(file, line))
+  }
+
+  func fetchError(
+    at i: Int, wasSliced: Bool
+  ) -> UTF8.ValidationError {
+    let err = errors[i]
+    if wasSliced && !err.errorStart {
+      return .init(.unexpectedContinuationByte, err.error.byteOffsets)
+    }
+    return err.error
+  }
+
+  func expect<T: Equatable>(
+    _ lhs: T,
+    _ rhs: T,
+    file: String = #file,
+    line: UInt = #line
+  ) {
+    expectEqual(
+      lhs,
+      rhs,
+      stackTrace: loc.withCurrentLoc(file: file, line: line))
+  }
+  func fail(
+    _ message: String,
+    file: String = #file,
+    line: UInt = #line
+  ) {
+    expectationFailure(
+      message,
+      trace: "",
+      stackTrace: loc.with(.init(file, line)))
+  }
+
+  /// Test UTF8._checkAllErrors(), which matches directly against
+  /// the provided expected-errors.
+  func testAllErrors() {
+    let caughtErrors = Array(UTF8._checkAllErrors(bytes))
+    for i in 0..<Swift.min(caughtErrors.count, errors.count) {
+      expect(fetchError(at: i, wasSliced: false), caughtErrors[i])
+    }
+    expect(caughtErrors.count, errors.count)
+  }
+
+  /// Test UTF8Span validation. Surface subsequent errors by slicing the
+  /// input (which will convert the error-kind to .unexpectedContinuationByte)
+  func testSpanSlicedErrors() {
+    bytes.withSpan { span in
+      if errors.isEmpty {
+        do throws(UTF8.ValidationError) {
+          // No errors expected
+          _ = try UTF8Span(validating: span)
+        } catch {
+          fail("Unexpected error: \(error)")
+        }
+        return
+      }
+
+      // Check every error, by slicing (which will change error classification
+      // of continuation bytes in multi-byte errors to .unexpectedContinuation)
+      var currentPos = 0
+      var errorIdx = 0
+      while true {
+        do throws(UTF8.ValidationError) {
+          // print("extracting \(currentPos)")
+          _ = try UTF8Span(validating: span._extracting(currentPos...))
+
+          if errorIdx != errors.endIndex {
+            fail("Expected a thrown UTF-8 encoding error")
+          }
+          break
+        } catch {
+          guard errorIdx < errors.endIndex else {
+            fail("Found unexpected subsequent error \(error)")
+            break
+          }
+
+          let expectedError = fetchError(at: errorIdx, wasSliced: true)
+          // print(currentPos)
+          // print(error)
+
+          // print(error.byteOffsets._offset(by: currentPos))
+
+
+          let adjustedErr = UTF8.ValidationError(
+            error.kind,
+            error.byteOffsets._offset(by: currentPos)
+          )
+          expect(expectedError, adjustedErr)
+
+          currentPos = adjustedErr.byteOffsets.upperBound
+          errorIdx += 1
+        }
+
+      }
+
+      // Rest of input should be error-free
+      if let start = errors.last?.error.byteOffsets.upperBound,
+          start < bytes.count
+      {
+        do throws(UTF8.ValidationError) {
+          _ = try UTF8Span(validating: span._extracting(start...))
+        } catch {
+          fail("Found subsequent error \(error)")
+        }
+      }
+    }
+  }
+
+  func run() {
+    testSpanSlicedErrors()
+    testAllErrors()
+  }
+}
+
+if #available(SwiftStdlib 6.2, *) {
+  suite.test("UTF8Span/encoding errors") {
+    func test(
+      _ bytes: Array<UInt8>,
+      _ file: String = #file, line: UInt = #line,
+      _ errors: ValidationError...
+    ) {
+      ValidationTestCase(
+        bytes, file: file, line: line, errors
+      ).run()
+    }
+
+    // Valid string
+    // test(Array("abcde\u{301}f😀🇺🇸🧟‍♀️🧟‍♀️".utf8), [])
+
+    // Bad URL
+    // test(
+    //   Array("http://servername/scripts/..".utf8)
+    //   + [0xC0, 0xAF]
+    //   + Array("../winnt/system32/cmd.exe".utf8),
+    //   [.overlongEncodingByte(at: 28),                    // C0
+    //    .overlongEncodingByte(at: 29, errorStart: false), // AF
+    //   ])
+
+    // test(
+    //   [0xC0, 0xAF, 0xE0, 0x80, 0xBF, 0xF0, 0x81, 0x82, 0x41],
+    //   [.overlongEncodingByte(at: 0),                    // C0
+    //    .overlongEncodingByte(at: 1, errorStart: false), // AF
+    //    .overlongEncodingByte(at: 2),                    // E0
+    //    .overlongEncodingByte(at: 3, errorStart: false), // 80
+    //    .overlongEncodingByte(at: 4, errorStart: false), // BF
+    //    .overlongEncodingByte(at: 5),                    // F0
+    //    .overlongEncodingByte(at: 6, errorStart: false), // 81
+    //    .overlongEncodingByte(at: 7, errorStart: false), // 82
+    //   ])
+    // test(
+    //   [0x41, 0xC0, 0xAF, 0x41, 0xF4, 0x80, 0x80, 0x41],
+    //   [.overlongEncodingByte(at: 1),                    // C0
+    //    .overlongEncodingByte(at: 2, errorStart: false), // AF
+    //    .truncatedScalar(4...6),                         // F4 80 80
+    //   ])
+    // test(
+    //   [0xED, 0xAF, 0x41],
+    //   [.surrogateCodePointByte(at: 0),                    // ED
+    //    .surrogateCodePointByte(at: 1, errorStart: false), // AF
+    //   ])
+    // test(
+    //   [0xED, 0xA0, 0x80, 0xED, 0xBF, 0xBF, 0xED, 0xAF, 0x41],
+    //   [.surrogateCodePointByte(at: 0),                    // ED
+    //    .surrogateCodePointByte(at: 1, errorStart: false), // A0
+    //    .surrogateCodePointByte(at: 2, errorStart: false), // 80
+    //    .surrogateCodePointByte(at: 3),                    // ED
+    //    .surrogateCodePointByte(at: 4, errorStart: false), // BF
+    //    .surrogateCodePointByte(at: 5, errorStart: false), // BF
+    //    .surrogateCodePointByte(at: 6),                    // ED
+    //    .surrogateCodePointByte(at: 7, errorStart: false), // AF
+    //   ])
+    // test(
+    //   [0xF4, 0x91, 0x92, 0x93, 0xFF, 0x41, 0x80, 0xBF, 0x42],
+    //   [.invalidNonSurrogateCodePointByte(at: 0),                    // F4
+    //    .invalidNonSurrogateCodePointByte(at: 1, errorStart: false), // 91
+    //    .invalidNonSurrogateCodePointByte(at: 2, errorStart: false), // 92
+    //    .invalidNonSurrogateCodePointByte(at: 3, errorStart: false), // 93
+    //    .invalidNonSurrogateCodePointByte(at: 4),                    // FF
+    //    .unexpectedContinuationByte(at: 6),                          // 80
+    //    .unexpectedContinuationByte(at: 7),                          // BF
+    //   ])
+    // test(
+    //   [0xE1, 0x80, 0xE2, 0xF0, 0x91, 0x92, 0xF1, 0xBF, 0x41],
+    //   [.truncatedScalar(0...1), // E1 80
+    //    .truncatedScalar(2...2), // E2
+    //    .truncatedScalar(3...5), // F0 91 92
+    //    .truncatedScalar(6...7), // F1 BF
+    //   ])
+    // test(
+    //   [0xE0, 0x81, 0x80],
+    //   [.overlongEncodingByte(at: 0), // E0
+    //    .overlongEncodingByte(at: 1, errorStart: false), // 81
+    //    .overlongEncodingByte(at: 2, errorStart: false), // 80
+    //   ])
+  }
+}
--- a/test/stdlib/UTF8SpanIteratorTests.swift
+++ b/test/stdlib/UTF8SpanIteratorTests.swift
@@ -0,0 +1,279 @@
+// RUN: %target-run-stdlib-swift(-enable-experimental-feature LifetimeDependence) %S/Inputs/
+// REQUIRES: swift_feature_LifetimeDependence
+// REQUIRES: executable_test
+
+import Swift
+import StdlibUnittest
+
+var suite = TestSuite("UTF8SpanIterator")
+defer { runAllTests() }
+
+@available(SwiftStdlib 6.2, *)
+extension Array {
+  func withSpan<R>(_ f: (Span<Element>) throws -> R) rethrows -> R {
+    try self.withUnsafeBufferPointer {
+      try f(Span(_unsafeElements: $0))
+    }
+  }
+}
+
+@available(SwiftStdlib 6.2, *)
+extension UTF8Span {
+  func withSpan<R>(_ f: (Span<UInt8>) throws -> R) rethrows -> R {
+    try self._withUnsafeBufferPointer {
+      try f(Span(_unsafeElements: $0))
+    }
+  }
+}
+
+
+// 
+@available(SwiftStdlib 6.2, *)
+struct ContentEquivalenceTestCase {
+  var str: String
+  var loc: SourceLocStack
+}
+
+@available(SwiftStdlib 6.2, *)
+extension ContentEquivalenceTestCase {
+  func expectStart(
+    _ scalars: inout UTF8Span.UnicodeScalarIterator
+  ) {
+    let firstScalar = str.unicodeScalars.first
+    expectEqual(0, scalars.currentCodeUnitOffset, stackTrace: loc)
+    expectNil(scalars.previous(), stackTrace: loc)
+    expectEqual(firstScalar, scalars.next(), stackTrace: loc)
+    expectEqual(firstScalar, scalars.previous(), stackTrace: loc)
+    expectNil(scalars.previous(), stackTrace: loc)
+  }
+
+  func expectEnd(
+    _ scalars: inout UTF8Span.UnicodeScalarIterator
+  ) {
+    let lastScalar = str.unicodeScalars.last
+    expectEqual(scalars.currentCodeUnitOffset, scalars.codeUnits.count, stackTrace: loc)
+    expectNil(scalars.next(), stackTrace: loc)
+    expectEqual(lastScalar, scalars.previous(), stackTrace: loc)
+    expectEqual(lastScalar, scalars.next(), stackTrace: loc)
+    expectNil(scalars.next(), stackTrace: loc)
+  }
+
+  func expectStart(
+    _ chars: inout UTF8Span.CharacterIterator
+  ) {
+    let firstChar = str.first
+    expectEqual(0, chars.currentCodeUnitOffset, stackTrace: loc)
+    expectNil(chars.previous(), stackTrace: loc)
+    expectEqual(firstChar, chars.next(), stackTrace: loc)
+    expectEqual(firstChar, chars.previous(), stackTrace: loc)
+    expectNil(chars.previous(), stackTrace: loc)
+  }
+
+  func expectEnd(
+    _ chars: inout UTF8Span.CharacterIterator
+  ) {
+    let lastChar = str.last
+    expectEqual(chars.currentCodeUnitOffset, chars.codeUnits.count, stackTrace: loc)
+    expectNil(chars.next(), stackTrace: loc)
+    expectEqual(lastChar, chars.previous(), stackTrace: loc)
+    expectEqual(lastChar, chars.next(), stackTrace: loc)
+    expectNil(chars.next(), stackTrace: loc)
+  }
+
+
+  func withUTF8Span<R>(_ f: (UTF8Span) throws -> R) rethrows -> R {
+    try Array(str.utf8).withSpan { span in 
+      try f(try! UTF8Span(validating: span))
+    }
+  }
+}
+
+@available(SwiftStdlib 6.2, *)
+extension ContentEquivalenceTestCase {
+  func testBytes() {
+    let otherBytes = Array((str+"abc").utf8)
+
+    withUTF8Span { utf8Span in
+      utf8Span._withUnsafeBufferPointer {
+        expectEqualSequence(str.utf8, $0, stackTrace: loc)
+      }
+    }
+
+    // NOTE: There's a slight jarring due to not having the same 
+    // iterators for code units
+  }
+
+  func testScalars() {
+    withUTF8Span { utf8Span in 
+      // Test forwards
+      var utf8SpanIter = utf8Span.makeUnicodeScalarIterator()
+      var stringIter = str.unicodeScalars.makeIterator()
+      while let scalar = utf8SpanIter.next() {
+        expectEqual(scalar, stringIter.next(), stackTrace: loc)
+      }
+      expectNil(stringIter.next(), stackTrace: loc)
+      expectEnd(&utf8SpanIter)
+
+      // Test backwards
+      var stringRevIter = str.unicodeScalars.reversed().makeIterator()
+      while let scalar = utf8SpanIter.previous() {
+        expectEqual(scalar, stringRevIter.next(), stackTrace: loc)
+      }
+      expectNil(stringRevIter.next(), stackTrace: loc)
+      expectStart(&utf8SpanIter)
+
+      let numElements = str.unicodeScalars.count
+      let lastElement = str.unicodeScalars.last
+      let firstElement = str.unicodeScalars.first
+
+      expectEqual(numElements, utf8SpanIter.skipForward(by: Int.max))
+      expectEnd(&utf8SpanIter)
+      expectEqual(numElements, utf8SpanIter.skipBack(by: Int.max))
+      expectStart(&utf8SpanIter)
+      expectEqual(numElements, utf8SpanIter.skipForward(by: numElements))
+      expectEnd(&utf8SpanIter)
+      expectEqual(numElements, utf8SpanIter.skipBack(by: numElements))
+      expectStart(&utf8SpanIter)
+
+      if numElements > 0 {
+        expectStart(&utf8SpanIter)
+        expectEqual(numElements-1, utf8SpanIter.skipForward(by: numElements-1))
+        expectEqual(lastElement, utf8SpanIter.next())
+        expectEnd(&utf8SpanIter)
+
+        expectEqual(numElements-1, utf8SpanIter.skipBack(by: numElements-1))
+        expectEqual(firstElement, utf8SpanIter.previous())
+        expectStart(&utf8SpanIter)
+      }
+
+      // TODO: test reset variants
+      // TODO: test prefix/suffix
+
+    }
+  }
+
+  func testCharacters() {
+    withUTF8Span { utf8Span in 
+      // Test forwards
+      var utf8SpanIter = utf8Span.makeCharacterIterator()
+      var stringIter = str.makeIterator()
+      while let char = utf8SpanIter.next() {
+        expectEqual(char, stringIter.next(), stackTrace: loc)
+      }
+      expectNil(stringIter.next(), stackTrace: loc)
+      expectEnd(&utf8SpanIter)
+
+      // Test backwards
+      var stringRevIter = str.reversed().makeIterator()
+      while let char = utf8SpanIter.previous() {
+        expectEqual(char, stringRevIter.next(), stackTrace: loc)
+      }
+      expectNil(stringRevIter.next(), stackTrace: loc)
+      expectStart(&utf8SpanIter)
+
+      let numElements = str.count
+      let lastElement = str.last
+      let firstElement = str.first
+
+      expectEqual(numElements, utf8SpanIter.skipForward(by: Int.max))
+      expectEnd(&utf8SpanIter)
+      expectEqual(numElements, utf8SpanIter.skipBack(by: Int.max))
+      expectStart(&utf8SpanIter)
+      expectEqual(numElements, utf8SpanIter.skipForward(by: numElements))
+      expectEnd(&utf8SpanIter)
+      expectEqual(numElements, utf8SpanIter.skipBack(by: numElements))
+      expectStart(&utf8SpanIter)
+
+      if numElements > 0 {
+        expectStart(&utf8SpanIter)
+        expectEqual(numElements-1, utf8SpanIter.skipForward(by: numElements-1))
+        expectEqual(lastElement, utf8SpanIter.next())
+        expectEnd(&utf8SpanIter)
+
+        expectEqual(numElements-1, utf8SpanIter.skipBack(by: numElements-1))
+        expectEqual(firstElement, utf8SpanIter.previous())
+        expectStart(&utf8SpanIter)
+      }
+
+      // TODO: test reset variants
+      // TODO: test prefix/suffix
+    }
+  }
+
+  func run() {
+    testBytes()
+    testScalars()
+    testCharacters()
+
+    // TODO: test grapheme break iterator
+  }
+
+}
+
+if #available(SwiftStdlib 6.2, *) {
+  suite.test("UTF8Span/iterators") {
+    func test(
+      _ s: String,
+      file: String = #file,
+      line: UInt = #line
+    ) {
+      // print("testing: \(s)")
+      let t = ContentEquivalenceTestCase(
+        str: s, loc: .init(SourceLoc(file, line)))
+      t.run()
+    }
+
+    test("")
+    test("a")
+    test("á")
+    test("a\u{301}")
+    test("🧟‍♀️")
+    test("abc")
+    test("abcde\u{301}")
+    test("abéÏ𓀀")
+    test("012345678901234567890")
+    test("abéÏ012345678901234567890𓀀")
+    test("😀😃🤢🤮👩🏿‍🎤🧛🏻‍♂️🧛🏻‍♂️👩‍👩‍👦‍👦")
+    test("defghijklmnopqrstuvwxyz")
+    test("ab🧟‍♀️de\u{301}bytés")
+    test("ab🧟‍♀️de\u{301}🧟‍♀️")
+    test("ab🧟‍♀️de🧟‍♀️\u{301}")
+  }
+}
+
+// @available(SwiftStdlib 6.2, *)
+// extension UTF8Span {
+//   func splitOffASCIIPrefix() -> (UTF8Span, UTF8Span) {
+//     if isKnownASCII {
+//       return (self, .init())
+//     }
+//     var splitPoint = 0
+//     while splitPoint < codeUnits.count && codeUnits[unchecked: split] < 0x80 {
+//       splitPoint += 1
+//     }
+
+//   }
+// }
+
+if #available(SwiftStdlib 6.2, *) {
+  suite.test("UTF8Span/whatever") {
+    // var badURLBytes: [UInt8] = []
+    // badURLBytes.append(contentsOf: "http://servername/scripts/..".utf8)
+
+    // // Invalid overlong encoding of "/"
+    // badURLBytes.append(contentsOf: [0xC0, 0xAF])
+
+    // badURLBytes.append(contentsOf: "../winnt/system32/cmd.exe".utf8)
+
+    // // try! UTF8Span(validating: badURLBytes.span)
+
+    // badURLBytes.withSpan {
+    //   try! UTF8Span(validating: $0)
+    // }
+
+
+
+  }
+}
+
+
--- a/test/stdlib/UTF8SpanQueriesComparisons.swift
+++ b/test/stdlib/UTF8SpanQueriesComparisons.swift
@@ -0,0 +1,278 @@
+// RUN: %target-run-stdlib-swift %S/Inputs/
+
+// REQUIRES: executable_test
+
+import Swift
+import StdlibUnittest
+
+@available(SwiftStdlib 6.2, *)
+extension UTF8Span {
+  static func ~=(_ lhs: StaticString, _ rhs: UTF8Span) -> Bool {
+    return lhs.withUTF8Buffer { str in
+      rhs._withUnsafeBufferPointer { span in
+        str.elementsEqual(span)
+      }
+    }
+  }
+}
+
+var suite = TestSuite("UTF8SpanQueriesComparisons")
+defer { runAllTests() }
+
+@available(SwiftStdlib 6.2, *)
+extension Array where Element == UInt8 {
+  func withSpan<R>(_ f: (Span<Element>) throws -> R) rethrows -> R {
+    try self.withUnsafeBufferPointer {
+      try f(Span(_unsafeElements: $0))
+    }
+  }
+  func withUTF8Span<R>(_ f: (UTF8Span) throws -> R) rethrows -> R {
+    try self.withSpan { span in
+      try f(try! UTF8Span(validating: span))
+    }
+  }
+}
+
+if #available(SwiftStdlib 6.2, *) {
+  suite.test("UTF8Span/tilde equals") {
+    Array("abcdefg".utf8).withUTF8Span { utf8Span in
+      switch utf8Span {
+      case "def":
+        expectationFailure(
+          "unexpected pattern match",
+          trace: "",
+          stackTrace: SourceLocStack().withCurrentLoc())
+      case "abcdef":
+        expectationFailure(
+          "unexpected pattern match",
+          trace: "",
+          stackTrace: SourceLocStack().withCurrentLoc())
+      case "abcdefg ":
+        expectationFailure(
+          "unexpected pattern match",
+          trace: "",
+          stackTrace: SourceLocStack().withCurrentLoc())
+      case "abcdefg\0":
+        expectationFailure(
+          "unexpected pattern match",
+          trace: "",
+          stackTrace: SourceLocStack().withCurrentLoc())
+      case "abcdefg":
+        break
+      default:
+        expectationFailure(
+          "expected a pattern match",
+          trace: "",
+          stackTrace: SourceLocStack().withCurrentLoc())
+      }
+    }
+  }
+
+  suite.test("UTF8Span/Sequence equal") {
+    // // A string and its canonical equivalent
+    // let testCases: [(String, String?)] = [
+    //   ("abdefg", nil)
+    //   ("café", "cafe\u{301}")
+    // ]
+  }
+
+  suite.test("UTF8Span/isKnownASCII") {
+    let tests: [(String, Bool)] = [
+      ("abc", true),
+      ("abcdefghil1235@#% _/.sladfj234 ", true),
+      ("abcdefghil1\u{80}sladfj234 ", false),
+    ]
+
+    for (test, expected) in tests {
+      Array(test.utf8).withUTF8Span {
+        expectEqual(expected, $0.isKnownASCII)
+      }
+    }
+  }
+
+  suite.test("UTF8Span/isKnownNFC") {
+    enum Normalness {
+      case known
+      case quickCheck
+      case fullCheck
+      case notNFC
+    }
+
+    let nfcQCNo = "\u{0374}"
+    let nfcQCYes = "\u{0374}"
+
+    let tests: [(String, Normalness)] = [
+      ("abc", .known),
+      ("abcdefghil123567890", .known),
+      ("abcdefghil1\u{299}123345678 ", .quickCheck),
+      ("abc日曜日xyz", .quickCheck),
+      ("abcde日曜日\u{301}", .fullCheck),
+      ("abcde\u{301}fghijkl", .notNFC),
+    ]
+
+    for (test, expected) in tests {
+      Array(test.utf8).withUTF8Span {
+        var span = $0
+        if span.isKnownNFC {
+          expectEqual(expected, .known)
+        } else if span.checkForNFC(quickCheck: true) {
+          expectEqual(expected, .quickCheck)
+        } else if span.checkForNFC(quickCheck: false) {
+          expectEqual(expected, .fullCheck)
+        } else {
+          expectEqual(expected, .notNFC)
+        }
+      }
+    }
+  }
+
+  suite.test("UTF8Span/canonical equivalence") {
+
+    // TODO: refactor to be test-case declaration driven, and add more tests...
+    //   `(normalized: String, variants: [String], lessThan: String, greaterThan: String)`
+
+    let precomposedStr = "café"
+    let decomposedStr = "cafe\u{301}"
+
+    let precomposed = Array(precomposedStr.utf8)
+    let decomposed = Array(decomposedStr.utf8)
+
+    precomposed.withSpan { pre in
+      let utf8Precomposed = try! UTF8Span(validating: pre)
+      decomposed.withSpan { de in
+        let utf8Decomposed = try! UTF8Span(validating: de)
+
+        // print("scalars for \(precomposedStr.unicodeScalars)")
+        // var preScalars = utf8Precomposed.makeUnicodeScalarIterator()
+        // while let s = preScalars.next() {
+        //   print(s)
+        // }
+
+        // print("scalars for \(decomposedStr.unicodeScalars)")
+        // var deScalars = utf8Decomposed.makeUnicodeScalarIterator()
+        // while let s = deScalars.next() {
+        //   print(s)
+        // }
+        
+        expectTrue(utf8Precomposed.isCanonicallyEquivalent(to: utf8Decomposed))
+
+        expectTrue(utf8Precomposed.bytesEqual(to: precomposedStr.utf8))
+        expectFalse(utf8Precomposed.bytesEqual(to: decomposedStr.utf8))
+
+        expectTrue(utf8Decomposed.bytesEqual(to: decomposedStr.utf8))
+        expectFalse(utf8Decomposed.bytesEqual(to: precomposedStr.utf8))
+
+        expectTrue(utf8Precomposed.unicodeScalarsEqual(to: precomposedStr.unicodeScalars))
+        expectFalse(utf8Precomposed.unicodeScalarsEqual(to: decomposedStr.unicodeScalars))
+
+        expectTrue(utf8Decomposed.unicodeScalarsEqual(to: decomposedStr.unicodeScalars))
+        expectFalse(utf8Decomposed.unicodeScalarsEqual(to: precomposedStr.unicodeScalars))
+
+        expectTrue(utf8Precomposed.charactersEqual(to: precomposedStr))
+        expectTrue(utf8Precomposed.charactersEqual(to: decomposedStr))
+
+        expectTrue(utf8Decomposed.charactersEqual(to: decomposedStr))
+        expectTrue(utf8Decomposed.charactersEqual(to: precomposedStr))
+
+        // Equivalence means no-one is less than the other
+        expectFalse(utf8Decomposed.isCanonicallyLessThan(utf8Precomposed))
+        expectFalse(utf8Precomposed.isCanonicallyLessThan(utf8Decomposed))
+
+      }
+
+    }
+
+
+  }
+
+
+}
+
+// TODO: Rest of this file is in-progress TODOs
+
+
+/*
+
+
+isASCII
+isKnownNFC
+checkForNFC(quickCheck:)
+isKnownSingleScalarCharacters
+checkForSingleScalarCharacters(quickCheck:)
+
+public func bytesEqual(to other: UTF8Span) -> Bool
+public func bytesEqual(to other: some Sequence<UInt8>) -> Bool
+
+public func scalarsEqual(
+  to other: some Sequence<Unicode.Scalar>
+) -> Bool
+
+public func charactersEqual(
+  to other: some Sequence<Character>
+) -> Bool
+
+public func isCanonicallyEquivalent(
+  to other: UTF8Span
+) -> Bool
+
+public func isCanonicallyLessThan(
+  _ other: UTF8Span
+) -> Bool
+
+*/
+
+// @available(SwiftStdlib 6.2, *)
+// private struct QueryTestCase {
+//   var content: String
+
+//   var loc: SourceLocStack
+
+//   var isASCII: Bool
+
+//   // TODO: This might become API, or otherwise calculated at init time
+//   var isLatinyNFC: Bool {
+//     bytes.allSatisfy { $0 < 0xCC }
+//   }
+
+//   var isQuickNFC: Bool
+//   var isNFC: Bool
+
+//   var isQuickSSC: Bool
+//   var isSSC: Bool
+// }
+
+// if #available(SwiftStdlib 6.2, *) {
+//   suite.test("UTF8Span/queries") {
+//   }
+// }
+
+// enum ComparisonResult {
+//   binaryEqual
+//   canonicallyEqual
+//   canonicallyLess
+//   inequal
+// }
+
+// private struct ComparisonTestCase {
+//   var content: String
+//   var comparisons: [(String, ComparisonResult)]
+
+//   var loc: SourceLocStack
+// }
+
+// if #available(SwiftStdlib 6.2, *) {
+//   suite.test("UTF8Span/comparisons") {
+//     func test()
+//   }
+// }
+
+
+/*
+
+  input string, to check the bits and relevant info
+  comparison string and expected comparison level
+
+*/
+
+
+// }