[stdlib] Add bookkeeping to keep track of the encoding of strings and indices

Assign some previously reserved bits in String.Index and _StringObject to keep track of their associated storage encoding (either UTF-8 or UTF-16). None of these bits will be reliably set in processes that load binaries compiled with older stdlib releases, but when they do end up getting set, we can use them opportunistically to more reliably detect cases where an index is applied on a string with a mismatching encoding. As more and more code gets recompiled with 5.7+, the stdlib will gradually become able to detect such issues with complete accuracy. Code that misuses indices this way was always considered broken; however, String wasn’t able to reliably detect these runtime errors before. Therefore, I expect there is a large amount of broken code out there that keeps using bridged Cocoa String indices (UTF-16) after a mutation turns them into native UTF-8 strings. Therefore, instead of trapping, this commit silently corrects the issue, transcoding the offsets into the correct encoding. It would probably be a good idea to also emit a runtime warning in addition to recovering from the error. This would generate some noise that would gently nudge folks to fix their code. rdar://89369680
2025-12-21 12:14:44 +01:00 · 2022-03-01 19:47:16 -08:00
parent 683b9fa021
commit 6e18955f90
8 changed files with 585 additions and 106 deletions
--- a/stdlib/public/core/StringBridge.swift
+++ b/stdlib/public/core/StringBridge.swift
@@ -614,10 +614,13 @@ extension String {
      // TODO: We'd rather emit a valid ObjC object statically than create a
      // shared string class instance.
      let gutsCountAndFlags = _guts._object._countAndFlags
+      let countAndFlags = _StringObject.CountAndFlags(
+        sharedCount: _guts.count,
+        isASCII: gutsCountAndFlags.isASCII,
+        isUTF16: false)
      return __SharedStringStorage(
        immortal: _guts._object.fastUTF8.baseAddress!,
-        countAndFlags: _StringObject.CountAndFlags(
-          sharedCount: _guts.count, isASCII: gutsCountAndFlags.isASCII))
+        countAndFlags: countAndFlags)
    }

    _internalInvariant(_guts._object.hasObjCBridgeableObject,
--- a/stdlib/public/core/StringCharacterView.swift
+++ b/stdlib/public/core/StringCharacterView.swift
@@ -49,21 +49,34 @@ extension String: BidirectionalCollection {
  ///   `endIndex`.
  /// - Returns: The index value immediately after `i`.
  public func index(after i: Index) -> Index {
+    let i = _guts.ensureMatchingEncoding(i)
    _precondition(i < endIndex, "String index is out of bounds")
+    let r = _uncheckedIndex(after: _guts.scalarAlign(i))
+    return _guts.markEncoding(r)
+  }

+  /// A version of `index(after:)` that assumes that the given index:
+  ///
+  /// - has the right encoding,
+  /// - is within bounds, and
+  /// - is scalar aligned.
+  ///
+  /// It does not mark the encoding of the returned index.
+  internal func _uncheckedIndex(after i: Index) -> Index {
    // FIXME: Unlike `index(before:)`, this function may return incorrect
    // results if `i` isn't on a grapheme cluster boundary. (The grapheme
    // breaking algorithm assumes we start on a break when we go forward.)
+    _internalInvariant(_guts.hasMatchingEncoding(i))
+    _internalInvariant(i < endIndex)
+    _internalInvariant(i._isScalarAligned)

    // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc.
-    let i = _guts.scalarAlign(i)
    let stride = _characterStride(startingAt: i)
    let nextOffset = i._encodedOffset &+ stride
-    let nextStride = _characterStride(
-      startingAt: Index(_encodedOffset: nextOffset)._scalarAligned)
-
-    return Index(
-      encodedOffset: nextOffset, characterStride: nextStride)._scalarAligned
+    let nextIndex = Index(_encodedOffset: nextOffset)._scalarAligned
+    let nextStride = _characterStride(startingAt: nextIndex)
+    let r = Index(encodedOffset: nextOffset, characterStride: nextStride)
+    return r._scalarAligned
  }

  /// Returns the position immediately before the given index.
@@ -72,7 +85,7 @@ extension String: BidirectionalCollection {
  ///   `startIndex`.
  /// - Returns: The index value immediately before `i`.
  public func index(before i: Index) -> Index {
-    // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc.
+    let i = _guts.ensureMatchingEncoding(i)

    // Note: bounds checking in `index(before:)` is tricky as scalar aligning an
    // index may need to access storage, but it may also move it closer towards
@@ -82,11 +95,30 @@ extension String: BidirectionalCollection {
    let i = _guts.scalarAlign(i)
    _precondition(i > startIndex, "String index is out of bounds")

+    let r = _uncheckedIndex(before: _guts.scalarAlign(i))
+    return _guts.markEncoding(r)
+  }
+
+  /// A version of `index(before:)` that assumes that the given index:
+  ///
+  /// - has the right encoding,
+  /// - is within bounds, and
+  /// - is scalar aligned.
+  ///
+  /// It does not mark the encoding of the returned index.
+  internal func _uncheckedIndex(before i: Index) -> Index {
+    _internalInvariant(_guts.hasMatchingEncoding(i))
+    _internalInvariant(i > startIndex && i <= endIndex)
+    _internalInvariant(i._isScalarAligned)
+
+    // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc.
    let stride = _characterStride(endingAt: i)
    let priorOffset = i._encodedOffset &- stride
-    return Index(
-      encodedOffset: priorOffset, characterStride: stride)._scalarAligned
+
+    let r = Index(encodedOffset: priorOffset, characterStride: stride)
+    return r._scalarAligned
  }
+
  /// Returns an index that is the specified distance from the given index.
  ///
  /// The following example obtains an index advanced four positions from a
@@ -109,10 +141,29 @@ extension String: BidirectionalCollection {
  ///   is the same value as the result of `abs(distance)` calls to
  ///   `index(before:)`.
  /// - Complexity: O(*n*), where *n* is the absolute value of `distance`.
-  @inlinable @inline(__always)
  public func index(_ i: Index, offsetBy distance: Int) -> Index {
+    // Note: in Swift 5.6 and below, this method used to be inlinable,
+    // forwarding to `_index(_:offsetBy:)`.
+
    // TODO: known-ASCII and single-scalar-grapheme fast path, etc.
-    return _index(i, offsetBy: distance)
+
+    var i = _guts.ensureMatchingEncoding(i)
+    _precondition(i >= startIndex && i <= endIndex,
+      "String index is out of bounds")
+    i = _guts.scalarAlign(i)
+
+    if distance >= 0 {
+      for _ in stride(from: 0, to: distance, by: 1) {
+        _precondition(i < endIndex, "String index is out of bounds")
+        i = _uncheckedIndex(after: i)
+      }
+    } else {
+      for _ in stride(from: 0, to: distance, by: -1) {
+        _precondition(i > startIndex, "String index is out of bounds")
+        i = _uncheckedIndex(before: i)
+      }
+    }
+    return _guts.markEncoding(i)
  }

  /// Returns an index that is the specified distance from the given index,
@@ -158,6 +209,8 @@ extension String: BidirectionalCollection {
  ) -> Index? {
    // Note: In Swift 5.6 and below, this function used to be inlinable,
    // forwarding to `BidirectionalCollection._index(_:offsetBy:limitedBy:)`.
+    // Unfortunately, that approach isn't compatible with SE-0180, as it doesn't
+    // support cases where `i` or `limit` aren't character aligned.

    // TODO: known-ASCII and single-scalar-grapheme fast path, etc.

@@ -167,23 +220,30 @@ extension String: BidirectionalCollection {

    // Note: `limit` is intentionally not scalar aligned to ensure our behavior
    // exactly matches the documentation above.
+    let limit = _guts.ensureMatchingEncoding(limit)

-    let start = _guts.scalarAlign(i)
-    var i = start
+    var i = _guts.ensureMatchingEncoding(i)
+    _precondition(i >= startIndex && i <= endIndex,
+      "String index is out of bounds")
+    i = _guts.scalarAlign(i)
+
+    let start = i
    if distance >= 0 {
      for _ in stride(from: 0, to: distance, by: 1) {
        guard limit < start || i < limit else { return nil }
-        formIndex(after: &i)
+        _precondition(i < endIndex, "String index is out of bounds")
+        i = _uncheckedIndex(after: i)
      }
      guard limit < start || i <= limit else { return nil }
    } else {
      for _ in stride(from: 0, to: distance, by: -1) {
        guard limit > start || i > limit else { return nil }
-        formIndex(before: &i)
+        _precondition(i > startIndex, "String index is out of bounds")
+        i = _uncheckedIndex(before: i)
      }
      guard limit > start || i >= limit else { return nil }
    }
-    return i
+    return _guts.markEncoding(i)
  }

  /// Returns the distance between two indices.
@@ -199,32 +259,40 @@ extension String: BidirectionalCollection {
    // Note: In Swift 5.6 and below, this function used to be inlinable,
    // forwarding to `BidirectionalCollection._distance(from:to:)`.

-    // TODO: known-ASCII and single-scalar-grapheme fast path, etc.
-    let start = _guts.scalarAlign(start)
-    let end = _guts.scalarAlign(end)
-
-    // Per SE-0180, `start` and `end` are allowed to fall in between grapheme
-    // breaks, in which case this function must still terminate without trapping
-    // and return a result that makes sense.
-
    // FIXME: Due to the `index(after:)` problem above, this function doesn't
    // always return consistent results when the given indices fall between
    // grapheme breaks -- swapping `start` and `end` may change the magnitude of
    // the result.

+    var start = _guts.ensureMatchingEncoding(start)
+    var end = _guts.ensureMatchingEncoding(end)
+
+    _precondition(
+      start >= startIndex && start <= endIndex &&
+      end >= startIndex && end <= endIndex,
+      "String index is out of bounds")
+
+    start = _guts.scalarAlign(start)
+    end = _guts.scalarAlign(end)
+
+    // TODO: known-ASCII and single-scalar-grapheme fast path, etc.
+
+    // Per SE-0180, `start` and `end` are allowed to fall in between grapheme
+    // breaks, in which case this function must still terminate without trapping
+    // and return a result that makes sense.
+
    var i = start
    var count = 0
-
    if i < end {
      while i < end { // Note `<` instead of `==`
        count += 1
-        formIndex(after: &i)
+        i = _uncheckedIndex(after: i)
      }
    }
    else if i > end {
      while i > end { // Note `<` instead of `==`
        count -= 1
-        formIndex(before: &i)
+        i = _uncheckedIndex(before: i)
      }
    }
    return count
@@ -245,11 +313,17 @@ extension String: BidirectionalCollection {
  ///
  /// - Parameter i: A valid index of the string. `i` must be less than the
  ///   string's end index.
-  @inlinable @inline(__always)
+  @inlinable @inline(__always) // FIXME(lorentey): Consider removing these. If
+                               // `index(after:)` isn't inlinable, does it
+                               // really matter if this one is? (Potential
+                               // _guts-related optimizations notwithstanding.)
+                               // `subscript` being inlinable forces a bunch of
+                               // new additions to be _aEIC, even though they
+                               // ought to be internal.
  public subscript(i: Index) -> Character {
+    var i = _guts.ensureMatchingEncoding(i)
    _boundsCheck(i)
-
-    let i = _guts.scalarAlign(i)
+    i = _guts.scalarAlign(i)
    let distance = _characterStride(startingAt: i)

    return _guts.errorCorrectedCharacter(
--- a/stdlib/public/core/StringGuts.swift
+++ b/stdlib/public/core/StringGuts.swift
@@ -288,11 +288,107 @@ extension _StringGuts {

  @inlinable @inline(__always)
  internal var startIndex: String.Index {
-   return Index(_encodedOffset: 0)._scalarAligned
+    Index(_encodedOffset: 0)._scalarAligned._encodingIndependent
  }
+
  @inlinable @inline(__always)
  internal var endIndex: String.Index {
-    return Index(_encodedOffset: self.count)._scalarAligned
+    markEncoding(Index(_encodedOffset: self.count)._scalarAligned)
+  }
+
+  @inlinable @inline(__always)
+  internal func index(atOffset offset: Int) -> String.Index {
+    markEncoding(Index(_encodedOffset: self.count)._scalarAligned)
+  }
+}
+
+// Encoding
+extension _StringGuts {
+  @_alwaysEmitIntoClient // Swift 5.7
+  internal func markEncoding(_ i: String.Index) -> String.Index {
+    if _slowPath(isForeign) {
+      // FIXME: Instead of having an opaque path here, we should define the same
+      // encoding flags in StringObject and pick them up from there. The flags
+      // can be initialized at the time the foreign string is created.
+      guard
+        #available(macOS 9999, iOS 9999, watchOS 9999, tvOS 9999, *)  // SwiftStdlib 5.7
+      else {
+        // We know all foreign strings were UTF-16 in releases < 5.7
+        return i._knownUTF16
+      }
+      return _foreignMarkEncoding(i)
+    }
+    return i._knownUTF8
+  }
+
+  @_effects(readnone)
+  @available(SwiftStdlib 5.7, *)
+  @usableFromInline
+  internal func _foreignMarkEncoding(_ i: String.Index) -> String.Index {
+    // Currently foreign indices always have UTF-16 offsets.
+    i._knownUTF16
+  }
+
+  internal func hasMatchingEncoding(_ i: String.Index) -> Bool {
+    (isForeign && i._canBeUTF16) || (!isForeign && i._canBeUTF8)
+  }
+
+  /// Return an index whose encoding can be assumed to match that of `self`.
+  ///
+  /// Detecting an encoding mismatch isn't always possible -- older binaries did
+  /// not set the flags that this method relies on. However, false positives
+  /// cannot happen: if this method detects a mismatch, then it is guaranteed to
+  /// be a real one.
+  @_alwaysEmitIntoClient
+  @inline(__always)
+  internal func ensureMatchingEncoding(_ i: String.Index) -> String.Index {
+    if _fastPath(!isForeign && i._canBeUTF8) { return i }
+    return _slowEnsureMatchingEncoding(i)
+  }
+
+  @_alwaysEmitIntoClient
+  internal func _slowEnsureMatchingEncoding(_ i: String.Index) -> String.Index {
+    _internalInvariant(isForeign || !i._canBeUTF8)
+    if isForeign {
+      // Opportunistically detect attempts to use an UTF-8 index on a UTF-16
+      // string. Strings don't usually get converted to UTF-16 storage, so it
+      // seems okay to trap in this case -- the index most likely comes from an
+      // unrelated string. (Trapping here may still turn out to affect binary
+      // compatibility with broken code in existing binaries running with new
+      // stdlibs. If so, we can replace this with the same transcoding hack as
+      // in the UTF-16->8 case below.)
+      //
+      // Note that this trap is not guaranteed to trigger when the process
+      // includes client binaries compiled with a previous Swift release.
+      // (`i._canBeUTF16` can sometimes return true in that case even if the
+      // index actually came from an UTF-8 string.) However, the trap will still
+      // often trigger in this case, as long as the index was initialized by
+      // code that was compiled with 5.7+.
+      //
+      // This trap can never trigger on OSes that have stdlibs <= 5.6, because
+      // those versions never set the `isKnownUTF16` flag in `_StringObject`.
+      //
+      _precondition(!_object.isKnownUTF16 || i._canBeUTF16,
+        "Invalid string index")
+      return i
+    }
+    // If we get here, then we know for sure that this is an attempt to use an
+    // UTF-16 index on a UTF-8 string.
+    //
+    // This can happen if `self` was originally verbatim-bridged, and someone
+    // mistakenly attempts to keep using an old index after a mutation. This is
+    // technically an error, but trapping here would trigger a lot of broken
+    // code that previously happened to work "fine" on e.g. ASCII strings.
+    // Instead, attempt to convert the offset to UTF-8 code units by transcoding
+    // the string. This can be slow, but it often results in a usable index,
+    // even if non-ASCII characters are present. (UTF-16 breadcrumbs help reduce
+    // the severity of the slowdown.)
+
+    // FIXME: Consider emitting a runtime warning here.
+    // FIXME: Consider performing a linked-on-or-after check & trapping if the
+    // client executable was built on some particular future Swift release.
+    let utf16 = String(self).utf16
+    return utf16.index(utf16.startIndex, offsetBy: i._encodedOffset)
  }
 }

--- a/stdlib/public/core/StringGutsSlice.swift
+++ b/stdlib/public/core/StringGutsSlice.swift
@@ -29,6 +29,9 @@ internal struct _StringGutsSlice {

  @inline(__always)
  internal init(_ guts: _StringGuts, _ offsetRange: Range<Int>) {
+    _internalInvariant(
+      guts.isOnUnicodeScalarBoundary(offsetRange.lowerBound)
+      && guts.isOnUnicodeScalarBoundary(offsetRange.upperBound))
    self._guts = guts
    self._offsetRange = offsetRange
  }
@@ -74,8 +77,11 @@ internal struct _StringGutsSlice {
  @inlinable
  internal var range: Range<String.Index> {
    @inline(__always) get {
-      return String.Index(_encodedOffset: _offsetRange.lowerBound)
-         ..< String.Index(_encodedOffset: _offsetRange.upperBound)
+      let lower = String.Index(_encodedOffset: _offsetRange.lowerBound)
+        ._scalarAligned
+      let higher = String.Index(_encodedOffset: _offsetRange.upperBound)
+        ._scalarAligned
+      return Range(_uncheckedBounds: (lower, higher))
    }
  }

--- a/stdlib/public/core/StringIndex.swift
+++ b/stdlib/public/core/StringIndex.swift
@@ -16,20 +16,20 @@ import SwiftShims

 String's Index has the following layout:

- ┌──────────┬───────────────────╥────────────────┬──────────╥────────────────┐
- │ b63:b16  │      b15:b14      ║     b13:b8     │  b7:b1   ║       b0       │
- ├──────────┼───────────────────╫────────────────┼──────────╫────────────────┤
- │ position │ transcoded offset ║ grapheme cache │ reserved ║ scalar aligned │
- └──────────┴───────────────────╨────────────────┴──────────╨────────────────┘
-                                └──────── resilient ────────┘
+ ┌──────────┬────────────────╥────────────────┬───────╥───────┐
+ │ b63:b16  │      b15:b14   ║     b13:b8     │ b7:b3 ║ b2:b0 │
+ ├──────────┼────────────────╫────────────────┼───────╫───────┤
+ │ position │ transc. offset ║ grapheme cache │ rsvd  ║ flags │
+ └──────────┴────────────────╨────────────────┴───────╨───────┘
+                             └────── resilient ───────┘

-Position, transcoded offset, and scalar aligned are fully exposed in the ABI.
-Grapheme cache and reserved are partially resilient: the fact that there are 13
-bits with a default value of `0` is ABI, but not the layout, construction, or
+Position, transcoded offset, and flags are fully exposed in the ABI. Grapheme
+cache and reserved bits are partially resilient: the fact that there are 11 bits
+with a default value of `0` is ABI, but not the layout, construction, or
 interpretation of those bits. All use of grapheme cache should be behind
 non-inlinable function calls. Inlinable code should not set a non-zero value to
-grapheme cache bits: doing so breaks back deployment as they will be interpreted
-as a set cache.
+resilient bits: doing so breaks future evolution as the meaning of those bits
+isn't frozen.

 - position aka `encodedOffset`: A 48-bit offset into the string's code units

@@ -40,12 +40,18 @@ as a set cache.
 - grapheme cache: A 6-bit value remembering the distance to the next grapheme
 boundary.

- reserved: 7-bit for future use.
+- reserved: 5 unused bits available for future flags etc. The meaning of each
+  bit may change between stdlib versions. These must be set to zero if
+  constructing an index in inlinable code.

 <resilience barrier>

- scalar aligned, whether this index is known to be scalar-aligned (see below)
+  b2: UTF-16. If set, position is in known to be UTF-16 code units [Swift 5.7+]
+  b1: UTF-8. If set, position is in known to be UTF-8 code units [Swift 5.7+]
+  b0: Scalar alignment. If set, index is known to be scalar-aligned (see below)

+Before Swift 5.7, bits b1 and b2 used to be part of the resilient slice.
+See the note on Index Encoding below to see how this works.

 */
 extension String {
@@ -72,7 +78,7 @@ extension String.Index {
  @inlinable @inline(__always)
  internal var isZeroPosition: Bool { return orderingValue == 0 }

-  /// The UTF-16 code unit offset corresponding to this Index
+  /// The UTF-16 code unit offset corresponding to this index.
  public func utf16Offset<S: StringProtocol>(in s: S) -> Int {
    return s.utf16.distance(from: s.utf16.startIndex, to: self)
  }
@@ -272,6 +278,95 @@ extension String.Index {
  }
 }

+/*
+  Index Encoding
+
+  Swift 5.7 introduced bookkeeping to keep track of the Unicode encoding
+  associated with the position value in String indices. Indices whose position
+  is an offset into UTF-8 storage come with the corresponding flag set, and a
+  separate flag is set for UTF-16 indices. (Only foreign strings can be UTF-16
+  encoded. As of 5.7, all foreign strings are UTF-16; but this is subject to
+  change later if we ever decide to implement additional foreign forms.)
+
+  In releases before 5.7, the bits corresponding to these flags were considered
+  reserved, and they were both set to zero in inlinable code. This means that
+  (on ABI stable platforms at least) we cannot assume that either of these bits
+  will be reliably set. If they are both clear, then we must fall back to
+  assuming that the index has the right encoding for whatever string it is used
+  on. However, if any of these bits are set, then the other bit's value is also
+  reliable -- whether it's set or cleared.
+
+  The indices of ASCII strings are encoding-independent, i.e. transcoding such
+  strings from UTF-8 to UTF-16 (or vice versa) does not change the position
+  value of any of their indices. Therefore it isn't an error for an index to
+  have both of these flags set. (The start index of every string also behaves
+  this way: position zero is the same no matter how the rest of string is
+  stored.)
+
+  These two bits (along with the isKnownUTF16 flag in StringObject) allows newer
+  versions of the Standard Library to more reliably catch runtime errors where
+  client code is applying an index from a UTF-16 string to a UTF-8 one, or vice
+  versa. This typically happens when indices from a UTF-16 Cocoa string that was
+  verbatim bridged into Swift are accidentally applied to a mutated version of
+  the same string. (The mutation turns it into a UTF-8 native string, where the
+  same numerical offsets might correspond to wildly different logical
+  positions.)
+
+  Such code has always been broken, as the old indices are documented to be no
+  longer valid after the mutation; however, in previous releases this bug wasn't
+  reliably detected, and if the code was only ever tested on ASCII strings, then
+  the bug could lie dormant for a long time. (Until the code encounters a
+  non-ASCII character and someone gets surprised that the results no longer make
+  sense.)
+
+  As more code gets rebuilt with Swift 5.7+, the stdlib will gradually become
+  able to reliably catch and correct all such issues. The error cases are
+  handled in `_StringGuts.ensureMatchingEncoding(_:)`; see there for the sordid
+  details.
+
+*/
+extension String.Index {
+  /// Returns true if the position in this index is okay to interpret as offset
+  /// into UTF-8-encoded string storage.
+  ///
+  /// (This returns true if either we know for sure that this is an UTF-8 index,
+  /// or if we don't have enough information to determine its encoding.)
+  @_alwaysEmitIntoClient // Swift 5.7
+  @inline(__always)
+  internal var _canBeUTF8: Bool {
+    // The only way an index cannot be UTF-8 is it has only the UTF-16 flag set.
+    _rawBits & 0x6 != 0x04
+  }
+
+  /// Returns true if the position in this index is okay to interpret as offset
+  /// into UTF-16-encoded string storage.
+  ///
+  /// (This returns true if either we know for sure that this is an UTF-16
+  /// index, or if we don't have enough information to determine its
+  /// encoding.)
+  @_alwaysEmitIntoClient // Swift 5.7
+  @inline(__always)
+  internal var _canBeUTF16: Bool {
+    // The only way an index cannot be UTF-16 is it has only the UTF-8 flag set.
+    _rawBits & 0x6 != 0x02
+  }
+
+  /// Returns the same index with the UTF-8 bit set.
+  @_alwaysEmitIntoClient // Swift 5.7
+  @inline(__always)
+  internal var _knownUTF8: Self { Self(_rawBits | 0x2) }
+
+  /// Returns the same index with the UTF-16 bit set.
+  @_alwaysEmitIntoClient // Swift 5.7
+  @inline(__always)
+  internal var _knownUTF16: Self { Self(_rawBits | 0x4) }
+
+  /// Returns the same index with both UTF-8 & UTF-16 bits set.
+  @_alwaysEmitIntoClient // Swift 5.7
+  @inline(__always)
+  internal var _encodingIndependent: Self { Self(_rawBits | 0x6) }
+}
+
 extension String.Index: Equatable {
  @inlinable @inline(__always)
  public static func == (lhs: String.Index, rhs: String.Index) -> Bool {
@@ -313,6 +408,12 @@ extension String.Index: CustomStringConvertible {
    if _isScalarAligned {
      d += ", scalarAligned"
    }
+    if _rawBits & 0x2 != 0 {
+      d += ", utf8"
+    }
+    if _rawBits & 0x4 != 0 {
+      d += ", utf16"
+    }
    d += ")"
    return d
  }
--- a/stdlib/public/core/StringObject.swift
+++ b/stdlib/public/core/StringObject.swift
@@ -580,40 +580,79 @@ extension _StringObject {

 All non-small forms share the same structure for the other half of the bits
 (i.e. non-object bits) as a word containing code unit count and various
- performance flags. The top 16 bits are for performance flags, which are not
- semantically relevant but communicate that some operations can be done more
- efficiently on this particular string, and the lower 48 are the code unit
- count (aka endIndex).
+ performance flags. The top 16 bits are nonessential flags; these aren't
+ critical for correct operation, but they may provide additional guarantees that
+ allow more efficient operation or more reliable detection of runtime errors.
+ The lower 48 bits contain the code unit count (aka endIndex).

-┌─────────┬───────┬──────────────────┬─────────────────┬────────┬───────┐
-│   b63   │  b62  │       b61        │       b60       │ b59:48 │ b47:0 │
-├─────────┼───────┼──────────────────┼─────────────────┼────────┼───────┤
-│ isASCII │ isNFC │ isNativelyStored │ isTailAllocated │  TBD   │ count │
-└─────────┴───────┴──────────────────┴─────────────────┴────────┴───────┘
+┌──────┬──────┬──────┬──────┬──────┬──────────┬───────────────────────────────┐
+│ b63  │ b62  │ b61  │ b60  │ b59  │  b58:48  │             b47:0             │
+├──────┼──────┼──────┼──────┼──────┼──────────┼───────────────────────────────┤
+│ ASCII│ NFC  │native│ tail │ UTF16│ reserved │             count             │
+└──────┴──────┴──────┴──────┴──────┴──────────┴───────────────────────────────┘

- isASCII: set when all code units are known to be ASCII, enabling:
+ b63: isASCII. set when all code units are known to be ASCII, enabling:
   - Trivial Unicode scalars, they're just the code units
   - Trivial UTF-16 transcoding (just bit-extend)
   - Also, isASCII always implies isNFC
- isNFC: set when the contents are in normal form C
+
+ b62: isNFC. set when the contents are in normal form C
   - Enables trivial lexicographical comparisons: just memcmp
   - `isASCII` always implies `isNFC`, but not vice versa
- isNativelyStored: set for native stored strings
+
+ b61: isNativelyStored. set for native stored strings
   - `largeAddressBits` holds an instance of `_StringStorage`.
   - I.e. the start of the code units is at the stored address + `nativeBias`
- isTailAllocated: contiguous UTF-8 code units starts at address + `nativeBias`
+   - NOTE: isNativelyStored is *specifically* allocated to b61 to align with the
+     bit-position of isSmall on the BridgeObject. This allows us to check for
+     native storage without an extra branch guarding against smallness. See
+     `_StringObject.hasNativeStorage` for this usage.
+
+ b60: isTailAllocated. contiguous UTF-8 code units starts at address + `nativeBias`
   - `isNativelyStored` always implies `isTailAllocated`, but not vice versa
      (e.g. literals)
   - `isTailAllocated` always implies `isFastUTF8`
- TBD: Reserved for future usage
-   - Setting a TBD bit to 1 must be semantically equivalent to 0
-   - I.e. it can only be used to "cache" fast-path information in the future
- count: stores the number of code units, corresponds to `endIndex`.

- NOTE: isNativelyStored is *specifically* allocated to b61 to align with the
- bit-position of isSmall on the BridgeObject. This allows us to check for
- native storage without an extra branch guarding against smallness. See
- `_StringObject.hasNativeStorage` for this usage.
+ b59: isKnownUTF16. This bit is set if index positions in the string are known
+     to be measured in UTF-16 code units, rather than the default UTF-8.
+   - This is only ever set on UTF-16 foreign strings created in noninlinable
+     code in stdlib versions >= 5.7. On stdlibs <= 5.6, this bit is always set
+     to zero.
+   - Note that while as of 5.7 all foreign strings are UTF-16, this isn't
+     guaranteed to remain this way -- future versions of the stdlib may
+     introduce new foreign forms that use a different encoding. (Likely UTF-8.)
+   - Foreign strings are only created in non-inlinable code, so on stdlib
+     versions >=5.7, this bit always correctly reflects the correct encoding
+     for the string's offset values.
+   - This bit along with the two related bits in String.Index allow us to
+     opportunistically catch cases where an UTF-16 index is used on an UTF-8
+     string (or vice versa), and to provide better error reporting & recovery.
+     As more code gets rebuilt with Swift 5.7+, the stdlib will gradually become
+     able to reliably catch all such issues.
+   - It is okay for isASCII strings to not set this flag, even if they are
+     UTF-16 encoded -- the offsets in that case can work in either encoding.
+     (This is not currently exercised, as foreign bridged strings never set
+     the isASCII flag.)
+
+ b48-58: Reserved for future usage.
+   - Because Swift is ABI stable (on some platforms at least), these bits can
+     only be assigned semantics that don't affect interoperability with code
+     built with previous releases of the Standard Library, from 5.0 onward.
+   - Older binaries will not look at newly assigned bits, and they will not
+     set them, either (unless by side effect of calling into newly built code).
+     Such code must continue working.
+   - Code in new versions of the stdlib must continue to work corectly even if
+     some of these newly assigned bits are never set -- as may be the case when
+     the initialization of a string was emitted entirely into an older client
+     binary.
+   - This typically means that these bits can only be used as optional
+     performance shortcuts, e.g. to signal the availability of a potential fast
+     path. (However, it is also possible to store information here that allows
+     more reliable detection & handling of runtime errors, like the
+     `isKnownUTF16` bit above.)
+
+ b0-47: count. Stores the number of code units. Corresponds to the position of
+     the `endIndex`.

 */
 extension _StringObject.CountAndFlags {
@@ -639,6 +678,12 @@ extension _StringObject.CountAndFlags {
    return 0x1000_0000_0000_0000
  }

+  @_alwaysEmitIntoClient // Swift 5.7
+  @inline(__always)
+  internal static var isKnownUTF16Mask: UInt64 {
+    return 0x0800_0000_0000_0000
+  }
+
  // General purpose bottom initializer
  @inlinable @inline(__always)
  internal init(
@@ -677,10 +722,53 @@ extension _StringObject.CountAndFlags {
    _internalInvariant(isTailAllocated == self.isTailAllocated)
  }

+  @inline(__always)
+  internal init(
+    count: Int,
+    isASCII: Bool,
+    isNFC: Bool,
+    isNativelyStored: Bool,
+    isTailAllocated: Bool,
+    isKnownUTF16: Bool
+  ) {
+    var rawBits = UInt64(truncatingIfNeeded: count)
+    _internalInvariant(rawBits <= _StringObject.CountAndFlags.countMask)
+
+    if isASCII {
+      _internalInvariant(isNFC)
+      rawBits |= _StringObject.CountAndFlags.isASCIIMask
+    }
+
+    if isNFC {
+      rawBits |= _StringObject.CountAndFlags.isNFCMask
+    }
+
+    if isNativelyStored {
+      _internalInvariant(isTailAllocated)
+      rawBits |= _StringObject.CountAndFlags.isNativelyStoredMask
+    }
+
+    if isTailAllocated {
+      rawBits |= _StringObject.CountAndFlags.isTailAllocatedMask
+    }
+
+    if isKnownUTF16 {
+      rawBits |= _StringObject.CountAndFlags.isKnownUTF16Mask
+    }
+
+    self.init(raw: rawBits)
+    _internalInvariant(count == self.count)
+    _internalInvariant(isASCII == self.isASCII)
+    _internalInvariant(isNFC == self.isNFC)
+    _internalInvariant(isNativelyStored == self.isNativelyStored)
+    _internalInvariant(isTailAllocated == self.isTailAllocated)
+    _internalInvariant(isKnownUTF16 == self.isKnownUTF16)
+  }
+
  @inlinable @inline(__always)
  internal init(count: Int, flags: UInt16) {
-    // Currently, we only use top 4 flags
-    _internalInvariant(flags & 0xF000 == flags)
+    // Currently, we only use top 5 flags
+    _internalInvariant(flags & 0xF800 == flags)

    let rawBits = UInt64(truncatingIfNeeded: flags) &<< 48
                | UInt64(truncatingIfNeeded: count)
@@ -710,13 +798,14 @@ extension _StringObject.CountAndFlags {
      isTailAllocated: true)
  }
  @inline(__always)
-  internal init(sharedCount: Int, isASCII: Bool) {
+  internal init(sharedCount: Int, isASCII: Bool, isUTF16: Bool) {
    self.init(
      count: sharedCount,
      isASCII: isASCII,
      isNFC: isASCII,
      isNativelyStored: false,
-      isTailAllocated: false)
+      isTailAllocated: false,
+      isKnownUTF16: isUTF16)
  }

  //
@@ -750,6 +839,11 @@ extension _StringObject.CountAndFlags {
  internal var isTailAllocated: Bool {
    return 0 != _storage & _StringObject.CountAndFlags.isTailAllocatedMask
  }
+  @_alwaysEmitIntoClient
+  @inline(__always) // Swift 5.7
+  internal var isKnownUTF16: Bool {
+    return 0 != _storage & _StringObject.CountAndFlags.isKnownUTF16Mask
+  }

  #if !INTERNAL_CHECKS_ENABLED
  @inlinable @inline(__always) internal func _invariantCheck() {}
@@ -762,6 +856,10 @@ extension _StringObject.CountAndFlags {
    if isNativelyStored {
      _internalInvariant(isTailAllocated)
    }
+    if isKnownUTF16 {
+      _internalInvariant(!isNativelyStored)
+      _internalInvariant(!isTailAllocated)
+    }
  }
  #endif // INTERNAL_CHECKS_ENABLED
 }
@@ -895,6 +993,13 @@ extension _StringObject {
    return _countAndFlags.isNFC
  }

+  @_alwaysEmitIntoClient // Swift 5.7
+  @inline(__always)
+  internal var isKnownUTF16: Bool {
+    if isSmall { return false }
+    return _countAndFlags.isKnownUTF16
+  }
+
  // Get access to fast UTF-8 contents for large strings which provide it.
  @inlinable @inline(__always)
  internal var fastUTF8: UnsafeBufferPointer<UInt8> {
@@ -994,7 +1099,8 @@ extension _StringObject {
  internal init(
    cocoa: AnyObject, providesFastUTF8: Bool, isASCII: Bool, length: Int
  ) {
-    let countAndFlags = CountAndFlags(sharedCount: length, isASCII: isASCII)
+    let countAndFlags = CountAndFlags(
+      sharedCount: length, isASCII: isASCII, isUTF16: !providesFastUTF8)
    let discriminator = Nibbles.largeCocoa(providesFastUTF8: providesFastUTF8)
 #if arch(i386) || arch(arm) || arch(arm64_32) || arch(wasm32)
    self.init(
--- a/stdlib/public/core/Substring.swift
+++ b/stdlib/public/core/Substring.swift
@@ -97,11 +97,17 @@ public struct Substring: Sendable {
  @usableFromInline
  internal var _slice: Slice<String>

-  @inlinable
+  @usableFromInline
  internal init(_ slice: Slice<String>) {
-    let _guts = slice.base._guts
-    let start = _guts.scalarAlign(slice.startIndex)
-    let end = _guts.scalarAlign(slice.endIndex)
+    let _guts = slice._base._guts
+    _internalInvariant(
+      _guts.hasMatchingEncoding(slice.startIndex) &&
+      _guts.hasMatchingEncoding(slice.endIndex))
+    _internalInvariant(
+      slice.startIndex >= _guts.startIndex && slice.endIndex <= _guts.endIndex)
+
+    let start = slice.base._guts.scalarAlign(slice.startIndex)
+    let end = slice.base._guts.scalarAlign(slice.endIndex)

    self._slice = Slice(
      base: slice.base,
@@ -168,32 +174,44 @@ extension Substring: StringProtocol {
    // leads to Collection conformance issues when the `Substring`'s bounds do
    // not fall on grapheme boundaries in `base`.

+    let i = _slice.base._guts.ensureMatchingEncoding(i)
+    _precondition(i < endIndex && i >= startIndex,
+      "Substring index is out of bounds")
+    let r = _uncheckedIndex(after: _slice.base._guts.scalarAlign(i))
+    return _slice.base._guts.markEncoding(r)
+  }
+
+  /// A version of `index(after:)` that assumes that the given index:
+  ///
+  /// - has the right encoding,
+  /// - is within bounds, and
+  /// - is scalar aligned.
+  ///
+  /// It does not mark the encoding of the returned index.
+  internal func _uncheckedIndex(after i: Index) -> Index {
    // FIXME: Unlike `index(before:)`, this function may return incorrect
    // results if `i` isn't on a grapheme cluster boundary. (The grapheme
    // breaking algorithm assumes we start on a break when we go forward.)
-
-    let i = _slice.base._guts.scalarAlign(i)
-
-    _precondition(i < endIndex && i >= startIndex,
-      "Substring index is out of bounds")
+    _internalInvariant(_slice.base._guts.hasMatchingEncoding(i))
+    _internalInvariant(i < endIndex)
+    _internalInvariant(i._isScalarAligned)

    let stride = _characterStride(startingAt: i)
    let nextOffset = i._encodedOffset &+ stride
-
    let nextIndex = Index(_encodedOffset: nextOffset)._scalarAligned
-    guard _knownToStartOnGraphemeBreak else {
+    guard
      // Don't cache character strides in indices of exotic substrings whose
      // startIndex isn't aligned on a grapheme cluster boundary. (Their
      // grapheme breaks may not match with those in `base`.)
-      return nextIndex
-    }
-    guard nextIndex < endIndex || _knownToEndOnGraphemeBreak else {
+      _knownToStartOnGraphemeBreak,
      // Don't cache the stride if we end on a partial grapheme cluster.
+      nextIndex < endIndex || _knownToEndOnGraphemeBreak
+    else {
      return nextIndex
    }
    let nextStride = _characterStride(startingAt: nextIndex)
-    return Index(
-      encodedOffset: nextOffset, characterStride: nextStride)._scalarAligned
+    let r = Index(encodedOffset: nextOffset, characterStride: nextStride)
+    return r._scalarAligned
  }

  public func index(before i: Index) -> Index {
@@ -204,8 +222,24 @@ extension Substring: StringProtocol {
    // leads to Collection conformance issues when the `Substring`'s bounds do
    // not fall on grapheme boundaries in `base`.

+    let i = _slice.base._guts.ensureMatchingEncoding(i)
    _precondition(i <= endIndex && i > startIndex,
      "Substring index is out of bounds")
+    let r = _uncheckedIndex(before: _slice.base._guts.scalarAlign(i))
+    return _slice.base._guts.markEncoding(r)
+  }
+
+  /// A version of `index(before:)` that assumes that the given index:
+  ///
+  /// - has the right encoding,
+  /// - is within bounds, and
+  /// - is scalar aligned.
+  ///
+  /// It does not mark the encoding of the returned index.
+  internal func _uncheckedIndex(before i: Index) -> Index {
+    _internalInvariant(_slice.base._guts.hasMatchingEncoding(i))
+    _internalInvariant(i < endIndex)
+    _internalInvariant(i._isScalarAligned)

    // TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc.
    let i = _slice.base._guts.scalarAlign(i)
@@ -231,7 +265,24 @@ extension Substring: StringProtocol {
    // outside the substring to affect grapheme breaking results within the
    // substring. This leads to Collection conformance issues when the
    // `Substring`'s bounds do not fall on grapheme boundaries in `base`.
-    return _index(i, offsetBy: distance)
+
+    var i = _slice.base._guts.ensureMatchingEncoding(i)
+    _precondition(i >= startIndex && i <= endIndex,
+      "String index is out of bounds")
+    i = _slice.base._guts.scalarAlign(i)
+    // TODO: known-ASCII and single-scalar-grapheme fast path, etc.
+    if distance >= 0 {
+      for _ in stride(from: 0, to: distance, by: 1) {
+        _precondition(i < endIndex, "String index is out of bounds")
+        i = _uncheckedIndex(after: i)
+      }
+    } else {
+      for _ in stride(from: 0, to: distance, by: -1) {
+        _precondition(i > startIndex, "String index is out of bounds")
+        i = _uncheckedIndex(before: i)
+      }
+    }
+    return _slice.base._guts.markEncoding(i)
  }

  public func index(
@@ -250,23 +301,30 @@ extension Substring: StringProtocol {

    // Note: `limit` is intentionally not scalar aligned to ensure our behavior
    // exactly matches the documentation.
+    let limit = _slice.base._guts.ensureMatchingEncoding(limit)

-    let start = _slice.base._guts.scalarAlign(i)
-    var i = start
+    var i = _slice.base._guts.ensureMatchingEncoding(i)
+    _precondition(i >= startIndex && i <= endIndex,
+      "String index is out of bounds")
+    i = _slice.base._guts.scalarAlign(i)
+
+    let start = i
    if distance >= 0 {
      for _ in stride(from: 0, to: distance, by: 1) {
        guard limit < start || i < limit else { return nil }
-        formIndex(after: &i)
+        _precondition(i < endIndex, "String index is out of bounds")
+        i = _uncheckedIndex(after: i)
      }
      guard limit < start || i <= limit else { return nil }
    } else {
      for _ in stride(from: 0, to: distance, by: -1) {
        guard limit > start || i > limit else { return nil }
-        formIndex(before: &i)
+        _precondition(i > startIndex, "String index is out of bounds")
+        i = _uncheckedIndex(before: i)
      }
      guard limit > start || i >= limit else { return nil }
    }
-    return i
+    return _slice.base._guts.markEncoding(i)
  }

  public func distance(from start: Index, to end: Index) -> Int {
@@ -277,32 +335,53 @@ extension Substring: StringProtocol {
    // substring. This leads to Collection conformance issues when the
    // `Substring`'s bounds do not fall on grapheme boundaries in `base`.

+    // FIXME: Due to the `index(after:)` problem above, this function doesn't
+    // always return consistent results when the given indices fall between
+    // grapheme breaks -- swapping `start` and `end` may change the magnitude of
+    // the result.
+
+    var start = _slice.base._guts.ensureMatchingEncoding(start)
+    var end = _slice.base._guts.ensureMatchingEncoding(end)
+
+    _precondition(
+      start >= startIndex && start <= endIndex &&
+      end >= startIndex && end <= endIndex,
+      "String index is out of bounds")
+
+    start = _slice.base._guts.scalarAlign(start)
+    end = _slice.base._guts.scalarAlign(end)
+
    // TODO: known-ASCII and single-scalar-grapheme fast path, etc.

    // Per SE-0180, `start` and `end` are allowed to fall in between grapheme
    // breaks, in which case this function must still terminate without trapping
    // and return a result that makes sense.
-    var i = _slice.base._guts.scalarAlign(start)
-    let end = _slice.base._guts.scalarAlign(end)
-    var count = 0

+    var i = start
+    var count = 0
    if i < end {
      while i < end { // Note `<` instead of `==`
        count += 1
-        formIndex(after: &i)
+        i = _uncheckedIndex(after: i)
      }
    }
    else if i > end {
      while i > end { // Note `<` instead of `==`
        count -= 1
-        formIndex(before: &i)
+        i = _uncheckedIndex(before: i)
      }
    }
    return count
  }

  public subscript(i: Index) -> Character {
-    get { return _slice[i] }
+    var i = _slice.base._guts.ensureMatchingEncoding(i)
+    _precondition(i >= startIndex && i < endIndex,
+      "Substring index is out of bounds")
+    i = _slice.base._guts.scalarAlign(i)
+    let distance = _characterStride(startingAt: i)
+    return _slice.base._guts.errorCorrectedCharacter(
+      startingAt: i._encodedOffset, endingAt: i._encodedOffset &+ distance)
  }

  public mutating func replaceSubrange<C>(
@@ -958,6 +1037,9 @@ extension Substring: ExpressibleByStringLiteral {
 extension String {
  @available(swift, introduced: 4)
  public subscript(r: Range<Index>) -> Substring {
+    let r = Range(_uncheckedBounds: (
+        _guts.ensureMatchingEncoding(r.lowerBound),
+        _guts.ensureMatchingEncoding(r.upperBound)))
    _boundsCheck(r)
    return Substring(Slice(base: self, bounds: r))
  }
@@ -966,6 +1048,11 @@ extension String {
 extension Substring {
  @available(swift, introduced: 4)
  public subscript(r: Range<Index>) -> Substring {
-    return Substring(_slice[r])
+    let r = Range(_uncheckedBounds: (
+        _slice.base._guts.ensureMatchingEncoding(r.lowerBound),
+        _slice.base._guts.ensureMatchingEncoding(r.upperBound)))
+    _precondition(r.lowerBound >= startIndex && r.upperBound <= endIndex,
+      "Substring index range is out of bounds")
+    return Substring(Slice(base: _slice.base, bounds: r))
  }
 }
--- a/stdlib/public/core/UnicodeHelpers.swift
+++ b/stdlib/public/core/UnicodeHelpers.swift
@@ -233,6 +233,12 @@ extension _StringGuts {
    return self.withFastUTF8 { _decodeScalar($0, startingAt: i).0 }
  }

+  @_alwaysEmitIntoClient
+  @inline(__always)
+  internal func isOnUnicodeScalarBoundary(_ offset: Int) -> Bool {
+    isOnUnicodeScalarBoundary(String.Index(_encodedOffset: offset))
+  }
+
  @usableFromInline
  @_effects(releasenone)
  internal func isOnUnicodeScalarBoundary(_ i: String.Index) -> Bool {
@@ -244,7 +250,7 @@ extension _StringGuts {

    if _fastPath(isFastUTF8) {
      return self.withFastUTF8 {
-        return !UTF8.isContinuation($0[i._encodedOffset])
+        return !UTF8.isContinuation($0[_unchecked: i._encodedOffset])
      }
    }