swift-mirror/stdlib/public/core/StringUTF8View.swift

//===--- StringUTF8.swift - A UTF8 view of String -------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

// FIXME(ABI)#71 : The UTF-16 string view should have a custom iterator type to
// allow performance optimizations of linear traversals.

extension String {
  /// A view of a string's contents as a collection of UTF-8 code units.
  ///
  /// You can access a string's view of UTF-8 code units by using its `utf8`
  /// property. A string's UTF-8 view encodes the string's Unicode scalar
  /// values as 8-bit integers.
  ///
  ///     let flowers = "Flowers 💐"
  ///     for v in flowers.utf8 {
  ///         print(v)
  ///     }
  ///     // 70
  ///     // 108
  ///     // 111
  ///     // 119
  ///     // 101
  ///     // 114
  ///     // 115
  ///     // 32
  ///     // 240
  ///     // 159
  ///     // 146
  ///     // 144
  ///
  /// A string's Unicode scalar values can be up to 21 bits in length. To
  /// represent those scalar values using 8-bit integers, more than one UTF-8
  /// code unit is often required.
  ///
  ///     let flowermoji = "💐"
  ///     for v in flowermoji.unicodeScalars {
  ///         print(v, v.value)
  ///     }
  ///     // 💐 128144
  ///
  ///     for v in flowermoji.utf8 {
  ///         print(v)
  ///     }
  ///     // 240
  ///     // 159
  ///     // 146
  ///     // 144
  ///
  /// In the encoded representation of a Unicode scalar value, each UTF-8 code
  /// unit after the first is called a *continuation byte*.
  ///
  /// UTF8View Elements Match Encoded C Strings
  /// =========================================
  ///
  /// Swift streamlines interoperation with C string APIs by letting you pass a
  /// `String` instance to a function as an `Int8` or `UInt8` pointer. When you
  /// call a C function using a `String`, Swift automatically creates a buffer
  /// of UTF-8 code units and passes a pointer to that buffer. The code units
  /// of that buffer match the code units in the string's `utf8` view.
  ///
  /// The following example uses the C `strncmp` function to compare the
  /// beginning of two Swift strings. The `strncmp` function takes two
  /// `const char*` pointers and an integer specifying the number of characters
  /// to compare. Because the strings are identical up to the 14th character,
  /// comparing only those characters results in a return value of `0`.
  ///
  ///     let s1 = "They call me 'Bell'"
  ///     let s2 = "They call me 'Stacey'"
  ///
  ///     print(strncmp(s1, s2, 14))
  ///     // Prints "0"
  ///     print(String(s1.utf8.prefix(14)))
  ///     // Prints "They call me '"
  ///
  /// Extending the compared character count to 15 includes the differing
  /// characters, so a nonzero result is returned.
  ///
  ///     print(strncmp(s1, s2, 15))
  ///     // Prints "-17"
  ///     print(String(s1.utf8.prefix(15)))
  ///     // Prints "They call me 'B"
  @_fixed_layout
  public struct UTF8View {
    @usableFromInline
    internal var _guts: _StringGuts

    @inlinable @inline(__always)
    internal init(_ guts: _StringGuts) {
      self._guts = guts
      _invariantCheck()
    }
  }
}

extension String.UTF8View {
  #if !INTERNAL_CHECKS_ENABLED
  @inlinable @inline(__always) internal func _invariantCheck() {}
  #else
  @usableFromInline @inline(never) @_effects(releasenone)
  internal func _invariantCheck() {
    // TODO: Ensure index alignment
  }
  #endif // INTERNAL_CHECKS_ENABLED
}

extension String.UTF8View: BidirectionalCollection {
  public typealias Index = String.Index

  public typealias Element = UTF8.CodeUnit

  /// The position of the first code unit if the UTF-8 view is
  /// nonempty.
  ///
  /// If the UTF-8 view is empty, `startIndex` is equal to `endIndex`.
  @inlinable
  public var startIndex: Index {
    @inline(__always) get { return _guts.startIndex }
  }

  /// The "past the end" position---that is, the position one
  /// greater than the last valid subscript argument.
  ///
  /// In an empty UTF-8 view, `endIndex` is equal to `startIndex`.
  @inlinable
  public var endIndex: Index {
    @inline(__always) get { return _guts.endIndex }
  }

  /// Returns the next consecutive position after `i`.
  ///
  /// - Precondition: The next position is representable.
  @inlinable @inline(__always)
  public func index(after i: Index) -> Index {
    if _fastPath(_guts.isFastUTF8) {
      return i.nextEncoded
    }

    return _foreignIndex(after: i)
  }

  @inlinable @inline(__always)
  public func index(before i: Index) -> Index {
    precondition(!i.isZeroPosition)
    if _fastPath(_guts.isFastUTF8) {
      return i.priorEncoded
    }

    return _foreignIndex(before: i)
  }

  @inlinable @inline(__always)
  public func index(_ i: Index, offsetBy n: Int) -> Index {
    if _fastPath(_guts.isFastUTF8) {
      _precondition(n + i._encodedOffset <= _guts.count)
      return i.encoded(offsetBy: n)
    }

    return _foreignIndex(i, offsetBy: n)
  }

  @inlinable @inline(__always)
  public func index(
    _ i: Index, offsetBy n: Int, limitedBy limit: Index
  ) -> Index? {
    if _fastPath(_guts.isFastUTF8) {
      // Check the limit: ignore limit if it precedes `i` (in the correct
      // direction), otherwise must not be beyond limit (in the correct
      // direction).
      let iOffset = i._encodedOffset
      let result = iOffset + n
      let limitOffset = limit._encodedOffset
      if n >= 0 {
        guard limitOffset < iOffset || result <= limitOffset else { return nil }
      } else {
        guard limitOffset > iOffset || result >= limitOffset else { return nil }
      }
      return Index(_encodedOffset: result)
    }

    return _foreignIndex(i, offsetBy: n, limitedBy: limit)
  }

  @inlinable @inline(__always)
  public func distance(from i: Index, to j: Index) -> Int {
    if _fastPath(_guts.isFastUTF8) {
      return j._encodedOffset &- i._encodedOffset
    }
    return _foreignDistance(from: i, to: j)
  }

  /// Accesses the code unit at the given position.
  ///
  /// The following example uses the subscript to print the value of a
  /// string's first UTF-8 code unit.
  ///
  ///     let greeting = "Hello, friend!"
  ///     let i = greeting.utf8.startIndex
  ///     print("First character's UTF-8 code unit: \(greeting.utf8[i])")
  ///     // Prints "First character's UTF-8 code unit: 72"
  ///
  /// - Parameter position: A valid index of the view. `position`
  ///   must be less than the view's end index.
  @inlinable
  public subscript(i: Index) -> UTF8.CodeUnit {
    @inline(__always) get {
      String(_guts)._boundsCheck(i)
      if _fastPath(_guts.isFastUTF8) {
        return _guts.withFastUTF8 { utf8 in utf8[_unchecked: i._encodedOffset] }
      }

      return _foreignSubscript(position: i)
    }
  }
}

extension String.UTF8View: CustomStringConvertible {
 @inlinable
 public var description: String {
   @inline(__always) get { return String(String(_guts)) }
 }
}

extension String.UTF8View: CustomDebugStringConvertible {
 public var debugDescription: String {
   return "UTF8View(\(self.description.debugDescription))"
 }
}


extension String {
  /// A UTF-8 encoding of `self`.
  @inlinable
  public var utf8: UTF8View {
    @inline(__always) get { return UTF8View(self._guts) }
    set { self = String(newValue._guts) }
  }

  /// A contiguously stored null-terminated UTF-8 representation of the string.
  ///
  /// To access the underlying memory, invoke `withUnsafeBufferPointer` on the
  /// array.
  ///
  ///     let s = "Hello!"
  ///     let bytes = s.utf8CString
  ///     print(bytes)
  ///     // Prints "[72, 101, 108, 108, 111, 33, 0]"
  ///
  ///     bytes.withUnsafeBufferPointer { ptr in
  ///         print(strlen(ptr.baseAddress!))
  ///     }
  ///     // Prints "6"
  public var utf8CString: ContiguousArray<CChar> {
    if _fastPath(_guts.isFastUTF8) {
      var result = _guts.withFastCChar { ContiguousArray($0) }
      result.append(0)
      return result
    }

    return _slowUTF8CString()
  }

  @usableFromInline @inline(never) // slow-path
  internal func _slowUTF8CString() -> ContiguousArray<CChar> {
    var result = ContiguousArray<CChar>()
    result.reserveCapacity(self._guts.count + 1)
    for c in self.utf8 {
      result.append(CChar(bitPattern: c))
    }
    result.append(0)
    return result
  }

  /// Creates a string corresponding to the given sequence of UTF-8 code units.
  @available(swift, introduced: 4.0, message:
  "Please use failable String.init?(_:UTF8View) when in Swift 3.2 mode")
  @inlinable @inline(__always)
  public init(_ utf8: UTF8View) {
    self = String(utf8._guts)
  }
}

extension String.UTF8View {
  @inlinable
  public var count: Int {
    @inline(__always) get {
      if _fastPath(_guts.isFastUTF8) {
        return _guts.count
      }
      return _foreignCount()
    }
  }
}

// Index conversions
extension String.UTF8View.Index {
  /// Creates an index in the given UTF-8 view that corresponds exactly to the
  /// specified `UTF16View` position.
  ///
  /// The following example finds the position of a space in a string's `utf16`
  /// view and then converts that position to an index in the string's
  /// `utf8` view.
  ///
  ///     let cafe = "Café 🍵"
  ///
  ///     let utf16Index = cafe.utf16.firstIndex(of: 32)!
  ///     let utf8Index = String.UTF8View.Index(utf16Index, within: cafe.utf8)!
  ///
  ///     print(Array(cafe.utf8[..<utf8Index]))
  ///     // Prints "[67, 97, 102, 195, 169]"
  ///
  /// If the position passed in `utf16Index` doesn't have an exact
  /// corresponding position in `utf8`, the result of the initializer is
  /// `nil`. For example, because UTF-8 and UTF-16 represent high Unicode code
  /// points differently, an attempt to convert the position of the trailing
  /// surrogate of a UTF-16 surrogate pair fails.
  ///
  /// The next example attempts to convert the indices of the two UTF-16 code
  /// points that represent the teacup emoji (`"🍵"`). The index of the lead
  /// surrogate is successfully converted to a position in `utf8`, but the
  /// index of the trailing surrogate is not.
  ///
  ///     let emojiHigh = cafe.utf16.index(after: utf16Index)
  ///     print(String.UTF8View.Index(emojiHigh, within: cafe.utf8))
  ///     // Prints "Optional(String.Index(...))"
  ///
  ///     let emojiLow = cafe.utf16.index(after: emojiHigh)
  ///     print(String.UTF8View.Index(emojiLow, within: cafe.utf8))
  ///     // Prints "nil"
  ///
  /// - Parameters:
  ///   - sourcePosition: A position in a `String` or one of its views.
  ///   - target: The `UTF8View` in which to find the new position.
  @inlinable
  public init?(_ idx: String.Index, within target: String.UTF8View) {
    if _slowPath(target._guts.isForeign) {
      guard idx._foreignIsWithin(target) else { return nil }
    } else {
      // All indices, except sub-scalar UTF-16 indices pointing at trailing
      // surrogates, are valid.
      guard idx.transcodedOffset == 0 else { return nil }
    }

    self = idx
  }
}

// Reflection
extension String.UTF8View : CustomReflectable {
  /// Returns a mirror that reflects the UTF-8 view of a string.
  public var customMirror: Mirror {
    return Mirror(self, unlabeledChildren: self)
  }
}

//===--- Slicing Support --------------------------------------------------===//
/// In Swift 3.2, in the absence of type context,
///
///   someString.utf8[someString.utf8.startIndex..<someString.utf8.endIndex]
///
/// was deduced to be of type `String.UTF8View`.  Provide a more-specific
/// Swift-3-only `subscript` overload that continues to produce
/// `String.UTF8View`.
extension String.UTF8View {
  public typealias SubSequence = Substring.UTF8View

  @inlinable
  @available(swift, introduced: 4)
  public subscript(r: Range<Index>) -> String.UTF8View.SubSequence {
    return Substring.UTF8View(self, _bounds: r)
  }
}

extension String.UTF8View {
  /// Copies `self` into the supplied buffer.
  ///
  /// - Precondition: The memory in `self` is uninitialized. The buffer must
  ///   contain sufficient uninitialized memory to accommodate
  ///   `source.underestimatedCount`.
  ///
  /// - Postcondition: The `Pointee`s at `buffer[startIndex..<returned index]`
  ///   are initialized.
  @inlinable @inline(__always)
  public func _copyContents(
    initializing buffer: UnsafeMutableBufferPointer<Iterator.Element>
  ) -> (Iterator, UnsafeMutableBufferPointer<Iterator.Element>.Index) {
    guard buffer.baseAddress != nil else {
        _preconditionFailure(
          "Attempt to copy string contents into nil buffer pointer")
    }
    guard let written = _guts.copyUTF8(into: buffer) else {
      _preconditionFailure(
        "Insufficient space allocated to copy string contents")
    }

    let it = String().utf8.makeIterator()
    return (it, buffer.index(buffer.startIndex, offsetBy: written))
  }
}

// Foreign string support
extension String.UTF8View {
  @usableFromInline @inline(never)
  @_effects(releasenone)
  internal func _foreignIndex(after i: Index) -> Index {
    _internalInvariant(_guts.isForeign)

    let (scalar, scalarLen) = _guts.foreignErrorCorrectedScalar(
      startingAt: i.strippingTranscoding)
    let utf8Len = _numUTF8CodeUnits(scalar)

    if utf8Len == 1 {
      _internalInvariant(i.transcodedOffset == 0)
      return i.nextEncoded
    }

    // Check if we're still transcoding sub-scalar
    if i.transcodedOffset < utf8Len - 1 {
      return i.nextTranscoded
    }

    // Skip to the next scalar
    return i.encoded(offsetBy: scalarLen)
  }

  @usableFromInline @inline(never)
  @_effects(releasenone)
  internal func _foreignIndex(before i: Index) -> Index {
    _internalInvariant(_guts.isForeign)
    if i.transcodedOffset != 0 {
      _internalInvariant((1...3) ~= i.transcodedOffset)
      return i.priorTranscoded
    }

    let (scalar, scalarLen) = _guts.foreignErrorCorrectedScalar(
      endingAt: i)
    let utf8Len = _numUTF8CodeUnits(scalar)
    return i.encoded(offsetBy: -scalarLen).transcoded(withOffset: utf8Len &- 1)
  }

  @usableFromInline @inline(never)
  @_effects(releasenone)
  internal func _foreignSubscript(position i: Index) -> UTF8.CodeUnit {
    _internalInvariant(_guts.isForeign)

    let scalar = _guts.foreignErrorCorrectedScalar(
      startingAt: _guts.scalarAlign(i)).0
    let encoded = Unicode.UTF8.encode(scalar)._unsafelyUnwrappedUnchecked
    _internalInvariant(i.transcodedOffset < 1+encoded.count)

    return encoded[
      encoded.index(encoded.startIndex, offsetBy: i.transcodedOffset)]
  }

  @usableFromInline @inline(never)
  @_effects(releasenone)
  internal func _foreignIndex(_ i: Index, offsetBy n: Int) -> Index {
    _internalInvariant(_guts.isForeign)
    return _index(i, offsetBy: n)
  }

  @usableFromInline @inline(never)
  @_effects(releasenone)
  internal func _foreignIndex(
    _ i: Index, offsetBy n: Int, limitedBy limit: Index
  ) -> Index? {
    _internalInvariant(_guts.isForeign)
    return _index(i, offsetBy: n, limitedBy: limit)
  }

  @usableFromInline @inline(never)
  @_effects(releasenone)
  internal func _foreignDistance(from i: Index, to j: Index) -> Int {
    _internalInvariant(_guts.isForeign)
    return _distance(from: i, to: j)
  }

  @usableFromInline @inline(never)
  @_effects(releasenone)
  internal func _foreignCount() -> Int {
    _internalInvariant(_guts.isForeign)
    return _distance(from: startIndex, to: endIndex)
  }
}

extension String.Index {
  @usableFromInline @inline(never) // opaque slow-path
  @_effects(releasenone)
  internal func _foreignIsWithin(_ target: String.UTF8View) -> Bool {
    _internalInvariant(target._guts.isForeign)
    // Currently, foreign means UTF-16.

    // If we're transcoding, we're already a UTF8 view index.
    if self.transcodedOffset != 0 { return true }

    // Otherwise, we must be scalar-aligned, i.e. not pointing at a trailing
    // surrogate.
    return target._guts.isOnUnicodeScalarBoundary(self)
  }
}

extension String.UTF8View {
  @inlinable
  public func withContiguousStorageIfAvailable<R>(
    _ body: (UnsafeBufferPointer<Element>) throws -> R
  ) rethrows -> R? {
    guard _guts.isFastUTF8 else { return nil }
    return try _guts.withFastUTF8(body)
  }
}