swift-mirror/stdlib/core/StringUTF8.swift

//===--- StringUTF8.swift - A UTF8 view of _StringCore ---------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
//
//  _StringCore currently has three representations: Native ASCII,
//  Native UTF-16, and Opaque Cocoa.  Expose each of these as UTF-8 in a
//  way that will hopefully be efficient to traverse
//
//===----------------------------------------------------------------------===//


extension _StringCore {
  /// An integral type that holds a sequence of UTF-8 code units, starting in
  /// its low byte.
  public typealias UTF8Chunk = UInt64

  /// Encode text starting at `i` as UTF-8.  Returns a pair whose first
  /// element is the index of the text following whatever got encoded,
  /// and the second element contains the encoded UTF-8 starting in its
  /// low byte.  Any unused high bytes in the result will be set to
  /// 0xFF.
  func _encodeSomeUTF8(i: Int) -> (Int, UTF8Chunk) {
    _sanityCheck(i <= count)

    if _fastPath(elementWidth == 1) {
      // How many UTF-16 code units might we use before we've filled up
      // our UTF8Chunk with UTF-8 code units?
      let utf16Count = min(sizeof(UTF8Chunk.self), count - i)

      var result: UTF8Chunk = ~0 // start with all bits set

      _memcpy(
        dest: UnsafeMutablePointer(Builtin.addressof(&result)),
        src: UnsafeMutablePointer(startASCII + i),
        size: numericCast(utf16Count))

      return (i + utf16Count, result)
    } else if _fastPath(!_baseAddress._isNull) {
      return _encodeSomeContiguousUTF16AsUTF8(i)
    } else {
#if _runtime(_ObjC)
      return _encodeSomeNonContiguousUTF16AsUTF8(i)
#else
      _sanityCheckFailure("_encodeSomeUTF8: Unexpected cocoa string")
#endif
    }
  }

  /// Helper for `_encodeSomeUTF8`, above.  Handles the case where the
  /// storage is contiguous UTF-16.
  func _encodeSomeContiguousUTF16AsUTF8(i: Int) -> (Int, UTF8Chunk) {
    _sanityCheck(elementWidth == 2)
    _sanityCheck(!_baseAddress._isNull)

    let storage = UnsafeBufferPointer(start: startUTF16, count: self.count)
    return _transcodeSomeUTF16AsUTF8(storage, i)
  }

#if _runtime(_ObjC)
  /// Helper for `_encodeSomeUTF8`, above.  Handles the case where the
  /// storage is non-contiguous UTF-16.
  func _encodeSomeNonContiguousUTF16AsUTF8(i: Int) -> (Int, UTF8Chunk) {
    _sanityCheck(elementWidth == 2)
    _sanityCheck(_baseAddress._isNull)

    let storage = _CollectionOf<Int, UInt16>(
      startIndex: 0, endIndex: self.count) {
      (i: Int) -> UInt16 in
      return _cocoaStringSubscript(target: self, position: i)
    }
    return _transcodeSomeUTF16AsUTF8(storage, i)
  }
#endif
}

extension String {
  /// A collection of UTF-8 code units that encodes a `String` value.
  public struct UTF8View : CollectionType, Reflectable {
    internal let _core: _StringCore

    init(_ _core: _StringCore) {
      self._core = _core
    }

    /// A position in a `String.UTF8View`
    public struct Index : ForwardIndexType {
      internal typealias Buffer = _StringCore.UTF8Chunk

      init(_ _core: _StringCore, _ _coreIndex: Int,
           _ _buffer: Buffer) {
        self._core = _core
        self._coreIndex = _coreIndex
        self._buffer = _buffer
        _sanityCheck(_coreIndex >= 0)
        _sanityCheck(_coreIndex <= _core.count)
      }

      /// Returns the next consecutive value after `self`.
      ///
      /// Requires: the next value is representable.
      public func successor() -> Index {
        let currentUnit = UTF8.CodeUnit(truncatingBitPattern: _buffer)
        let hiNibble = currentUnit >> 4
        // Map the high nibble of the current code unit into the
        // amount by which to increment the utf16 index.  Only when
        // the high nibble is 1111 do we have a surrogate pair.
        let u16Increments =
        // 1111 1110 1101 1100 1011 1010 1001 1000 0111 0110 0101 0100 0011 0010 0001 0000
           0b10___01___01___01___00___00___00___00___01___01___01___01___01___01___01___01
        let increment = (u16Increments >> numericCast(hiNibble << 1)) & 0x3
        let nextCoreIndex = _coreIndex &+ increment
        let nextBuffer = Index._nextBuffer(_buffer)

        // if the nextBuffer is non-empty, we have all we need
        if _fastPath(nextBuffer != Index._emptyBuffer) {
          return Index(_core, nextCoreIndex, nextBuffer)
        }
        // If the underlying UTF16 isn't exhausted, fill a new buffer
        else if _fastPath(nextCoreIndex < _core.endIndex) {
          let (_, freshBuffer) = _core._encodeSomeUTF8(nextCoreIndex)
          return Index(_core, nextCoreIndex, freshBuffer)
        }
        else {
          // Produce the endIndex
          _precondition(
            nextCoreIndex == _core.endIndex,
            "Can't increment past endIndex of String.UTF8View")
          return Index(_core, nextCoreIndex, nextBuffer)
        }
      }

      /// True iff the index is at the end of its view or if the next
      /// byte begins a new UnicodeScalar.
      internal var _isOnUnicodeScalarBoundary : Bool {
        let next = UTF8.CodeUnit(truncatingBitPattern: _buffer)
        return UTF8._numTrailingBytes(next) != 4 || _isAtEnd
      }

      /// True iff the index is at the end of its view
      internal var _isAtEnd : Bool {
        return _coreIndex == _core.endIndex
      }

      /// The value of the buffer when it is empty
      internal static var _emptyBuffer: Buffer {
        return ~0
      }

      /// A Buffer value with the high byte set
      internal static var _bufferHiByte: Buffer {
        return 0xFF << numericCast((sizeof(Buffer.self) &- 1) &* 8)
      }

      /// Consume a byte of the given buffer: shift out the low byte
      /// and put FF in the high byte
      internal static func _nextBuffer(thisBuffer: Buffer) -> Buffer {
        return (thisBuffer >> 8) | _bufferHiByte
      }

      /// The underlying buffer we're presenting as UTF8
      internal let _core: _StringCore
      /// The position of `self`, rounded up to the nearest unicode
      /// scalar boundary, in the underlying UTF16.
      internal let _coreIndex: Int
      /// If `self` is at the end of its `_core`, has the value `_endBuffer`.
      /// Otherwise, the low byte contains the value of
      internal let _buffer: Buffer
    }

    /// The position of the first code unit if the `String` is
    /// non-empty; identical to `endIndex` otherwise.
    public var startIndex: Index {
      if _fastPath(_core.count != 0) {
        let (_, buffer) = _core._encodeSomeUTF8(0)
        return Index(_core, 0, buffer)
      }
      return endIndex
    }

    /// The "past the end" position.
    ///
    /// `endIndex` is not a valid argument to `subscript`, and is always
    /// reachable from `startIndex` by zero or more applications of
    /// `successor()`.
    public var endIndex: Index {
      return Index(_core, _core.endIndex, Index._emptyBuffer)
    }

    /// Access the element at `position`.
    ///
    /// Requires: `position` is a valid position in `self` and
    /// `position != endIndex`.
    public subscript(position: Index) -> UTF8.CodeUnit {
      let result: UTF8.CodeUnit = numericCast(position._buffer & 0xFF)
      _precondition(result != 0xFF, "can not subscript using endIndex")
      return result
    }

    /// Return a *generator* over the code points that comprise this
    /// *sequence*.
    ///
    /// Complexity: O(1)
    public func generate() -> IndexingGenerator<UTF8View> {
      return IndexingGenerator(self)
    }

    /// Returns a mirror that reflects `self`.
    public func getMirror() -> MirrorType {
      return _UTF8ViewMirror(self)
    }
  }

  /// A UTF-8 encoding of `self`.
  public var utf8: UTF8View {
    return UTF8View(self._core)
  }

  public var _contiguousUTF8: UnsafeMutablePointer<UTF8.CodeUnit> {
    return _core.elementWidth == 1 ? _core.startASCII : nil
  }

  /// A contiguously-stored nul-terminated UTF-8 representation of
  /// `self`.
  ///
  /// To access the underlying memory, invoke
  /// `withUnsafeBufferPointer` on the `ContiguousArray`.
  public var nulTerminatedUTF8: ContiguousArray<UTF8.CodeUnit> {
    var result = ContiguousArray<UTF8.CodeUnit>()
    result.reserveCapacity(count(utf8) + 1)
    result += utf8
    result.append(0)
    return result
  }

  public typealias UTF8Index = UTF8View.Index
}

public
func == (lhs: String.UTF8View.Index, rhs: String.UTF8View.Index) -> Bool {
  // If the underlying UTF16 index differs, they're unequal
  if lhs._coreIndex != rhs._coreIndex {
    return false
  }

  // Match up bytes in the buffer
  var buffer = (lhs._buffer, rhs._buffer)
  var isContinuation: Bool
  do {
    let unit = (
      UTF8.CodeUnit(truncatingBitPattern: buffer.0),
      UTF8.CodeUnit(truncatingBitPattern: buffer.1))

    isContinuation = UTF8.isContinuation(unit.0)
    if !isContinuation {
      // We don't check for unit equality in this case because one of
      // the units might be an 0xFF read from the end of the buffer.
      return !UTF8.isContinuation(unit.1)
    }
    // Continuation bytes must match exactly
    else if unit.0 != unit.1 {
      return false
    }

    // Move the buffers along.
    buffer = (
      String.UTF8Index._nextBuffer(buffer.0),
      String.UTF8Index._nextBuffer(buffer.1))
  }
  while true
}

extension String.UTF8Index {
  internal init(_ core: _StringCore, _utf16Offset: Int) {
      let (_, buffer) = core._encodeSomeUTF8(_utf16Offset)
      self.init(core, _utf16Offset, buffer)
  }

  public init?(_ sourceIndex: String.UTF16Index, within utf8: String.UTF8View) {
    let sourceView = String.UTF16View(utf8._core)

    if sourceIndex != sourceView.startIndex
    && sourceIndex != sourceView.endIndex {
      _precondition(
        sourceIndex >= sourceView.startIndex
        && sourceIndex <= sourceView.endIndex,
        "Invalid String.UTF16Index for this UTF-8 view")

      // Detect positions that have no corresponding index.  Note that
      // we have to check before and after, because an unpaired
      // surrogate will be decoded as a single replacement character,
      // thus making the corresponding position valid in UTF8.
      if UTF16.isTrailSurrogate(sourceView[sourceIndex])
        && UTF16.isLeadSurrogate(sourceView[sourceIndex.predecessor()]) {
        return nil
      }
    }
    self.init(utf8._core, _utf16Offset: sourceIndex._offset)
  }

  public init(
    _ sourceIndex: String.UnicodeScalarIndex, within utf8: String.UTF8View) {
    self.init(utf8._core, _utf16Offset: sourceIndex._position)
  }

  public init(_ sourceIndex: String.Index, within utf8: String.UTF8View) {
    self.init(utf8._core, _utf16Offset: sourceIndex._base._position)
  }
}