Files
swift-mirror/stdlib/core/StringUnicodeScalarView.swift
Dmitri Hrybenko f370ca0746 stdlib: fix a bunch of various Unicode issues, primarily in UTF-8 decoding
In UTF-8 decoder:
- implement U+FFFD insertion according to the recommendation given in the
  Unicode spec.  This required changing the decoder to become stateful, which
  significantly increased complexity due to the need to maintain an internal
  buffer.
- reject invalid code unit sequences properly instead of crashing rdar://16767868
- reject overlong sequences rdar://16767911

In stdlib:
- change APIs that assume that UTF decoding can never fail to account for
  possibility of errors
- fix a bug in UnicodeScalarView that could cause a crash during backward
  iteration if U+8000 is present in the string
- allow noncharacters in UnicodeScalar.  They are explicitly allowed in the
  definition of "Unicode scalar" in the specification.  Disallowing noncharacters
  in UnicodeScalar prevents actually using these scalar values as internal
  special values during string processing, which is exactly the reason why they
  are reserved in the first place.
- fix a crash in String.fromCString() that could happen if it was passed a null
  pointer

In Lexer:
- allow noncharacters in string literals.  These Unicode scalar values are not
  allowed to be exchanged externally, but it is totally reasonable to have them
  in literals as long as they don't escape the program.  For example, using
  U+FFFF as a delimiter and then calling str.split("\uffff") is completely
  reasonable.

This is a lot of changes in a single commit; the primary reason why they are
lumped together is the need to change stdlib APIs to account for the
possibility of UTF decoding failure, and this has long-reaching effects
throughout stdlib where these APIs are used.


Swift SVN r19045
2014-06-20 13:07:40 +00:00

218 lines
6.1 KiB
Swift

//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
func ==(
lhs: String.UnicodeScalarView.IndexType,
rhs: String.UnicodeScalarView.IndexType
) -> Bool {
return lhs._position == rhs._position
}
extension String {
struct UnicodeScalarView : Sliceable, Sequence {
init(_ _base: _StringCore) {
self._base = _base
}
struct ScratchGenerator : Generator {
var base :_StringCore
var idx : Int
init(_ core : _StringCore, _ pos : Int) {
idx = pos
base = core
}
mutating func next() -> UTF16.CodeUnit? {
return self.base[idx++]
}
}
// FIXME: This index should probably become bidirectional, as UTF16
// is traversable in either direction.
struct IndexType : BidirectionalIndex {
init(_ _position: Int, _ _base: _StringCore) {
self._position = _position
self._base = _base
}
func succ() -> IndexType {
var scratch = ScratchGenerator(_base, _position)
UTF16.decode(&scratch)
return IndexType(scratch.idx, _base)
}
func pred() -> IndexType {
var i = _position
let codeUnit = self._base[--i]
// FIXME: consider adding:
// assert(!(codeUnit >= 0xD800 && codeUnit <= 0xDBFF), "unpaired surrogates are ill-formed")
if codeUnit >= 0xDC00 && codeUnit <= 0xDFFF {
--i
}
return IndexType(i, _base)
}
var _position: Int
var _base: _StringCore
}
var startIndex: IndexType {
return IndexType(_base.startIndex, _base)
}
var endIndex: IndexType {
return IndexType(_base.endIndex, _base)
}
subscript(i: IndexType) -> UnicodeScalar {
var scratch = ScratchGenerator(_base, i._position)
switch UTF16.decode(&scratch) {
case .Result(let us):
return us
case .EmptyInput:
_fatalError("can not subscript using an endIndex")
case .Error:
_fatalError("unpaired surrogates are ill-formed in UTF-16")
}
}
func __slice__(start: IndexType, end: IndexType) -> UnicodeScalarView {
return UnicodeScalarView(_base[start._position..<end._position])
}
subscript(r: Range<IndexType>) -> UnicodeScalarView {
return UnicodeScalarView(
_base[r.startIndex._position..<r.endIndex._position])
}
struct GeneratorType : Generator {
init(_ _base: _StringCore.GeneratorType) {
self._base = _base
}
mutating func next() -> UnicodeScalar? {
switch UTF16.decode(&self._base) {
case .Result(let us):
return us
case .EmptyInput:
return .None
case .Error:
_fatalError("unpaired surrogates are ill-formed in UTF-16")
}
}
var _base: _StringCore.GeneratorType
}
func generate() -> GeneratorType {
return GeneratorType(_base.generate())
}
@conversion
func __conversion() -> String {
return String(_base)
}
func compare(other : UnicodeScalarView) -> Int {
// Try to compare the string without decoding
// the UTF16 string.
var aIdx = self._base.startIndex
var bIdx = other._base.startIndex
var aEnd = self._base.endIndex
var bEnd = other._base.endIndex
// If this is not a contiguous UTF-16 then use the slow path.
// TODO: when we fix rdar://16740011 we'll need to optimize the ascii
// path.
if ((self._base.elementShift != 1) |
(other._base.elementShift != 1) |
(!self._base.hasContiguousStorage) |
(!other._base.hasContiguousStorage)) {
// We wrap this function with a closure to disable inlining.
// This is a cold path and inlining _compareUnicode may
// prevent the inlining of the compare function.
return {$0._compareUnicode($1)}(self, other)
}
while true {
if aIdx < aEnd {
if bIdx < bEnd {
let e1 = self._base._nthContiguous(aIdx)
let e2 = other._base._nthContiguous(bIdx)
// The range 0xD800 .. 0xDFFF is reserved for lead and trail
// surrogates. In this code we are only comparing against the
// lower bound because most interesting characters are in that
// range. This is conservatively correct since the slow path is
// handling the surrogates correctly.
if _slowPath((e1 >= 0xD800) | (e2 >= 0xD800)) {
// Use slow unicode comparator if
// we found multi-byte scalar.
// Disable inlining by wrapping the callee in a closure.
return {$0._compareUnicode($1)}(self, other)
}
if e1 < e2 {
return -1
}
if e2 < e1 {
return 1
}
aIdx++
bIdx++
continue // equivalent
}
return 1
}
if bIdx < bEnd {
return -1
}
return 0
}
}
func _compareUnicode(other : UnicodeScalarView) -> Int {
var g1 = self.generate()
var g2 = other.generate()
while true {
let e1_ = g1.next()
let e2_ = g2.next()
if let e1 = e1_ {
if let e2 = e2_ {
if e1 < e2 {
return -1
}
if e2 < e1 {
return 1
}
continue // equivalent
}
return 1
}
if e2_ {
return -1
}
return 0
}
}
var _base: _StringCore
}
}
extension String {
func compare(other : String) -> Int {
return(UnicodeScalarView(core).compare(UnicodeScalarView(other.core)))
}
var unicodeScalars : UnicodeScalarView {
return UnicodeScalarView(core)
}
}