[stdlib] String initialization with encoding and CString interop

This commit is contained in:
Dave Abrahams
2017-05-08 19:50:38 -07:00
parent cb5b5bad7c
commit fd8cfea3ac
5 changed files with 284 additions and 40 deletions

View File

@@ -42,6 +42,45 @@ public protocol StringProtocol
func lowercased() -> String func lowercased() -> String
func uppercased() -> String func uppercased() -> String
/// Constructs a `String` having the same contents as `codeUnits`.
///
/// - Parameter codeUnits: a collection of code units in
/// the given `encoding`.
/// - Parameter encoding: describes the encoding in which the code units
/// should be interpreted.
init<C: Collection, Encoding: UnicodeEncoding>(
codeUnits: C, encoding: Encoding.Type
)
where C.Iterator.Element == Encoding.CodeUnit
/// Constructs a `String` having the same contents as `nulTerminatedUTF8`.
///
/// - Parameter nulTerminatedUTF8: a sequence of contiguous UTF-8 encoded
/// bytes ending just before the first zero byte (NUL character).
init(cString nulTerminatedUTF8: UnsafePointer<CChar>)
/// Constructs a `String` having the same contents as `nulTerminatedCodeUnits`.
///
/// - Parameter nulTerminatedCodeUnits: a sequence of contiguous code units in
/// the given `encoding`, ending just before the first zero code unit.
/// - Parameter encoding: describes the encoding in which the code units
/// should be interpreted.
init<Encoding: UnicodeEncoding>(
cString nulTerminatedCodeUnits: UnsafePointer<Encoding.CodeUnit>,
encoding: Encoding.Type)
/// Invokes the given closure on the contents of the string, represented as a
/// pointer to a null-terminated sequence of UTF-8 code units.
func withCString<Result>(
_ body: (UnsafePointer<CChar>) throws -> Result) rethrows -> Result
/// Invokes the given closure on the contents of the string, represented as a
/// pointer to a null-terminated sequence of code units in the given encoding.
func withCString<Result, Encoding: UnicodeEncoding>(
encoding: Encoding.Type,
_ body: (UnsafePointer<Encoding.CodeUnit>) throws -> Result
) rethrows -> Result
} }
extension StringProtocol { extension StringProtocol {
@@ -52,7 +91,141 @@ extension StringProtocol {
} }
} }
// FIXME: complexity documentation for most of methods on String is ought to be /// Call body with a pointer to zero-terminated sequence of
/// `TargetEncoding.CodeUnit` representing the same string as `source`, when
/// `source` is interpreted as being encoded with `SourceEncoding`.
internal func _withCString<
Source : Collection,
SourceEncoding : UnicodeEncoding,
TargetEncoding : UnicodeEncoding,
Result
>(
encodedAs targetEncoding: TargetEncoding.Type,
from source: Source,
encodedAs sourceEncoding: SourceEncoding.Type,
execute body : (UnsafePointer<TargetEncoding.CodeUnit>) throws -> Result
) rethrows -> Result
where Source.Iterator.Element == SourceEncoding.CodeUnit {
return try _withCStringAndLength(
encodedAs: targetEncoding,
from: source,
encodedAs: sourceEncoding) { p, _ in try body(p) }
}
internal func _withCStringAndLength<
Source : Collection,
SourceEncoding : UnicodeEncoding,
TargetEncoding : UnicodeEncoding,
Result
>(
encodedAs targetEncoding: TargetEncoding.Type,
from source: Source,
encodedAs sourceEncoding: SourceEncoding.Type,
execute body : (UnsafePointer<TargetEncoding.CodeUnit>, Int) throws -> Result
) rethrows -> Result
where Source.Iterator.Element == SourceEncoding.CodeUnit {
var targetLength = 0 // nul terminator
var i = source.makeIterator()
SourceEncoding.ForwardParser.parse(&i) {
targetLength += numericCast(
targetEncoding.transcode($0, from: SourceEncoding.self).count)
}
var a: [TargetEncoding.CodeUnit] = []
a.reserveCapacity(targetLength + 1)
i = source.makeIterator()
SourceEncoding.ForwardParser.parse(&i) {
a.append(
contentsOf: targetEncoding.transcode($0, from: SourceEncoding.self))
}
a.append(0)
return try body(a, targetLength)
}
extension _StringCore {
/// Invokes `body` on a null-terminated sequence of code units in the given
/// encoding corresponding to the substring in `bounds`.
internal func _withCSubstring<Result, TargetEncoding: UnicodeEncoding>(
in bounds: Range<Index>,
encoding targetEncoding: TargetEncoding.Type,
_ body: (UnsafePointer<TargetEncoding.CodeUnit>) throws -> Result
) rethrows -> Result {
return try _withCSubstringAndLength(in: bounds, encoding: targetEncoding) {
p,_ in try body(p)
}
}
internal func _withCSubstringAndLength<
Result, TargetEncoding: UnicodeEncoding
>(
in bounds: Range<Index>,
encoding targetEncoding: TargetEncoding.Type,
_ body: (UnsafePointer<TargetEncoding.CodeUnit>, Int) throws -> Result
) rethrows -> Result {
if _fastPath(hasContiguousStorage) {
defer { _fixLifetime(self) }
if isASCII {
return try Swift._withCStringAndLength(
encodedAs: targetEncoding,
from: UnsafeBufferPointer(start: startASCII, count: count)[bounds],
encodedAs: _Unicode.ASCII.self,
execute: body
)
}
else {
return try Swift._withCStringAndLength(
encodedAs: targetEncoding,
from: UnsafeBufferPointer(start: startUTF16, count: count)[bounds],
encodedAs: _Unicode.UTF16.self,
execute: body
)
}
}
return try Swift._withCStringAndLength(
encodedAs: targetEncoding,
from: self[bounds],
encodedAs: _Unicode.UTF16.self,
execute: body
)
}
}
extension String {
public init<C: Collection, Encoding: UnicodeEncoding>(
codeUnits: C, encoding: Encoding.Type
) where C.Iterator.Element == Encoding.CodeUnit {
let (b,_) = _StringBuffer.fromCodeUnits(
codeUnits, encoding: encoding, repairIllFormedSequences: true)
self = String(_StringCore(b!))
}
/// Constructs a `String` having the same contents as `nulTerminatedCodeUnits`.
///
/// - Parameter nulTerminatedCodeUnits: a sequence of contiguous code units in
/// the given `encoding`, ending just before the first zero code unit.
/// - Parameter encoding: describes the encoding in which the code units
/// should be interpreted.
public init<Encoding: UnicodeEncoding>(
cString nulTerminatedCodeUnits: UnsafePointer<Encoding.CodeUnit>,
encoding: Encoding.Type) {
let codeUnits = _SentinelCollection(
UnsafeBufferPointer(_unboundedStartingAt: nulTerminatedCodeUnits),
until: _IsZero()
)
self.init(codeUnits: codeUnits, encoding: encoding)
}
/// Invokes the given closure on the contents of the string, represented as a
/// pointer to a null-terminated sequence of code units in the given encoding.
public func withCString<Result, TargetEncoding: UnicodeEncoding>(
encoding targetEncoding: TargetEncoding.Type,
_ body: (UnsafePointer<TargetEncoding.CodeUnit>) throws -> Result
) rethrows -> Result {
return try _core._withCSubstring(
in: _core.startIndex..<_core.endIndex, encoding: targetEncoding, body)
}
}
// FIXME: complexity documentation for most of methods on String ought to be
// qualified with "amortized" at least, as Characters are variable-length. // qualified with "amortized" at least, as Characters are variable-length.
/// A Unicode string value. /// A Unicode string value.

View File

@@ -387,7 +387,7 @@ public struct _StringCore {
} }
} }
else if let content = _unmanagedUTF16 { else if let content = _unmanagedUTF16 {
var i = content.makeIterator() var i = content.makeIterator()
_Unicode.UTF16.ForwardParser.parse(&i) { _Unicode.UTF16.ForwardParser.parse(&i) {
Encoding.transcode($0, from: UTF16.self).forEach(processCodeUnit) Encoding.transcode($0, from: UTF16.self).forEach(processCodeUnit)
} }

View File

@@ -32,7 +32,9 @@ public struct Substring : StringProtocol {
_slice = RangeReplaceableBidirectionalSlice(base: base, bounds: bounds) _slice = RangeReplaceableBidirectionalSlice(base: base, bounds: bounds)
} }
internal init<R: RangeExpression>(_base base: String, _ bounds: R) where R.Bound == Index { internal init<R: RangeExpression>(
_base base: String, _ bounds: R
) where R.Bound == Index {
self.init(_base: base, bounds.relative(to: base)) self.init(_base: base, bounds.relative(to: base))
} }
@@ -99,6 +101,52 @@ public struct Substring : StringProtocol {
} }
% end % end
public init<C: Collection, Encoding: UnicodeEncoding>(
codeUnits: C, encoding: Encoding.Type
) where C.Iterator.Element == Encoding.CodeUnit {
self.init(String(codeUnits: codeUnits, encoding: encoding))
}
public init(cString nulTerminatedUTF8: UnsafePointer<CChar>) {
self.init(String(cString: nulTerminatedUTF8))
}
/// Constructs a `String` having the same contents as `nulTerminatedCodeUnits`.
///
/// - Parameter nulTerminatedCodeUnits: a sequence of contiguous code units in
/// the given `encoding`, ending just before the first zero code unit.
/// - Parameter encoding: describes the encoding in which the code units
/// should be interpreted.
public init<Encoding: UnicodeEncoding>(
cString nulTerminatedCodeUnits: UnsafePointer<Encoding.CodeUnit>,
encoding: Encoding.Type) {
self.init(String(cString: nulTerminatedCodeUnits, encoding: encoding))
}
/// Invokes the given closure on the contents of the string, represented as a
/// pointer to a null-terminated sequence of UTF-8 code units.
public func withCString<Result>(
_ body: (UnsafePointer<CChar>) throws -> Result) rethrows -> Result {
return try _slice._base._core._withCSubstringAndLength(
in: startIndex._base._position..<endIndex._base._position,
encoding: UTF8.self) {
p, length in try p.withMemoryRebound(to: CChar.self, capacity: length) {
try body($0)
}
}
}
/// Invokes the given closure on the contents of the string, represented as a
/// pointer to a null-terminated sequence of code units in the given encoding.
public func withCString<Result, TargetEncoding: UnicodeEncoding>(
encoding targetEncoding: TargetEncoding.Type,
_ body: (UnsafePointer<TargetEncoding.CodeUnit>) throws -> Result
) rethrows -> Result {
return try _slice._base._core._withCSubstring(
in: startIndex._base._position..<endIndex._base._position,
encoding: targetEncoding, body)
}
} }
@@ -127,8 +175,8 @@ extension Substring : CustomDebugStringConvertible {
} }
extension Substring : LosslessStringConvertible { extension Substring : LosslessStringConvertible {
public init?(_ description: String) { public init(_ content: String) {
self.init(_base: description, description.startIndex ..< description.endIndex) self.init(_base: content, content.startIndex ..< content.endIndex)
} }
} }

View File

@@ -62,13 +62,15 @@ extension _UnicodeEncoding {
} }
/// Converts from encoding-independent to encoded representation, returning /// Converts from encoding-independent to encoded representation, returning
/// `nil` if the scalar can't be represented in this encoding. /// `encodedReplacementCharacter` if the scalar can't be represented in this
/// encoding.
public static func encode(_ content: UnicodeScalar) -> EncodedScalar { public static func encode(_ content: UnicodeScalar) -> EncodedScalar {
return encodeIfRepresentable(content) ?? encodedReplacementCharacter return encodeIfRepresentable(content) ?? encodedReplacementCharacter
} }
/// Converts a scalar from another encoding's representation, returning /// Converts a scalar from another encoding's representation, returning
/// `nil` if the scalar can't be represented in this encoding. /// `encodedReplacementCharacter` if the scalar can't be represented in this
/// encoding.
public static func transcode<FromEncoding : UnicodeEncoding>( public static func transcode<FromEncoding : UnicodeEncoding>(
_ content: FromEncoding.EncodedScalar, from _: FromEncoding.Type _ content: FromEncoding.EncodedScalar, from _: FromEncoding.Type
) -> EncodedScalar { ) -> EncodedScalar {

View File

@@ -148,37 +148,6 @@ extension _Unicode.DefaultScalarView : Collection {
} }
} }
/// An iterator that can be much faster than the iterator of a reversed slice.
// TODO: See about using this in more places
@_fixed_layout
public struct _ReverseIndexingIterator<
Elements : BidirectionalCollection
> : IteratorProtocol, Sequence {
@_inlineable
@inline(__always)
/// Creates an iterator over the given collection.
public /// @testable
init(_elements: Elements, _position: Elements.Index) {
self._elements = _elements
self._position = _position
}
@_inlineable
@inline(__always)
public mutating func next() -> Elements._Element? {
guard _fastPath(_position != _elements.startIndex) else { return nil }
_position = _elements.index(before: _position)
return _elements[_position]
}
@_versioned
internal let _elements: Elements
@_versioned
internal var _position: Elements.Index
}
extension _Unicode.DefaultScalarView : BidirectionalCollection { extension _Unicode.DefaultScalarView : BidirectionalCollection {
@inline(__always) @inline(__always)
public func index(before i: Index) -> Index { public func index(before i: Index) -> Index {
@@ -210,9 +179,47 @@ extension _Unicode.DefaultScalarView : BidirectionalCollection {
import StdlibUnittest import StdlibUnittest
import SwiftPrivate import SwiftPrivate
func utf32<S : StringProtocol>(_ s: S) -> [UInt32] {
return s.unicodeScalars.map { $0.value }
}
func checkStringProtocol<S : StringProtocol, Encoding: UnicodeEncoding>(
_ s: S,
_ utfStr: [Encoding.CodeUnit],
encodedAs: Encoding.Type,
expectingUTF32 expected: [UInt32]
) {
expectEqualSequence(
expected, utf32(s), "\(S.self) init(codeUnits:encoding:)")
if !utfStr.contains(0) {
if Encoding.self == UTF8.self {
var ntbs = utfStr.map { CChar(extendingOrTruncating: $0) }
ntbs.append(0)
expectEqualSequence(
expected, utf32(S(cString: ntbs)), "\(S.self) init(cString:)")
}
var ntbs = Array(utfStr); ntbs.append(0)
expectEqualSequence(
expected, utf32(S(cString: ntbs, encoding: Encoding.self)),
"\(S.self) init(cString:encoding:)"
)
s.withCString {
expectEqual(s, S(cString: $0), "\(S.self) withCString(_:)")
}
s.withCString(encoding: Encoding.self) {
expectEqual(s, S(cString: $0, encoding: Encoding.self),
"\(S.self) withCString(encoding:_:)")
}
}
}
func checkDecodeUTF<Codec : UnicodeCodec & UnicodeEncoding>( func checkDecodeUTF<Codec : UnicodeCodec & UnicodeEncoding>(
_ codec: Codec.Type, _ expectedHead: [UInt32], _ codec: Codec.Type, _ expectedHead: [UInt32],
_ expectedRepairedTail: [UInt32], _ utfStr: [Codec.CodeUnit] _ expectedRepairedTail: [UInt32], _ utfStr: [Codec.CodeUnit]
) -> AssertionResult { ) -> AssertionResult {
var decoded = [UInt32]() var decoded = [UInt32]()
var expected = expectedHead var expected = expectedHead
@@ -303,6 +310,20 @@ func checkDecodeUTF<Codec : UnicodeCodec & UnicodeEncoding>(
else { expectNotEqual(0, errorCount) } else { expectNotEqual(0, errorCount) }
} }
check(expected.reversed(), "reverse, repairing: true") check(expected.reversed(), "reverse, repairing: true")
//===--- String/Substring Construction and C-String interop -------------===//
do {
let s = String(codeUnits: utfStr, encoding: Codec.self)
checkStringProtocol(
s, utfStr, encodedAs: Codec.self, expectingUTF32: expected)
}
do {
let s0 = "\n" + String(codeUnits: utfStr, encoding: Codec.self) + "\n"
checkStringProtocol(
s0.dropFirst().dropLast(),
utfStr, encodedAs: Codec.self, expectingUTF32: expected)
}
//===--- Transcoded Scalars ---------------------------------------------===// //===--- Transcoded Scalars ---------------------------------------------===//
for x in decoded.lazy.map({ UnicodeScalar($0)! }) { for x in decoded.lazy.map({ UnicodeScalar($0)! }) {