mirror of
https://github.com/apple/swift.git
synced 2025-12-21 12:14:44 +01:00
[stdlib] String initialization with encoding and CString interop
This commit is contained in:
@@ -42,6 +42,45 @@ public protocol StringProtocol
|
|||||||
|
|
||||||
func lowercased() -> String
|
func lowercased() -> String
|
||||||
func uppercased() -> String
|
func uppercased() -> String
|
||||||
|
|
||||||
|
/// Constructs a `String` having the same contents as `codeUnits`.
|
||||||
|
///
|
||||||
|
/// - Parameter codeUnits: a collection of code units in
|
||||||
|
/// the given `encoding`.
|
||||||
|
/// - Parameter encoding: describes the encoding in which the code units
|
||||||
|
/// should be interpreted.
|
||||||
|
init<C: Collection, Encoding: UnicodeEncoding>(
|
||||||
|
codeUnits: C, encoding: Encoding.Type
|
||||||
|
)
|
||||||
|
where C.Iterator.Element == Encoding.CodeUnit
|
||||||
|
|
||||||
|
/// Constructs a `String` having the same contents as `nulTerminatedUTF8`.
|
||||||
|
///
|
||||||
|
/// - Parameter nulTerminatedUTF8: a sequence of contiguous UTF-8 encoded
|
||||||
|
/// bytes ending just before the first zero byte (NUL character).
|
||||||
|
init(cString nulTerminatedUTF8: UnsafePointer<CChar>)
|
||||||
|
|
||||||
|
/// Constructs a `String` having the same contents as `nulTerminatedCodeUnits`.
|
||||||
|
///
|
||||||
|
/// - Parameter nulTerminatedCodeUnits: a sequence of contiguous code units in
|
||||||
|
/// the given `encoding`, ending just before the first zero code unit.
|
||||||
|
/// - Parameter encoding: describes the encoding in which the code units
|
||||||
|
/// should be interpreted.
|
||||||
|
init<Encoding: UnicodeEncoding>(
|
||||||
|
cString nulTerminatedCodeUnits: UnsafePointer<Encoding.CodeUnit>,
|
||||||
|
encoding: Encoding.Type)
|
||||||
|
|
||||||
|
/// Invokes the given closure on the contents of the string, represented as a
|
||||||
|
/// pointer to a null-terminated sequence of UTF-8 code units.
|
||||||
|
func withCString<Result>(
|
||||||
|
_ body: (UnsafePointer<CChar>) throws -> Result) rethrows -> Result
|
||||||
|
|
||||||
|
/// Invokes the given closure on the contents of the string, represented as a
|
||||||
|
/// pointer to a null-terminated sequence of code units in the given encoding.
|
||||||
|
func withCString<Result, Encoding: UnicodeEncoding>(
|
||||||
|
encoding: Encoding.Type,
|
||||||
|
_ body: (UnsafePointer<Encoding.CodeUnit>) throws -> Result
|
||||||
|
) rethrows -> Result
|
||||||
}
|
}
|
||||||
|
|
||||||
extension StringProtocol {
|
extension StringProtocol {
|
||||||
@@ -52,7 +91,141 @@ extension StringProtocol {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: complexity documentation for most of methods on String is ought to be
|
/// Call body with a pointer to zero-terminated sequence of
|
||||||
|
/// `TargetEncoding.CodeUnit` representing the same string as `source`, when
|
||||||
|
/// `source` is interpreted as being encoded with `SourceEncoding`.
|
||||||
|
internal func _withCString<
|
||||||
|
Source : Collection,
|
||||||
|
SourceEncoding : UnicodeEncoding,
|
||||||
|
TargetEncoding : UnicodeEncoding,
|
||||||
|
Result
|
||||||
|
>(
|
||||||
|
encodedAs targetEncoding: TargetEncoding.Type,
|
||||||
|
from source: Source,
|
||||||
|
encodedAs sourceEncoding: SourceEncoding.Type,
|
||||||
|
execute body : (UnsafePointer<TargetEncoding.CodeUnit>) throws -> Result
|
||||||
|
) rethrows -> Result
|
||||||
|
where Source.Iterator.Element == SourceEncoding.CodeUnit {
|
||||||
|
return try _withCStringAndLength(
|
||||||
|
encodedAs: targetEncoding,
|
||||||
|
from: source,
|
||||||
|
encodedAs: sourceEncoding) { p, _ in try body(p) }
|
||||||
|
}
|
||||||
|
|
||||||
|
internal func _withCStringAndLength<
|
||||||
|
Source : Collection,
|
||||||
|
SourceEncoding : UnicodeEncoding,
|
||||||
|
TargetEncoding : UnicodeEncoding,
|
||||||
|
Result
|
||||||
|
>(
|
||||||
|
encodedAs targetEncoding: TargetEncoding.Type,
|
||||||
|
from source: Source,
|
||||||
|
encodedAs sourceEncoding: SourceEncoding.Type,
|
||||||
|
execute body : (UnsafePointer<TargetEncoding.CodeUnit>, Int) throws -> Result
|
||||||
|
) rethrows -> Result
|
||||||
|
where Source.Iterator.Element == SourceEncoding.CodeUnit {
|
||||||
|
var targetLength = 0 // nul terminator
|
||||||
|
var i = source.makeIterator()
|
||||||
|
SourceEncoding.ForwardParser.parse(&i) {
|
||||||
|
targetLength += numericCast(
|
||||||
|
targetEncoding.transcode($0, from: SourceEncoding.self).count)
|
||||||
|
}
|
||||||
|
var a: [TargetEncoding.CodeUnit] = []
|
||||||
|
a.reserveCapacity(targetLength + 1)
|
||||||
|
i = source.makeIterator()
|
||||||
|
SourceEncoding.ForwardParser.parse(&i) {
|
||||||
|
a.append(
|
||||||
|
contentsOf: targetEncoding.transcode($0, from: SourceEncoding.self))
|
||||||
|
}
|
||||||
|
a.append(0)
|
||||||
|
return try body(a, targetLength)
|
||||||
|
}
|
||||||
|
|
||||||
|
extension _StringCore {
|
||||||
|
/// Invokes `body` on a null-terminated sequence of code units in the given
|
||||||
|
/// encoding corresponding to the substring in `bounds`.
|
||||||
|
internal func _withCSubstring<Result, TargetEncoding: UnicodeEncoding>(
|
||||||
|
in bounds: Range<Index>,
|
||||||
|
encoding targetEncoding: TargetEncoding.Type,
|
||||||
|
_ body: (UnsafePointer<TargetEncoding.CodeUnit>) throws -> Result
|
||||||
|
) rethrows -> Result {
|
||||||
|
return try _withCSubstringAndLength(in: bounds, encoding: targetEncoding) {
|
||||||
|
p,_ in try body(p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
internal func _withCSubstringAndLength<
|
||||||
|
Result, TargetEncoding: UnicodeEncoding
|
||||||
|
>(
|
||||||
|
in bounds: Range<Index>,
|
||||||
|
encoding targetEncoding: TargetEncoding.Type,
|
||||||
|
_ body: (UnsafePointer<TargetEncoding.CodeUnit>, Int) throws -> Result
|
||||||
|
) rethrows -> Result {
|
||||||
|
if _fastPath(hasContiguousStorage) {
|
||||||
|
defer { _fixLifetime(self) }
|
||||||
|
if isASCII {
|
||||||
|
return try Swift._withCStringAndLength(
|
||||||
|
encodedAs: targetEncoding,
|
||||||
|
from: UnsafeBufferPointer(start: startASCII, count: count)[bounds],
|
||||||
|
encodedAs: _Unicode.ASCII.self,
|
||||||
|
execute: body
|
||||||
|
)
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return try Swift._withCStringAndLength(
|
||||||
|
encodedAs: targetEncoding,
|
||||||
|
from: UnsafeBufferPointer(start: startUTF16, count: count)[bounds],
|
||||||
|
encodedAs: _Unicode.UTF16.self,
|
||||||
|
execute: body
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return try Swift._withCStringAndLength(
|
||||||
|
encodedAs: targetEncoding,
|
||||||
|
from: self[bounds],
|
||||||
|
encodedAs: _Unicode.UTF16.self,
|
||||||
|
execute: body
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extension String {
|
||||||
|
public init<C: Collection, Encoding: UnicodeEncoding>(
|
||||||
|
codeUnits: C, encoding: Encoding.Type
|
||||||
|
) where C.Iterator.Element == Encoding.CodeUnit {
|
||||||
|
let (b,_) = _StringBuffer.fromCodeUnits(
|
||||||
|
codeUnits, encoding: encoding, repairIllFormedSequences: true)
|
||||||
|
self = String(_StringCore(b!))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Constructs a `String` having the same contents as `nulTerminatedCodeUnits`.
|
||||||
|
///
|
||||||
|
/// - Parameter nulTerminatedCodeUnits: a sequence of contiguous code units in
|
||||||
|
/// the given `encoding`, ending just before the first zero code unit.
|
||||||
|
/// - Parameter encoding: describes the encoding in which the code units
|
||||||
|
/// should be interpreted.
|
||||||
|
public init<Encoding: UnicodeEncoding>(
|
||||||
|
cString nulTerminatedCodeUnits: UnsafePointer<Encoding.CodeUnit>,
|
||||||
|
encoding: Encoding.Type) {
|
||||||
|
|
||||||
|
let codeUnits = _SentinelCollection(
|
||||||
|
UnsafeBufferPointer(_unboundedStartingAt: nulTerminatedCodeUnits),
|
||||||
|
until: _IsZero()
|
||||||
|
)
|
||||||
|
self.init(codeUnits: codeUnits, encoding: encoding)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Invokes the given closure on the contents of the string, represented as a
|
||||||
|
/// pointer to a null-terminated sequence of code units in the given encoding.
|
||||||
|
public func withCString<Result, TargetEncoding: UnicodeEncoding>(
|
||||||
|
encoding targetEncoding: TargetEncoding.Type,
|
||||||
|
_ body: (UnsafePointer<TargetEncoding.CodeUnit>) throws -> Result
|
||||||
|
) rethrows -> Result {
|
||||||
|
return try _core._withCSubstring(
|
||||||
|
in: _core.startIndex..<_core.endIndex, encoding: targetEncoding, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// FIXME: complexity documentation for most of methods on String ought to be
|
||||||
// qualified with "amortized" at least, as Characters are variable-length.
|
// qualified with "amortized" at least, as Characters are variable-length.
|
||||||
|
|
||||||
/// A Unicode string value.
|
/// A Unicode string value.
|
||||||
|
|||||||
@@ -387,7 +387,7 @@ public struct _StringCore {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if let content = _unmanagedUTF16 {
|
else if let content = _unmanagedUTF16 {
|
||||||
var i = content.makeIterator()
|
var i = content.makeIterator()
|
||||||
_Unicode.UTF16.ForwardParser.parse(&i) {
|
_Unicode.UTF16.ForwardParser.parse(&i) {
|
||||||
Encoding.transcode($0, from: UTF16.self).forEach(processCodeUnit)
|
Encoding.transcode($0, from: UTF16.self).forEach(processCodeUnit)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -32,7 +32,9 @@ public struct Substring : StringProtocol {
|
|||||||
_slice = RangeReplaceableBidirectionalSlice(base: base, bounds: bounds)
|
_slice = RangeReplaceableBidirectionalSlice(base: base, bounds: bounds)
|
||||||
}
|
}
|
||||||
|
|
||||||
internal init<R: RangeExpression>(_base base: String, _ bounds: R) where R.Bound == Index {
|
internal init<R: RangeExpression>(
|
||||||
|
_base base: String, _ bounds: R
|
||||||
|
) where R.Bound == Index {
|
||||||
self.init(_base: base, bounds.relative(to: base))
|
self.init(_base: base, bounds.relative(to: base))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -99,6 +101,52 @@ public struct Substring : StringProtocol {
|
|||||||
}
|
}
|
||||||
|
|
||||||
% end
|
% end
|
||||||
|
|
||||||
|
public init<C: Collection, Encoding: UnicodeEncoding>(
|
||||||
|
codeUnits: C, encoding: Encoding.Type
|
||||||
|
) where C.Iterator.Element == Encoding.CodeUnit {
|
||||||
|
self.init(String(codeUnits: codeUnits, encoding: encoding))
|
||||||
|
}
|
||||||
|
|
||||||
|
public init(cString nulTerminatedUTF8: UnsafePointer<CChar>) {
|
||||||
|
self.init(String(cString: nulTerminatedUTF8))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Constructs a `String` having the same contents as `nulTerminatedCodeUnits`.
|
||||||
|
///
|
||||||
|
/// - Parameter nulTerminatedCodeUnits: a sequence of contiguous code units in
|
||||||
|
/// the given `encoding`, ending just before the first zero code unit.
|
||||||
|
/// - Parameter encoding: describes the encoding in which the code units
|
||||||
|
/// should be interpreted.
|
||||||
|
public init<Encoding: UnicodeEncoding>(
|
||||||
|
cString nulTerminatedCodeUnits: UnsafePointer<Encoding.CodeUnit>,
|
||||||
|
encoding: Encoding.Type) {
|
||||||
|
self.init(String(cString: nulTerminatedCodeUnits, encoding: encoding))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Invokes the given closure on the contents of the string, represented as a
|
||||||
|
/// pointer to a null-terminated sequence of UTF-8 code units.
|
||||||
|
public func withCString<Result>(
|
||||||
|
_ body: (UnsafePointer<CChar>) throws -> Result) rethrows -> Result {
|
||||||
|
return try _slice._base._core._withCSubstringAndLength(
|
||||||
|
in: startIndex._base._position..<endIndex._base._position,
|
||||||
|
encoding: UTF8.self) {
|
||||||
|
p, length in try p.withMemoryRebound(to: CChar.self, capacity: length) {
|
||||||
|
try body($0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Invokes the given closure on the contents of the string, represented as a
|
||||||
|
/// pointer to a null-terminated sequence of code units in the given encoding.
|
||||||
|
public func withCString<Result, TargetEncoding: UnicodeEncoding>(
|
||||||
|
encoding targetEncoding: TargetEncoding.Type,
|
||||||
|
_ body: (UnsafePointer<TargetEncoding.CodeUnit>) throws -> Result
|
||||||
|
) rethrows -> Result {
|
||||||
|
return try _slice._base._core._withCSubstring(
|
||||||
|
in: startIndex._base._position..<endIndex._base._position,
|
||||||
|
encoding: targetEncoding, body)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -127,8 +175,8 @@ extension Substring : CustomDebugStringConvertible {
|
|||||||
}
|
}
|
||||||
|
|
||||||
extension Substring : LosslessStringConvertible {
|
extension Substring : LosslessStringConvertible {
|
||||||
public init?(_ description: String) {
|
public init(_ content: String) {
|
||||||
self.init(_base: description, description.startIndex ..< description.endIndex)
|
self.init(_base: content, content.startIndex ..< content.endIndex)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -62,13 +62,15 @@ extension _UnicodeEncoding {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Converts from encoding-independent to encoded representation, returning
|
/// Converts from encoding-independent to encoded representation, returning
|
||||||
/// `nil` if the scalar can't be represented in this encoding.
|
/// `encodedReplacementCharacter` if the scalar can't be represented in this
|
||||||
|
/// encoding.
|
||||||
public static func encode(_ content: UnicodeScalar) -> EncodedScalar {
|
public static func encode(_ content: UnicodeScalar) -> EncodedScalar {
|
||||||
return encodeIfRepresentable(content) ?? encodedReplacementCharacter
|
return encodeIfRepresentable(content) ?? encodedReplacementCharacter
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Converts a scalar from another encoding's representation, returning
|
/// Converts a scalar from another encoding's representation, returning
|
||||||
/// `nil` if the scalar can't be represented in this encoding.
|
/// `encodedReplacementCharacter` if the scalar can't be represented in this
|
||||||
|
/// encoding.
|
||||||
public static func transcode<FromEncoding : UnicodeEncoding>(
|
public static func transcode<FromEncoding : UnicodeEncoding>(
|
||||||
_ content: FromEncoding.EncodedScalar, from _: FromEncoding.Type
|
_ content: FromEncoding.EncodedScalar, from _: FromEncoding.Type
|
||||||
) -> EncodedScalar {
|
) -> EncodedScalar {
|
||||||
|
|||||||
@@ -148,37 +148,6 @@ extension _Unicode.DefaultScalarView : Collection {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// An iterator that can be much faster than the iterator of a reversed slice.
|
|
||||||
// TODO: See about using this in more places
|
|
||||||
@_fixed_layout
|
|
||||||
public struct _ReverseIndexingIterator<
|
|
||||||
Elements : BidirectionalCollection
|
|
||||||
> : IteratorProtocol, Sequence {
|
|
||||||
|
|
||||||
@_inlineable
|
|
||||||
@inline(__always)
|
|
||||||
/// Creates an iterator over the given collection.
|
|
||||||
public /// @testable
|
|
||||||
init(_elements: Elements, _position: Elements.Index) {
|
|
||||||
self._elements = _elements
|
|
||||||
self._position = _position
|
|
||||||
}
|
|
||||||
|
|
||||||
@_inlineable
|
|
||||||
@inline(__always)
|
|
||||||
public mutating func next() -> Elements._Element? {
|
|
||||||
guard _fastPath(_position != _elements.startIndex) else { return nil }
|
|
||||||
_position = _elements.index(before: _position)
|
|
||||||
return _elements[_position]
|
|
||||||
}
|
|
||||||
|
|
||||||
@_versioned
|
|
||||||
internal let _elements: Elements
|
|
||||||
@_versioned
|
|
||||||
internal var _position: Elements.Index
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
extension _Unicode.DefaultScalarView : BidirectionalCollection {
|
extension _Unicode.DefaultScalarView : BidirectionalCollection {
|
||||||
@inline(__always)
|
@inline(__always)
|
||||||
public func index(before i: Index) -> Index {
|
public func index(before i: Index) -> Index {
|
||||||
@@ -210,9 +179,47 @@ extension _Unicode.DefaultScalarView : BidirectionalCollection {
|
|||||||
import StdlibUnittest
|
import StdlibUnittest
|
||||||
import SwiftPrivate
|
import SwiftPrivate
|
||||||
|
|
||||||
|
func utf32<S : StringProtocol>(_ s: S) -> [UInt32] {
|
||||||
|
return s.unicodeScalars.map { $0.value }
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkStringProtocol<S : StringProtocol, Encoding: UnicodeEncoding>(
|
||||||
|
_ s: S,
|
||||||
|
_ utfStr: [Encoding.CodeUnit],
|
||||||
|
encodedAs: Encoding.Type,
|
||||||
|
expectingUTF32 expected: [UInt32]
|
||||||
|
) {
|
||||||
|
expectEqualSequence(
|
||||||
|
expected, utf32(s), "\(S.self) init(codeUnits:encoding:)")
|
||||||
|
|
||||||
|
if !utfStr.contains(0) {
|
||||||
|
if Encoding.self == UTF8.self {
|
||||||
|
var ntbs = utfStr.map { CChar(extendingOrTruncating: $0) }
|
||||||
|
ntbs.append(0)
|
||||||
|
expectEqualSequence(
|
||||||
|
expected, utf32(S(cString: ntbs)), "\(S.self) init(cString:)")
|
||||||
|
}
|
||||||
|
|
||||||
|
var ntbs = Array(utfStr); ntbs.append(0)
|
||||||
|
expectEqualSequence(
|
||||||
|
expected, utf32(S(cString: ntbs, encoding: Encoding.self)),
|
||||||
|
"\(S.self) init(cString:encoding:)"
|
||||||
|
)
|
||||||
|
|
||||||
|
s.withCString {
|
||||||
|
expectEqual(s, S(cString: $0), "\(S.self) withCString(_:)")
|
||||||
|
}
|
||||||
|
|
||||||
|
s.withCString(encoding: Encoding.self) {
|
||||||
|
expectEqual(s, S(cString: $0, encoding: Encoding.self),
|
||||||
|
"\(S.self) withCString(encoding:_:)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func checkDecodeUTF<Codec : UnicodeCodec & UnicodeEncoding>(
|
func checkDecodeUTF<Codec : UnicodeCodec & UnicodeEncoding>(
|
||||||
_ codec: Codec.Type, _ expectedHead: [UInt32],
|
_ codec: Codec.Type, _ expectedHead: [UInt32],
|
||||||
_ expectedRepairedTail: [UInt32], _ utfStr: [Codec.CodeUnit]
|
_ expectedRepairedTail: [UInt32], _ utfStr: [Codec.CodeUnit]
|
||||||
) -> AssertionResult {
|
) -> AssertionResult {
|
||||||
var decoded = [UInt32]()
|
var decoded = [UInt32]()
|
||||||
var expected = expectedHead
|
var expected = expectedHead
|
||||||
@@ -303,6 +310,20 @@ func checkDecodeUTF<Codec : UnicodeCodec & UnicodeEncoding>(
|
|||||||
else { expectNotEqual(0, errorCount) }
|
else { expectNotEqual(0, errorCount) }
|
||||||
}
|
}
|
||||||
check(expected.reversed(), "reverse, repairing: true")
|
check(expected.reversed(), "reverse, repairing: true")
|
||||||
|
|
||||||
|
//===--- String/Substring Construction and C-String interop -------------===//
|
||||||
|
do {
|
||||||
|
let s = String(codeUnits: utfStr, encoding: Codec.self)
|
||||||
|
checkStringProtocol(
|
||||||
|
s, utfStr, encodedAs: Codec.self, expectingUTF32: expected)
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
let s0 = "\n" + String(codeUnits: utfStr, encoding: Codec.self) + "\n"
|
||||||
|
checkStringProtocol(
|
||||||
|
s0.dropFirst().dropLast(),
|
||||||
|
utfStr, encodedAs: Codec.self, expectingUTF32: expected)
|
||||||
|
}
|
||||||
|
|
||||||
//===--- Transcoded Scalars ---------------------------------------------===//
|
//===--- Transcoded Scalars ---------------------------------------------===//
|
||||||
for x in decoded.lazy.map({ UnicodeScalar($0)! }) {
|
for x in decoded.lazy.map({ UnicodeScalar($0)! }) {
|
||||||
|
|||||||
Reference in New Issue
Block a user