// FIXME: Char should be renamed CodePoint in the compiler typealias CodePoint = Char struct IsSpace : Predicate { typealias Arguments = CodePoint typealias Result = Bool func __call__(c: CodePoint) -> Result { return c.isSpace() } } struct IsEqualTo : Predicate { var target: GenericIVar constructor(target: T) { this.target.value = target } typealias Arguments = T typealias Result = Bool func __call__(x: T) -> Result { return x == target.value } } struct CodePointIndex : BidirectionalIndex, Comparable { var data : StringByteData var position : Int func __equal__(rhs: CodePointIndex) -> Bool { assert(data == rhs.data, "Can't compare Indices from different Strings") return position == rhs.position } func __less__(rhs: CodePointIndex) -> Bool { assert(data == rhs.data, "Can't compare Indices from different Strings") return position < rhs.position } func succ() -> CodePointIndex { assert(position < data.length) var pos = position var c0 = (data.base + pos).get() ++pos if c0 >= 0x80 { ++pos if c0 >= 0xE0 { ++pos if c0 >= 0xF0 { ++pos } } } return CodePointIndex(data, pos) } func pred() -> CodePointIndex { var pos = position var p = data.base assert(pos > 0) if (((p + --pos).get() & 0xC0) == 0x80) { if (((p + --pos).get() & 0xC0) == 0x80) { if (((p + --pos).get() & 0xC0) == 0x80) { --pos } } } return CodePointIndex(data, pos) } } /// \brief Represents the sequence of Unicode code points in a /// UTF8-encoded StringByteData object struct CodePoints { constructor(s: String) { this.str_value = s.str_value } func [conversion] __conversion() -> String { return String(str_value) } func isEmpty() -> Bool { return str_value.length == 0 } func first() -> Char { var p = str_value.base // one octet (7 bits) var c0 : UInt8 = p.get() if c0 < 0x80 { return Char(UInt32(c0)) } var c1 = (++p).get() // start with octet 1 (we'll mask off high bits later) var result = UInt32(c0) result = (result << 6) | UInt32(c1 & 0x3F) // merge octet 2 if c0 < 0xE0 { return CodePoint(result & 0x000007FF) // 11 bits } c1 = (++p).get() // prefetch octet 3 result = (result << 6) | UInt32(c1 & 0x3F) // merge octet 3 if c0 < 0xF0 { return CodePoint(result & 0x0000FFFF) // 16 bits } c1 = (++p).get() // prefetch octet 4 result = (result << 6) | UInt32(c1 & 0x3F) // merge octet 4 return CodePoint(result & 0x001FFFFF) // 21 bits } func last() -> Char { var p = str_value.base + str_value.length var c = (--p).get() if c < 0x80 { return Char(UInt32(c)) } var result = UInt32(c & 0x3F) c = (--p).get() var more = ~c & 0x40 var mask = (more >> 1) | 0x1F result |= UInt32(c & mask) << 6 if more == 0 { return Char(result) } c = (--p).get() more = ~c & 0x40 mask = (more >> 2) | 0x0F result |= UInt32(c & mask) << (6+6) if more == 0 { return Char(result) } c = (--p).get() more = ~c & 0x40 mask = (more >> 3) | 0x07 result |= UInt32(c & mask) << (6+6+6) return Char(result) } func startsWith(prefix: CodePoints) -> Bool { return swift.startsWith(this.getEnumeratorType(), prefix.getEnumeratorType()) } func endsWith(suffix: CodePoints) -> Bool { return swift.startsWith( reverse(this).getEnumeratorType(), reverse(suffix).getEnumeratorType()) } /// \brief Return a sequence of consecutive whitespace-separated /// substrings of the string S. If maxsplit is given, at most /// maxsplit splits are done. Any whitespace string is a separator /// and empty strings are removed from the result. func split(maxSplit: Int = Int.max()) -> CodePoints[] { return swift.split(this, IsSpace(), maxSplit) } /// \brief Return the sequence of consecutive (possibly-empty) /// substrings that do not contain separator. If maxsplit is given, /// at most maxsplit splits are done. The result may contain empty /// strings. func split(separator: Char, maxSplit: Int = Int.max()) -> CodePoints[] { return swift.split(this, IsEqualTo(separator), maxSplit, allowEmptySlices: true) } var str_value : StringByteData } extension CodePoints : Equatable { func __equal__(rhs: CodePoints) -> Bool { return String(str_value) == String(rhs.str_value) } } extension CodePoints : Indexable { typealias Element = CodePoint typealias IndexType = CodePointIndex func begin() -> IndexType { return IndexType(str_value, 0) } func end() -> IndexType { return IndexType(str_value, str_value.length) } func __getitem__(i: IndexType) -> Element { assert(str_value == i.data, "Attempting to index a String using an index from another String") var p = str_value.base + i.position var c0 : UInt8 = p.get() if c0 < 0x80 { return Char(UInt32(c0)) // one octet (7 bits) } // start with octet 1 (we'll mask off high bits later) var c1 = (++p).get() var result = UInt32(c0) result = (result << 6) | UInt32(c1 & 0x3F) if c0 < 0xE0 { return CodePoint(result & 0x000007FF) // two octets (11 bits) } c1 = (++p).get() result = (result << 6) | UInt32(c1 & 0x3F) if c0 < 0xF0 { return CodePoint(result & 0x0000FFFF) // three octets (16 bits) } c1 = (++p).get() result = (result << 6) | UInt32(c1 & 0x3F) return CodePoint(result & 0x001FFFFF) // four octets (21 bits) } subscript(i: IndexType) -> Element { get: return __getitem__(i) } } extension CodePoints : Sliceable { func __slice__(start: IndexType, finish: IndexType) -> CodePoints { assert( str_value == start.data, "Attempting to slice a string using a start index from another String") assert( str_value == finish.data, "Attempting to slice a string using an end index from another String") // TODO: Decide what should be an error. For now I think it's // better to return an empty String when finish < start var ret = CodePoints("") if finish > start { ret.str_value = str_value[start.position..finish.position] } return ret } subscript(r: Range) -> CodePoints { get: return __slice__(r.begin(), r.end()) } } extension CodePoints : Enumerable { // FIXME: This is not the most efficient way to implement // Enumerators for CodePoints, since: // a. they will essentially contain two copies of the same // StringByteData (one for the IndexType and one for the Indexable). // b. enumeration will have to repeat much of the work of decoding // characters when moving forward. // At some point we'll want to refactor for efficiency, but probably // not before Generator becomes the official iteration protocol. typealias EnumeratorType = IndexableEnumerator > func getEnumeratorType() -> EnumeratorType { return EnumeratorType(this, indices(this)) } } extension CodePoints : FormattedPrintable { func format(kind : Char, layout : String) -> String { return String(this.str_value).format(kind, layout) } } func print(x: CodePoints) { print(String(x)) } func println(x: CodePoints) { println(String(x)) }