//===----------------------------------------------------------------------===// // // This source file is part of the Swift.org open source project // // Copyright (c) 2021 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// import GenUtils func getName( from data: String, into result: inout [(UInt32, String)], words: inout [String] ) { var uniqueWords: Set = [] for line in data.split(separator: "\n") { // Skip comments guard !line.hasPrefix("#") else { continue } let info = line.split(separator: "#") let components = info[0].split(separator: ";") let name = String(components[1].dropFirst()) let filteredScalars = components[0].filter { !$0.isWhitespace } if filteredScalars.contains(".") { continue } let scalar = UInt32(filteredScalars, radix: 16)! // Variation selectors are handled in code. if (0xE0100...0xE01EF).contains(scalar) { continue } // Hanguel is handled in code. if scalar >= 0xAC00, scalar <= 0xD7A3 { continue } result.append((scalar, name)) for word in name.split(separator: " ") { uniqueWords.insert(String(word)) } } words = Array(uniqueWords) } func sortWords( _ words: inout [String], from data: [(UInt32, String)] ) { var popularity: [String: Int] = [:] for (_, name) in data { let scalarWords = name.split(separator: " ") for word in scalarWords { popularity[String(word), default: 0] += 1 } } let sortedPopularity = Array(popularity).sorted { $0.value > $1.value } words = sortedPopularity.map { $0.key } } func emitWords( _ words: [String], into result: inout String ) -> [String: UInt32] { var wordIndices: [String: UInt32] = [:] var bytes: [UInt8] = [] var index: UInt32 = 0 for word in words { wordIndices[word] = index for (i, byte) in word.utf8.enumerated() { var element = byte if i == word.utf8.count - 1 { element |= 0x80 } bytes.append(element) index += 1 } } emitCollection(bytes, name: "_swift_stdlib_words", into: &result) return wordIndices } func emitWordOffsets( _ wordOffsets: [String: UInt32], into result: inout String ) -> [String: UInt32] { let sortedWordOffsets = Array(wordOffsets).sorted { $0.value < $1.value } var wordIndices: [String: UInt32] = [:] for (i, (word, _)) in sortedWordOffsets.enumerated() { wordIndices[word] = UInt32(i) } emitCollection( sortedWordOffsets.map { $0.value }, name: "_swift_stdlib_word_indices", into: &result ) return wordIndices } func emitScalarNames( _ names: [(UInt32, String)], _ wordIndices: [String: UInt32], into result: inout String ) -> [UInt32: UInt32] { var nameBytes: [UInt8] = [] var scalarNameIndices: [UInt32: UInt32] = [:] var index: UInt32 = 0 for (scalar, name) in names.sorted(by: { $0.0 < $1.0 }) { scalarNameIndices[scalar] = index for word in name.split(separator: " ") { let wordIndex = wordIndices[String(word)]! // If the word index is smaller than 0xFF, then we don't need to add the // extra byte to represent the index. if wordIndex < 0xFF { nameBytes.append(UInt8(wordIndex)) index += 1 } else { assert(wordIndex <= UInt16.max) nameBytes.append(0xFF) nameBytes.append(UInt8(wordIndex & 0xFF)) nameBytes.append(UInt8(wordIndex >> 8)) index += 3 } } } result += """ #define NAMES_LAST_SCALAR_OFFSET \(nameBytes.count) """ emitCollection(nameBytes, name: "_swift_stdlib_names", into: &result) return scalarNameIndices } func emitScalars( _ scalarNameIndices: [UInt32: UInt32], into result: inout String ) -> [UInt32: UInt16] { var scalars: [UInt32] = [] var scalarSetIndices: [UInt32: UInt16] = [:] var index: UInt16 = 0 for i in 0x0 ... 0x10FFFF >> 7 { let scalarRange = i << 7 ..< i << 7 + 128 let filteredRange = scalarRange.filter { scalarNameIndices.keys.contains(UInt32($0)) } scalarSetIndices[UInt32(i)] = index if filteredRange.count >= 1 { scalarSetIndices[UInt32(i)] = index for scalar in scalarRange { index += 1 guard let index = scalarNameIndices[UInt32(scalar)] else { scalars.append(0) continue } scalars.append(index) } } else { scalarSetIndices[UInt32(i)] = UInt16.max } } result += """ #define NAMES_SCALARS_MAX_INDEX \(scalars.count - 1) """ emitCollection(scalars, name: "_swift_stdlib_names_scalars", into: &result) return scalarSetIndices } func emitScalarSets( _ scalarSetIndices: [UInt32: UInt16], into result: inout String ) { var scalarSets: [UInt16] = [] for i in 0x0 ... 0x10FFFF >> 7 { let index = scalarSetIndices[UInt32(i)]! guard index != .max else { scalarSets.append(index) continue } scalarSets.append(index >> 7) } emitCollection(scalarSets, name: "_swift_stdlib_names_scalar_sets", into: &result) } func emitLargestNameCount(_ names: [(UInt32, String)], into result: inout String) { var largestCount = 0 for (_, name) in names { largestCount = Swift.max(largestCount, name.count) } print(""" Please copy and paste the following into 'stdlib/public/SwiftShims/swift/shims/UnicodeData.h': #define SWIFT_STDLIB_LARGEST_NAME_COUNT \(largestCount) """) } func generateNameProp(into result: inout String) { let derivedName = readFile("Data/16/DerivedName.txt") var names: [(UInt32, String)] = [] var words: [String] = [] getName(from: derivedName, into: &names, words: &words) sortWords(&words, from: names) emitLargestNameCount(names, into: &result) let wordOffsets = emitWords(words, into: &result) let wordIndices = emitWordOffsets(wordOffsets, into: &result) let scalarNameIndices = emitScalarNames(names, wordIndices, into: &result) let scalarSetIndices = emitScalars(scalarNameIndices, into: &result) emitScalarSets(scalarSetIndices, into: &result) }