Files
swift-mirror/utils/gen-unicode-data/Sources/GenScalarProps/Names.swift
2025-01-15 14:09:57 -08:00

274 lines
6.4 KiB
Swift

//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
import GenUtils
func getName(
from data: String,
into result: inout [(UInt32, String)],
words: inout [String]
) {
var uniqueWords: Set<String> = []
for line in data.split(separator: "\n") {
// Skip comments
guard !line.hasPrefix("#") else {
continue
}
let info = line.split(separator: "#")
let components = info[0].split(separator: ";")
let name = String(components[1].dropFirst())
let filteredScalars = components[0].filter { !$0.isWhitespace }
if filteredScalars.contains(".") {
continue
}
let scalar = UInt32(filteredScalars, radix: 16)!
// Variation selectors are handled in code.
if (0xE0100...0xE01EF).contains(scalar) {
continue
}
// Hanguel is handled in code.
if scalar >= 0xAC00, scalar <= 0xD7A3 {
continue
}
result.append((scalar, name))
for word in name.split(separator: " ") {
uniqueWords.insert(String(word))
}
}
words = Array(uniqueWords)
}
func sortWords(
_ words: inout [String],
from data: [(UInt32, String)]
) {
var popularity: [String: Int] = [:]
for (_, name) in data {
let scalarWords = name.split(separator: " ")
for word in scalarWords {
popularity[String(word), default: 0] += 1
}
}
let sortedPopularity = Array(popularity).sorted { $0.value > $1.value }
words = sortedPopularity.map { $0.key }
}
func emitWords(
_ words: [String],
into result: inout String
) -> [String: UInt32] {
var wordIndices: [String: UInt32] = [:]
var bytes: [UInt8] = []
var index: UInt32 = 0
for word in words {
wordIndices[word] = index
for (i, byte) in word.utf8.enumerated() {
var element = byte
if i == word.utf8.count - 1 {
element |= 0x80
}
bytes.append(element)
index += 1
}
}
emitCollection(bytes, name: "_swift_stdlib_words", into: &result)
return wordIndices
}
func emitWordOffsets(
_ wordOffsets: [String: UInt32],
into result: inout String
) -> [String: UInt32] {
let sortedWordOffsets = Array(wordOffsets).sorted { $0.value < $1.value }
var wordIndices: [String: UInt32] = [:]
for (i, (word, _)) in sortedWordOffsets.enumerated() {
wordIndices[word] = UInt32(i)
}
emitCollection(
sortedWordOffsets.map { $0.value },
name: "_swift_stdlib_word_indices",
into: &result
)
return wordIndices
}
func emitScalarNames(
_ names: [(UInt32, String)],
_ wordIndices: [String: UInt32],
into result: inout String
) -> [UInt32: UInt32] {
var nameBytes: [UInt8] = []
var scalarNameIndices: [UInt32: UInt32] = [:]
var index: UInt32 = 0
for (scalar, name) in names.sorted(by: { $0.0 < $1.0 }) {
scalarNameIndices[scalar] = index
for word in name.split(separator: " ") {
let wordIndex = wordIndices[String(word)]!
// If the word index is smaller than 0xFF, then we don't need to add the
// extra byte to represent the index.
if wordIndex < 0xFF {
nameBytes.append(UInt8(wordIndex))
index += 1
} else {
assert(wordIndex <= UInt16.max)
nameBytes.append(0xFF)
nameBytes.append(UInt8(wordIndex & 0xFF))
nameBytes.append(UInt8(wordIndex >> 8))
index += 3
}
}
}
result += """
#define NAMES_LAST_SCALAR_OFFSET \(nameBytes.count)
"""
emitCollection(nameBytes, name: "_swift_stdlib_names", into: &result)
return scalarNameIndices
}
func emitScalars(
_ scalarNameIndices: [UInt32: UInt32],
into result: inout String
) -> [UInt32: UInt16] {
var scalars: [UInt32] = []
var scalarSetIndices: [UInt32: UInt16] = [:]
var index: UInt16 = 0
for i in 0x0 ... 0x10FFFF >> 7 {
let scalarRange = i << 7 ..< i << 7 + 128
let filteredRange = scalarRange.filter {
scalarNameIndices.keys.contains(UInt32($0))
}
scalarSetIndices[UInt32(i)] = index
if filteredRange.count >= 1 {
scalarSetIndices[UInt32(i)] = index
for scalar in scalarRange {
index += 1
guard let index = scalarNameIndices[UInt32(scalar)] else {
scalars.append(0)
continue
}
scalars.append(index)
}
} else {
scalarSetIndices[UInt32(i)] = UInt16.max
}
}
result += """
#define NAMES_SCALARS_MAX_INDEX \(scalars.count - 1)
"""
emitCollection(scalars, name: "_swift_stdlib_names_scalars", into: &result)
return scalarSetIndices
}
func emitScalarSets(
_ scalarSetIndices: [UInt32: UInt16],
into result: inout String
) {
var scalarSets: [UInt16] = []
for i in 0x0 ... 0x10FFFF >> 7 {
let index = scalarSetIndices[UInt32(i)]!
guard index != .max else {
scalarSets.append(index)
continue
}
scalarSets.append(index >> 7)
}
emitCollection(scalarSets, name: "_swift_stdlib_names_scalar_sets", into: &result)
}
func emitLargestNameCount(_ names: [(UInt32, String)], into result: inout String) {
var largestCount = 0
for (_, name) in names {
largestCount = Swift.max(largestCount, name.count)
}
print("""
Please copy and paste the following into 'stdlib/public/SwiftShims/swift/shims/UnicodeData.h':
#define SWIFT_STDLIB_LARGEST_NAME_COUNT \(largestCount)
""")
}
func generateNameProp(into result: inout String) {
let derivedName = readFile("Data/16/DerivedName.txt")
var names: [(UInt32, String)] = []
var words: [String] = []
getName(from: derivedName, into: &names, words: &words)
sortWords(&words, from: names)
emitLargestNameCount(names, into: &result)
let wordOffsets = emitWords(words, into: &result)
let wordIndices = emitWordOffsets(wordOffsets, into: &result)
let scalarNameIndices = emitScalarNames(names, wordIndices, into: &result)
let scalarSetIndices = emitScalars(scalarNameIndices, into: &result)
emitScalarSets(scalarSetIndices, into: &result)
}