mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
We should not make grapheme segmentation algorithm inlineable now (and possibly ever). The immediate reason is that Unicode 8.0 grapheme segmentation algorithm has been changed significantly in Unicode 9.0. Unicode 9.0 requires a stateful algorithm. Current indices and iterators just don't have the storage for that state, so we can't mark them as fragile.
241 lines
7.7 KiB
Swift
241 lines
7.7 KiB
Swift
//===--- UnicodeTrie.swift.gyb --------------------------------*- swift -*-===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See http://swift.org/LICENSE.txt for license information
|
|
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// A custom trie implementation to quickly retrieve Unicode property values.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
%{
|
|
|
|
# Note: keep these constants synchronized with the data that it is actually
|
|
# generated. There is a runtime check for this, but it is only performed in
|
|
# builds with INTERNAL_CHECKS_ENABLED.
|
|
|
|
BMPFirstLevelIndexBits = 8
|
|
BMPDataOffsetBits = 8
|
|
SuppFirstLevelIndexBits = 5
|
|
SuppSecondLevelIndexBits = 8
|
|
SuppDataOffsetBits = 8
|
|
|
|
BMPLookupBytesPerEntry = 1
|
|
BMPDataBytesPerEntry = 1
|
|
SuppLookup1BytesPerEntry = 1
|
|
SuppLookup2BytesPerEntry = 1
|
|
SuppDataBytesPerEntry = 1
|
|
|
|
TrieSize = 18961
|
|
|
|
BMPLookupBytesOffset = 0
|
|
BMPDataBytesOffset = 256
|
|
SuppLookup1BytesOffset = 12032
|
|
SuppLookup2BytesOffset = 12049
|
|
SuppDataBytesOffset = 12817
|
|
|
|
}%
|
|
|
|
import SwiftShims
|
|
|
|
// These case names must be kept in sync with the 'GraphemeClusterBreakProperty'
|
|
// enum in C++ and with the names in the GYBUnicodeDataUtils script.
|
|
public // @testable
|
|
enum _GraphemeClusterBreakPropertyValue : Int {
|
|
case Other = 0
|
|
case CR = 1
|
|
case LF = 2
|
|
case Control = 3
|
|
case Extend = 4
|
|
case Regional_Indicator = 5
|
|
case Prepend = 6
|
|
case SpacingMark = 7
|
|
case L = 8
|
|
case V = 9
|
|
case T = 10
|
|
case LV = 11
|
|
case LVT = 12
|
|
}
|
|
|
|
// It is expensive to convert a raw enum value to an enum, so we use this type
|
|
// safe wrapper around the raw property value to avoid paying the conversion
|
|
// cost in hot code paths.
|
|
struct _GraphemeClusterBreakPropertyRawValue {
|
|
init(_ rawValue: UInt${BMPDataBytesPerEntry * 8}) {
|
|
self.rawValue = rawValue
|
|
}
|
|
|
|
var rawValue: UInt${BMPDataBytesPerEntry * 8}
|
|
|
|
// Use with care: this operation is expensive (even with optimization
|
|
// turned on the compiler generates code for a switch).
|
|
var cookedValue: _GraphemeClusterBreakPropertyValue {
|
|
return _GraphemeClusterBreakPropertyValue(rawValue: Int(rawValue))!
|
|
}
|
|
}
|
|
|
|
public // @testable
|
|
struct _UnicodeGraphemeClusterBreakPropertyTrie {
|
|
static func _checkParameters() {
|
|
let metadata = _swift_stdlib_GraphemeClusterBreakPropertyTrieMetadata
|
|
|
|
_sanityCheck(metadata.BMPFirstLevelIndexBits == ${BMPFirstLevelIndexBits})
|
|
_sanityCheck(metadata.BMPDataOffsetBits == ${BMPDataOffsetBits})
|
|
_sanityCheck(metadata.SuppFirstLevelIndexBits == ${SuppFirstLevelIndexBits})
|
|
_sanityCheck(metadata.SuppSecondLevelIndexBits == ${SuppSecondLevelIndexBits})
|
|
_sanityCheck(metadata.SuppDataOffsetBits == ${SuppDataOffsetBits})
|
|
|
|
_sanityCheck(metadata.BMPLookupBytesPerEntry == ${BMPLookupBytesPerEntry})
|
|
_sanityCheck(metadata.BMPDataBytesPerEntry == ${BMPDataBytesPerEntry})
|
|
_sanityCheck(metadata.SuppLookup1BytesPerEntry == ${SuppLookup1BytesPerEntry})
|
|
_sanityCheck(metadata.SuppLookup2BytesPerEntry == ${SuppLookup2BytesPerEntry})
|
|
_sanityCheck(metadata.SuppDataBytesPerEntry == ${SuppDataBytesPerEntry})
|
|
|
|
_sanityCheck(metadata.TrieSize == ${TrieSize})
|
|
|
|
_sanityCheck(metadata.BMPLookupBytesOffset == ${BMPLookupBytesOffset})
|
|
_sanityCheck(metadata.BMPDataBytesOffset == ${BMPDataBytesOffset})
|
|
_sanityCheck(metadata.SuppLookup1BytesOffset == ${SuppLookup1BytesOffset})
|
|
_sanityCheck(metadata.SuppLookup2BytesOffset == ${SuppLookup2BytesOffset})
|
|
_sanityCheck(metadata.SuppDataBytesOffset == ${SuppDataBytesOffset})
|
|
}
|
|
|
|
let _trieData: UnsafePointer<UInt8>
|
|
|
|
% if BMPLookupBytesPerEntry == 1:
|
|
@_transparent var _bmpLookup: UnsafePointer<UInt8> {
|
|
return _trieData + ${BMPLookupBytesOffset}
|
|
}
|
|
% end
|
|
|
|
% if BMPDataBytesPerEntry == 1:
|
|
@_transparent var _bmpData: UnsafePointer<UInt8> {
|
|
return _trieData + ${BMPDataBytesOffset}
|
|
}
|
|
% end
|
|
|
|
% if SuppLookup1BytesPerEntry == 1:
|
|
@_transparent var _suppLookup1: UnsafePointer<UInt8> {
|
|
return _trieData + ${SuppLookup1BytesOffset}
|
|
}
|
|
% end
|
|
|
|
% if SuppLookup2BytesPerEntry == 1:
|
|
@_transparent var _suppLookup2: UnsafePointer<UInt8> {
|
|
return _trieData + ${SuppLookup2BytesOffset}
|
|
}
|
|
% end
|
|
|
|
% if SuppDataBytesPerEntry == 1:
|
|
@_transparent var _suppData: UnsafePointer<UInt8> {
|
|
return _trieData + ${SuppDataBytesOffset}
|
|
}
|
|
% end
|
|
|
|
public // @testable
|
|
init() {
|
|
_UnicodeGraphemeClusterBreakPropertyTrie._checkParameters()
|
|
_trieData = _swift_stdlib_GraphemeClusterBreakPropertyTrie
|
|
}
|
|
|
|
@_transparent
|
|
func _getBMPFirstLevelIndex(_ cp: UInt32) -> Int {
|
|
return Int(cp >> ${BMPFirstLevelIndexBits})
|
|
}
|
|
|
|
@_transparent
|
|
func _getBMPDataOffset(_ cp: UInt32) -> Int {
|
|
return Int(cp & ((1 << ${BMPDataOffsetBits}) - 1))
|
|
}
|
|
|
|
@_transparent
|
|
func _getSuppFirstLevelIndex(_ cp: UInt32) -> Int {
|
|
return Int(cp >> (${SuppSecondLevelIndexBits} + ${SuppDataOffsetBits}))
|
|
}
|
|
|
|
@_transparent
|
|
func _getSuppSecondLevelIndex(_ cp: UInt32) -> Int {
|
|
return Int((cp >> ${SuppDataOffsetBits}) &
|
|
((1 << ${SuppSecondLevelIndexBits}) - 1))
|
|
}
|
|
|
|
@_transparent
|
|
func _getSuppDataOffset(_ cp: UInt32) -> Int {
|
|
return Int(cp & ((1 << ${SuppDataOffsetBits}) - 1))
|
|
}
|
|
|
|
func getPropertyRawValue(
|
|
_ codePoint: UInt32
|
|
) -> _GraphemeClusterBreakPropertyRawValue {
|
|
// Note: for optimization, the code below uses '&+' instead of '+' to avoid
|
|
// a few branches. There is no possibility of overflow here.
|
|
//
|
|
// The optimizer could figure this out, but right now it keeps extra checks
|
|
// if '+' is used.
|
|
|
|
if _fastPath(codePoint <= 0xffff) {
|
|
let dataBlockIndex = Int(_bmpLookup[_getBMPFirstLevelIndex(codePoint)])
|
|
return _GraphemeClusterBreakPropertyRawValue(
|
|
_bmpData[
|
|
(dataBlockIndex << ${BMPDataOffsetBits}) &+
|
|
_getBMPDataOffset(codePoint)])
|
|
} else {
|
|
_precondition(codePoint <= 0x10ffff)
|
|
let secondLookupIndex = Int(_suppLookup1[_getSuppFirstLevelIndex(codePoint)])
|
|
let dataBlockIndex = Int(_suppLookup2[
|
|
(secondLookupIndex << ${SuppSecondLevelIndexBits}) &+
|
|
_getSuppSecondLevelIndex(codePoint)])
|
|
return _GraphemeClusterBreakPropertyRawValue(
|
|
_suppData[
|
|
(dataBlockIndex << ${SuppDataOffsetBits}) &+
|
|
_getSuppDataOffset(codePoint)])
|
|
}
|
|
}
|
|
|
|
public // @testable
|
|
func getPropertyValue(
|
|
_ codePoint: UInt32
|
|
) -> _GraphemeClusterBreakPropertyValue {
|
|
return getPropertyRawValue(codePoint).cookedValue
|
|
}
|
|
}
|
|
|
|
// FIXME(ABI): don't mark this type versioned, or any of its APIs inlineable.
|
|
// Grapheme cluster segmentation uses a completely different algorithm in
|
|
// Unicode 9.0.
|
|
internal struct _UnicodeExtendedGraphemeClusterSegmenter {
|
|
let _noBoundaryRulesMatrix: UnsafePointer<UInt16>
|
|
|
|
init() {
|
|
_noBoundaryRulesMatrix =
|
|
_swift_stdlib_ExtendedGraphemeClusterNoBoundaryRulesMatrix
|
|
}
|
|
|
|
/// Returns `true` if there is always a grapheme cluster break after a code
|
|
/// point with a given `Grapheme_Cluster_Break` property value.
|
|
func isBoundaryAfter(_ gcb: _GraphemeClusterBreakPropertyRawValue) -> Bool {
|
|
let ruleRow = _noBoundaryRulesMatrix[Int(gcb.rawValue)]
|
|
return ruleRow == 0
|
|
}
|
|
|
|
/// Returns `true` if there is a grapheme cluster break between code points
|
|
/// with given `Grapheme_Cluster_Break` property values.
|
|
func isBoundary(
|
|
_ gcb1: _GraphemeClusterBreakPropertyRawValue,
|
|
_ gcb2: _GraphemeClusterBreakPropertyRawValue
|
|
) -> Bool {
|
|
let ruleRow = _noBoundaryRulesMatrix[Int(gcb1.rawValue)]
|
|
return (ruleRow & (1 << UInt16(gcb2.rawValue))) == 0
|
|
}
|
|
}
|
|
|
|
// ${'Local Variables'}:
|
|
// eval: (read-only-mode 1)
|
|
// End:
|