mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
[stdlib] Parse my tweets faster! 2x forwards, 3x reverse
Adds in a special case grapheme break detection between two values within scalar ranges where we have special knowledge. Any sub-0x300 scalars, except CR-LF, are guaranteed to have grapheme breaks between them. We're reasonably confident this will not change in future versions of Unicode. We might add more ranges in the future, but should do so conservatively, anticipating future Unicode changes. In these cases we can very quickly break, even for strings that have mixed latin and emoji characters. In a ASCII string with a single emoji in it, we traverse the string 2x faster forwards and 3x faster in reverse. (Reverse is 3x faster as it involves some forwards traversal inside of the index). For a string that's half Latin half non-Latin, we're about 1.5x faster forwards and backwards.
This commit is contained in:
@@ -18,6 +18,10 @@
|
||||
// FIXME(ABI)#70 : The character string view should have a custom iterator type to
|
||||
// allow performance optimizations of linear traversals.
|
||||
|
||||
/// CR and LF are common special cases in grapheme breaking logic
|
||||
internal let _CR: UInt8 = 0x0d
|
||||
internal let _LF: UInt8 = 0x0a
|
||||
|
||||
extension String {
|
||||
/// A view of a string's contents as a collection of characters.
|
||||
///
|
||||
@@ -246,6 +250,22 @@ extension String.CharacterView : BidirectionalCollection {
|
||||
)
|
||||
}
|
||||
|
||||
/// Fast check for a (stable) grapheme break between two UInt16 code units
|
||||
@inline(__always)
|
||||
internal static func _quickCheckGraphemeBreakBetween(
|
||||
_ lhs: UInt16, _ rhs: UInt16
|
||||
) -> Bool {
|
||||
// With the exception of CR-LF, there is always a grapheme break between two
|
||||
// sub-0x300 code units
|
||||
if lhs < 0x300 && rhs < 0x300 {
|
||||
return lhs != UInt16(_CR) && rhs != UInt16(_LF)
|
||||
}
|
||||
|
||||
// TODO: Other large ranges, such as CJK? Note that any such addition must
|
||||
// be highly probable to never change in future Unicode versions.
|
||||
return false
|
||||
}
|
||||
|
||||
// NOTE: don't make this function inlineable. Grapheme cluster
|
||||
// segmentation uses a completely different algorithm in Unicode 9.0.
|
||||
//
|
||||
@@ -266,15 +286,13 @@ extension String.CharacterView : BidirectionalCollection {
|
||||
_onFastPath() // Please aggressively inline
|
||||
let asciiBuffer = _core.asciiBuffer._unsafelyUnwrappedUnchecked
|
||||
let pos = start._position - _coreOffset
|
||||
let CR: UInt8 = 0x0d
|
||||
let LF: UInt8 = 0x0a
|
||||
|
||||
// With the exception of CR-LF, ASCII graphemes are single-scalar. Check
|
||||
// for that one exception.
|
||||
if _slowPath(
|
||||
asciiBuffer[pos] == CR &&
|
||||
asciiBuffer[pos] == _CR &&
|
||||
pos+1 < asciiBuffer.endIndex &&
|
||||
asciiBuffer[pos+1] == LF
|
||||
asciiBuffer[pos+1] == _LF
|
||||
) {
|
||||
return 2
|
||||
}
|
||||
@@ -283,6 +301,22 @@ extension String.CharacterView : BidirectionalCollection {
|
||||
}
|
||||
|
||||
let startIndexUTF16 = start._position
|
||||
|
||||
// Last scalar is its own grapheme
|
||||
if (startIndexUTF16+1 == end._position) {
|
||||
return 1
|
||||
}
|
||||
|
||||
// Perform a quick single-code-unit grapheme check
|
||||
if _core._baseAddress != nil {
|
||||
if String.CharacterView._quickCheckGraphemeBreakBetween(
|
||||
_core._nthContiguous(startIndexUTF16),
|
||||
_core._nthContiguous(startIndexUTF16+1)
|
||||
) {
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
let graphemeClusterBreakProperty =
|
||||
_UnicodeGraphemeClusterBreakPropertyTrie()
|
||||
let segmenter = _UnicodeExtendedGraphemeClusterSegmenter()
|
||||
@@ -332,15 +366,13 @@ extension String.CharacterView : BidirectionalCollection {
|
||||
_sanityCheck(
|
||||
pos >= asciiBuffer.startIndex,
|
||||
"should of been caught in earlier start-of-scalars check")
|
||||
let CR: UInt8 = 0x0d
|
||||
let LF: UInt8 = 0x0a
|
||||
|
||||
// With the exception of CR-LF, ASCII graphemes are single-scalar. Check
|
||||
// for that one exception.
|
||||
if _slowPath(
|
||||
asciiBuffer[pos] == LF &&
|
||||
asciiBuffer[pos] == _LF &&
|
||||
pos-1 >= asciiBuffer.startIndex &&
|
||||
asciiBuffer[pos-1] == CR
|
||||
asciiBuffer[pos-1] == _CR
|
||||
) {
|
||||
return 2
|
||||
}
|
||||
@@ -349,6 +381,22 @@ extension String.CharacterView : BidirectionalCollection {
|
||||
}
|
||||
|
||||
let endIndexUTF16 = end._position
|
||||
|
||||
// First scalar is its own grapheme
|
||||
if (endIndexUTF16-1 == start._position) {
|
||||
return 1
|
||||
}
|
||||
|
||||
// Perform a quick single-code-unit grapheme check
|
||||
if _core._baseAddress != nil {
|
||||
if String.CharacterView._quickCheckGraphemeBreakBetween(
|
||||
_core._nthContiguous(endIndexUTF16-2),
|
||||
_core._nthContiguous(endIndexUTF16-1)
|
||||
) {
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
let graphemeClusterBreakProperty =
|
||||
_UnicodeGraphemeClusterBreakPropertyTrie()
|
||||
let segmenter = _UnicodeExtendedGraphemeClusterSegmenter()
|
||||
|
||||
Reference in New Issue
Block a user