[stdlib] Parse my tweets faster! 2x forwards, 3x reverse

Adds in a special case grapheme break detection between two values
within scalar ranges where we have special knowledge. Any sub-0x300
scalars, except CR-LF, are guaranteed to have grapheme breaks between
them. We're reasonably confident this will not change in future
versions of Unicode. We might add more ranges in the future, but
should do so conservatively, anticipating future Unicode changes.

In these cases we can very quickly break, even for strings that have
mixed latin and emoji characters. In a ASCII string with a single
emoji in it, we traverse the string 2x faster forwards and 3x faster
in reverse. (Reverse is 3x faster as it involves some forwards
traversal inside of the index). For a string that's half Latin half
non-Latin, we're about 1.5x faster forwards and backwards.
This commit is contained in:
Michael Ilseman
2017-04-24 14:00:17 -07:00
parent 517ac2a66f
commit 2d8164e552
2 changed files with 62 additions and 8 deletions

View File

@@ -18,6 +18,10 @@
// FIXME(ABI)#70 : The character string view should have a custom iterator type to
// allow performance optimizations of linear traversals.
/// CR and LF are common special cases in grapheme breaking logic
internal let _CR: UInt8 = 0x0d
internal let _LF: UInt8 = 0x0a
extension String {
/// A view of a string's contents as a collection of characters.
///
@@ -246,6 +250,22 @@ extension String.CharacterView : BidirectionalCollection {
)
}
/// Fast check for a (stable) grapheme break between two UInt16 code units
@inline(__always)
internal static func _quickCheckGraphemeBreakBetween(
_ lhs: UInt16, _ rhs: UInt16
) -> Bool {
// With the exception of CR-LF, there is always a grapheme break between two
// sub-0x300 code units
if lhs < 0x300 && rhs < 0x300 {
return lhs != UInt16(_CR) && rhs != UInt16(_LF)
}
// TODO: Other large ranges, such as CJK? Note that any such addition must
// be highly probable to never change in future Unicode versions.
return false
}
// NOTE: don't make this function inlineable. Grapheme cluster
// segmentation uses a completely different algorithm in Unicode 9.0.
//
@@ -266,15 +286,13 @@ extension String.CharacterView : BidirectionalCollection {
_onFastPath() // Please aggressively inline
let asciiBuffer = _core.asciiBuffer._unsafelyUnwrappedUnchecked
let pos = start._position - _coreOffset
let CR: UInt8 = 0x0d
let LF: UInt8 = 0x0a
// With the exception of CR-LF, ASCII graphemes are single-scalar. Check
// for that one exception.
if _slowPath(
asciiBuffer[pos] == CR &&
asciiBuffer[pos] == _CR &&
pos+1 < asciiBuffer.endIndex &&
asciiBuffer[pos+1] == LF
asciiBuffer[pos+1] == _LF
) {
return 2
}
@@ -283,6 +301,22 @@ extension String.CharacterView : BidirectionalCollection {
}
let startIndexUTF16 = start._position
// Last scalar is its own grapheme
if (startIndexUTF16+1 == end._position) {
return 1
}
// Perform a quick single-code-unit grapheme check
if _core._baseAddress != nil {
if String.CharacterView._quickCheckGraphemeBreakBetween(
_core._nthContiguous(startIndexUTF16),
_core._nthContiguous(startIndexUTF16+1)
) {
return 1
}
}
let graphemeClusterBreakProperty =
_UnicodeGraphemeClusterBreakPropertyTrie()
let segmenter = _UnicodeExtendedGraphemeClusterSegmenter()
@@ -332,15 +366,13 @@ extension String.CharacterView : BidirectionalCollection {
_sanityCheck(
pos >= asciiBuffer.startIndex,
"should of been caught in earlier start-of-scalars check")
let CR: UInt8 = 0x0d
let LF: UInt8 = 0x0a
// With the exception of CR-LF, ASCII graphemes are single-scalar. Check
// for that one exception.
if _slowPath(
asciiBuffer[pos] == LF &&
asciiBuffer[pos] == _LF &&
pos-1 >= asciiBuffer.startIndex &&
asciiBuffer[pos-1] == CR
asciiBuffer[pos-1] == _CR
) {
return 2
}
@@ -349,6 +381,22 @@ extension String.CharacterView : BidirectionalCollection {
}
let endIndexUTF16 = end._position
// First scalar is its own grapheme
if (endIndexUTF16-1 == start._position) {
return 1
}
// Perform a quick single-code-unit grapheme check
if _core._baseAddress != nil {
if String.CharacterView._quickCheckGraphemeBreakBetween(
_core._nthContiguous(endIndexUTF16-2),
_core._nthContiguous(endIndexUTF16-1)
) {
return 1
}
}
let graphemeClusterBreakProperty =
_UnicodeGraphemeClusterBreakPropertyTrie()
let segmenter = _UnicodeExtendedGraphemeClusterSegmenter()