mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
libBasic: implement extended grapheme cluster segmentation algorithm
This is only for the frontend, not for stdlib. The implementation is very slow, optimizing it is the next step. rdar://16755123 rdar://16013860 Swift SVN r18928
This commit is contained in:
@@ -16,13 +16,47 @@
|
||||
using namespace swift;
|
||||
|
||||
StringRef swift::unicode::extractFirstExtendedGraphemeCluster(StringRef S) {
|
||||
// FIXME: implement as described in Unicode Standard Annex #29.
|
||||
// Extended grapheme cluster segmentation algorithm as described in Unicode
|
||||
// Standard Annex #29.
|
||||
if (S.empty())
|
||||
return StringRef();
|
||||
|
||||
// FIXME: deal with broken code unit sequences.
|
||||
// For now, just extract the first code point.
|
||||
unsigned CodeUnitSeqLen = getNumBytesForUTF8(S[0]);
|
||||
return S.slice(0, CodeUnitSeqLen);
|
||||
const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
|
||||
|
||||
const UTF8 *SourceNext = SourceStart;
|
||||
UTF32 C[2];
|
||||
UTF32 *TargetStart = C;
|
||||
|
||||
ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, C + 1,
|
||||
lenientConversion);
|
||||
if (TargetStart == C) {
|
||||
// The source string contains an ill-formed subsequence at the end.
|
||||
return S;
|
||||
}
|
||||
|
||||
GraphemeClusterBreakProperty GCBForC0 = getGraphemeClusterBreakProperty(C[0]);
|
||||
while (true) {
|
||||
if (isExtendedGraphemeClusterBoundaryAfter(GCBForC0))
|
||||
return S.slice(0, SourceNext - SourceStart);
|
||||
|
||||
size_t C1Offset = SourceNext - SourceStart;
|
||||
ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, C + 2,
|
||||
lenientConversion);
|
||||
|
||||
if (TargetStart == C + 1) {
|
||||
// End of source string or the source string contains an ill-formed
|
||||
// subsequence at the end.
|
||||
return S.slice(0, C1Offset);
|
||||
}
|
||||
|
||||
GraphemeClusterBreakProperty GCBForC1 =
|
||||
getGraphemeClusterBreakProperty(C[1]);
|
||||
if (isExtendedGraphemeClusterBoundary(GCBForC0, GCBForC1))
|
||||
return S.slice(0, C1Offset);
|
||||
|
||||
C[0] = C[1];
|
||||
TargetStart = C + 1;
|
||||
GCBForC0 = GCBForC1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user