stdlib/String: implement Unicode extended grapheme cluster segmentation

algorithm The implementation uses a specialized trie that has not been tuned to the table data. I tried guessing parameter values that should work well, but did not do any performance measurements. There is no efficient way to initialize arrays with static data in Swift. The required tables are being generated as C++ code in the runtime library. rdar://16013860 Swift SVN r19340
2025-12-14 20:36:38 +01:00 · 2014-06-30 14:38:53 +00:00
parent a01b72ea2f
commit 4814e00fda
17 changed files with 1349 additions and 161 deletions
--- a/unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb
+++ b/unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb
@@ -17,55 +17,10 @@

 %{

-import re
+from GYBUnicodeDataUtils import *

-def convertLine(line):
-    # Strip comments.
-    line = re.sub('#.*', '', line).strip()
-
-    if line == "":
-        return None
-
-    test = ""
-    currBytes = 0
-    boundaries = []
-
-    # Match a list of code points.
-    for token in line.split(" "):
-        if token == "÷":
-            boundaries += [ currBytes ]
-        elif token == "×":
-            pass
-        else:
-            codePoint = int(token, 16)
-            # Tests from Unicode spec have isolated surrogates in them.  Our
-            # segmentation algorithm works on UTF-8 sequences, so encoding a
-            # surrogate would produce an invalid code unit sequence.  Instead
-            # of trying to emulate the maximal subpart algorithm for inserting
-            # U+FFFD in Python, we just replace every isolated surrogate with
-            # U+200B, which also has Grapheme_Cluster_Break equal to 'Control'
-            # and test separately that we handle ill-formed UTF-8 sequences.
-            if codePoint >= 0xd800 and codePoint <= 0xdfff:
-                codePoint = 0x200b
-            codePoint = ('\U%(cp)08x' % { 'cp': codePoint }).decode('unicode_escape')
-            asUTF8Bytes = codePoint.encode('utf8')
-            asUTF8Escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in asUTF8Bytes])
-            test += asUTF8Escaped
-            currBytes += len(asUTF8Bytes)
-
-    return (test, boundaries)
-
-# Self-test.
-assert(convertLine('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
-assert(convertLine('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
-
-graphemeBreakTests = []
-
-with open(unicodeGraphemeBreakTestFile, 'rb') as f:
-    for line in f:
-        test = convertLine(line)
-        if test:
-            graphemeBreakTests += [ test ]
+grapheme_cluster_break_tests = \
+    get_grapheme_cluster_break_tests_as_UTF8(unicodeGraphemeBreakTestFile)

 }%

@@ -90,9 +45,9 @@ static std::vector<unsigned> FindGraphemeClusterBoundaries(StringRef Str) {
 }

 TEST(ExtractExtendedGraphemeCluster, TestsFromUnicodeSpec) {
-% for subjectString,expectedBoundaries in graphemeBreakTests:
-  EXPECT_EQ((std::vector<unsigned>{ ${', '.join([ str(x) for x in expectedBoundaries ])} }),
-      FindGraphemeClusterBoundaries("${subjectString}"));
+% for subject_string,expected_boundaries in grapheme_cluster_break_tests:
+  EXPECT_EQ((std::vector<unsigned>{ ${', '.join([ str(x) for x in expected_boundaries ])} }),
+      FindGraphemeClusterBoundaries("${subject_string}"));
 % end
 }