libBasic: implement extended grapheme cluster segmentation algorithm

This is only for the frontend, not for stdlib. The implementation is very slow, optimizing it is the next step. rdar://16755123 rdar://16013860 Swift SVN r18928
2025-12-14 20:36:38 +01:00 · 2014-06-16 14:20:43 +00:00
parent e310fd7f0b
commit 7704e19b7d
11 changed files with 2071 additions and 9 deletions
--- a/unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb
+++ b/unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb
@@ -0,0 +1,117 @@
+%# -*- mode: C++ -*-
+
+%# Ignore the following admonition; it applies to the resulting .cpp file only
+//// Automatically Generated From UnicodeExtendedGraphemeClusters.cpp.gyb.
+//// Do Not Edit Directly!
+//===----------------------------------------------------------------------===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See http://swift.org/LICENSE.txt for license information
+// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
+//
+//===----------------------------------------------------------------------===//
+
+%{
+
+import re
+
+def convertLine(line):
+    # Strip comments.
+    line = re.sub('#.*', '', line).strip()
+
+    if line == "":
+        return None
+
+    test = ""
+    currBytes = 0
+    boundaries = []
+
+    # Match a list of code points.
+    for token in line.split(" "):
+        if token == "÷":
+            boundaries += [ currBytes ]
+        elif token == "×":
+            pass
+        else:
+            codePoint = int(token, 16)
+            # Tests from Unicode spec have isolated surrogates in them.  Our
+            # segmentation algorithm works on UTF-8 sequences, so encoding a
+            # surrogate would produce an invalid code unit sequence.  Instead
+            # of trying to emulate the maximal subpart algorithm for inserting
+            # U+FFFD in Python, we just replace every isolated surrogate with
+            # U+200B, which also has Grapheme_Cluster_Break equal to 'Control'
+            # and test separately that we handle ill-formed UTF-8 sequences.
+            if codePoint >= 0xd800 and codePoint <= 0xdfff:
+                codePoint = 0x200b
+            codePoint = ('\U%(cp)08x' % { 'cp': codePoint }).decode('unicode_escape')
+            asUTF8Bytes = codePoint.encode('utf8')
+            asUTF8Escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in asUTF8Bytes])
+            test += asUTF8Escaped
+            currBytes += len(asUTF8Bytes)
+
+    return (test, boundaries)
+
+# Self-test.
+assert(convertLine('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
+assert(convertLine('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
+
+graphemeBreakTests = []
+
+with open(unicodeGraphemeBreakTestFile, 'rb') as f:
+    for line in f:
+        test = convertLine(line)
+        if test:
+            graphemeBreakTests += [ test ]
+
+}%
+
+#include "swift/Basic/Unicode.h"
+#include "gtest/gtest.h"
+#include <vector>
+
+using namespace swift;
+using namespace swift::unicode;
+
+static std::vector<unsigned> FindGraphemeClusterBoundaries(StringRef Str) {
+  std::vector<unsigned> Result;
+  Result.push_back(0);
+
+  unsigned Pos = 0;
+  while (Pos != Str.size()) {
+    Pos += extractFirstExtendedGraphemeCluster(Str.substr(Pos)).size();
+    Result.push_back(Pos);
+  }
+
+  return Result;
+}
+
+TEST(ExtractExtendedGraphemeCluster, TestsFromUnicodeSpec) {
+% for subjectString,expectedBoundaries in graphemeBreakTests:
+  EXPECT_EQ((std::vector<unsigned>{ ${', '.join([ str(x) for x in expectedBoundaries ])} }),
+      FindGraphemeClusterBoundaries("${subjectString}"));
+% end
+}
+
+TEST(ExtractExtendedGraphemeCluster, ExtraTests) {
+  //
+  // Sequences with one continuation byte missing
+  //
+
+  EXPECT_EQ((std::vector<unsigned>{ 0, 1 }),
+            FindGraphemeClusterBoundaries("\xc2"));
+
+  //
+  // Isolated surrogates
+  //
+
+  EXPECT_EQ((std::vector<unsigned>{ 0, 1, 2, 3 }),
+            FindGraphemeClusterBoundaries("\xed\xa0\x80"));
+  EXPECT_EQ((std::vector<unsigned>{ 0, 4, 5, 6, 11 }),
+            FindGraphemeClusterBoundaries(
+                "\xf3\xa0\x84\x80" "\xed\xa0\x80" "\xf3\xa0\x84\x80"));
+}
+