%# -*- mode: C++ -*- %# Ignore the following admonition; it applies to the resulting .cpp file only //// Automatically Generated From UnicodeExtendedGraphemeClusters.cpp.gyb. //// Do Not Edit Directly! //===----------------------------------------------------------------------===// // // This source file is part of the Swift.org open source project // // Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See http://swift.org/LICENSE.txt for license information // See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// %{ import re def convertLine(line): # Strip comments. line = re.sub('#.*', '', line).strip() if line == "": return None test = "" currBytes = 0 boundaries = [] # Match a list of code points. for token in line.split(" "): if token == "÷": boundaries += [ currBytes ] elif token == "×": pass else: codePoint = int(token, 16) # Tests from Unicode spec have isolated surrogates in them. Our # segmentation algorithm works on UTF-8 sequences, so encoding a # surrogate would produce an invalid code unit sequence. Instead # of trying to emulate the maximal subpart algorithm for inserting # U+FFFD in Python, we just replace every isolated surrogate with # U+200B, which also has Grapheme_Cluster_Break equal to 'Control' # and test separately that we handle ill-formed UTF-8 sequences. if codePoint >= 0xd800 and codePoint <= 0xdfff: codePoint = 0x200b codePoint = ('\U%(cp)08x' % { 'cp': codePoint }).decode('unicode_escape') asUTF8Bytes = codePoint.encode('utf8') asUTF8Escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in asUTF8Bytes]) test += asUTF8Escaped currBytes += len(asUTF8Bytes) return (test, boundaries) # Self-test. assert(convertLine('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ])) assert(convertLine('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ])) graphemeBreakTests = [] with open(unicodeGraphemeBreakTestFile, 'rb') as f: for line in f: test = convertLine(line) if test: graphemeBreakTests += [ test ] }% #include "swift/Basic/Unicode.h" #include "gtest/gtest.h" #include using namespace swift; using namespace swift::unicode; static std::vector FindGraphemeClusterBoundaries(StringRef Str) { std::vector Result; Result.push_back(0); unsigned Pos = 0; while (Pos != Str.size()) { Pos += extractFirstExtendedGraphemeCluster(Str.substr(Pos)).size(); Result.push_back(Pos); } return Result; } TEST(ExtractExtendedGraphemeCluster, TestsFromUnicodeSpec) { % for subjectString,expectedBoundaries in graphemeBreakTests: EXPECT_EQ((std::vector{ ${', '.join([ str(x) for x in expectedBoundaries ])} }), FindGraphemeClusterBoundaries("${subjectString}")); % end } TEST(ExtractExtendedGraphemeCluster, ExtraTests) { // // Sequences with one continuation byte missing // EXPECT_EQ((std::vector{ 0, 1 }), FindGraphemeClusterBoundaries("\xc2")); // // Isolated surrogates // EXPECT_EQ((std::vector{ 0, 1, 2, 3 }), FindGraphemeClusterBoundaries("\xed\xa0\x80")); EXPECT_EQ((std::vector{ 0, 4, 5, 6, 11 }), FindGraphemeClusterBoundaries( "\xf3\xa0\x84\x80" "\xed\xa0\x80" "\xf3\xa0\x84\x80")); }