mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
This is only for the frontend, not for stdlib. The implementation is very slow, optimizing it is the next step. rdar://16755123 rdar://16013860 Swift SVN r18928
118 lines
3.7 KiB
C++
118 lines
3.7 KiB
C++
%# -*- mode: C++ -*-
|
||
|
||
%# Ignore the following admonition; it applies to the resulting .cpp file only
|
||
//// Automatically Generated From UnicodeExtendedGraphemeClusters.cpp.gyb.
|
||
//// Do Not Edit Directly!
|
||
//===----------------------------------------------------------------------===//
|
||
//
|
||
// This source file is part of the Swift.org open source project
|
||
//
|
||
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
|
||
// Licensed under Apache License v2.0 with Runtime Library Exception
|
||
//
|
||
// See http://swift.org/LICENSE.txt for license information
|
||
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
||
//
|
||
//===----------------------------------------------------------------------===//
|
||
|
||
%{
|
||
|
||
import re
|
||
|
||
def convertLine(line):
|
||
# Strip comments.
|
||
line = re.sub('#.*', '', line).strip()
|
||
|
||
if line == "":
|
||
return None
|
||
|
||
test = ""
|
||
currBytes = 0
|
||
boundaries = []
|
||
|
||
# Match a list of code points.
|
||
for token in line.split(" "):
|
||
if token == "÷":
|
||
boundaries += [ currBytes ]
|
||
elif token == "×":
|
||
pass
|
||
else:
|
||
codePoint = int(token, 16)
|
||
# Tests from Unicode spec have isolated surrogates in them. Our
|
||
# segmentation algorithm works on UTF-8 sequences, so encoding a
|
||
# surrogate would produce an invalid code unit sequence. Instead
|
||
# of trying to emulate the maximal subpart algorithm for inserting
|
||
# U+FFFD in Python, we just replace every isolated surrogate with
|
||
# U+200B, which also has Grapheme_Cluster_Break equal to 'Control'
|
||
# and test separately that we handle ill-formed UTF-8 sequences.
|
||
if codePoint >= 0xd800 and codePoint <= 0xdfff:
|
||
codePoint = 0x200b
|
||
codePoint = ('\U%(cp)08x' % { 'cp': codePoint }).decode('unicode_escape')
|
||
asUTF8Bytes = codePoint.encode('utf8')
|
||
asUTF8Escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in asUTF8Bytes])
|
||
test += asUTF8Escaped
|
||
currBytes += len(asUTF8Bytes)
|
||
|
||
return (test, boundaries)
|
||
|
||
# Self-test.
|
||
assert(convertLine('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
|
||
assert(convertLine('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
|
||
|
||
graphemeBreakTests = []
|
||
|
||
with open(unicodeGraphemeBreakTestFile, 'rb') as f:
|
||
for line in f:
|
||
test = convertLine(line)
|
||
if test:
|
||
graphemeBreakTests += [ test ]
|
||
|
||
}%
|
||
|
||
#include "swift/Basic/Unicode.h"
|
||
#include "gtest/gtest.h"
|
||
#include <vector>
|
||
|
||
using namespace swift;
|
||
using namespace swift::unicode;
|
||
|
||
static std::vector<unsigned> FindGraphemeClusterBoundaries(StringRef Str) {
|
||
std::vector<unsigned> Result;
|
||
Result.push_back(0);
|
||
|
||
unsigned Pos = 0;
|
||
while (Pos != Str.size()) {
|
||
Pos += extractFirstExtendedGraphemeCluster(Str.substr(Pos)).size();
|
||
Result.push_back(Pos);
|
||
}
|
||
|
||
return Result;
|
||
}
|
||
|
||
TEST(ExtractExtendedGraphemeCluster, TestsFromUnicodeSpec) {
|
||
% for subjectString,expectedBoundaries in graphemeBreakTests:
|
||
EXPECT_EQ((std::vector<unsigned>{ ${', '.join([ str(x) for x in expectedBoundaries ])} }),
|
||
FindGraphemeClusterBoundaries("${subjectString}"));
|
||
% end
|
||
}
|
||
|
||
TEST(ExtractExtendedGraphemeCluster, ExtraTests) {
|
||
//
|
||
// Sequences with one continuation byte missing
|
||
//
|
||
|
||
EXPECT_EQ((std::vector<unsigned>{ 0, 1 }),
|
||
FindGraphemeClusterBoundaries("\xc2"));
|
||
|
||
//
|
||
// Isolated surrogates
|
||
//
|
||
|
||
EXPECT_EQ((std::vector<unsigned>{ 0, 1, 2, 3 }),
|
||
FindGraphemeClusterBoundaries("\xed\xa0\x80"));
|
||
EXPECT_EQ((std::vector<unsigned>{ 0, 4, 5, 6, 11 }),
|
||
FindGraphemeClusterBoundaries(
|
||
"\xf3\xa0\x84\x80" "\xed\xa0\x80" "\xf3\xa0\x84\x80"));
|
||
}
|
||
|