Files
swift-mirror/unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb
Dmitri Hrybenko 7704e19b7d libBasic: implement extended grapheme cluster segmentation algorithm
This is only for the frontend, not for stdlib.  The implementation is very
slow, optimizing it is the next step.

rdar://16755123 rdar://16013860


Swift SVN r18928
2014-06-16 14:20:43 +00:00

118 lines
3.7 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
%# -*- mode: C++ -*-
%# Ignore the following admonition; it applies to the resulting .cpp file only
//// Automatically Generated From UnicodeExtendedGraphemeClusters.cpp.gyb.
//// Do Not Edit Directly!
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
%{
import re
def convertLine(line):
# Strip comments.
line = re.sub('#.*', '', line).strip()
if line == "":
return None
test = ""
currBytes = 0
boundaries = []
# Match a list of code points.
for token in line.split(" "):
if token == "÷":
boundaries += [ currBytes ]
elif token == "×":
pass
else:
codePoint = int(token, 16)
# Tests from Unicode spec have isolated surrogates in them. Our
# segmentation algorithm works on UTF-8 sequences, so encoding a
# surrogate would produce an invalid code unit sequence. Instead
# of trying to emulate the maximal subpart algorithm for inserting
# U+FFFD in Python, we just replace every isolated surrogate with
# U+200B, which also has Grapheme_Cluster_Break equal to 'Control'
# and test separately that we handle ill-formed UTF-8 sequences.
if codePoint >= 0xd800 and codePoint <= 0xdfff:
codePoint = 0x200b
codePoint = ('\U%(cp)08x' % { 'cp': codePoint }).decode('unicode_escape')
asUTF8Bytes = codePoint.encode('utf8')
asUTF8Escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in asUTF8Bytes])
test += asUTF8Escaped
currBytes += len(asUTF8Bytes)
return (test, boundaries)
# Self-test.
assert(convertLine('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
assert(convertLine('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
graphemeBreakTests = []
with open(unicodeGraphemeBreakTestFile, 'rb') as f:
for line in f:
test = convertLine(line)
if test:
graphemeBreakTests += [ test ]
}%
#include "swift/Basic/Unicode.h"
#include "gtest/gtest.h"
#include <vector>
using namespace swift;
using namespace swift::unicode;
static std::vector<unsigned> FindGraphemeClusterBoundaries(StringRef Str) {
std::vector<unsigned> Result;
Result.push_back(0);
unsigned Pos = 0;
while (Pos != Str.size()) {
Pos += extractFirstExtendedGraphemeCluster(Str.substr(Pos)).size();
Result.push_back(Pos);
}
return Result;
}
TEST(ExtractExtendedGraphemeCluster, TestsFromUnicodeSpec) {
% for subjectString,expectedBoundaries in graphemeBreakTests:
EXPECT_EQ((std::vector<unsigned>{ ${', '.join([ str(x) for x in expectedBoundaries ])} }),
FindGraphemeClusterBoundaries("${subjectString}"));
% end
}
TEST(ExtractExtendedGraphemeCluster, ExtraTests) {
//
// Sequences with one continuation byte missing
//
EXPECT_EQ((std::vector<unsigned>{ 0, 1 }),
FindGraphemeClusterBoundaries("\xc2"));
//
// Isolated surrogates
//
EXPECT_EQ((std::vector<unsigned>{ 0, 1, 2, 3 }),
FindGraphemeClusterBoundaries("\xed\xa0\x80"));
EXPECT_EQ((std::vector<unsigned>{ 0, 4, 5, 6, 11 }),
FindGraphemeClusterBoundaries(
"\xf3\xa0\x84\x80" "\xed\xa0\x80" "\xf3\xa0\x84\x80"));
}