mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
libBasic: implement extended grapheme cluster segmentation algorithm
This is only for the frontend, not for stdlib. The implementation is very slow, optimizing it is the next step. rdar://16755123 rdar://16013860 Swift SVN r18928
This commit is contained in:
117
unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb
Normal file
117
unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb
Normal file
@@ -0,0 +1,117 @@
|
||||
%# -*- mode: C++ -*-
|
||||
|
||||
%# Ignore the following admonition; it applies to the resulting .cpp file only
|
||||
//// Automatically Generated From UnicodeExtendedGraphemeClusters.cpp.gyb.
|
||||
//// Do Not Edit Directly!
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This source file is part of the Swift.org open source project
|
||||
//
|
||||
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
|
||||
// Licensed under Apache License v2.0 with Runtime Library Exception
|
||||
//
|
||||
// See http://swift.org/LICENSE.txt for license information
|
||||
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
%{
|
||||
|
||||
import re
|
||||
|
||||
def convertLine(line):
|
||||
# Strip comments.
|
||||
line = re.sub('#.*', '', line).strip()
|
||||
|
||||
if line == "":
|
||||
return None
|
||||
|
||||
test = ""
|
||||
currBytes = 0
|
||||
boundaries = []
|
||||
|
||||
# Match a list of code points.
|
||||
for token in line.split(" "):
|
||||
if token == "÷":
|
||||
boundaries += [ currBytes ]
|
||||
elif token == "×":
|
||||
pass
|
||||
else:
|
||||
codePoint = int(token, 16)
|
||||
# Tests from Unicode spec have isolated surrogates in them. Our
|
||||
# segmentation algorithm works on UTF-8 sequences, so encoding a
|
||||
# surrogate would produce an invalid code unit sequence. Instead
|
||||
# of trying to emulate the maximal subpart algorithm for inserting
|
||||
# U+FFFD in Python, we just replace every isolated surrogate with
|
||||
# U+200B, which also has Grapheme_Cluster_Break equal to 'Control'
|
||||
# and test separately that we handle ill-formed UTF-8 sequences.
|
||||
if codePoint >= 0xd800 and codePoint <= 0xdfff:
|
||||
codePoint = 0x200b
|
||||
codePoint = ('\U%(cp)08x' % { 'cp': codePoint }).decode('unicode_escape')
|
||||
asUTF8Bytes = codePoint.encode('utf8')
|
||||
asUTF8Escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in asUTF8Bytes])
|
||||
test += asUTF8Escaped
|
||||
currBytes += len(asUTF8Bytes)
|
||||
|
||||
return (test, boundaries)
|
||||
|
||||
# Self-test.
|
||||
assert(convertLine('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
|
||||
assert(convertLine('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
|
||||
|
||||
graphemeBreakTests = []
|
||||
|
||||
with open(unicodeGraphemeBreakTestFile, 'rb') as f:
|
||||
for line in f:
|
||||
test = convertLine(line)
|
||||
if test:
|
||||
graphemeBreakTests += [ test ]
|
||||
|
||||
}%
|
||||
|
||||
#include "swift/Basic/Unicode.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <vector>
|
||||
|
||||
using namespace swift;
|
||||
using namespace swift::unicode;
|
||||
|
||||
static std::vector<unsigned> FindGraphemeClusterBoundaries(StringRef Str) {
|
||||
std::vector<unsigned> Result;
|
||||
Result.push_back(0);
|
||||
|
||||
unsigned Pos = 0;
|
||||
while (Pos != Str.size()) {
|
||||
Pos += extractFirstExtendedGraphemeCluster(Str.substr(Pos)).size();
|
||||
Result.push_back(Pos);
|
||||
}
|
||||
|
||||
return Result;
|
||||
}
|
||||
|
||||
TEST(ExtractExtendedGraphemeCluster, TestsFromUnicodeSpec) {
|
||||
% for subjectString,expectedBoundaries in graphemeBreakTests:
|
||||
EXPECT_EQ((std::vector<unsigned>{ ${', '.join([ str(x) for x in expectedBoundaries ])} }),
|
||||
FindGraphemeClusterBoundaries("${subjectString}"));
|
||||
% end
|
||||
}
|
||||
|
||||
TEST(ExtractExtendedGraphemeCluster, ExtraTests) {
|
||||
//
|
||||
// Sequences with one continuation byte missing
|
||||
//
|
||||
|
||||
EXPECT_EQ((std::vector<unsigned>{ 0, 1 }),
|
||||
FindGraphemeClusterBoundaries("\xc2"));
|
||||
|
||||
//
|
||||
// Isolated surrogates
|
||||
//
|
||||
|
||||
EXPECT_EQ((std::vector<unsigned>{ 0, 1, 2, 3 }),
|
||||
FindGraphemeClusterBoundaries("\xed\xa0\x80"));
|
||||
EXPECT_EQ((std::vector<unsigned>{ 0, 4, 5, 6, 11 }),
|
||||
FindGraphemeClusterBoundaries(
|
||||
"\xf3\xa0\x84\x80" "\xed\xa0\x80" "\xf3\xa0\x84\x80"));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user