libBasic: implement extended grapheme cluster segmentation algorithm

This is only for the frontend, not for stdlib.  The implementation is very
slow, optimizing it is the next step.

rdar://16755123 rdar://16013860


Swift SVN r18928
This commit is contained in:
Dmitri Hrybenko
2014-06-16 14:20:43 +00:00
parent e310fd7f0b
commit 7704e19b7d
11 changed files with 2071 additions and 9 deletions

View File

@@ -0,0 +1,117 @@
%# -*- mode: C++ -*-
%# Ignore the following admonition; it applies to the resulting .cpp file only
//// Automatically Generated From UnicodeExtendedGraphemeClusters.cpp.gyb.
//// Do Not Edit Directly!
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
%{
import re
def convertLine(line):
# Strip comments.
line = re.sub('#.*', '', line).strip()
if line == "":
return None
test = ""
currBytes = 0
boundaries = []
# Match a list of code points.
for token in line.split(" "):
if token == "÷":
boundaries += [ currBytes ]
elif token == "×":
pass
else:
codePoint = int(token, 16)
# Tests from Unicode spec have isolated surrogates in them. Our
# segmentation algorithm works on UTF-8 sequences, so encoding a
# surrogate would produce an invalid code unit sequence. Instead
# of trying to emulate the maximal subpart algorithm for inserting
# U+FFFD in Python, we just replace every isolated surrogate with
# U+200B, which also has Grapheme_Cluster_Break equal to 'Control'
# and test separately that we handle ill-formed UTF-8 sequences.
if codePoint >= 0xd800 and codePoint <= 0xdfff:
codePoint = 0x200b
codePoint = ('\U%(cp)08x' % { 'cp': codePoint }).decode('unicode_escape')
asUTF8Bytes = codePoint.encode('utf8')
asUTF8Escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in asUTF8Bytes])
test += asUTF8Escaped
currBytes += len(asUTF8Bytes)
return (test, boundaries)
# Self-test.
assert(convertLine('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
assert(convertLine('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
graphemeBreakTests = []
with open(unicodeGraphemeBreakTestFile, 'rb') as f:
for line in f:
test = convertLine(line)
if test:
graphemeBreakTests += [ test ]
}%
#include "swift/Basic/Unicode.h"
#include "gtest/gtest.h"
#include <vector>
using namespace swift;
using namespace swift::unicode;
static std::vector<unsigned> FindGraphemeClusterBoundaries(StringRef Str) {
std::vector<unsigned> Result;
Result.push_back(0);
unsigned Pos = 0;
while (Pos != Str.size()) {
Pos += extractFirstExtendedGraphemeCluster(Str.substr(Pos)).size();
Result.push_back(Pos);
}
return Result;
}
TEST(ExtractExtendedGraphemeCluster, TestsFromUnicodeSpec) {
% for subjectString,expectedBoundaries in graphemeBreakTests:
EXPECT_EQ((std::vector<unsigned>{ ${', '.join([ str(x) for x in expectedBoundaries ])} }),
FindGraphemeClusterBoundaries("${subjectString}"));
% end
}
TEST(ExtractExtendedGraphemeCluster, ExtraTests) {
//
// Sequences with one continuation byte missing
//
EXPECT_EQ((std::vector<unsigned>{ 0, 1 }),
FindGraphemeClusterBoundaries("\xc2"));
//
// Isolated surrogates
//
EXPECT_EQ((std::vector<unsigned>{ 0, 1, 2, 3 }),
FindGraphemeClusterBoundaries("\xed\xa0\x80"));
EXPECT_EQ((std::vector<unsigned>{ 0, 4, 5, 6, 11 }),
FindGraphemeClusterBoundaries(
"\xf3\xa0\x84\x80" "\xed\xa0\x80" "\xf3\xa0\x84\x80"));
}