swift-mirror/unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb

%# -*- mode: C++ -*-

%# Ignore the following admonition; it applies to the resulting .cpp file only
//// Automatically Generated From UnicodeExtendedGraphemeClusters.cpp.gyb.
//// Do Not Edit Directly!
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

%{

import re

def convertLine(line):
    # Strip comments.
    line = re.sub('#.*', '', line).strip()

    if line == "":
        return None

    test = ""
    currBytes = 0
    boundaries = []

    # Match a list of code points.
    for token in line.split(" "):
        if token == "÷":
            boundaries += [ currBytes ]
        elif token == "×":
            pass
        else:
            codePoint = int(token, 16)
            # Tests from Unicode spec have isolated surrogates in them.  Our
            # segmentation algorithm works on UTF-8 sequences, so encoding a
            # surrogate would produce an invalid code unit sequence.  Instead
            # of trying to emulate the maximal subpart algorithm for inserting
            # U+FFFD in Python, we just replace every isolated surrogate with
            # U+200B, which also has Grapheme_Cluster_Break equal to 'Control'
            # and test separately that we handle ill-formed UTF-8 sequences.
            if codePoint >= 0xd800 and codePoint <= 0xdfff:
                codePoint = 0x200b
            codePoint = ('\U%(cp)08x' % { 'cp': codePoint }).decode('unicode_escape')
            asUTF8Bytes = codePoint.encode('utf8')
            asUTF8Escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in asUTF8Bytes])
            test += asUTF8Escaped
            currBytes += len(asUTF8Bytes)

    return (test, boundaries)

# Self-test.
assert(convertLine('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
assert(convertLine('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))

graphemeBreakTests = []

with open(unicodeGraphemeBreakTestFile, 'rb') as f:
    for line in f:
        test = convertLine(line)
        if test:
            graphemeBreakTests += [ test ]

}%

#include "swift/Basic/Unicode.h"
#include "gtest/gtest.h"
#include <vector>

using namespace swift;
using namespace swift::unicode;

static std::vector<unsigned> FindGraphemeClusterBoundaries(StringRef Str) {
  std::vector<unsigned> Result;
  Result.push_back(0);

  unsigned Pos = 0;
  while (Pos != Str.size()) {
    Pos += extractFirstExtendedGraphemeCluster(Str.substr(Pos)).size();
    Result.push_back(Pos);
  }

  return Result;
}

TEST(ExtractExtendedGraphemeCluster, TestsFromUnicodeSpec) {
% for subjectString,expectedBoundaries in graphemeBreakTests:
  EXPECT_EQ((std::vector<unsigned>{ ${', '.join([ str(x) for x in expectedBoundaries ])} }),
      FindGraphemeClusterBoundaries("${subjectString}"));
% end
}

TEST(ExtractExtendedGraphemeCluster, ExtraTests) {
  //
  // Sequences with one continuation byte missing
  //

  EXPECT_EQ((std::vector<unsigned>{ 0, 1 }),
            FindGraphemeClusterBoundaries("\xc2"));

  //
  // Isolated surrogates
  //

  EXPECT_EQ((std::vector<unsigned>{ 0, 1, 2, 3 }),
            FindGraphemeClusterBoundaries("\xed\xa0\x80"));
  EXPECT_EQ((std::vector<unsigned>{ 0, 4, 5, 6, 11 }),
            FindGraphemeClusterBoundaries(
                "\xf3\xa0\x84\x80" "\xed\xa0\x80" "\xf3\xa0\x84\x80"));
}