swift-mirror/validation-test/stdlib/UnicodeTrie.swift.gyb

//===--- UnicodeTrie.swift.gyb --------------------------------*- swift -*-===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

// RUN: rm -rf %t && mkdir -p %t && %S/../../utils/gyb -DunicodeGraphemeBreakPropertyFile=%S/../../utils/UnicodeData/GraphemeBreakProperty.txt -DunicodeGraphemeBreakTestFile=%S/../../utils/UnicodeData/GraphemeBreakTest.txt %s -o %t/UnicodeTrie.swift
// RUN: %S/../../utils/line-directive %t/UnicodeTrie.swift -- %target-build-swift %t/UnicodeTrie.swift -o %t/a.out -g -Xfrontend -disable-access-control
// RUN: %S/../../utils/line-directive %t/UnicodeTrie.swift -- %target-run %t/a.out
// REQUIRES: executable_test

// FIXME: rdar://problem/19648117 Needs splitting objc parts out
// XFAIL: linux

%{

from GYBUnicodeDataUtils import *

grapheme_cluster_break_property_table = \
    GraphemeClusterBreakPropertyTable(unicodeGraphemeBreakPropertyFile)

}%

import SwiftPrivate
import StdlibUnittest
import Darwin
import Foundation

var graphemeBreakPropertyTable = [
// 'as Int' annotations are needed to help prevent the type-checker from
// blowing the stack. <rdar://problem/17539704>
% for start_code_point,end_code_point,value in grapheme_cluster_break_property_table.property_value_ranges:
  (${start_code_point} as Int, ${end_code_point} as Int, _GraphemeClusterBreakPropertyValue.${value}),
% end
]

var UnicodeTrie = TestSuite("UnicodeTrie")

UnicodeTrie.test("_UnicodeGraphemeClusterBreakPropertyTrie") {
  // Verify that the trie reports correct values of the property for every code
  // point.

  var trie = _UnicodeGraphemeClusterBreakPropertyTrie()

  var expected = [_GraphemeClusterBreakPropertyValue](count: 0x110000,
      repeatedValue: _GraphemeClusterBreakPropertyValue.Other)
  for (startCodePoint, endCodePoint, value) in graphemeBreakPropertyTable {
    for cp in startCodePoint...endCodePoint {
      expected[cp] = value
    }
  }

  for cp in UInt32(0)...UInt32(0x10ffff) {
    if cp % 0x10000 == 0 {
      print("\(cp)...")
    }
    expectEqual(
      expected[Int(cp)], trie.getPropertyValue(cp), "code point \(cp)")
  }
}

%{

grapheme_cluster_break_tests = \
  get_grapheme_cluster_break_tests_as_unicode_scalars(
    unicodeGraphemeBreakTestFile)

}%

// The most simple subclass of NSString that CoreFoundation does not know
// about.
class NonContiguousNSString : NSString {
  override init() {
    _value = []
    super.init()
  }

  required init(coder aDecoder: NSCoder) {
    fatalError("don't call this initializer")
  }

  init(_ value: [UInt16]) {
    _value = value
    super.init()
  }

  convenience init(_ scalars: [UInt32]) {
    var encoded: [UInt16] = []
    var g = scalars.generate()
    let output: (UInt16) -> () = { encoded.append($0) }
    let hadError =
      transcode(UTF32.self, UTF16.self, g, output, stopOnError: true)
    expectFalse(hadError)
    self.init(encoded)
  }

  @objc override func copyWithZone(zone: NSZone) -> AnyObject {
    // Ensure that copying this string produces a class that CoreFoundation
    // does not know about.
    return self
  }

  @objc override var length: Int {
    return _value.count
  }

  @objc override func characterAtIndex(index: Int) -> unichar {
    return _value[index]
  }

  var _value: [UInt16]
}

/// Verify that extended grapheme cluster boundaries in `subject` occur at
/// positions specified in `expectedBoundaries`.
func checkGraphemeClusterSegmentation(
    expectedBoundaries: [Int], _ subject: String, _ stackTrace: SourceLocStack
) {
  var actualBoundaries: [Int] = [ 0 ]
  var unicodeScalarCount = 0
  for c in subject.characters {
    let currentClusterSize = String(c).unicodeScalars.count
    unicodeScalarCount += currentClusterSize
    actualBoundaries += [ unicodeScalarCount ]
  }
  expectEqual(
    expectedBoundaries, actualBoundaries,
    "scalars: \(asHex(subject.unicodeScalars.lazy.map { $0.value }.array))"
  )

  let expectedCharacters: [Character] = Array(subject.characters)
  checkSliceableWithBidirectionalIndex(expectedCharacters, subject.characters)
}

func checkGraphemeClusterSegmentation(
    expectedBoundaries: [Int], scalars: [UInt32], _ stackTrace: SourceLocStack
) {
  let subject = NonContiguousNSString(scalars) as String
  checkGraphemeClusterSegmentation(expectedBoundaries, subject,
      stackTrace.withCurrentLoc())
}

func checkGraphemeClusterSegmentation(
    expectedBoundaries: [Int], codeUnits: [UInt16], _ stackTrace: SourceLocStack
) {
  let subject = NonContiguousNSString(codeUnits) as String
  checkGraphemeClusterSegmentation(expectedBoundaries, subject,
      stackTrace.withCurrentLoc())
}

UnicodeTrie.test("GraphemeClusterSegmentation/UnicodeSpec") {
  // Test segmentation algorithm using test data from the Unicode
  // specification.

% for code_points,expected_boundaries in grapheme_cluster_break_tests:
  if true {
    let scalars: [UInt32] =
        [ ${", ".join([ str(cp) for cp in code_points ])} ]
    let expectedBoundaries: [Int] =
        [ ${", ".join([ str(x) for x in expected_boundaries ])} ]
    checkGraphemeClusterSegmentation(expectedBoundaries, scalars: scalars,
        SourceLocStack().withCurrentLoc())
  }

% end
}

UnicodeTrie.test("GraphemeClusterSegmentation/Extra") {
  // Extra tests for input Strings that contain ill-formed code unit sequences.

  // U+D800 (high-surrogate)
  checkGraphemeClusterSegmentation(
      [ 0, 1 ],
      codeUnits: [ 0xd800 ],
      SourceLocStack().withCurrentLoc())

  // U+D800 (high-surrogate)
  // U+D800 (high-surrogate)
  checkGraphemeClusterSegmentation(
      [ 0, 1, 2 ],
      codeUnits: [ 0xd800, 0xd800 ],
      SourceLocStack().withCurrentLoc())

  // U+0041 LATIN CAPITAL LETTER A
  // U+D800 (high-surrogate)
  checkGraphemeClusterSegmentation(
      [ 0, 1, 2 ],
      codeUnits: [ 0x0041, 0xd800 ],
      SourceLocStack().withCurrentLoc())

  // U+D800 (high-surrogate)
  // U+0041 LATIN CAPITAL LETTER A
  checkGraphemeClusterSegmentation(
      [ 0, 1, 2 ],
      codeUnits: [ 0xd800, 0x0041 ],
      SourceLocStack().withCurrentLoc())

  // U+0041 LATIN CAPITAL LETTER A
  // U+0301 COMBINING ACUTE ACCENT
  // U+D800 (high-surrogate)
  checkGraphemeClusterSegmentation(
      [ 0, 2, 3 ],
      codeUnits: [ 0x0041, 0x0301, 0xd800 ],
      SourceLocStack().withCurrentLoc())

  // U+D800 (high-surrogate)
  // U+0041 LATIN CAPITAL LETTER A
  // U+0301 COMBINING ACUTE ACCENT
  checkGraphemeClusterSegmentation(
      [ 0, 1, 3 ],
      codeUnits: [ 0xd800, 0x0041, 0x0301 ],
      SourceLocStack().withCurrentLoc())
}

UnicodeTrie.test("GraphemeClusterSegmentation/Unicode_7_0_0") {
  // Verify that we are using Unicode 7.0.0+ data tables.

  // In Unicode 6.3.0, this sequence was segmented into two grapheme clusters.
  //
  // U+0041 LATIN CAPITAL LETTER A
  // U+1122C KHOJKI VOWEL SIGN AA
  checkGraphemeClusterSegmentation(
      [ 0, 2 ],
      scalars: [ 0x0041, 0x1122c ],
      SourceLocStack().withCurrentLoc())
}

UnicodeTrie.test("GraphemeClusterSegmentation/Unicode_8_0_0") {
  // Verify that we are using Unicode 8.0.0+ data tables.

  // In Unicode 7.0.0, this sequence was segmented into two grapheme clusters.
  //
  // U+0041 LATIN CAPITAL LETTER A
  // U+11720 AHOM VOWEL SIGN A
  checkGraphemeClusterSegmentation(
      [ 0, 2 ],
      scalars: [ 0x0041, 0x11720 ],
      SourceLocStack().withCurrentLoc())
}


runAllTests()