swift-mirror/utils/GYBUnicodeDataUtils.py

##===--------------------------------------------------*- coding: utf-8 -*-===##
##
## This source file is part of the Swift.org open source project
##
## Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
## Licensed under Apache License v2.0 with Runtime Library Exception
##
## See http://swift.org/LICENSE.txt for license information
## See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
##
##===----------------------------------------------------------------------===##

import re

class UnicodeProperty(object):
    """Abstract base class for Unicode properties."""

    def __init__(self):
        raise NotImplemented

    def get_default_value(self):
        raise NotImplemented

    def get_value(self, cp):
        raise NotImplemented

    def to_numeric_value(self, value):
        raise NotImplemented

    def get_numeric_value(self, cp):
        raise NotImplemented

class GraphemeClusterBreakPropertyTable(UnicodeProperty):
    """Grapheme_Cluster_Break property."""

    # An array of tuples (start_code_point, end_code_point, value).
    property_value_ranges = []

    property_values = [ None for i in range(0, 0x110000) ]

    # Note: Numeric values should be consistent with
    # '_GraphemeClusterBreakPropertyValue' enum on the Swift side, and with
    # 'GraphemeClusterBreakProperty' in the compiler C++ code.  If there is a
    # reason for eithor of those to differ, then this mapping can be overriden
    # after an instance of this class is created.
    numeric_value_table = {
      'Other': 0,
      'CR': 1,
      'LF': 2,
      'Control': 3,
      'Extend': 4,
      'Regional_Indicator': 5,
      'Prepend': 6,
      'SpacingMark': 7,
      'L': 8,
      'V': 9,
      'T': 10,
      'LV': 11,
      'LVT': 12,
    }

    def __init__(self, grapheme_break_property_file_name):
        # Build 'self.symbolic_values' -- an array that maps numeric property
        # values to symbolic values.
        self.symbolic_values = \
            [ None ] * (max(self.numeric_value_table.values()) + 1)
        for k,v in self.numeric_value_table.iteritems():
            self.symbolic_values[v] = k

        # Load the data file.
        with open(grapheme_break_property_file_name, 'rb') as f:
            for line in f:
                # Strip comments.
                line = re.sub('#.*', '', line)

                # Single code point?
                m = re.match('([0-9A-F]+) +; +([a-zA-Z]+) ', line)
                if m:
                    code_point = int(m.group(1), 16)
                    value = m.group(2)
                    self.property_value_ranges += \
                        [(code_point, code_point, value)]
                    continue

                # Range of code points?
                m = re.match('([0-9A-F]+)..([0-9A-F]+) +; +([a-zA-Z_]+) ', line)
                if m:
                    start_code_point = int(m.group(1), 16)
                    end_code_point = int(m.group(2), 16)
                    value = m.group(3)
                    self.property_value_ranges += \
                        [(start_code_point, end_code_point, value)]

        # Prepare a flat lookup table for fast access.
        for cp in range(0, 0x110000):
            self.property_values[cp] = self.get_default_value()

        for start_code_point,end_code_point,value in self.property_value_ranges:
            for cp in range(start_code_point, end_code_point + 1):
                self.property_values[cp] = value

    def get_default_value(self):
        return 'Other'

    def get_value(self, cp):
        return self.property_values[cp]

    def to_numeric_value(self, value):
        return self.numeric_value_table[value]

    def get_numeric_value(self, cp):
        return self.to_numeric_value(self.get_value(cp))


# BMP code points are 16-bit values.  The code point value is split as
# follows:
#
#   8 bits                     8 bits
# +-------------------------+-------------------------+
# | 15 14 13 12 11 10  9  8 |  7  6  5  4  3  2  1  0 |
# +-------------------------+-------------------------+
#   first-level index          data offset
#
# Supplementary code points (U+XXXX where XXXX > 0xffff) are 21-bit values.
# The code point value is split as follows:
#
#   5 bits           8 bits                     8 bits
# +----------------+-------------------------+-------------------------+
# | 20 19 18 17 16 | 15 14 13 12 11 10  9  8 |  7  6  5  4  3  2  1  0 |
# +----------------+-------------------------+-------------------------+
#  first-level       second-level index         data offset
#  index
#
# The actual number of bits are just trie parameters.  They affect the size of
# the lookup tables (and thus, lookup time), but do not change the overall
# structure of the trie.
#
# Here and below 'supp' stands for 'supplementary characters'.
#
# Property data for BMP code points is stored as a one-stage trie.
# A trie with one lookup table consists of two memory blocks:
#
#         First-level lookup table
#  +-----+-----+-----+-----+--...--+
#  |  *  |  *  |  *  |  *  |       |
#  +--|--+--|--+--|--+--|--+--...--+
#     |     |     |      \          The references don't form
#     |      \____|       \___,        a systematic pattern
#     |           |           |
#     |           |           |     Data storage
#   +-V--------++-V--------++-V--------++---...---+
#   | data     || data     || data     ||         |
#   +----------++----------++----------++---...---+
#
# In order to fetch data for a given code point, you need to:
# * load from the first-level lookup table using first-level index; this will
#   give you the number of the data block that you should use.
# * load from the data block applying the data offset.
#
# Property data for supplementary code points is stored as a two-stage trie.
# A trie with two-stage lookup tables consists of three memory blocks.  The
# following drawing explains how it is implemented:
#
#         First-level lookup table
#       +-----+-----+-----+-----+-----+--...--+
#       |  *  |  *  |  *  |  *  |  *  |       |
#       +--|--+--|--+--|--+--|--+--|--+--...--+
#          |     |     |     |      \          The references don't form
#      ,__/      |      \____|       \___,        a systematic pattern
#     /          |           |           |
#    |           |           |           | Second-level lookup table
#  +-V--------++-V--------++-V--------++-V--------++---...---+
#  | ******** || ******** || ******** ||          ||         |
#  +-||||||||-++-||||||||-++-||||||||-++----------++---...---+
#    \\\|////    ||||||VV    |VVV|V|V
#     \\|///     ||||||     /    | |
#      \|//      ||||||    /     | |
#       |/       ||||| \__|___.   \ \       The references don't form
#       |        |||| \___|__. \   | \         a systematic pattern
#       |        ||| \____|   \ \__|  \
#       |        || \_____|__. \___|___\       ...___.
#       |        | \______|   \____|    \___,        |  Data storage
#     +-V-----++-V-----++-V-----++-V-----++-V-----++-V-----++---...---+
#     | data  || data  || data  || data  ||       ||       ||         |
#     +-------++-------++-------++-------++-------++-------++---...---+
#
# In order to fetch data for a given code point, you need to:
# * load from the first-level lookup table using first-level index; this will
#   give you the number of the second-level lookup table that you should use.
# * load from the chosen second-level lookup table using the second-level
#   index, which will give you the number of the data block that you should
#   use.
# * load from the data block applying the data offset.
#
# First- and second-level lookup tables in the general case contain 16-bit
# words; that will be sufficient to store a trie that does not compress at all.
# But in many cases, after trie compression there will be fewer than 256
# unique second-level lookup tables and/or data storage blocks, which allows
# one to use 8-bit words in lookup tables.
#
# The bitwidth of data depends on the application of the trie.
#
# The supp tables contain entries for BMP code units to simplify trie
# implementation, but those BMP entries are filled with the default value, so
# they compress well.
class UnicodeTrieGenerator(object):
    # Note: if you change any of these parameters, don't forget to update the
    # ASCII art above.
    BMP_first_level_index_bits = 8

    supp_first_level_index_bits = 5
    supp_second_level_index_bits = 8

    def get_BMP_first_level_index(self, cp):
        return cp >> self.BMP_data_offset_bits

    def get_BMP_data_offset(self, cp):
        return cp & ((1 << self.BMP_data_offset_bits) - 1)

    def get_supp_first_level_index(self, cp):
        return cp >> (self.supp_second_level_index_bits + self.supp_data_offset_bits)

    def get_supp_second_level_index(self, cp):
        return (cp >> self.supp_data_offset_bits) & ((1 << self.supp_second_level_index_bits) - 1)

    def get_supp_data_offset(self, cp):
        return cp & ((1 << self.supp_data_offset_bits) - 1)

    def __init__(self):
        """Create a trie generator with default parameters."""
        pass

    def create_tables(self):
        """Compute derived parameter values and create internal data
        structures.

        Don't change parameter values after calling this method.
        """

        self.BMP_data_offset_bits = 16 - self.BMP_first_level_index_bits

        self.supp_data_offset_bits = \
            21 - self.supp_first_level_index_bits - \
            self.supp_second_level_index_bits

        # The maximum value of the first level index for supp tables.  It is
        # not equal to ((1 << supp_first_level_index_bits) - 1), because
        # maximum Unicode code point value is not 2^21-1 (0x1fffff), it is
        # 0x10ffff.
        self.supp_first_level_index_max = \
            0x10ffff >> (self.supp_second_level_index_bits + \
                self.supp_data_offset_bits)

        # A mapping from BMP first-level index to BMP data block index.
        self.BMP_lookup = [ i for i in range(0, 1 << self.BMP_first_level_index_bits) ]

        # An array of BMP data blocks.
        self.BMP_data = [
            [ -1 for i in range(0, 1 << self.BMP_data_offset_bits) ]
                for i in range(0, 1 << self.BMP_first_level_index_bits) ]

        # A mapping from supp first-level index to an index of the second-level
        # lookup table.
        self.supp_lookup1 = [ i for i in range(0, self.supp_first_level_index_max + 1) ]

        # An array of second-level lookup tables.  Each second-level lookup
        # table is a mapping from a supp second-level index to supp data block
        # index.
        self.supp_lookup2 = [
            [ j for j in range(i << self.supp_second_level_index_bits, (i + 1) << self.supp_second_level_index_bits) ]
                for i in range(0, self.supp_first_level_index_max + 1) ]

        # An arry of supp data blocks.
        self.supp_data = [
            [ -1 for i in range(0, 1 << self.supp_data_offset_bits) ]
                for i in range(0, (self.supp_first_level_index_max + 1) * (1 << self.supp_second_level_index_bits)) ]

    def splat(self, value):
        for i in range(0, len(self.BMP_data)):
            for j in range(0, len(self.BMP_data[i])):
                self.BMP_data[i][j] = value

        for i in range(0, len(self.supp_data)):
            for j in range(0, len(self.supp_data[i])):
                self.supp_data[i][j] = value

    def set_value(self, cp, value):
        if cp <= 0xffff:
            data_block_index = self.BMP_lookup[self.get_BMP_first_level_index(cp)]
            self.BMP_data[data_block_index][self.get_BMP_data_offset(cp)] = value
        else:
            second_lookup_index = self.supp_lookup1[self.get_supp_first_level_index(cp)]
            data_block_index = self.supp_lookup2[second_lookup_index][self.get_supp_second_level_index(cp)]
            self.supp_data[data_block_index][self.get_supp_data_offset(cp)] = value

    def get_value(self, cp):
        if cp <= 0xffff:
            data_block_index = self.BMP_lookup[self.get_BMP_first_level_index(cp)]
            return self.BMP_data[data_block_index][self.get_BMP_data_offset(cp)]
        else:
            second_lookup_index = self.supp_lookup1[self.get_supp_first_level_index(cp)]
            data_block_index = self.supp_lookup2[second_lookup_index][self.get_supp_second_level_index(cp)]
            return self.supp_data[data_block_index][self.get_supp_data_offset(cp)]

    def fill_from_unicode_property(self, unicode_property):
        self.splat(unicode_property.get_default_value())
        for cp in range(0, 0x110000):
            self.set_value(cp, unicode_property.get_value(cp))

    def verify(self, unicode_property):
        for cp in range(0, 0x110000):
            expectedValue = unicode_property.get_value(cp)
            actualValue = self.get_value(cp)
            assert(expectedValue == actualValue)

    def freeze(self):
        """Compress internal trie representation.

        Don't mutate the trie after calling this method.

        """

        def remap_indexes(indexes, old_idx, new_idx):
            def map_index(idx):
                if idx == old_idx:
                    return new_idx
                elif idx > old_idx:
                    return idx - 1
                else:
                    return idx

            return map(map_index, indexes)

        # If self.BMP_data contains identical data blocks, keep the first one,
        # remove duplicates and change the indexes in self.BMP_lookup to point to
        # the first one.
        i = 0
        while i < len(self.BMP_data):
            j = i + 1
            while j < len(self.BMP_data):
                if self.BMP_data[i] == self.BMP_data[j]:
                    self.BMP_data.pop(j)
                    self.BMP_lookup = \
                        remap_indexes(self.BMP_lookup, old_idx=j, new_idx=i)
                else:
                    j += 1
            i += 1

        # For supp tables, perform bottom-up deduplication: first, deduplicate
        # data blocks.  The algorithm is the same as above, but operates on
        # self.supp_data/supp_lookup2.
        i = 0
        while i < len(self.supp_data):
            j = i + 1
            while j < len(self.supp_data):
                if self.supp_data[i] == self.supp_data[j]:
                    self.supp_data.pop(j)
                    for k in range(0, len(self.supp_lookup2)):
                        self.supp_lookup2[k] = \
                            remap_indexes(self.supp_lookup2[k], old_idx=j,
                                new_idx=i)
                else:
                    j += 1
            i += 1

        # Next, deduplicate second-level lookup tables.
        # Same as above, but for supp_lookup1/supp_lookup2.
        i = 0
        while i < len(self.supp_lookup2):
            j = i + 1
            while j < len(self.supp_lookup2):
                if self.supp_lookup2[i] == self.supp_lookup2[j]:
                    self.supp_lookup2.pop(j)
                    self.supp_lookup1 = \
                        remap_indexes(self.supp_lookup1, old_idx=j, new_idx=i)
                else:
                    j += 1
            i += 1

    def _int_to_LE_bytes(self, data, width):
        if width == 1:
            assert(data & ~0xff == 0)
            return [ data ]
        if width == 2:
            assert(data & ~0xffff == 0)
            return [ data & 0xff, data & 0xff00 ]
        assert(False)

    def _int_list_to_LE_bytes(self, ints, width):
        return [
            byte
            for elt in ints
            for byte in self._int_to_LE_bytes(elt, width) ]

    def serialize(self, unicode_property):
        self.BMP_lookup_bytes_per_entry = 1 if len(self.BMP_data) < 256 else 2
        self.BMP_data_bytes_per_entry = 1

        self.supp_lookup1_bytes_per_entry = 1 if len(self.supp_lookup2) < 256 else 2
        self.supp_lookup2_bytes_per_entry = 1 if len(self.supp_data) < 256 else 2
        self.supp_data_bytes_per_entry = 1

        BMP_lookup_words = [ elt for elt in self.BMP_lookup ]
        BMP_data_words = [
            unicode_property.to_numeric_value(elt)
            for block in self.BMP_data
            for elt in block ]

        supp_lookup1_words = [ elt for elt in self.supp_lookup1 ]
        supp_lookup2_words = [ elt for block in self.supp_lookup2 for elt in block ]
        supp_data_words = [
            unicode_property.to_numeric_value(elt)
            for block in self.supp_data
            for elt in block ]

        BMP_lookup_bytes = self._int_list_to_LE_bytes(
            BMP_lookup_words, self.BMP_lookup_bytes_per_entry)
        BMP_data_bytes = self._int_list_to_LE_bytes(
            BMP_data_words, self.BMP_data_bytes_per_entry)

        supp_lookup1_bytes = self._int_list_to_LE_bytes(
            supp_lookup1_words, self.supp_lookup1_bytes_per_entry)
        supp_lookup2_bytes = self._int_list_to_LE_bytes(
            supp_lookup2_words, self.supp_lookup2_bytes_per_entry)
        supp_data_bytes = self._int_list_to_LE_bytes(
            supp_data_words, self.supp_data_bytes_per_entry)

        self.trie_bytes = []

        self.BMP_lookup_bytes_offset = 0
        self.trie_bytes += BMP_lookup_bytes

        self.BMP_data_bytes_offset = len(self.trie_bytes)
        self.trie_bytes += BMP_data_bytes

        self.supp_lookup1_bytes_offset = len(self.trie_bytes)
        self.trie_bytes += supp_lookup1_bytes

        self.supp_lookup2_bytes_offset = len(self.trie_bytes)
        self.trie_bytes += supp_lookup2_bytes

        self.supp_data_bytes_offset = len(self.trie_bytes)
        self.trie_bytes += supp_data_bytes

def get_extended_grapheme_cluster_rules_matrix(grapheme_cluster_break_property_table):
    any_value = \
        grapheme_cluster_break_property_table.symbolic_values

    # Rules to determine extended grapheme cluster boundaries, as defined in
    # 'Grapheme Break Chart',
    # http://www.unicode.org/Public/6.3.0/ucd/auxiliary/GraphemeBreakTest.html,
    # Unicode 6.3.0.
    #
    # The Unicode 7.0.0 draft does not change these rules.
    #
    # As in the referenced document, the rules are specified in order of
    # decreasing priority.
    rules = [
      ( [ 'CR' ], 'no_boundary', [ 'LF' ] ),
      ( [ 'Control', 'CR', 'LF' ], 'boundary', any_value ),
      ( any_value, 'boundary', [ 'Control', 'CR', 'LF' ] ),
      ( [ 'L' ], 'no_boundary', [ 'L', 'V', 'LV', 'LVT' ] ),
      ( [ 'LV', 'V' ], 'no_boundary', [ 'V', 'T' ] ),
      ( [ 'LVT', 'T' ], 'no_boundary', [ 'T' ] ),
      ( [ 'Regional_Indicator' ], 'no_boundary', [ 'Regional_Indicator' ] ),
      ( any_value, 'no_boundary', [ 'Extend' ] ),
      ( any_value, 'no_boundary', [ 'SpacingMark' ] ),
      ( [ 'Prepend' ], 'no_boundary', any_value ),
      ( any_value, 'boundary', any_value ),
    ]

    # Expand the rules into a matrix.
    rules_matrix = {}
    for first in any_value:
        rules_matrix[first] = \
            dict.fromkeys(any_value, None)

    # Iterate over rules in the order of increasing priority.
    for firstList,action,secondList in reversed(rules):
        for first in firstList:
            for second in secondList:
                rules_matrix[first][second] = action

    # Make sure we can pack one row of the matrix into a 'uint16_t'.
    assert(len(any_value) <= 16)

    result = []
    for first in any_value:
        # Retrieve a row that corresponds to this first code point.
        row = rules_matrix[first]

        # Change strings into bits.
        bits = [ row[second] == 'no_boundary'
            for second in any_value ]

        # Pack bits into an integer.
        packed = sum([ bits[i] * pow(2, i) for i in range(0, len(bits)) ])

        result += [ packed ]

    return result

def get_grapheme_cluster_break_tests_as_UTF8(grapheme_break_test_file_name):
    def _convert_line(line):
        # Strip comments.
        line = re.sub('#.*', '', line).strip()

        if line == "":
            return None

        test = ""
        curr_bytes = 0
        boundaries = []

        # Match a list of code points.
        for token in line.split(" "):
            if token == "÷":
                boundaries += [ curr_bytes ]
            elif token == "×":
                pass
            else:
                code_point = int(token, 16)
                # Tests from Unicode spec have isolated surrogates in them.  Our
                # segmentation algorithm works on UTF-8 sequences, so encoding a
                # surrogate would produce an invalid code unit sequence.  Instead
                # of trying to emulate the maximal subpart algorithm for inserting
                # U+FFFD in Python, we just replace every isolated surrogate with
                # U+200B, which also has Grapheme_Cluster_Break equal to 'Control'
                # and test separately that we handle ill-formed UTF-8 sequences.
                if code_point >= 0xd800 and code_point <= 0xdfff:
                    code_point = 0x200b
                code_point = ('\U%(cp)08x' % { 'cp': code_point }).decode('unicode_escape')
                as_UTF8_bytes = code_point.encode('utf8')
                as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in as_UTF8_bytes])
                test += as_UTF8_escaped
                curr_bytes += len(as_UTF8_bytes)

        return (test, boundaries)

    # Self-test.
    assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
    assert(_convert_line('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))

    result = []

    with open(grapheme_break_test_file_name, 'rb') as f:
        for line in f:
            test = _convert_line(line)
            if test:
                result += [ test ]

    return result

def get_grapheme_cluster_break_tests_as_unicode_scalars(grapheme_break_test_file_name):
    def _convert_line(line):
        # Strip comments.
        line = re.sub('#.*', '', line).strip()

        if line == "":
            return None

        test = []
        curr_code_points = 0
        boundaries = []

        # Match a list of code points.
        for token in line.split(" "):
            if token == "÷":
                boundaries += [ curr_code_points ]
            elif token == "×":
                pass
            else:
                code_point = int(token, 16)
                # Tests from Unicode spec have isolated surrogates in them.  Our
                # segmentation algorithm works on UTF-16 sequences, so encoding a
                # surrogate would produce an invalid code unit sequence.  Instead
                # of trying to emulate the maximal subpart algorithm for inserting
                # U+FFFD in Python, we just replace every isolated surrogate with
                # U+200B, which also has Grapheme_Cluster_Break equal to 'Control'
                # and test separately that we handle ill-formed UTF-8 sequences.
                if code_point >= 0xd800 and code_point <= 0xdfff:
                    code_point = 0x200b
                test += [ code_point ]
                curr_code_points += 1

        return (test, boundaries)

    # Self-test.
    assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ([ 0x0903, 0x0308, 0xac01 ], [ 0, 2, 3 ]))
    assert(_convert_line('÷ D800 ÷ # abc') == ([ 0x200b ], [ 0, 1 ]))

    result = []

    with open(grapheme_break_test_file_name, 'rb') as f:
        for line in f:
            test = _convert_line(line)
            if test:
                result += [ test ]

    return result