mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
643 lines
24 KiB
Python
643 lines
24 KiB
Python
# ===--- GYBUnicodeDataUtils.py ----------------------*- coding: utf-8 -*-===//
|
||
#
|
||
# This source file is part of the Swift.org open source project
|
||
#
|
||
# Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
|
||
# Licensed under Apache License v2.0 with Runtime Library Exception
|
||
#
|
||
# See https://swift.org/LICENSE.txt for license information
|
||
# See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
||
|
||
import codecs
|
||
import re
|
||
|
||
|
||
class UnicodeProperty(object):
|
||
"""Abstract base class for Unicode properties."""
|
||
|
||
def __init__(self):
|
||
raise NotImplementedError(
|
||
"UnicodeProperty.__init__ is not implemented.")
|
||
|
||
def get_default_value(self):
|
||
raise NotImplementedError(
|
||
"UnicodeProperty.get_default_value is not implemented.")
|
||
|
||
def get_value(self, cp):
|
||
raise NotImplementedError(
|
||
"UnicodeProperty.get_value is not implemented.")
|
||
|
||
def to_numeric_value(self, value):
|
||
raise NotImplementedError(
|
||
"UnicodeProperty.to_numeric_value is not implemented.")
|
||
|
||
def get_numeric_value(self, cp):
|
||
raise NotImplementedError(
|
||
"UnicodeProperty.get_numeric_value is not implemented.")
|
||
|
||
|
||
class GraphemeClusterBreakPropertyTable(UnicodeProperty):
|
||
"""Grapheme_Cluster_Break property."""
|
||
|
||
# An array of tuples (start_code_point, end_code_point, value).
|
||
property_value_ranges = []
|
||
|
||
property_values = [None for i in range(0, 0x110000)]
|
||
|
||
# Note: Numeric values (including the names) should be consistent with
|
||
# '_GraphemeClusterBreakPropertyValue' enum on the Swift side, and with
|
||
# 'GraphemeClusterBreakProperty' in the compiler C++ code. If there is a
|
||
# reason for either of those to differ, then this mapping can be overridden
|
||
# after an instance of this class is created.
|
||
numeric_value_table = {
|
||
'Other': 0,
|
||
'CR': 1,
|
||
'LF': 2,
|
||
'Control': 3,
|
||
'Extend': 4,
|
||
'Regional_Indicator': 5,
|
||
'Prepend': 6,
|
||
'SpacingMark': 7,
|
||
'L': 8,
|
||
'V': 9,
|
||
'T': 10,
|
||
'LV': 11,
|
||
'LVT': 12,
|
||
}
|
||
|
||
def __init__(self, grapheme_break_property_file_name):
|
||
# Build 'self.symbolic_values' -- an array that maps numeric property
|
||
# values to symbolic values.
|
||
self.symbolic_values = \
|
||
[None] * (max(self.numeric_value_table.values()) + 1)
|
||
for k, v in self.numeric_value_table.items():
|
||
self.symbolic_values[v] = k
|
||
|
||
# Load the data file.
|
||
with codecs.open(
|
||
grapheme_break_property_file_name,
|
||
encoding='utf-8',
|
||
errors='strict') as f:
|
||
for line in f:
|
||
# Strip comments.
|
||
line = re.sub('#.*', '', line)
|
||
|
||
# Single code point?
|
||
m = re.match('([0-9A-F]+) +; +([a-zA-Z]+) ', line)
|
||
if m:
|
||
code_point = int(m.group(1), 16)
|
||
value = m.group(2)
|
||
self.property_value_ranges += \
|
||
[(code_point, code_point, value)]
|
||
continue
|
||
|
||
# Range of code points?
|
||
m = re.match(
|
||
'([0-9A-F]+)..([0-9A-F]+) +; +([a-zA-Z_]+) ', line)
|
||
if m:
|
||
start_code_point = int(m.group(1), 16)
|
||
end_code_point = int(m.group(2), 16)
|
||
value = m.group(3)
|
||
self.property_value_ranges += \
|
||
[(start_code_point, end_code_point, value)]
|
||
|
||
# Prepare a flat lookup table for fast access.
|
||
for cp in range(0, 0x110000):
|
||
self.property_values[cp] = self.get_default_value()
|
||
|
||
for start_code_pt, end_code_pt, val in self.property_value_ranges:
|
||
for cp in range(start_code_pt, end_code_pt + 1):
|
||
self.property_values[cp] = val
|
||
|
||
def get_default_value(self):
|
||
return 'Other'
|
||
|
||
def get_value(self, cp):
|
||
return self.property_values[cp]
|
||
|
||
def to_numeric_value(self, value):
|
||
return self.numeric_value_table[value]
|
||
|
||
def get_numeric_value(self, cp):
|
||
return self.to_numeric_value(self.get_value(cp))
|
||
|
||
|
||
# BMP code points are 16-bit values. The code point value is split as
|
||
# follows:
|
||
#
|
||
# 8 bits 8 bits
|
||
# +-------------------------+-------------------------+
|
||
# | 15 14 13 12 11 10 9 8 | 7 6 5 4 3 2 1 0 |
|
||
# +-------------------------+-------------------------+
|
||
# first-level index data offset
|
||
#
|
||
# Supplementary code points (U+XXXX where XXXX > 0xffff) are 21-bit values.
|
||
# The code point value is split as follows:
|
||
#
|
||
# 5 bits 8 bits 8 bits
|
||
# +----------------+-------------------------+-------------------------+
|
||
# | 20 19 18 17 16 | 15 14 13 12 11 10 9 8 | 7 6 5 4 3 2 1 0 |
|
||
# +----------------+-------------------------+-------------------------+
|
||
# first-level second-level index data offset
|
||
# index
|
||
#
|
||
# The actual number of bits are just trie parameters. They affect the size of
|
||
# the lookup tables (and thus, lookup time), but do not change the overall
|
||
# structure of the trie.
|
||
#
|
||
# Here and below 'supp' stands for 'supplementary characters'.
|
||
#
|
||
# Property data for BMP code points is stored as a one-stage trie.
|
||
# A trie with one lookup table consists of two memory blocks:
|
||
#
|
||
# First-level lookup table
|
||
# +-----+-----+-----+-----+--...--+
|
||
# | * | * | * | * | |
|
||
# +--|--+--|--+--|--+--|--+--...--+
|
||
# | | | \ The references don't form
|
||
# | \____| \___, a systematic pattern
|
||
# | | |
|
||
# | | | Data storage
|
||
# +-V--------++-V--------++-V--------++---...---+
|
||
# | data || data || data || |
|
||
# +----------++----------++----------++---...---+
|
||
#
|
||
# In order to fetch data for a given code point, you need to:
|
||
# * load from the first-level lookup table using first-level index; this will
|
||
# give you the number of the data block that you should use.
|
||
# * load from the data block applying the data offset.
|
||
#
|
||
# Property data for supplementary code points is stored as a two-stage trie.
|
||
# A trie with two-stage lookup tables consists of three memory blocks. The
|
||
# following drawing explains how it is implemented:
|
||
#
|
||
# First-level lookup table
|
||
# +-----+-----+-----+-----+-----+--...--+
|
||
# | * | * | * | * | * | |
|
||
# +--|--+--|--+--|--+--|--+--|--+--...--+
|
||
# | | | | \ The references don't form
|
||
# ,__/ | \____| \___, a systematic pattern
|
||
# / | | |
|
||
# | | | | Second-level lookup table
|
||
# +-V--------++-V--------++-V--------++-V--------++---...---+
|
||
# | ******** || ******** || ******** || || |
|
||
# +-||||||||-++-||||||||-++-||||||||-++----------++---...---+
|
||
# \\\|//// ||||||VV |VVV|V|V
|
||
# \\|/// |||||| / | |
|
||
# \|// |||||| / | |
|
||
# |/ ||||| \__|___. \ \ The references don't form
|
||
# | |||| \___|__. \ | \ a systematic pattern
|
||
# | ||| \____| \ \__| \
|
||
# | || \_____|__. \___|___\ ...___.
|
||
# | | \______| \____| \___, | Data storage
|
||
# +-V-----++-V-----++-V-----++-V-----++-V-----++-V-----++---...---+
|
||
# | data || data || data || data || || || |
|
||
# +-------++-------++-------++-------++-------++-------++---...---+
|
||
#
|
||
# In order to fetch data for a given code point, you need to:
|
||
# * load from the first-level lookup table using first-level index; this will
|
||
# give you the number of the second-level lookup table that you should use.
|
||
# * load from the chosen second-level lookup table using the second-level
|
||
# index, which will give you the number of the data block that you should
|
||
# use.
|
||
# * load from the data block applying the data offset.
|
||
#
|
||
# First- and second-level lookup tables in the general case contain 16-bit
|
||
# words; that will be sufficient to store a trie that does not compress at all.
|
||
# But in many cases, after trie compression there will be fewer than 256
|
||
# unique second-level lookup tables and/or data storage blocks, which allows
|
||
# one to use 8-bit words in lookup tables.
|
||
#
|
||
# The bitwidth of data depends on the application of the trie.
|
||
#
|
||
# The supp tables contain entries for BMP code units to simplify trie
|
||
# implementation, but those BMP entries are filled with the default value, so
|
||
# they compress well.
|
||
class UnicodeTrieGenerator(object):
|
||
# Note: if you change any of these parameters, don't forget to update the
|
||
# ASCII art above.
|
||
bmp_first_level_index_bits = 8
|
||
|
||
supp_first_level_index_bits = 5
|
||
supp_second_level_index_bits = 8
|
||
|
||
def get_bmp_first_level_index(self, cp):
|
||
return cp >> self.bmp_data_offset_bits
|
||
|
||
def get_bmp_data_offset(self, cp):
|
||
return cp & ((1 << self.bmp_data_offset_bits) - 1)
|
||
|
||
def get_supp_first_level_index(self, cp):
|
||
return cp >> \
|
||
(self.supp_second_level_index_bits + self.supp_data_offset_bits)
|
||
|
||
def get_supp_second_level_index(self, cp):
|
||
return (cp >> self.supp_data_offset_bits) & \
|
||
((1 << self.supp_second_level_index_bits) - 1)
|
||
|
||
def get_supp_data_offset(self, cp):
|
||
return cp & ((1 << self.supp_data_offset_bits) - 1)
|
||
|
||
def __init__(self):
|
||
"""Create a trie generator with default parameters."""
|
||
pass
|
||
|
||
def create_tables(self):
|
||
"""Compute derived parameter values and create internal data
|
||
structures.
|
||
|
||
Don't change parameter values after calling this method.
|
||
"""
|
||
self.bmp_data_offset_bits = 16 - self.bmp_first_level_index_bits
|
||
|
||
self.supp_data_offset_bits = \
|
||
21 - self.supp_first_level_index_bits - \
|
||
self.supp_second_level_index_bits
|
||
|
||
# The maximum value of the first level index for supp tables. It is
|
||
# not equal to ((1 << supp_first_level_index_bits) - 1), because
|
||
# maximum Unicode code point value is not 2^21-1 (0x1fffff), it is
|
||
# 0x10ffff.
|
||
self.supp_first_level_index_max = \
|
||
0x10ffff >> \
|
||
(self.supp_second_level_index_bits + self.supp_data_offset_bits)
|
||
|
||
# A mapping from BMP first-level index to BMP data block index.
|
||
self.bmp_lookup = \
|
||
[i for i in range(0, 1 << self.bmp_first_level_index_bits)]
|
||
|
||
# An array of BMP data blocks.
|
||
self.bmp_data = [
|
||
[-1 for i in range(0, 1 << self.bmp_data_offset_bits)]
|
||
for i in range(0, 1 << self.bmp_first_level_index_bits)
|
||
]
|
||
|
||
# A mapping from supp first-level index to an index of the second-level
|
||
# lookup table.
|
||
self.supp_lookup1 = \
|
||
[i for i in range(0, self.supp_first_level_index_max + 1)]
|
||
|
||
# An array of second-level lookup tables. Each second-level lookup
|
||
# table is a mapping from a supp second-level index to supp data block
|
||
# index.
|
||
self.supp_lookup2 = [
|
||
[j for j in range(i << self.supp_second_level_index_bits,
|
||
(i + 1) << self.supp_second_level_index_bits)]
|
||
for i in range(0, self.supp_first_level_index_max + 1)
|
||
]
|
||
|
||
# An array of supp data blocks.
|
||
self.supp_data = [
|
||
[-1 for i in range(0, 1 << self.supp_data_offset_bits)]
|
||
for i in range(0, (self.supp_first_level_index_max + 1) *
|
||
(1 << self.supp_second_level_index_bits))
|
||
]
|
||
|
||
def splat(self, value):
|
||
for i in range(0, len(self.bmp_data)):
|
||
for j in range(0, len(self.bmp_data[i])):
|
||
self.bmp_data[i][j] = value
|
||
|
||
for i in range(0, len(self.supp_data)):
|
||
for j in range(0, len(self.supp_data[i])):
|
||
self.supp_data[i][j] = value
|
||
|
||
def set_value(self, cp, value):
|
||
if cp <= 0xffff:
|
||
data_block_index = self.bmp_lookup[
|
||
self.get_bmp_first_level_index(cp)]
|
||
self.bmp_data[data_block_index][
|
||
self.get_bmp_data_offset(cp)] = value
|
||
else:
|
||
second_lookup_index = self.supp_lookup1[
|
||
self.get_supp_first_level_index(cp)]
|
||
data_block_index = self.supp_lookup2[second_lookup_index][
|
||
self.get_supp_second_level_index(cp)]
|
||
self.supp_data[data_block_index][
|
||
self.get_supp_data_offset(cp)] = value
|
||
|
||
def get_value(self, cp):
|
||
if cp <= 0xffff:
|
||
data_block_index = self.bmp_lookup[
|
||
self.get_bmp_first_level_index(cp)]
|
||
return self.bmp_data[data_block_index][
|
||
self.get_bmp_data_offset(cp)]
|
||
else:
|
||
second_lookup_index = self.supp_lookup1[
|
||
self.get_supp_first_level_index(cp)]
|
||
data_block_index = self.supp_lookup2[second_lookup_index][
|
||
self.get_supp_second_level_index(cp)]
|
||
return self.supp_data[data_block_index][
|
||
self.get_supp_data_offset(cp)]
|
||
|
||
def fill_from_unicode_property(self, unicode_property):
|
||
self.splat(unicode_property.get_default_value())
|
||
for cp in range(0, 0x110000):
|
||
self.set_value(cp, unicode_property.get_value(cp))
|
||
|
||
def verify(self, unicode_property):
|
||
for cp in range(0, 0x110000):
|
||
expected_value = unicode_property.get_value(cp)
|
||
actual_value = self.get_value(cp)
|
||
assert expected_value == actual_value
|
||
|
||
def freeze(self):
|
||
"""Compress internal trie representation.
|
||
|
||
Don't mutate the trie after calling this method.
|
||
"""
|
||
def remap_indexes(indexes, old_idx, new_idx):
|
||
def map_index(idx):
|
||
if idx == old_idx:
|
||
return new_idx
|
||
elif idx > old_idx:
|
||
return idx - 1
|
||
else:
|
||
return idx
|
||
|
||
return list(map(map_index, indexes))
|
||
|
||
# If self.bmp_data contains identical data blocks, keep the first one,
|
||
# remove duplicates and change the indexes in self.bmp_lookup to point
|
||
# to the first one.
|
||
i = 0
|
||
while i < len(self.bmp_data):
|
||
j = i + 1
|
||
while j < len(self.bmp_data):
|
||
if self.bmp_data[i] == self.bmp_data[j]:
|
||
self.bmp_data.pop(j)
|
||
self.bmp_lookup = \
|
||
remap_indexes(self.bmp_lookup, old_idx=j, new_idx=i)
|
||
else:
|
||
j += 1
|
||
i += 1
|
||
|
||
# For supp tables, perform bottom-up deduplication: first, deduplicate
|
||
# data blocks. The algorithm is the same as above, but operates on
|
||
# self.supp_data/supp_lookup2.
|
||
i = 0
|
||
while i < len(self.supp_data):
|
||
j = i + 1
|
||
while j < len(self.supp_data):
|
||
if self.supp_data[i] == self.supp_data[j]:
|
||
self.supp_data.pop(j)
|
||
for k in range(0, len(self.supp_lookup2)):
|
||
self.supp_lookup2[k] = \
|
||
remap_indexes(self.supp_lookup2[k],
|
||
old_idx=j, new_idx=i)
|
||
else:
|
||
j += 1
|
||
i += 1
|
||
|
||
# Next, deduplicate second-level lookup tables.
|
||
# Same as above, but for supp_lookup1/supp_lookup2.
|
||
i = 0
|
||
while i < len(self.supp_lookup2):
|
||
j = i + 1
|
||
while j < len(self.supp_lookup2):
|
||
if self.supp_lookup2[i] == self.supp_lookup2[j]:
|
||
self.supp_lookup2.pop(j)
|
||
self.supp_lookup1 = \
|
||
remap_indexes(self.supp_lookup1, old_idx=j, new_idx=i)
|
||
else:
|
||
j += 1
|
||
i += 1
|
||
|
||
def _int_to_le_bytes(self, data, width):
|
||
if width == 1:
|
||
assert data & ~0xff == 0
|
||
return [data]
|
||
if width == 2:
|
||
assert data & ~0xffff == 0
|
||
return [data & 0xff, data & 0xff00]
|
||
assert False
|
||
|
||
def _int_list_to_le_bytes(self, ints, width):
|
||
return [
|
||
byte
|
||
for elt in ints
|
||
for byte in self._int_to_le_bytes(elt, width)]
|
||
|
||
def serialize(self, unicode_property):
|
||
self.bmp_lookup_bytes_per_entry = 1 if len(self.bmp_data) < 256 else 2
|
||
self.bmp_data_bytes_per_entry = 1
|
||
|
||
self.supp_lookup1_bytes_per_entry = 1 if len(self.supp_lookup2) < 256 \
|
||
else 2
|
||
self.supp_lookup2_bytes_per_entry = 1 if len(self.supp_data) < 256 \
|
||
else 2
|
||
self.supp_data_bytes_per_entry = 1
|
||
|
||
bmp_lookup_words = list(self.bmp_lookup)
|
||
bmp_data_words = [
|
||
unicode_property.to_numeric_value(elt)
|
||
for block in self.bmp_data
|
||
for elt in block]
|
||
|
||
supp_lookup1_words = list(self.supp_lookup1)
|
||
supp_lookup2_words = [
|
||
elt for block in self.supp_lookup2 for elt in block]
|
||
supp_data_words = [
|
||
unicode_property.to_numeric_value(elt)
|
||
for block in self.supp_data
|
||
for elt in block]
|
||
|
||
bmp_lookup_bytes = self._int_list_to_le_bytes(
|
||
bmp_lookup_words, self.bmp_lookup_bytes_per_entry)
|
||
bmp_data_bytes = self._int_list_to_le_bytes(
|
||
bmp_data_words, self.bmp_data_bytes_per_entry)
|
||
|
||
supp_lookup1_bytes = self._int_list_to_le_bytes(
|
||
supp_lookup1_words, self.supp_lookup1_bytes_per_entry)
|
||
supp_lookup2_bytes = self._int_list_to_le_bytes(
|
||
supp_lookup2_words, self.supp_lookup2_bytes_per_entry)
|
||
supp_data_bytes = self._int_list_to_le_bytes(
|
||
supp_data_words, self.supp_data_bytes_per_entry)
|
||
|
||
self.trie_bytes = []
|
||
|
||
self.bmp_lookup_bytes_offset = 0
|
||
self.trie_bytes += bmp_lookup_bytes
|
||
|
||
self.bmp_data_bytes_offset = len(self.trie_bytes)
|
||
self.trie_bytes += bmp_data_bytes
|
||
|
||
self.supp_lookup1_bytes_offset = len(self.trie_bytes)
|
||
self.trie_bytes += supp_lookup1_bytes
|
||
|
||
self.supp_lookup2_bytes_offset = len(self.trie_bytes)
|
||
self.trie_bytes += supp_lookup2_bytes
|
||
|
||
self.supp_data_bytes_offset = len(self.trie_bytes)
|
||
self.trie_bytes += supp_data_bytes
|
||
|
||
|
||
def get_extended_grapheme_cluster_rules_matrix(grapheme_cluster_break_table):
|
||
any_value = \
|
||
grapheme_cluster_break_table.symbolic_values
|
||
|
||
# Rules to determine extended grapheme cluster boundaries, as defined in
|
||
# 'Grapheme Break Chart',
|
||
# http://www.unicode.org/Public/6.3.0/ucd/auxiliary/GraphemeBreakTest.html,
|
||
# Unicode 6.3.0.
|
||
#
|
||
# The Unicode 7.0.0 draft does not change these rules.
|
||
#
|
||
# As in the referenced document, the rules are specified in order of
|
||
# decreasing priority.
|
||
rules = [
|
||
(['CR'], 'no_boundary', ['LF']),
|
||
(['Control', 'CR', 'LF'], 'boundary', any_value),
|
||
(any_value, 'boundary', ['Control', 'CR', 'LF']),
|
||
(['L'], 'no_boundary', ['L', 'V', 'LV', 'LVT']),
|
||
(['LV', 'V'], 'no_boundary', ['V', 'T']),
|
||
(['LVT', 'T'], 'no_boundary', ['T']),
|
||
(['Regional_Indicator'], 'no_boundary', ['Regional_Indicator']),
|
||
(any_value, 'no_boundary', ['Extend']),
|
||
(any_value, 'no_boundary', ['SpacingMark']),
|
||
(['Prepend'], 'no_boundary', any_value),
|
||
(any_value, 'boundary', any_value),
|
||
]
|
||
|
||
# Expand the rules into a matrix.
|
||
rules_matrix = {}
|
||
for first in any_value:
|
||
rules_matrix[first] = \
|
||
dict.fromkeys(any_value, None)
|
||
|
||
# Iterate over rules in the order of increasing priority.
|
||
for first_list, action, second_list in reversed(rules):
|
||
for first in first_list:
|
||
for second in second_list:
|
||
rules_matrix[first][second] = action
|
||
|
||
# Make sure we can pack one row of the matrix into a 'uint16_t'.
|
||
assert len(any_value) <= 16
|
||
|
||
result = []
|
||
for first in any_value:
|
||
# Retrieve a row that corresponds to this first code point.
|
||
row = rules_matrix[first]
|
||
|
||
# Change strings into bits.
|
||
bits = [row[second] == 'no_boundary' for second in any_value]
|
||
|
||
# Pack bits into an integer.
|
||
packed = sum([bits[i] * pow(2, i) for i in range(0, len(bits))])
|
||
|
||
result += [packed]
|
||
|
||
return result
|
||
|
||
|
||
def get_grapheme_cluster_break_tests_as_utf8(grapheme_break_test_file_name):
|
||
def _convert_line(line):
|
||
# Strip comments.
|
||
line = re.sub('#.*', '', line).strip()
|
||
|
||
if line == "":
|
||
return None
|
||
|
||
test = ""
|
||
curr_bytes = 0
|
||
boundaries = []
|
||
|
||
# Match a list of code points.
|
||
for token in line.split(" "):
|
||
if token == u"÷":
|
||
boundaries += [curr_bytes]
|
||
elif token == u"×":
|
||
pass
|
||
else:
|
||
code_point = int(token, 16)
|
||
# Tests from Unicode spec have isolated surrogates in them.
|
||
# Our segmentation algorithm works on UTF-8 sequences, so
|
||
# encoding a surrogate would produce an invalid code unit
|
||
# sequence. Instead of trying to emulate the maximal subpart
|
||
# algorithm for inserting U+FFFD in Python, we just replace
|
||
# every isolated surrogate with U+200B, which also has
|
||
# Grapheme_Cluster_Break equal to 'Control' and test
|
||
# separately that we handle ill-formed UTF-8 sequences.
|
||
if code_point >= 0xd800 and code_point <= 0xdfff:
|
||
code_point = 0x200b
|
||
code_point = (b'\U%(cp)08x' % {b'cp': code_point}).decode(
|
||
'unicode_escape', 'strict')
|
||
as_utf8_bytes = bytearray(code_point.encode('utf8', 'strict'))
|
||
as_utf8_escaped = ''.join(
|
||
['\\x%(byte)02x' % {'byte': byte}
|
||
for byte in as_utf8_bytes])
|
||
test += as_utf8_escaped
|
||
curr_bytes += len(as_utf8_bytes)
|
||
|
||
return (test, boundaries)
|
||
|
||
# Self-test.
|
||
assert (_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') ==
|
||
('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [0, 5, 8]))
|
||
assert _convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [0, 3])
|
||
|
||
result = []
|
||
|
||
with codecs.open(
|
||
grapheme_break_test_file_name,
|
||
encoding='utf-8',
|
||
errors='strict') as f:
|
||
for line in f:
|
||
test = _convert_line(line)
|
||
if test:
|
||
result += [test]
|
||
|
||
return result
|
||
|
||
|
||
def get_grapheme_cluster_break_tests_as_unicode_scalars(
|
||
grapheme_break_test_file_name):
|
||
def _convert_line(line):
|
||
# Strip comments.
|
||
line = re.sub('#.*', '', line).strip()
|
||
|
||
if line == "":
|
||
return None
|
||
|
||
test = []
|
||
curr_code_points = 0
|
||
boundaries = []
|
||
|
||
# Match a list of code points.
|
||
for token in line.split(" "):
|
||
if token == "÷":
|
||
boundaries += [curr_code_points]
|
||
elif token == "×":
|
||
pass
|
||
else:
|
||
code_point = int(token, 16)
|
||
# Tests from Unicode spec have isolated surrogates in them. Our
|
||
# segmentation algorithm works on UTF-16 sequences, so encoding
|
||
# a surrogate would produce an invalid code unit sequence.
|
||
# Instead of trying to emulate the maximal subpart algorithm
|
||
# for inserting U+FFFD in Python, we just replace every
|
||
# isolated surrogate with U+200B, which also has
|
||
# Grapheme_Cluster_Break equal to 'Control' and test separately
|
||
# that we handle ill-formed UTF-8 sequences.
|
||
if code_point >= 0xd800 and code_point <= 0xdfff:
|
||
code_point = 0x200b
|
||
test += [code_point]
|
||
curr_code_points += 1
|
||
|
||
return (test, boundaries)
|
||
|
||
# Self-test.
|
||
assert (_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') ==
|
||
([0x0903, 0x0308, 0xac01], [0, 2, 3]))
|
||
assert _convert_line('÷ D800 ÷ # abc') == ([0x200b], [0, 1])
|
||
|
||
result = []
|
||
|
||
with open(grapheme_break_test_file_name, 'rb') as f:
|
||
for line in f:
|
||
test = _convert_line(line)
|
||
if test:
|
||
result += [test]
|
||
|
||
return result
|