diff --git a/stdlib/core/UnicodeTrie.swift.gyb b/stdlib/core/UnicodeTrie.swift.gyb index de9f94b7dac..e3b2341a4f8 100644 --- a/stdlib/core/UnicodeTrie.swift.gyb +++ b/stdlib/core/UnicodeTrie.swift.gyb @@ -32,13 +32,13 @@ SuppLookup1BytesPerEntry = 1 SuppLookup2BytesPerEntry = 1 SuppDataBytesPerEntry = 1 -TrieSize = 15904 +TrieSize = 15889 BMPLookupBytesOffset = 0 BMPDataBytesOffset = 256 SuppLookup1BytesOffset = 12032 -SuppLookup2BytesOffset = 12064 -SuppDataBytesOffset = 12832 +SuppLookup2BytesOffset = 12049 +SuppDataBytesOffset = 12817 }% diff --git a/stdlib/runtime/UnicodeExtendedGraphemeClusters.cpp.gyb b/stdlib/runtime/UnicodeExtendedGraphemeClusters.cpp.gyb index 53a4b147d98..75f6d538435 100644 --- a/stdlib/runtime/UnicodeExtendedGraphemeClusters.cpp.gyb +++ b/stdlib/runtime/UnicodeExtendedGraphemeClusters.cpp.gyb @@ -21,6 +21,7 @@ grapheme_cluster_break_property_table = \ GraphemeClusterBreakPropertyTable(unicodeGraphemeBreakPropertyFile) trie_generator = UnicodeTrieGenerator() +trie_generator.create_tables() trie_generator.fill_from_unicode_property(grapheme_cluster_break_property_table) trie_generator.verify(grapheme_cluster_break_property_table) diff --git a/test/stdlib/UnicodeTrieGenerator.gyb b/test/stdlib/UnicodeTrieGenerator.gyb new file mode 100644 index 00000000000..353f50c4332 --- /dev/null +++ b/test/stdlib/UnicodeTrieGenerator.gyb @@ -0,0 +1,139 @@ +%{ + +# RUN: rm -rf %t && mkdir -p %t && %S/../../utils/gyb %s | FileCheck %s +# +# REQUIRES: long_tests + +from GYBUnicodeDataUtils import * + +def test_trie_generation(property_table, configure_generator=None): + trie_generator = UnicodeTrieGenerator() + if configure_generator is not None: + configure_generator(trie_generator) + trie_generator.create_tables() + trie_generator.fill_from_unicode_property(property_table) + trie_generator.verify(property_table) + trie_generator.freeze() + trie_generator.verify(property_table) + trie_generator.serialize(property_table) + print ( + trie_generator.BMP_first_level_index_bits, + trie_generator.BMP_data_offset_bits, + trie_generator.supp_first_level_index_bits, + trie_generator.supp_second_level_index_bits, + trie_generator.supp_data_offset_bits, + + trie_generator.BMP_lookup_bytes_per_entry, + trie_generator.BMP_data_bytes_per_entry, + trie_generator.supp_lookup1_bytes_per_entry, + trie_generator.supp_lookup2_bytes_per_entry, + trie_generator.supp_data_bytes_per_entry, + + len(trie_generator.trie_bytes), + + trie_generator.BMP_data_bytes_offset - trie_generator.BMP_lookup_bytes_offset, + trie_generator.supp_lookup1_bytes_offset - trie_generator.BMP_data_bytes_offset, + trie_generator.supp_lookup2_bytes_offset - trie_generator.supp_lookup1_bytes_offset, + trie_generator.supp_data_bytes_offset - trie_generator.supp_lookup2_bytes_offset, + len(trie_generator.trie_bytes) - trie_generator.supp_data_bytes_offset) + +class PerfectlyCompressableProperty(UnicodeProperty): + def __init__(self): + pass + + def get_default_value(self): + return 'Default' + + def get_value(self, cp): + return 'Default' + + def to_numeric_value(self, value): + if value == 'Default': + return 42 + assert(False) + + def get_numeric_value(self, cp): + return self.to_numeric_value(self.get_value(cp)) + +print 'PerfectlyCompressableProperty' +test_trie_generation(PerfectlyCompressableProperty()) +# CHECK-LABEL: PerfectlyCompressableProperty +# CHECK: (8, 8, 5, 8, 8, 1, 1, 1, 1, 1, 1041, 256, 256, 17, 256, 256) +# +# Explanation for table sizes above: +# +# BMP_lookup: 1-byte words x 256 = 256 +# BMP_data: 1 x 1 = 256 +# supp_lookup1: 1 x 17 = 17 +# supp_lookup2: 1 x 1*256 = 256 +# supp_data: 1 x 1*256 = 256 + + +class UncompressableProperty(UnicodeProperty): + def __init__(self): + pass + + def get_default_value(self): + return 42 + + def get_value(self, cp): + # Split Unicode codespace into 128-entry "pages". Start each page with + # a unique sequence of property values (page number) so that the result + # can not be compressed. + page_number = cp >> 7 + if cp % 0x80 == 1: + return page_number & 0xff + if cp % 0x80 == 2: + return (page_number >> 8) & 0xff + if cp % 0x80 == 3: + return (page_number >> 16) & 0xff + return 42 + + def to_numeric_value(self, value): + return value + + def get_numeric_value(self, cp): + return self.to_numeric_value(self.get_value(cp)) + +print 'UncompressableProperty, default trie parameters' +test_trie_generation(UncompressableProperty()) +# CHECK-LABEL: UncompressableProperty, default trie parameters +# CHECK: (8, 8, 5, 8, 8, 2, 1, 1, 2, 1, 1123601, 512, 65536, 17, 8704, 1048832) +# +# Explanation for table sizes above: +# +# BMP_lookup: 2-byte words x 256 = 512 +# BMP_data: 1 x 256*256 = 65536 +# supp_lookup1: 1 x 17 = 17 +# supp_lookup2: 2 x 17*256 = 8704 +# supp_data: 1 x (16*256+1)*256 = 1048832 + +def configure_generator_for_16_bit_indexes(trie_generator): + trie_generator.BMP_first_level_index_bits = 9 + + trie_generator.supp_first_level_index_bits = 10 + trie_generator.supp_second_level_index_bits = 2 + +print 'UncompressableProperty, 16-bit indexes' +test_trie_generation(UncompressableProperty(), + configure_generator_for_16_bit_indexes) +# CHECK-LABEL: UncompressableProperty, 16-bit indexes +# CHECK: (9, 7, 10, 2, 9, 2, 1, 2, 2, 1, 1120840, 1024, 65536, 1088, 4104, 1049088) +# +# Explanation for table sizes above: +# +# BMP_lookup: 2-byte words x 512 = 1024 +# BMP_data: 1 x 512*128 = 65536 +# supp_lookup1: 2 x 544 = 1088 +# supp_lookup2: 2 x 513*4 = 4104 +# supp_data: 1 x (2048+1)*512 = 1049088 + + +# gyb will print line markers after our output, so make sure that those +# don't accidentally match any other CHECK lines. + +print 'THE END' +# CHECK-LABEL: THE END + +}% + diff --git a/utils/GYBUnicodeDataUtils.py b/utils/GYBUnicodeDataUtils.py index fd5ace0f0af..640e7d64d28 100644 --- a/utils/GYBUnicodeDataUtils.py +++ b/utils/GYBUnicodeDataUtils.py @@ -207,14 +207,12 @@ class UnicodeTrieGenerator(object): # Note: if you change any of these parameters, don't forget to update the # ASCII art above. BMP_first_level_index_bits = 8 - BMP_data_offset_bits = 16 - BMP_first_level_index_bits supp_first_level_index_bits = 5 supp_second_level_index_bits = 8 - supp_data_offset_bits = 21 - supp_first_level_index_bits - supp_second_level_index_bits def get_BMP_first_level_index(self, cp): - return cp >> self.BMP_first_level_index_bits + return cp >> self.BMP_data_offset_bits def get_BMP_data_offset(self, cp): return cp & ((1 << self.BMP_data_offset_bits) - 1) @@ -229,6 +227,30 @@ class UnicodeTrieGenerator(object): return cp & ((1 << self.supp_data_offset_bits) - 1) def __init__(self): + """Create a trie generator with default parameters.""" + pass + + def create_tables(self): + """Compute derived parameter values and create internal data + structures. + + Don't change parameter values after calling this method. + """ + + self.BMP_data_offset_bits = 16 - self.BMP_first_level_index_bits + + self.supp_data_offset_bits = \ + 21 - self.supp_first_level_index_bits - \ + self.supp_second_level_index_bits + + # The maximum value of the first level index for supp tables. It is + # not equal to ((1 << supp_first_level_index_bits) - 1), because + # maximum Unicode code point value is not 2^21-1 (0x1fffff), it is + # 0x10ffff. + self.supp_first_level_index_max = \ + 0x10ffff >> (self.supp_second_level_index_bits + \ + self.supp_data_offset_bits) + # A mapping from BMP first-level index to BMP data block index. self.BMP_lookup = [ i for i in range(0, 1 << self.BMP_first_level_index_bits) ] @@ -239,19 +261,19 @@ class UnicodeTrieGenerator(object): # A mapping from supp first-level index to an index of the second-level # lookup table. - self.supp_lookup1 = [ i for i in range(0, 1 << self.supp_first_level_index_bits) ] + self.supp_lookup1 = [ i for i in range(0, self.supp_first_level_index_max + 1) ] # An array of second-level lookup tables. Each second-level lookup # table is a mapping from a supp second-level index to supp data block # index. self.supp_lookup2 = [ [ j for j in range(i << self.supp_second_level_index_bits, (i + 1) << self.supp_second_level_index_bits) ] - for i in range(0, (1 << self.supp_first_level_index_bits)) ] + for i in range(0, self.supp_first_level_index_max + 1) ] # An arry of supp data blocks. self.supp_data = [ [ -1 for i in range(0, 1 << self.supp_data_offset_bits) ] - for i in range(0, 1 << (self.supp_first_level_index_bits + self.supp_second_level_index_bits)) ] + for i in range(0, (self.supp_first_level_index_max + 1) * (1 << self.supp_second_level_index_bits)) ] def splat(self, value): for i in range(0, len(self.BMP_data)): @@ -292,6 +314,23 @@ class UnicodeTrieGenerator(object): assert(expectedValue == actualValue) def freeze(self): + """Compress internal trie representation. + + Don't mutate the trie after calling this method. + + """ + + def remap_indexes(indexes, old_idx, new_idx): + def map_index(idx): + if idx == old_idx: + return new_idx + elif idx > old_idx: + return idx - 1 + else: + return idx + + return map(map_index, indexes) + # If self.BMP_data contains identical data blocks, keep the first one, # remove duplicates and change the indexes in self.BMP_lookup to point to # the first one. @@ -301,11 +340,8 @@ class UnicodeTrieGenerator(object): while j < len(self.BMP_data): if self.BMP_data[i] == self.BMP_data[j]: self.BMP_data.pop(j) - for k in range(0, len(self.BMP_lookup)): - if self.BMP_lookup[k] == j: - self.BMP_lookup[k] = i - elif self.BMP_lookup[k] > j: - self.BMP_lookup[k] -= 1 + self.BMP_lookup = \ + remap_indexes(self.BMP_lookup, old_idx=j, new_idx=i) else: j += 1 i += 1 @@ -320,11 +356,9 @@ class UnicodeTrieGenerator(object): if self.supp_data[i] == self.supp_data[j]: self.supp_data.pop(j) for k in range(0, len(self.supp_lookup2)): - for l in range(0, len(self.supp_lookup2[k])): - if self.supp_lookup2[k][l] == j: - self.supp_lookup2[k][l] = i - elif self.supp_lookup2[k][l] > j: - self.supp_lookup2[k][l] -= 1 + self.supp_lookup2[k] = \ + remap_indexes(self.supp_lookup2[k], old_idx=j, + new_idx=i) else: j += 1 i += 1 @@ -337,11 +371,8 @@ class UnicodeTrieGenerator(object): while j < len(self.supp_lookup2): if self.supp_lookup2[i] == self.supp_lookup2[j]: self.supp_lookup2.pop(j) - for k in range(0, len(self.supp_lookup1)): - if self.supp_lookup1[k] == j: - self.supp_lookup1[k] = i - elif self.supp_lookup1[k] > j: - self.supp_lookup1[k] -= 1 + self.supp_lookup1 = \ + remap_indexes(self.supp_lookup1, old_idx=j, new_idx=i) else: j += 1 i += 1 @@ -351,7 +382,7 @@ class UnicodeTrieGenerator(object): assert(data & ~0xff == 0) return [ data ] if width == 2: - assert(data & 0xffff == 0) + assert(data & ~0xffff == 0) return [ data & 0xff, data & 0xff00 ] assert(False)