[gyb] Force Unicode strings in Python 2

All strings are sequences of Unicode characters in Python 3. This is entirely different than that of Python 2. Python 2's strings were of bytes. However, Python 2 does have the concept of Unicode strings. This patch changes the behavior of the file reader to use the same the codecs module on Python 2 to properly read a string into a unicode string. From there the strings are meant to be equivalent on 2 and 3. The rest of the patch just updates the code to natively work with unicode strings. To test the class `GraphemeClusterBreakPropertyTable`: $ python2 utils/gyb --test \ -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \ -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \ -DCMAKE_SIZEOF_VOID_P=8 \ -o /tmp/UnicodeExtendedGraphemeClusters.cpp.2.7.tmp \ ./stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb $ python3 utils/gyb --test \ -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \ -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \ -DCMAKE_SIZEOF_VOID_P=8 \ -o /tmp/UnicodeExtendedGraphemeClusters.cpp.3.5.tmp \ ./stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb $ diff -u /tmp/UnicodeExtendedGraphemeClusters.cpp.2.7.tmp \ /tmp/UnicodeExtendedGraphemeClusters.cpp.3.5.tmp To test the method `get_grapheme_cluster_break_tests_as_UTF8`: $ python2 utils/gyb --test \ -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \ -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \ -DCMAKE_SIZEOF_VOID_P=8 \ -o /tmp/UnicodeGraphemeBreakTest.cpp.2.7.tmp \ ./unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb $ python3 utils/gyb --test \ -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \ -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \ -DCMAKE_SIZEOF_VOID_P=8 \ -o /tmp/UnicodeGraphemeBreakTest.cpp.3.5.tmp \ ./unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb $ diff -u /tmp/UnicodeGraphemeBreakTest.cpp.2.7.tmp \ /tmp/UnicodeGraphemeBreakTest.cpp.3.5.tmp
2025-12-14 20:36:38 +01:00 · 2015-12-28 16:01:15 -05:00
parent c677844ba4
commit 7dbb4127f5
2 changed files with 14 additions and 10 deletions
--- a/utils/GYBUnicodeDataUtils.py
+++ b/utils/GYBUnicodeDataUtils.py
@@ -11,6 +11,8 @@
 ##===----------------------------------------------------------------------===##

 import re
+import sys
+import codecs

 class UnicodeProperty(object):
    """Abstract base class for Unicode properties."""
@@ -68,7 +70,7 @@ class GraphemeClusterBreakPropertyTable(UnicodeProperty):
            self.symbolic_values[v] = k

        # Load the data file.
-        with open(grapheme_break_property_file_name, 'rb') as f:
+        with codecs.open(grapheme_break_property_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
            for line in f:
                # Strip comments.
                line = re.sub('#.*', '', line)
@@ -514,9 +516,9 @@ def get_grapheme_cluster_break_tests_as_UTF8(grapheme_break_test_file_name):

        # Match a list of code points.
        for token in line.split(" "):
-            if token == "÷":
+            if token == u"÷":
                boundaries += [ curr_bytes ]
-            elif token == "×":
+            elif token == u"×":
                pass
            else:
                code_point = int(token, 16)
@@ -529,21 +531,21 @@ def get_grapheme_cluster_break_tests_as_UTF8(grapheme_break_test_file_name):
                # and test separately that we handle ill-formed UTF-8 sequences.
                if code_point >= 0xd800 and code_point <= 0xdfff:
                    code_point = 0x200b
-                code_point = ('\U%(cp)08x' % { 'cp': code_point }).decode('unicode_escape')
-                as_UTF8_bytes = code_point.encode('utf8')
-                as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in as_UTF8_bytes])
+                code_point = (b'\U%(cp)08x' % { b'cp': code_point }).decode('unicode_escape', 'strict')
+                as_UTF8_bytes = bytearray(code_point.encode('utf8', 'strict'))
+                as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': byte } for byte in as_UTF8_bytes])
                test += as_UTF8_escaped
                curr_bytes += len(as_UTF8_bytes)

        return (test, boundaries)

    # Self-test.
-    assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
-    assert(_convert_line('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
+    assert(_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
+    assert(_convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))

    result = []

-    with open(grapheme_break_test_file_name, 'rb') as f:
+    with codecs.open(grapheme_break_test_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
        for line in f:
            test = _convert_line(line)
            if test: