[gyb] Force Unicode strings in Python 2

All strings are sequences of Unicode characters in Python 3. This is
entirely different than that of Python 2. Python 2's strings were of
bytes. However, Python 2 does have the concept of Unicode strings. This
patch changes the behavior of the file reader to use the same the codecs
module on Python 2 to properly read a string into a unicode string. From
there the strings are meant to be equivalent on 2 and 3. The rest of the
patch just updates the code to natively work with unicode strings.

To test the class `GraphemeClusterBreakPropertyTable`:

    $ python2 utils/gyb --test \
    -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
    -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
    -DCMAKE_SIZEOF_VOID_P=8 \
    -o /tmp/UnicodeExtendedGraphemeClusters.cpp.2.7.tmp \
    ./stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb

    $ python3 utils/gyb --test \
    -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
    -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
    -DCMAKE_SIZEOF_VOID_P=8 \
    -o /tmp/UnicodeExtendedGraphemeClusters.cpp.3.5.tmp \
    ./stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb

    $ diff -u /tmp/UnicodeExtendedGraphemeClusters.cpp.2.7.tmp \
    /tmp/UnicodeExtendedGraphemeClusters.cpp.3.5.tmp

To test the method `get_grapheme_cluster_break_tests_as_UTF8`:

    $ python2 utils/gyb --test \
    -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
    -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
    -DCMAKE_SIZEOF_VOID_P=8 \
    -o /tmp/UnicodeGraphemeBreakTest.cpp.2.7.tmp \
    ./unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb

    $ python3 utils/gyb --test \
    -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
    -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
    -DCMAKE_SIZEOF_VOID_P=8 \
    -o /tmp/UnicodeGraphemeBreakTest.cpp.3.5.tmp \
    ./unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb

    $ diff -u /tmp/UnicodeGraphemeBreakTest.cpp.2.7.tmp \
    /tmp/UnicodeGraphemeBreakTest.cpp.3.5.tmp
This commit is contained in:
Ryan Lovelett
2015-12-28 16:01:15 -05:00
parent c677844ba4
commit 7dbb4127f5
2 changed files with 14 additions and 10 deletions

View File

@@ -11,6 +11,8 @@
##===----------------------------------------------------------------------===##
import re
import sys
import codecs
class UnicodeProperty(object):
"""Abstract base class for Unicode properties."""
@@ -68,7 +70,7 @@ class GraphemeClusterBreakPropertyTable(UnicodeProperty):
self.symbolic_values[v] = k
# Load the data file.
with open(grapheme_break_property_file_name, 'rb') as f:
with codecs.open(grapheme_break_property_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
for line in f:
# Strip comments.
line = re.sub('#.*', '', line)
@@ -514,9 +516,9 @@ def get_grapheme_cluster_break_tests_as_UTF8(grapheme_break_test_file_name):
# Match a list of code points.
for token in line.split(" "):
if token == "÷":
if token == u"÷":
boundaries += [ curr_bytes ]
elif token == "×":
elif token == u"×":
pass
else:
code_point = int(token, 16)
@@ -529,21 +531,21 @@ def get_grapheme_cluster_break_tests_as_UTF8(grapheme_break_test_file_name):
# and test separately that we handle ill-formed UTF-8 sequences.
if code_point >= 0xd800 and code_point <= 0xdfff:
code_point = 0x200b
code_point = ('\U%(cp)08x' % { 'cp': code_point }).decode('unicode_escape')
as_UTF8_bytes = code_point.encode('utf8')
as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in as_UTF8_bytes])
code_point = (b'\U%(cp)08x' % { b'cp': code_point }).decode('unicode_escape', 'strict')
as_UTF8_bytes = bytearray(code_point.encode('utf8', 'strict'))
as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': byte } for byte in as_UTF8_bytes])
test += as_UTF8_escaped
curr_bytes += len(as_UTF8_bytes)
return (test, boundaries)
# Self-test.
assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
assert(_convert_line('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
assert(_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
assert(_convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
result = []
with open(grapheme_break_test_file_name, 'rb') as f:
with codecs.open(grapheme_break_test_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
for line in f:
test = _convert_line(line)
if test: