mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
[gyb] Force Unicode strings in Python 2
All strings are sequences of Unicode characters in Python 3. This is
entirely different than that of Python 2. Python 2's strings were of
bytes. However, Python 2 does have the concept of Unicode strings. This
patch changes the behavior of the file reader to use the same the codecs
module on Python 2 to properly read a string into a unicode string. From
there the strings are meant to be equivalent on 2 and 3. The rest of the
patch just updates the code to natively work with unicode strings.
To test the class `GraphemeClusterBreakPropertyTable`:
$ python2 utils/gyb --test \
-DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
-DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
-DCMAKE_SIZEOF_VOID_P=8 \
-o /tmp/UnicodeExtendedGraphemeClusters.cpp.2.7.tmp \
./stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb
$ python3 utils/gyb --test \
-DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
-DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
-DCMAKE_SIZEOF_VOID_P=8 \
-o /tmp/UnicodeExtendedGraphemeClusters.cpp.3.5.tmp \
./stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb
$ diff -u /tmp/UnicodeExtendedGraphemeClusters.cpp.2.7.tmp \
/tmp/UnicodeExtendedGraphemeClusters.cpp.3.5.tmp
To test the method `get_grapheme_cluster_break_tests_as_UTF8`:
$ python2 utils/gyb --test \
-DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
-DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
-DCMAKE_SIZEOF_VOID_P=8 \
-o /tmp/UnicodeGraphemeBreakTest.cpp.2.7.tmp \
./unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb
$ python3 utils/gyb --test \
-DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
-DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
-DCMAKE_SIZEOF_VOID_P=8 \
-o /tmp/UnicodeGraphemeBreakTest.cpp.3.5.tmp \
./unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb
$ diff -u /tmp/UnicodeGraphemeBreakTest.cpp.2.7.tmp \
/tmp/UnicodeGraphemeBreakTest.cpp.3.5.tmp
This commit is contained in:
@@ -11,6 +11,8 @@
|
||||
##===----------------------------------------------------------------------===##
|
||||
|
||||
import re
|
||||
import sys
|
||||
import codecs
|
||||
|
||||
class UnicodeProperty(object):
|
||||
"""Abstract base class for Unicode properties."""
|
||||
@@ -68,7 +70,7 @@ class GraphemeClusterBreakPropertyTable(UnicodeProperty):
|
||||
self.symbolic_values[v] = k
|
||||
|
||||
# Load the data file.
|
||||
with open(grapheme_break_property_file_name, 'rb') as f:
|
||||
with codecs.open(grapheme_break_property_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
|
||||
for line in f:
|
||||
# Strip comments.
|
||||
line = re.sub('#.*', '', line)
|
||||
@@ -514,9 +516,9 @@ def get_grapheme_cluster_break_tests_as_UTF8(grapheme_break_test_file_name):
|
||||
|
||||
# Match a list of code points.
|
||||
for token in line.split(" "):
|
||||
if token == "÷":
|
||||
if token == u"÷":
|
||||
boundaries += [ curr_bytes ]
|
||||
elif token == "×":
|
||||
elif token == u"×":
|
||||
pass
|
||||
else:
|
||||
code_point = int(token, 16)
|
||||
@@ -529,21 +531,21 @@ def get_grapheme_cluster_break_tests_as_UTF8(grapheme_break_test_file_name):
|
||||
# and test separately that we handle ill-formed UTF-8 sequences.
|
||||
if code_point >= 0xd800 and code_point <= 0xdfff:
|
||||
code_point = 0x200b
|
||||
code_point = ('\U%(cp)08x' % { 'cp': code_point }).decode('unicode_escape')
|
||||
as_UTF8_bytes = code_point.encode('utf8')
|
||||
as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in as_UTF8_bytes])
|
||||
code_point = (b'\U%(cp)08x' % { b'cp': code_point }).decode('unicode_escape', 'strict')
|
||||
as_UTF8_bytes = bytearray(code_point.encode('utf8', 'strict'))
|
||||
as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': byte } for byte in as_UTF8_bytes])
|
||||
test += as_UTF8_escaped
|
||||
curr_bytes += len(as_UTF8_bytes)
|
||||
|
||||
return (test, boundaries)
|
||||
|
||||
# Self-test.
|
||||
assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
|
||||
assert(_convert_line('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
|
||||
assert(_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
|
||||
assert(_convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
|
||||
|
||||
result = []
|
||||
|
||||
with open(grapheme_break_test_file_name, 'rb') as f:
|
||||
with codecs.open(grapheme_break_test_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
|
||||
for line in f:
|
||||
test = _convert_line(line)
|
||||
if test:
|
||||
|
||||
Reference in New Issue
Block a user