libBasic: implement extended grapheme cluster segmentation algorithm

This is only for the frontend, not for stdlib.  The implementation is very
slow, optimizing it is the next step.

rdar://16755123 rdar://16013860


Swift SVN r18928
This commit is contained in:
Dmitri Hrybenko
2014-06-16 14:20:43 +00:00
parent e310fd7f0b
commit 7704e19b7d
11 changed files with 2071 additions and 9 deletions

View File

@@ -0,0 +1,121 @@
%# -*- mode: C++ -*-
%# Ignore the following admonition; it applies to the resulting .cpp file only
//// Automatically Generated From UnicodeExtendedGraphemeClusters.cpp.gyb.
//// Do Not Edit Directly!
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
%{
import re
# Grapheme_Cluster_Break property. An array of tuples (startCodePoint,
# endCodePoint, value).
graphemeBreakProperty = []
with open(unicodeGraphemeBreakPropertyFile, 'rb') as f:
for line in f:
# Strip comments.
line = re.sub('#.*', '', line)
# Single code point?
m = re.match('([0-9A-F]+) +; +([a-zA-Z]+) ', line)
if m:
codePoint = int(m.group(1), 16)
value = m.group(2)
graphemeBreakProperty += [(codePoint, codePoint, value)]
continue
# Range of code points?
m = re.match('([0-9A-F]+)..([0-9A-F]+) +; +([a-zA-Z_]+) ', line)
if m:
startCodePoint = int(m.group(1), 16)
endCodePoint = int(m.group(2), 16)
value = m.group(3)
graphemeBreakProperty += [(startCodePoint, endCodePoint, value)]
}%
#include "swift/Basic/Unicode.h"
swift::unicode::GraphemeClusterBreakProperty
swift::unicode::getGraphemeClusterBreakProperty(uint32_t C) {
// FIXME: replace linear search with a trie lookup.
% for startCodePoint,endCodePoint,value in graphemeBreakProperty:
% if startCodePoint == 0:
if (C <= ${endCodePoint})
% else:
if (C >= ${startCodePoint} && C <= ${endCodePoint})
% end
return GraphemeClusterBreakProperty::${value};
% end
return GraphemeClusterBreakProperty::Other;
}
%{
# The order should be consistent with 'GraphemeClusterBreakProperty' enum.
anyGraphemePropertyValue = [
'Other', 'CR', 'LF', 'Control', 'Extend', 'Regional_Indicator', 'Prepend',
'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT',
]
# Rules to determine extended grapheme cluster boundaries, as defined in
# 'Grapheme Break Chart', ucd/auxiliary/GraphemeBreakTest.html, Unicode 6.3.0.
extendedGraphemeClusterRules = [
( [ 'CR' ], 'no_boundary', [ 'LF' ] ),
( [ 'Control', 'CR', 'LF' ], 'boundary', anyGraphemePropertyValue ),
( anyGraphemePropertyValue, 'boundary', [ 'Control', 'CR', 'LF' ] ),
( [ 'L' ], 'no_boundary', [ 'L', 'V', 'LV', 'LVT' ] ),
( [ 'LV', 'V' ], 'no_boundary', [ 'V', 'T' ] ),
( [ 'LVT', 'T' ], 'no_boundary', [ 'T' ] ),
( [ 'Regional_Indicator' ], 'no_boundary', [ 'Regional_Indicator' ] ),
( anyGraphemePropertyValue, 'no_boundary', [ 'Extend' ] ),
( anyGraphemePropertyValue, 'no_boundary', [ 'SpacingMark' ] ),
( [ 'Prepend' ], 'no_boundary', anyGraphemePropertyValue ),
( anyGraphemePropertyValue, 'boundary', anyGraphemePropertyValue ),
]
# Expand the rules into a matrix.
extendedGraphemeClusterRulesMatrix = {}
for first in anyGraphemePropertyValue:
extendedGraphemeClusterRulesMatrix[first] = \
dict.fromkeys(anyGraphemePropertyValue, None)
for firstList,action,secondList in reversed(extendedGraphemeClusterRules):
for first in firstList:
for second in secondList:
extendedGraphemeClusterRulesMatrix[first][second] = action
# Make sure we can pack one row of the matrix into a 'uint16_t'.
assert(len(anyGraphemePropertyValue) <= 16)
}%
uint16_t swift::unicode::ExtendedGraphemeClusterNoBoundaryRulesMatrix[] = {
% for first in anyGraphemePropertyValue:
% # Retrieve a row that corresponds to this first code point.
% row = extendedGraphemeClusterRulesMatrix[first]
% # Change strings into bits.
% bits = [ row[second] == 'no_boundary' for second in anyGraphemePropertyValue ]
% # Pack bits into an integer.
% packed = sum([ bits[i] * pow(2, i) for i in range(0, len(bits)) ])
${packed},
% end
};