Mangling: support for special encoding ASCII of characters which may not appear in symbol names.

Such characters (like ‘.’) can be punycode-encoded just like non-ASCII unicode characters.
This commit is contained in:
Erik Eckstein
2016-12-02 12:49:19 -08:00
parent 97f1fac11e
commit 76820edda9
3 changed files with 33 additions and 6 deletions

View File

@@ -18,6 +18,9 @@
// - Encoding digits are represented using [a-zA-J] instead of [a-z0-9], because
// symbol names are case-sensitive, and Swift mangled identifiers cannot begin
// with a digit.
// - Optinally, non-symbol ASCII characters (characters except [$_a-zA-Z0-9])
// are mapped to the code range 0xD800 - 0xD880 and are also encoded like
// non-ASCII unicode characters.
//
//===----------------------------------------------------------------------===//
@@ -45,7 +48,13 @@ bool encodePunycode(const std::vector<uint32_t> &InputCodePoints,
bool decodePunycode(StringRef InputPunycode,
std::vector<uint32_t> &OutCodePoints);
bool encodePunycodeUTF8(StringRef InputUTF8, std::string &OutPunycode);
/// Encodes an UTF8 string into Punycode.
///
/// If \p mapNonSymbolChars is true, non-symbol ASCII characters (characters
/// except [$_a-zA-Z0-9]) are also encoded like non-ASCII unicode characters.
/// Returns false if \p InputUTF8 contains surrogate code points.
bool encodePunycodeUTF8(StringRef InputUTF8, std::string &OutPunycode,
bool mapNonSymbolChars = false);
bool decodePunycodeUTF8(StringRef InputPunycode, std::string &OutUTF8);

View File

@@ -47,7 +47,9 @@ static int digit_index(char value) {
}
static bool isValidUnicodeScalar(uint32_t S) {
return (S < 0xD800) || (S >= 0xE000 && S <= 0x1FFFFF);
// Also accept the range of 0xD800 - 0xD880, which is used for non-symbol
// ASCII characters.
return (S < 0xD880) || (S >= 0xE000 && S <= 0x1FFFFF);
}
// Section 6.1: Bias adaptation function
@@ -200,6 +202,8 @@ static bool encodeToUTF8(const std::vector<uint32_t> &Scalars,
OutUTF8.clear();
return false;
}
if (S >= 0xD800 && S < 0xD880)
S -= 0xD800;
unsigned Bytes = 0;
if (S < 0x80)

View File

@@ -19,19 +19,32 @@ static bool isContinuationByte(uint8_t unit) {
return (unit & 0xC0) == 0x80;
}
static bool isValidSymbolChar(char ch) {
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
(ch >= '0' && ch <= '9') || ch == '_' || ch == '$';
}
/// Reencode well-formed UTF-8 as UTF-32.
///
/// This entry point is only called from compiler-internal entry points, so does
/// only minimal validation. In particular, it does *not* check for overlong
/// encodings.
/// If \p mapNonSymbolChars is true, non-symbol ASCII characters (characters
/// except [$_a-zA-Z0-9]) are also encoded like non-ASCII unicode characters.
/// Returns false if \p InputUTF8 contains surrogate code points.
static bool convertUTF8toUTF32(StringRef InputUTF8,
std::vector<uint32_t> &OutUTF32) {
std::vector<uint32_t> &OutUTF32,
bool mapNonSymbolChars) {
auto ptr = InputUTF8.begin();
auto end = InputUTF8.end();
while (ptr < end) {
uint8_t first = *ptr++;
if (first < 0x80) {
OutUTF32.push_back(first);
if (isValidSymbolChar(first) || !mapNonSymbolChars) {
OutUTF32.push_back(first);
} else {
OutUTF32.push_back((uint32_t)first + 0xD800);
}
} else if (first < 0xC0) {
// Invalid continuation byte.
return false;
@@ -75,11 +88,12 @@ static bool convertUTF8toUTF32(StringRef InputUTF8,
}
bool Punycode::encodePunycodeUTF8(StringRef InputUTF8,
std::string &OutPunycode) {
std::string &OutPunycode,
bool mapNonSymbolChars) {
std::vector<uint32_t> InputCodePoints;
InputCodePoints.reserve(InputUTF8.size());
if (!convertUTF8toUTF32(InputUTF8, InputCodePoints))
if (!convertUTF8toUTF32(InputUTF8, InputCodePoints, mapNonSymbolChars))
return false;
return encodePunycode(InputCodePoints, OutPunycode);