mirror of
https://github.com/apple/swift.git
synced 2025-12-21 12:14:44 +01:00
Mangling: support for special encoding ASCII of characters which may not appear in symbol names.
Such characters (like ‘.’) can be punycode-encoded just like non-ASCII unicode characters.
This commit is contained in:
@@ -18,6 +18,9 @@
|
||||
// - Encoding digits are represented using [a-zA-J] instead of [a-z0-9], because
|
||||
// symbol names are case-sensitive, and Swift mangled identifiers cannot begin
|
||||
// with a digit.
|
||||
// - Optinally, non-symbol ASCII characters (characters except [$_a-zA-Z0-9])
|
||||
// are mapped to the code range 0xD800 - 0xD880 and are also encoded like
|
||||
// non-ASCII unicode characters.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
@@ -45,7 +48,13 @@ bool encodePunycode(const std::vector<uint32_t> &InputCodePoints,
|
||||
bool decodePunycode(StringRef InputPunycode,
|
||||
std::vector<uint32_t> &OutCodePoints);
|
||||
|
||||
bool encodePunycodeUTF8(StringRef InputUTF8, std::string &OutPunycode);
|
||||
/// Encodes an UTF8 string into Punycode.
|
||||
///
|
||||
/// If \p mapNonSymbolChars is true, non-symbol ASCII characters (characters
|
||||
/// except [$_a-zA-Z0-9]) are also encoded like non-ASCII unicode characters.
|
||||
/// Returns false if \p InputUTF8 contains surrogate code points.
|
||||
bool encodePunycodeUTF8(StringRef InputUTF8, std::string &OutPunycode,
|
||||
bool mapNonSymbolChars = false);
|
||||
|
||||
bool decodePunycodeUTF8(StringRef InputPunycode, std::string &OutUTF8);
|
||||
|
||||
|
||||
@@ -47,7 +47,9 @@ static int digit_index(char value) {
|
||||
}
|
||||
|
||||
static bool isValidUnicodeScalar(uint32_t S) {
|
||||
return (S < 0xD800) || (S >= 0xE000 && S <= 0x1FFFFF);
|
||||
// Also accept the range of 0xD800 - 0xD880, which is used for non-symbol
|
||||
// ASCII characters.
|
||||
return (S < 0xD880) || (S >= 0xE000 && S <= 0x1FFFFF);
|
||||
}
|
||||
|
||||
// Section 6.1: Bias adaptation function
|
||||
@@ -200,6 +202,8 @@ static bool encodeToUTF8(const std::vector<uint32_t> &Scalars,
|
||||
OutUTF8.clear();
|
||||
return false;
|
||||
}
|
||||
if (S >= 0xD800 && S < 0xD880)
|
||||
S -= 0xD800;
|
||||
|
||||
unsigned Bytes = 0;
|
||||
if (S < 0x80)
|
||||
|
||||
@@ -19,19 +19,32 @@ static bool isContinuationByte(uint8_t unit) {
|
||||
return (unit & 0xC0) == 0x80;
|
||||
}
|
||||
|
||||
static bool isValidSymbolChar(char ch) {
|
||||
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
|
||||
(ch >= '0' && ch <= '9') || ch == '_' || ch == '$';
|
||||
}
|
||||
|
||||
/// Reencode well-formed UTF-8 as UTF-32.
|
||||
///
|
||||
/// This entry point is only called from compiler-internal entry points, so does
|
||||
/// only minimal validation. In particular, it does *not* check for overlong
|
||||
/// encodings.
|
||||
/// If \p mapNonSymbolChars is true, non-symbol ASCII characters (characters
|
||||
/// except [$_a-zA-Z0-9]) are also encoded like non-ASCII unicode characters.
|
||||
/// Returns false if \p InputUTF8 contains surrogate code points.
|
||||
static bool convertUTF8toUTF32(StringRef InputUTF8,
|
||||
std::vector<uint32_t> &OutUTF32) {
|
||||
std::vector<uint32_t> &OutUTF32,
|
||||
bool mapNonSymbolChars) {
|
||||
auto ptr = InputUTF8.begin();
|
||||
auto end = InputUTF8.end();
|
||||
while (ptr < end) {
|
||||
uint8_t first = *ptr++;
|
||||
if (first < 0x80) {
|
||||
OutUTF32.push_back(first);
|
||||
if (isValidSymbolChar(first) || !mapNonSymbolChars) {
|
||||
OutUTF32.push_back(first);
|
||||
} else {
|
||||
OutUTF32.push_back((uint32_t)first + 0xD800);
|
||||
}
|
||||
} else if (first < 0xC0) {
|
||||
// Invalid continuation byte.
|
||||
return false;
|
||||
@@ -75,11 +88,12 @@ static bool convertUTF8toUTF32(StringRef InputUTF8,
|
||||
}
|
||||
|
||||
bool Punycode::encodePunycodeUTF8(StringRef InputUTF8,
|
||||
std::string &OutPunycode) {
|
||||
std::string &OutPunycode,
|
||||
bool mapNonSymbolChars) {
|
||||
std::vector<uint32_t> InputCodePoints;
|
||||
InputCodePoints.reserve(InputUTF8.size());
|
||||
|
||||
if (!convertUTF8toUTF32(InputUTF8, InputCodePoints))
|
||||
if (!convertUTF8toUTF32(InputUTF8, InputCodePoints, mapNonSymbolChars))
|
||||
return false;
|
||||
|
||||
return encodePunycode(InputCodePoints, OutPunycode);
|
||||
|
||||
Reference in New Issue
Block a user