mirror of
https://github.com/apple/swift.git
synced 2025-12-21 12:14:44 +01:00
Mangling: support for special encoding ASCII of characters which may not appear in symbol names.
Such characters (like ‘.’) can be punycode-encoded just like non-ASCII unicode characters.
This commit is contained in:
@@ -18,6 +18,9 @@
|
|||||||
// - Encoding digits are represented using [a-zA-J] instead of [a-z0-9], because
|
// - Encoding digits are represented using [a-zA-J] instead of [a-z0-9], because
|
||||||
// symbol names are case-sensitive, and Swift mangled identifiers cannot begin
|
// symbol names are case-sensitive, and Swift mangled identifiers cannot begin
|
||||||
// with a digit.
|
// with a digit.
|
||||||
|
// - Optinally, non-symbol ASCII characters (characters except [$_a-zA-Z0-9])
|
||||||
|
// are mapped to the code range 0xD800 - 0xD880 and are also encoded like
|
||||||
|
// non-ASCII unicode characters.
|
||||||
//
|
//
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
@@ -45,7 +48,13 @@ bool encodePunycode(const std::vector<uint32_t> &InputCodePoints,
|
|||||||
bool decodePunycode(StringRef InputPunycode,
|
bool decodePunycode(StringRef InputPunycode,
|
||||||
std::vector<uint32_t> &OutCodePoints);
|
std::vector<uint32_t> &OutCodePoints);
|
||||||
|
|
||||||
bool encodePunycodeUTF8(StringRef InputUTF8, std::string &OutPunycode);
|
/// Encodes an UTF8 string into Punycode.
|
||||||
|
///
|
||||||
|
/// If \p mapNonSymbolChars is true, non-symbol ASCII characters (characters
|
||||||
|
/// except [$_a-zA-Z0-9]) are also encoded like non-ASCII unicode characters.
|
||||||
|
/// Returns false if \p InputUTF8 contains surrogate code points.
|
||||||
|
bool encodePunycodeUTF8(StringRef InputUTF8, std::string &OutPunycode,
|
||||||
|
bool mapNonSymbolChars = false);
|
||||||
|
|
||||||
bool decodePunycodeUTF8(StringRef InputPunycode, std::string &OutUTF8);
|
bool decodePunycodeUTF8(StringRef InputPunycode, std::string &OutUTF8);
|
||||||
|
|
||||||
|
|||||||
@@ -47,7 +47,9 @@ static int digit_index(char value) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static bool isValidUnicodeScalar(uint32_t S) {
|
static bool isValidUnicodeScalar(uint32_t S) {
|
||||||
return (S < 0xD800) || (S >= 0xE000 && S <= 0x1FFFFF);
|
// Also accept the range of 0xD800 - 0xD880, which is used for non-symbol
|
||||||
|
// ASCII characters.
|
||||||
|
return (S < 0xD880) || (S >= 0xE000 && S <= 0x1FFFFF);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Section 6.1: Bias adaptation function
|
// Section 6.1: Bias adaptation function
|
||||||
@@ -200,6 +202,8 @@ static bool encodeToUTF8(const std::vector<uint32_t> &Scalars,
|
|||||||
OutUTF8.clear();
|
OutUTF8.clear();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (S >= 0xD800 && S < 0xD880)
|
||||||
|
S -= 0xD800;
|
||||||
|
|
||||||
unsigned Bytes = 0;
|
unsigned Bytes = 0;
|
||||||
if (S < 0x80)
|
if (S < 0x80)
|
||||||
|
|||||||
@@ -19,19 +19,32 @@ static bool isContinuationByte(uint8_t unit) {
|
|||||||
return (unit & 0xC0) == 0x80;
|
return (unit & 0xC0) == 0x80;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool isValidSymbolChar(char ch) {
|
||||||
|
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
|
||||||
|
(ch >= '0' && ch <= '9') || ch == '_' || ch == '$';
|
||||||
|
}
|
||||||
|
|
||||||
/// Reencode well-formed UTF-8 as UTF-32.
|
/// Reencode well-formed UTF-8 as UTF-32.
|
||||||
///
|
///
|
||||||
/// This entry point is only called from compiler-internal entry points, so does
|
/// This entry point is only called from compiler-internal entry points, so does
|
||||||
/// only minimal validation. In particular, it does *not* check for overlong
|
/// only minimal validation. In particular, it does *not* check for overlong
|
||||||
/// encodings.
|
/// encodings.
|
||||||
|
/// If \p mapNonSymbolChars is true, non-symbol ASCII characters (characters
|
||||||
|
/// except [$_a-zA-Z0-9]) are also encoded like non-ASCII unicode characters.
|
||||||
|
/// Returns false if \p InputUTF8 contains surrogate code points.
|
||||||
static bool convertUTF8toUTF32(StringRef InputUTF8,
|
static bool convertUTF8toUTF32(StringRef InputUTF8,
|
||||||
std::vector<uint32_t> &OutUTF32) {
|
std::vector<uint32_t> &OutUTF32,
|
||||||
|
bool mapNonSymbolChars) {
|
||||||
auto ptr = InputUTF8.begin();
|
auto ptr = InputUTF8.begin();
|
||||||
auto end = InputUTF8.end();
|
auto end = InputUTF8.end();
|
||||||
while (ptr < end) {
|
while (ptr < end) {
|
||||||
uint8_t first = *ptr++;
|
uint8_t first = *ptr++;
|
||||||
if (first < 0x80) {
|
if (first < 0x80) {
|
||||||
OutUTF32.push_back(first);
|
if (isValidSymbolChar(first) || !mapNonSymbolChars) {
|
||||||
|
OutUTF32.push_back(first);
|
||||||
|
} else {
|
||||||
|
OutUTF32.push_back((uint32_t)first + 0xD800);
|
||||||
|
}
|
||||||
} else if (first < 0xC0) {
|
} else if (first < 0xC0) {
|
||||||
// Invalid continuation byte.
|
// Invalid continuation byte.
|
||||||
return false;
|
return false;
|
||||||
@@ -75,11 +88,12 @@ static bool convertUTF8toUTF32(StringRef InputUTF8,
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool Punycode::encodePunycodeUTF8(StringRef InputUTF8,
|
bool Punycode::encodePunycodeUTF8(StringRef InputUTF8,
|
||||||
std::string &OutPunycode) {
|
std::string &OutPunycode,
|
||||||
|
bool mapNonSymbolChars) {
|
||||||
std::vector<uint32_t> InputCodePoints;
|
std::vector<uint32_t> InputCodePoints;
|
||||||
InputCodePoints.reserve(InputUTF8.size());
|
InputCodePoints.reserve(InputUTF8.size());
|
||||||
|
|
||||||
if (!convertUTF8toUTF32(InputUTF8, InputCodePoints))
|
if (!convertUTF8toUTF32(InputUTF8, InputCodePoints, mapNonSymbolChars))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
return encodePunycode(InputCodePoints, OutPunycode);
|
return encodePunycode(InputCodePoints, OutPunycode);
|
||||||
|
|||||||
Reference in New Issue
Block a user