Mangling: support for special encoding ASCII of characters which may not appear in symbol names.

Such characters (like ‘.’) can be punycode-encoded just like non-ASCII unicode characters.
2025-12-21 12:14:44 +01:00 · 2016-12-02 12:49:19 -08:00
parent 97f1fac11e
commit 76820edda9
3 changed files with 33 additions and 6 deletions
--- a/include/swift/Basic/Punycode.h
+++ b/include/swift/Basic/Punycode.h
@@ -18,6 +18,9 @@
 // - Encoding digits are represented using [a-zA-J] instead of [a-z0-9], because
 //   symbol names are case-sensitive, and Swift mangled identifiers cannot begin
 //   with a digit.
+// - Optinally, non-symbol ASCII characters (characters except [$_a-zA-Z0-9])
+//   are mapped to the code range 0xD800 - 0xD880 and are also encoded like
+//   non-ASCII unicode characters.
 //
 //===----------------------------------------------------------------------===//

@@ -45,7 +48,13 @@ bool encodePunycode(const std::vector<uint32_t> &InputCodePoints,
 bool decodePunycode(StringRef InputPunycode,
                    std::vector<uint32_t> &OutCodePoints);

-bool encodePunycodeUTF8(StringRef InputUTF8, std::string &OutPunycode);
+/// Encodes an UTF8 string into Punycode.
+///
+/// If \p mapNonSymbolChars is true, non-symbol ASCII characters (characters
+/// except [$_a-zA-Z0-9]) are also encoded like non-ASCII unicode characters.
+/// Returns false if \p InputUTF8 contains surrogate code points.
+bool encodePunycodeUTF8(StringRef InputUTF8, std::string &OutPunycode,
+                        bool mapNonSymbolChars = false);

 bool decodePunycodeUTF8(StringRef InputPunycode, std::string &OutUTF8);

--- a/lib/Basic/Punycode.cpp
+++ b/lib/Basic/Punycode.cpp
@@ -47,7 +47,9 @@ static int digit_index(char value) {
 }

 static bool isValidUnicodeScalar(uint32_t S) {
-  return (S < 0xD800) || (S >= 0xE000 && S <= 0x1FFFFF);
+  // Also accept the range of 0xD800 - 0xD880, which is used for non-symbol
+  // ASCII characters.
+  return (S < 0xD880) || (S >= 0xE000 && S <= 0x1FFFFF);
 }

 // Section 6.1: Bias adaptation function
@@ -200,6 +202,8 @@ static bool encodeToUTF8(const std::vector<uint32_t> &Scalars,
      OutUTF8.clear();
      return false;
    }
+    if (S >= 0xD800 && S < 0xD880)
+      S -= 0xD800;

    unsigned Bytes = 0;
    if (S < 0x80)
--- a/lib/Basic/PunycodeUTF8.cpp
+++ b/lib/Basic/PunycodeUTF8.cpp
@@ -19,19 +19,32 @@ static bool isContinuationByte(uint8_t unit) {
  return (unit & 0xC0) == 0x80;
 }

+static bool isValidSymbolChar(char ch) {
+  return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
+         (ch >= '0' && ch <= '9') || ch == '_' || ch == '$';
+}
+
 /// Reencode well-formed UTF-8 as UTF-32.
 ///
 /// This entry point is only called from compiler-internal entry points, so does
 /// only minimal validation. In particular, it does *not* check for overlong
 /// encodings.
+/// If \p mapNonSymbolChars is true, non-symbol ASCII characters (characters
+/// except [$_a-zA-Z0-9]) are also encoded like non-ASCII unicode characters.
+/// Returns false if \p InputUTF8 contains surrogate code points.
 static bool convertUTF8toUTF32(StringRef InputUTF8,
-                               std::vector<uint32_t> &OutUTF32) {
+                               std::vector<uint32_t> &OutUTF32,
+                               bool mapNonSymbolChars) {
  auto ptr = InputUTF8.begin();
  auto end = InputUTF8.end();
  while (ptr < end) {
    uint8_t first = *ptr++;
    if (first < 0x80) {
-      OutUTF32.push_back(first);
+      if (isValidSymbolChar(first) || !mapNonSymbolChars) {
+        OutUTF32.push_back(first);
+      } else {
+        OutUTF32.push_back((uint32_t)first + 0xD800);
+      }
    } else if (first < 0xC0) {
      // Invalid continuation byte.
      return false;
@@ -75,11 +88,12 @@ static bool convertUTF8toUTF32(StringRef InputUTF8,
 }

 bool Punycode::encodePunycodeUTF8(StringRef InputUTF8,
-                                  std::string &OutPunycode) {
+                                  std::string &OutPunycode,
+                                  bool mapNonSymbolChars) {
  std::vector<uint32_t> InputCodePoints;
  InputCodePoints.reserve(InputUTF8.size());
  
-  if (!convertUTF8toUTF32(InputUTF8, InputCodePoints))
+  if (!convertUTF8toUTF32(InputUTF8, InputCodePoints, mapNonSymbolChars))
    return false;
  
  return encodePunycode(InputCodePoints, OutPunycode);