Punycode encoder/decoder: separate core and parts that depend on UTF8

encoder/decoder Swift SVN r20307
2025-12-21 12:14:44 +01:00 · 2014-07-22 14:37:37 +00:00
parent 87ed6c8a5b
commit 56342b0cfa
7 changed files with 117 additions and 81 deletions
--- a/include/swift/Basic/Punycode.h
+++ b/include/swift/Basic/Punycode.h
@@ -27,17 +27,27 @@
 #include "swift/Basic/LLVM.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/SmallVector.h"
+#include <vector>
+#include <cstdint>

 namespace swift {
 namespace Punycode {

-/// Encodes a UTF-8-encoded Unicode string into Punycode.
-void encodePunycode(StringRef InputUTF8, std::string &OutPunycode);
-
-/// Decodes a Punycode string into a UTF-8-encoded Unicode string.
+/// Encodes a sequence of code points into Punycode.
 ///
-/// Returns true if the encoding failed, false if it succeeded.
-bool decodePunycode(StringRef InputPunycode, std::string &OutUTF8);
+/// Returns false if input contains surrogate code points.
+bool encodePunycode(const std::vector<uint32_t> &InputCodePoints,
+                    std::string &OutPunycode);
+
+/// Decodes a Punycode string into a sequence of Unicode scalars.
+///
+/// Returns false if decoding failed.
+bool decodePunycode(StringRef InputPunycode,
+                    std::vector<uint32_t> &OutCodePoints);
+
+bool encodePunycodeUTF8(StringRef InputUTF8, std::string &OutPunycode);
+
+bool decodePunycodeUTF8(StringRef InputPunycode, std::string &OutUTF8);

 } // end namespace Punycode
 } // end namespace swift
--- a/lib/AST/Mangle.cpp
+++ b/lib/AST/Mangle.cpp
@@ -92,7 +92,7 @@ void Mangler::mangleIdentifier(Identifier ident, OperatorFixity fixity) {

  if (isNonAscii(str)) {
    Buffer << 'X';
-    Punycode::encodePunycode(str, punycodeBuf);
+    Punycode::encodePunycodeUTF8(str, punycodeBuf);
    str = punycodeBuf;
  }
  
--- a/lib/Basic/CMakeLists.txt
+++ b/lib/Basic/CMakeLists.txt
@@ -12,6 +12,7 @@ add_swift_library(swiftBasic
  PrimitiveParsing.cpp
  Program.cpp
  Punycode.cpp
+  PunycodeUTF8.cpp
  QuotedString.cpp
  SourceLoc.cpp
  StringExtras.cpp
--- a/lib/Basic/Demangle.cpp
+++ b/lib/Basic/Demangle.cpp
@@ -718,7 +718,7 @@ private:
    auto decode = [&](StringRef s) -> StringRef {
      if (!isPunycoded)
        return s;
-      if (Punycode::decodePunycode(s, decodeBuffer))
+      if (!Punycode::decodePunycodeUTF8(s, decodeBuffer))
        return {};
      return decodeBuffer;
    };
--- a/lib/Basic/Punycode.cpp
+++ b/lib/Basic/Punycode.cpp
@@ -1,4 +1,4 @@
-//===--- Punycode.cpp - UTF-8 to Punycode transcoding -----------*- C++ -*-===//
+//===--- Punycode.cpp - Unicode to Punycode transcoding ---------*- C++ -*-===//
 //
 // This source file is part of the Swift.org open source project
 //
@@ -9,23 +9,11 @@
 // See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
 //
 //===----------------------------------------------------------------------===//
-//
-// These functions implement a variant of the Punycode algorithm from RFC3492,
-// originally designed for encoding international domain names, for the purpose
-// encoding Swift identifiers into mangled symbol names. This version differs
-// from RFC3492 in the following respects:
-// - '_' is used as the encoding delimiter instead of the '-'.
-// - Encoding digits are mapped to [a-zA-J] instead of to [a-z0-9], because
-//   symbol names are case-sensitive, and Swift mangled identifiers cannot begin
-//   with a digit.
-//
-//===----------------------------------------------------------------------===//

 #include "swift/Basic/LLVM.h"
 #include "swift/Basic/Punycode.h"
-#include "llvm/Support/ConvertUTF.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <climits>
+#include <vector>
+#include <cstdint>

 using namespace swift;
 using namespace Punycode;
@@ -39,7 +27,7 @@ static const int tmax         = 26;
 static const int skew         = 38;
 static const int damp         = 700;
 static const int initial_bias = 72;
-static const UTF32 initial_n    = 128;
+static const uint32_t initial_n    = 128;

 static const char delimiter = '_';

@@ -77,41 +65,41 @@ static int adapt(int delta, int numpoints, bool firsttime) {

 // Section 6.2: Decoding procedure

-bool Punycode::decodePunycode(StringRef inputPunycode,
-                              std::string &outUTF8) {
-  outUTF8.clear();
+bool Punycode::decodePunycode(StringRef InputPunycode,
+                              std::vector<uint32_t> &OutCodePoints) {
+  OutCodePoints.clear();
+  OutCodePoints.reserve(InputPunycode.size());

  // -- Build the decoded string as UTF32 first because we need random access.
-  SmallVector<UTF32, 32> output;
-
-  UTF32 n = initial_n;
+  uint32_t n = initial_n;
  int i = 0;
  int bias = initial_bias;
  /// let output = an empty string indexed from 0
  // consume all code points before the last delimiter (if there is one)
  //  and copy them to output,
-  size_t lastDelimiter = inputPunycode.find_last_of(delimiter);
+  size_t lastDelimiter = InputPunycode.find_last_of(delimiter);
  if (lastDelimiter != StringRef::npos) {
-    for (char c : inputPunycode.slice(0, lastDelimiter)) {
+    for (char c : InputPunycode.slice(0, lastDelimiter)) {
      // fail on any non-basic code point
-      if ((signed char)c < 0)
+      if (static_cast<unsigned char>(c) > 0x7f)
        return true;
-      output.push_back((UTF32)c);
+      OutCodePoints.push_back(c);
    }
    // if more than zero code points were consumed then consume one more
    //  (which will be the last delimiter)
-    inputPunycode = inputPunycode.slice(lastDelimiter + 1, inputPunycode.size());
+    InputPunycode =
+        InputPunycode.slice(lastDelimiter + 1, InputPunycode.size());
  }
  
-  while (!inputPunycode.empty()) {
+  while (!InputPunycode.empty()) {
    int oldi = i;
    int w = 1;
    for (int k = base; ; k += base) {
      // consume a code point, or fail if there was none to consume
-      if (inputPunycode.empty())
+      if (InputPunycode.empty())
        return true;
-      char codePoint = inputPunycode.front();
-      inputPunycode = inputPunycode.slice(1, inputPunycode.size());
+      char codePoint = InputPunycode.front();
+      InputPunycode = InputPunycode.slice(1, InputPunycode.size());
      // let digit = the code point's digit-value, fail if it has none
      int digit = digit_index(codePoint);
      if (digit < 0)
@@ -125,76 +113,59 @@ bool Punycode::decodePunycode(StringRef inputPunycode,
        break;
      w = w * (base - t);
    }
-    bias = adapt(i - oldi, output.size() + 1, oldi == 0);
-    n = n + i / (output.size() + 1);
-    i = i % (output.size() + 1);
+    bias = adapt(i - oldi, OutCodePoints.size() + 1, oldi == 0);
+    n = n + i / (OutCodePoints.size() + 1);
+    i = i % (OutCodePoints.size() + 1);
    // if n is a basic code point then fail
    if (n < 0x80)
      return true;
    // insert n into output at position i
-    output.insert(output.begin() + i, n);
+    OutCodePoints.insert(OutCodePoints.begin() + i, n);
    i++;
  }
  
-  // -- Transcode to a UTF-8 result.
-  size_t SizeUpperBound = output.size()*4;
-  std::vector<UTF8> Result(SizeUpperBound);
-  const UTF32 *utf32_begin = output.begin();
-  UTF8 *utf8_begin = Result.data();
-  auto res = ConvertUTF32toUTF8(&utf32_begin, output.end(),
-                                &utf8_begin, Result.data() + SizeUpperBound,
-                                lenientConversion);
-  assert(res == conversionOK && "wide-to-utf8 conversion failed!");
-  (void)res;
-  outUTF8 = std::string(Result.data(), utf8_begin);
-
-  return false;
+  return true;
 }

 // Section 6.3: Encoding procedure

-void Punycode::encodePunycode(StringRef inputUTF8,
-                              std::string &outPunycode) {
-  outPunycode.clear();
+bool Punycode::encodePunycode(const std::vector<uint32_t> &InputCodePoints,
+                              std::string &OutPunycode) {
+  OutPunycode.clear();

-  UTF32 n = initial_n;
+  uint32_t n = initial_n;
  int delta = 0;
  int bias = initial_bias;

  // let h = b = the number of basic code points in the input
  // copy them to the output in order...
  size_t h = 0;
-  SmallVector<UTF32, 32> inputCodePoints;
-  for (auto *i = reinterpret_cast<UTF8 const *>(inputUTF8.begin()),
-            *end = reinterpret_cast<UTF8 const *>(inputUTF8.end());
-       i < end;
-       ) {
-    UTF32 c;
-    auto conv = llvm::convertUTF8Sequence(&i, end, &c, strictConversion);
-    assert(conv == conversionOK && "invalid UTF-8 input");
-    (void)conv;
-    inputCodePoints.push_back(c);
-    if (c < 0x80) {
+  for (auto C : InputCodePoints) {
+    if (C < 0x80) {
      ++h;
-      outPunycode.push_back(c);
+      OutPunycode.push_back(C);
+    }
+    if (C >= 0xD800 && C <= 0xDFFF) {
+      OutPunycode.clear();
+      return false;
    }
  }
  size_t b = h;
  // ...followed by a delimiter if b > 0
  if (b > 0)
-    outPunycode.push_back(delimiter);
+    OutPunycode.push_back(delimiter);
  
-  while (h < inputCodePoints.size()) {
+  while (h < InputCodePoints.size()) {
    // let m = the minimum code point >= n in the input
-    UTF32 m = 0x10FFFF;
-    for (UTF32 codePoint : inputCodePoints) {
+    uint32_t m = 0x10FFFF;
+    for (auto codePoint : InputCodePoints) {
      if (codePoint >= n && codePoint < m)
        m = codePoint;
    }
    
    delta = delta + (m - n) * (h + 1);
    n = m;
-    for (UTF32 c : inputCodePoints) {
+    for (auto c : InputCodePoints) {
      if (c < n) ++delta;
      if (c == n) {
        int q = delta;
@@ -204,10 +175,10 @@ void Punycode::encodePunycode(StringRef inputUTF8,
                : k - bias;
          
          if (q < t) break;
-          outPunycode.push_back(digit_value(t + ((q - t) % (base - t))));
+          OutPunycode.push_back(digit_value(t + ((q - t) % (base - t))));
          q = (q - t) / (base - t);
        }
-        outPunycode.push_back(digit_value(q));
+        OutPunycode.push_back(digit_value(q));
        bias = adapt(delta, h + 1, h == b);
        delta = 0;
        ++h;
@@ -215,4 +186,6 @@ void Punycode::encodePunycode(StringRef inputUTF8,
    }
    ++delta; ++n;
  }
+  return true;
 }
+
--- a/lib/Basic/PunycodeUTF8.cpp
+++ b/lib/Basic/PunycodeUTF8.cpp
@@ -0,0 +1,52 @@
+//===--- PunycodeUTF8.cpp - Unicode to Punycode transcoding -----*- C++ -*-===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See http://swift.org/LICENSE.txt for license information
+// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
+//
+//===----------------------------------------------------------------------===//
+
+#include "swift/Basic/Punycode.h"
+#include "llvm/Support/ConvertUTF.h"
+#include <vector>
+
+using namespace swift;
+
+bool Punycode::encodePunycodeUTF8(StringRef InputUTF8,
+                                  std::string &OutPunycode) {
+  std::vector<uint32_t> InputCodePoints(InputUTF8.size());
+  const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(InputUTF8.data());
+  UTF32 *TargetStart = InputCodePoints.data();
+  auto ConvStatus = ConvertUTF8toUTF32(
+      &SourceStart, SourceStart + InputUTF8.size(), &TargetStart,
+      InputCodePoints.data() + InputCodePoints.size(), strictConversion);
+  if (ConvStatus != conversionOK)
+    return false;
+  return encodePunycode(InputCodePoints, OutPunycode);
+}
+
+bool Punycode::decodePunycodeUTF8(StringRef InputPunycode,
+                                  std::string &OutUTF8) {
+  std::vector<uint32_t> OutCodePoints;
+  if (!decodePunycode(InputPunycode, OutCodePoints))
+    return false;
+
+  const size_t SizeUpperBound = OutCodePoints.size() * 4;
+  std::vector<UTF8> Result(SizeUpperBound);
+  const UTF32 *SourceStart = OutCodePoints.data();
+  UTF8 *TargetStart = Result.data();
+  auto ConvStatus = ConvertUTF32toUTF8(
+      &SourceStart, SourceStart + OutCodePoints.size(), &TargetStart,
+      Result.data() + Result.size(), strictConversion);
+  if (ConvStatus != conversionOK) {
+    OutUTF8.clear();
+    return false;
+  }
+  OutUTF8 = std::string(Result.data(), TargetStart);
+  return true;
+}
+
--- a/lib/IRGen/IRGenDebugInfo.cpp
+++ b/lib/IRGen/IRGenDebugInfo.cpp
@@ -84,7 +84,7 @@ static void mangleIdent(llvm::raw_string_ostream &OS, StringRef Id) {
  std::string PunycodeBuf;
  if (isNonAscii(Id)) {
    OS << 'X';
-    Punycode::encodePunycode(Id, PunycodeBuf);
+    Punycode::encodePunycodeUTF8(Id, PunycodeBuf);
    Id = PunycodeBuf;
  }
  OS << Id.size() << Id;