Punycode encoder/decoder: separate core and parts that depend on UTF8

encoder/decoder


Swift SVN r20307
This commit is contained in:
Dmitri Hrybenko
2014-07-22 14:37:37 +00:00
parent 87ed6c8a5b
commit 56342b0cfa
7 changed files with 117 additions and 81 deletions

View File

@@ -27,17 +27,27 @@
#include "swift/Basic/LLVM.h" #include "swift/Basic/LLVM.h"
#include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringRef.h"
#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/SmallVector.h"
#include <vector>
#include <cstdint>
namespace swift { namespace swift {
namespace Punycode { namespace Punycode {
/// Encodes a UTF-8-encoded Unicode string into Punycode. /// Encodes a sequence of code points into Punycode.
void encodePunycode(StringRef InputUTF8, std::string &OutPunycode);
/// Decodes a Punycode string into a UTF-8-encoded Unicode string.
/// ///
/// Returns true if the encoding failed, false if it succeeded. /// Returns false if input contains surrogate code points.
bool decodePunycode(StringRef InputPunycode, std::string &OutUTF8); bool encodePunycode(const std::vector<uint32_t> &InputCodePoints,
std::string &OutPunycode);
/// Decodes a Punycode string into a sequence of Unicode scalars.
///
/// Returns false if decoding failed.
bool decodePunycode(StringRef InputPunycode,
std::vector<uint32_t> &OutCodePoints);
bool encodePunycodeUTF8(StringRef InputUTF8, std::string &OutPunycode);
bool decodePunycodeUTF8(StringRef InputPunycode, std::string &OutUTF8);
} // end namespace Punycode } // end namespace Punycode
} // end namespace swift } // end namespace swift

View File

@@ -92,7 +92,7 @@ void Mangler::mangleIdentifier(Identifier ident, OperatorFixity fixity) {
if (isNonAscii(str)) { if (isNonAscii(str)) {
Buffer << 'X'; Buffer << 'X';
Punycode::encodePunycode(str, punycodeBuf); Punycode::encodePunycodeUTF8(str, punycodeBuf);
str = punycodeBuf; str = punycodeBuf;
} }

View File

@@ -12,6 +12,7 @@ add_swift_library(swiftBasic
PrimitiveParsing.cpp PrimitiveParsing.cpp
Program.cpp Program.cpp
Punycode.cpp Punycode.cpp
PunycodeUTF8.cpp
QuotedString.cpp QuotedString.cpp
SourceLoc.cpp SourceLoc.cpp
StringExtras.cpp StringExtras.cpp

View File

@@ -718,7 +718,7 @@ private:
auto decode = [&](StringRef s) -> StringRef { auto decode = [&](StringRef s) -> StringRef {
if (!isPunycoded) if (!isPunycoded)
return s; return s;
if (Punycode::decodePunycode(s, decodeBuffer)) if (!Punycode::decodePunycodeUTF8(s, decodeBuffer))
return {}; return {};
return decodeBuffer; return decodeBuffer;
}; };

View File

@@ -1,4 +1,4 @@
//===--- Punycode.cpp - UTF-8 to Punycode transcoding -----------*- C++ -*-===// //===--- Punycode.cpp - Unicode to Punycode transcoding ---------*- C++ -*-===//
// //
// This source file is part of the Swift.org open source project // This source file is part of the Swift.org open source project
// //
@@ -9,23 +9,11 @@
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
// //
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
//
// These functions implement a variant of the Punycode algorithm from RFC3492,
// originally designed for encoding international domain names, for the purpose
// encoding Swift identifiers into mangled symbol names. This version differs
// from RFC3492 in the following respects:
// - '_' is used as the encoding delimiter instead of the '-'.
// - Encoding digits are mapped to [a-zA-J] instead of to [a-z0-9], because
// symbol names are case-sensitive, and Swift mangled identifiers cannot begin
// with a digit.
//
//===----------------------------------------------------------------------===//
#include "swift/Basic/LLVM.h" #include "swift/Basic/LLVM.h"
#include "swift/Basic/Punycode.h" #include "swift/Basic/Punycode.h"
#include "llvm/Support/ConvertUTF.h" #include <vector>
#include "llvm/Support/ErrorHandling.h" #include <cstdint>
#include <climits>
using namespace swift; using namespace swift;
using namespace Punycode; using namespace Punycode;
@@ -39,7 +27,7 @@ static const int tmax = 26;
static const int skew = 38; static const int skew = 38;
static const int damp = 700; static const int damp = 700;
static const int initial_bias = 72; static const int initial_bias = 72;
static const UTF32 initial_n = 128; static const uint32_t initial_n = 128;
static const char delimiter = '_'; static const char delimiter = '_';
@@ -77,41 +65,41 @@ static int adapt(int delta, int numpoints, bool firsttime) {
// Section 6.2: Decoding procedure // Section 6.2: Decoding procedure
bool Punycode::decodePunycode(StringRef inputPunycode, bool Punycode::decodePunycode(StringRef InputPunycode,
std::string &outUTF8) { std::vector<uint32_t> &OutCodePoints) {
outUTF8.clear(); OutCodePoints.clear();
OutCodePoints.reserve(InputPunycode.size());
// -- Build the decoded string as UTF32 first because we need random access. // -- Build the decoded string as UTF32 first because we need random access.
SmallVector<UTF32, 32> output; uint32_t n = initial_n;
UTF32 n = initial_n;
int i = 0; int i = 0;
int bias = initial_bias; int bias = initial_bias;
/// let output = an empty string indexed from 0 /// let output = an empty string indexed from 0
// consume all code points before the last delimiter (if there is one) // consume all code points before the last delimiter (if there is one)
// and copy them to output, // and copy them to output,
size_t lastDelimiter = inputPunycode.find_last_of(delimiter); size_t lastDelimiter = InputPunycode.find_last_of(delimiter);
if (lastDelimiter != StringRef::npos) { if (lastDelimiter != StringRef::npos) {
for (char c : inputPunycode.slice(0, lastDelimiter)) { for (char c : InputPunycode.slice(0, lastDelimiter)) {
// fail on any non-basic code point // fail on any non-basic code point
if ((signed char)c < 0) if (static_cast<unsigned char>(c) > 0x7f)
return true; return true;
output.push_back((UTF32)c); OutCodePoints.push_back(c);
} }
// if more than zero code points were consumed then consume one more // if more than zero code points were consumed then consume one more
// (which will be the last delimiter) // (which will be the last delimiter)
inputPunycode = inputPunycode.slice(lastDelimiter + 1, inputPunycode.size()); InputPunycode =
InputPunycode.slice(lastDelimiter + 1, InputPunycode.size());
} }
while (!inputPunycode.empty()) { while (!InputPunycode.empty()) {
int oldi = i; int oldi = i;
int w = 1; int w = 1;
for (int k = base; ; k += base) { for (int k = base; ; k += base) {
// consume a code point, or fail if there was none to consume // consume a code point, or fail if there was none to consume
if (inputPunycode.empty()) if (InputPunycode.empty())
return true; return true;
char codePoint = inputPunycode.front(); char codePoint = InputPunycode.front();
inputPunycode = inputPunycode.slice(1, inputPunycode.size()); InputPunycode = InputPunycode.slice(1, InputPunycode.size());
// let digit = the code point's digit-value, fail if it has none // let digit = the code point's digit-value, fail if it has none
int digit = digit_index(codePoint); int digit = digit_index(codePoint);
if (digit < 0) if (digit < 0)
@@ -125,76 +113,59 @@ bool Punycode::decodePunycode(StringRef inputPunycode,
break; break;
w = w * (base - t); w = w * (base - t);
} }
bias = adapt(i - oldi, output.size() + 1, oldi == 0); bias = adapt(i - oldi, OutCodePoints.size() + 1, oldi == 0);
n = n + i / (output.size() + 1); n = n + i / (OutCodePoints.size() + 1);
i = i % (output.size() + 1); i = i % (OutCodePoints.size() + 1);
// if n is a basic code point then fail // if n is a basic code point then fail
if (n < 0x80) if (n < 0x80)
return true; return true;
// insert n into output at position i // insert n into output at position i
output.insert(output.begin() + i, n); OutCodePoints.insert(OutCodePoints.begin() + i, n);
i++; i++;
} }
// -- Transcode to a UTF-8 result. return true;
size_t SizeUpperBound = output.size()*4;
std::vector<UTF8> Result(SizeUpperBound);
const UTF32 *utf32_begin = output.begin();
UTF8 *utf8_begin = Result.data();
auto res = ConvertUTF32toUTF8(&utf32_begin, output.end(),
&utf8_begin, Result.data() + SizeUpperBound,
lenientConversion);
assert(res == conversionOK && "wide-to-utf8 conversion failed!");
(void)res;
outUTF8 = std::string(Result.data(), utf8_begin);
return false;
} }
// Section 6.3: Encoding procedure // Section 6.3: Encoding procedure
void Punycode::encodePunycode(StringRef inputUTF8, bool Punycode::encodePunycode(const std::vector<uint32_t> &InputCodePoints,
std::string &outPunycode) { std::string &OutPunycode) {
outPunycode.clear(); OutPunycode.clear();
UTF32 n = initial_n; uint32_t n = initial_n;
int delta = 0; int delta = 0;
int bias = initial_bias; int bias = initial_bias;
// let h = b = the number of basic code points in the input // let h = b = the number of basic code points in the input
// copy them to the output in order... // copy them to the output in order...
size_t h = 0; size_t h = 0;
SmallVector<UTF32, 32> inputCodePoints; for (auto C : InputCodePoints) {
for (auto *i = reinterpret_cast<UTF8 const *>(inputUTF8.begin()), if (C < 0x80) {
*end = reinterpret_cast<UTF8 const *>(inputUTF8.end());
i < end;
) {
UTF32 c;
auto conv = llvm::convertUTF8Sequence(&i, end, &c, strictConversion);
assert(conv == conversionOK && "invalid UTF-8 input");
(void)conv;
inputCodePoints.push_back(c);
if (c < 0x80) {
++h; ++h;
outPunycode.push_back(c); OutPunycode.push_back(C);
}
if (C >= 0xD800 && C <= 0xDFFF) {
OutPunycode.clear();
return false;
} }
} }
size_t b = h; size_t b = h;
// ...followed by a delimiter if b > 0 // ...followed by a delimiter if b > 0
if (b > 0) if (b > 0)
outPunycode.push_back(delimiter); OutPunycode.push_back(delimiter);
while (h < inputCodePoints.size()) { while (h < InputCodePoints.size()) {
// let m = the minimum code point >= n in the input // let m = the minimum code point >= n in the input
UTF32 m = 0x10FFFF; uint32_t m = 0x10FFFF;
for (UTF32 codePoint : inputCodePoints) { for (auto codePoint : InputCodePoints) {
if (codePoint >= n && codePoint < m) if (codePoint >= n && codePoint < m)
m = codePoint; m = codePoint;
} }
delta = delta + (m - n) * (h + 1); delta = delta + (m - n) * (h + 1);
n = m; n = m;
for (UTF32 c : inputCodePoints) { for (auto c : InputCodePoints) {
if (c < n) ++delta; if (c < n) ++delta;
if (c == n) { if (c == n) {
int q = delta; int q = delta;
@@ -204,10 +175,10 @@ void Punycode::encodePunycode(StringRef inputUTF8,
: k - bias; : k - bias;
if (q < t) break; if (q < t) break;
outPunycode.push_back(digit_value(t + ((q - t) % (base - t)))); OutPunycode.push_back(digit_value(t + ((q - t) % (base - t))));
q = (q - t) / (base - t); q = (q - t) / (base - t);
} }
outPunycode.push_back(digit_value(q)); OutPunycode.push_back(digit_value(q));
bias = adapt(delta, h + 1, h == b); bias = adapt(delta, h + 1, h == b);
delta = 0; delta = 0;
++h; ++h;
@@ -215,4 +186,6 @@ void Punycode::encodePunycode(StringRef inputUTF8,
} }
++delta; ++n; ++delta; ++n;
} }
return true;
} }

View File

@@ -0,0 +1,52 @@
//===--- PunycodeUTF8.cpp - Unicode to Punycode transcoding -----*- C++ -*-===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
#include "swift/Basic/Punycode.h"
#include "llvm/Support/ConvertUTF.h"
#include <vector>
using namespace swift;
bool Punycode::encodePunycodeUTF8(StringRef InputUTF8,
std::string &OutPunycode) {
std::vector<uint32_t> InputCodePoints(InputUTF8.size());
const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(InputUTF8.data());
UTF32 *TargetStart = InputCodePoints.data();
auto ConvStatus = ConvertUTF8toUTF32(
&SourceStart, SourceStart + InputUTF8.size(), &TargetStart,
InputCodePoints.data() + InputCodePoints.size(), strictConversion);
if (ConvStatus != conversionOK)
return false;
return encodePunycode(InputCodePoints, OutPunycode);
}
bool Punycode::decodePunycodeUTF8(StringRef InputPunycode,
std::string &OutUTF8) {
std::vector<uint32_t> OutCodePoints;
if (!decodePunycode(InputPunycode, OutCodePoints))
return false;
const size_t SizeUpperBound = OutCodePoints.size() * 4;
std::vector<UTF8> Result(SizeUpperBound);
const UTF32 *SourceStart = OutCodePoints.data();
UTF8 *TargetStart = Result.data();
auto ConvStatus = ConvertUTF32toUTF8(
&SourceStart, SourceStart + OutCodePoints.size(), &TargetStart,
Result.data() + Result.size(), strictConversion);
if (ConvStatus != conversionOK) {
OutUTF8.clear();
return false;
}
OutUTF8 = std::string(Result.data(), TargetStart);
return true;
}

View File

@@ -84,7 +84,7 @@ static void mangleIdent(llvm::raw_string_ostream &OS, StringRef Id) {
std::string PunycodeBuf; std::string PunycodeBuf;
if (isNonAscii(Id)) { if (isNonAscii(Id)) {
OS << 'X'; OS << 'X';
Punycode::encodePunycode(Id, PunycodeBuf); Punycode::encodePunycodeUTF8(Id, PunycodeBuf);
Id = PunycodeBuf; Id = PunycodeBuf;
} }
OS << Id.size() << Id; OS << Id.size() << Id;