mirror of
https://github.com/apple/swift.git
synced 2025-12-21 12:14:44 +01:00
Punycode encoder/decoder: separate core and parts that depend on UTF8
encoder/decoder Swift SVN r20307
This commit is contained in:
@@ -27,17 +27,27 @@
|
|||||||
#include "swift/Basic/LLVM.h"
|
#include "swift/Basic/LLVM.h"
|
||||||
#include "llvm/ADT/StringRef.h"
|
#include "llvm/ADT/StringRef.h"
|
||||||
#include "llvm/ADT/SmallVector.h"
|
#include "llvm/ADT/SmallVector.h"
|
||||||
|
#include <vector>
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
namespace swift {
|
namespace swift {
|
||||||
namespace Punycode {
|
namespace Punycode {
|
||||||
|
|
||||||
/// Encodes a UTF-8-encoded Unicode string into Punycode.
|
/// Encodes a sequence of code points into Punycode.
|
||||||
void encodePunycode(StringRef InputUTF8, std::string &OutPunycode);
|
|
||||||
|
|
||||||
/// Decodes a Punycode string into a UTF-8-encoded Unicode string.
|
|
||||||
///
|
///
|
||||||
/// Returns true if the encoding failed, false if it succeeded.
|
/// Returns false if input contains surrogate code points.
|
||||||
bool decodePunycode(StringRef InputPunycode, std::string &OutUTF8);
|
bool encodePunycode(const std::vector<uint32_t> &InputCodePoints,
|
||||||
|
std::string &OutPunycode);
|
||||||
|
|
||||||
|
/// Decodes a Punycode string into a sequence of Unicode scalars.
|
||||||
|
///
|
||||||
|
/// Returns false if decoding failed.
|
||||||
|
bool decodePunycode(StringRef InputPunycode,
|
||||||
|
std::vector<uint32_t> &OutCodePoints);
|
||||||
|
|
||||||
|
bool encodePunycodeUTF8(StringRef InputUTF8, std::string &OutPunycode);
|
||||||
|
|
||||||
|
bool decodePunycodeUTF8(StringRef InputPunycode, std::string &OutUTF8);
|
||||||
|
|
||||||
} // end namespace Punycode
|
} // end namespace Punycode
|
||||||
} // end namespace swift
|
} // end namespace swift
|
||||||
|
|||||||
@@ -92,7 +92,7 @@ void Mangler::mangleIdentifier(Identifier ident, OperatorFixity fixity) {
|
|||||||
|
|
||||||
if (isNonAscii(str)) {
|
if (isNonAscii(str)) {
|
||||||
Buffer << 'X';
|
Buffer << 'X';
|
||||||
Punycode::encodePunycode(str, punycodeBuf);
|
Punycode::encodePunycodeUTF8(str, punycodeBuf);
|
||||||
str = punycodeBuf;
|
str = punycodeBuf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ add_swift_library(swiftBasic
|
|||||||
PrimitiveParsing.cpp
|
PrimitiveParsing.cpp
|
||||||
Program.cpp
|
Program.cpp
|
||||||
Punycode.cpp
|
Punycode.cpp
|
||||||
|
PunycodeUTF8.cpp
|
||||||
QuotedString.cpp
|
QuotedString.cpp
|
||||||
SourceLoc.cpp
|
SourceLoc.cpp
|
||||||
StringExtras.cpp
|
StringExtras.cpp
|
||||||
|
|||||||
@@ -718,7 +718,7 @@ private:
|
|||||||
auto decode = [&](StringRef s) -> StringRef {
|
auto decode = [&](StringRef s) -> StringRef {
|
||||||
if (!isPunycoded)
|
if (!isPunycoded)
|
||||||
return s;
|
return s;
|
||||||
if (Punycode::decodePunycode(s, decodeBuffer))
|
if (!Punycode::decodePunycodeUTF8(s, decodeBuffer))
|
||||||
return {};
|
return {};
|
||||||
return decodeBuffer;
|
return decodeBuffer;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
//===--- Punycode.cpp - UTF-8 to Punycode transcoding -----------*- C++ -*-===//
|
//===--- Punycode.cpp - Unicode to Punycode transcoding ---------*- C++ -*-===//
|
||||||
//
|
//
|
||||||
// This source file is part of the Swift.org open source project
|
// This source file is part of the Swift.org open source project
|
||||||
//
|
//
|
||||||
@@ -9,23 +9,11 @@
|
|||||||
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
||||||
//
|
//
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
//
|
|
||||||
// These functions implement a variant of the Punycode algorithm from RFC3492,
|
|
||||||
// originally designed for encoding international domain names, for the purpose
|
|
||||||
// encoding Swift identifiers into mangled symbol names. This version differs
|
|
||||||
// from RFC3492 in the following respects:
|
|
||||||
// - '_' is used as the encoding delimiter instead of the '-'.
|
|
||||||
// - Encoding digits are mapped to [a-zA-J] instead of to [a-z0-9], because
|
|
||||||
// symbol names are case-sensitive, and Swift mangled identifiers cannot begin
|
|
||||||
// with a digit.
|
|
||||||
//
|
|
||||||
//===----------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
#include "swift/Basic/LLVM.h"
|
#include "swift/Basic/LLVM.h"
|
||||||
#include "swift/Basic/Punycode.h"
|
#include "swift/Basic/Punycode.h"
|
||||||
#include "llvm/Support/ConvertUTF.h"
|
#include <vector>
|
||||||
#include "llvm/Support/ErrorHandling.h"
|
#include <cstdint>
|
||||||
#include <climits>
|
|
||||||
|
|
||||||
using namespace swift;
|
using namespace swift;
|
||||||
using namespace Punycode;
|
using namespace Punycode;
|
||||||
@@ -39,7 +27,7 @@ static const int tmax = 26;
|
|||||||
static const int skew = 38;
|
static const int skew = 38;
|
||||||
static const int damp = 700;
|
static const int damp = 700;
|
||||||
static const int initial_bias = 72;
|
static const int initial_bias = 72;
|
||||||
static const UTF32 initial_n = 128;
|
static const uint32_t initial_n = 128;
|
||||||
|
|
||||||
static const char delimiter = '_';
|
static const char delimiter = '_';
|
||||||
|
|
||||||
@@ -77,41 +65,41 @@ static int adapt(int delta, int numpoints, bool firsttime) {
|
|||||||
|
|
||||||
// Section 6.2: Decoding procedure
|
// Section 6.2: Decoding procedure
|
||||||
|
|
||||||
bool Punycode::decodePunycode(StringRef inputPunycode,
|
bool Punycode::decodePunycode(StringRef InputPunycode,
|
||||||
std::string &outUTF8) {
|
std::vector<uint32_t> &OutCodePoints) {
|
||||||
outUTF8.clear();
|
OutCodePoints.clear();
|
||||||
|
OutCodePoints.reserve(InputPunycode.size());
|
||||||
|
|
||||||
// -- Build the decoded string as UTF32 first because we need random access.
|
// -- Build the decoded string as UTF32 first because we need random access.
|
||||||
SmallVector<UTF32, 32> output;
|
uint32_t n = initial_n;
|
||||||
|
|
||||||
UTF32 n = initial_n;
|
|
||||||
int i = 0;
|
int i = 0;
|
||||||
int bias = initial_bias;
|
int bias = initial_bias;
|
||||||
/// let output = an empty string indexed from 0
|
/// let output = an empty string indexed from 0
|
||||||
// consume all code points before the last delimiter (if there is one)
|
// consume all code points before the last delimiter (if there is one)
|
||||||
// and copy them to output,
|
// and copy them to output,
|
||||||
size_t lastDelimiter = inputPunycode.find_last_of(delimiter);
|
size_t lastDelimiter = InputPunycode.find_last_of(delimiter);
|
||||||
if (lastDelimiter != StringRef::npos) {
|
if (lastDelimiter != StringRef::npos) {
|
||||||
for (char c : inputPunycode.slice(0, lastDelimiter)) {
|
for (char c : InputPunycode.slice(0, lastDelimiter)) {
|
||||||
// fail on any non-basic code point
|
// fail on any non-basic code point
|
||||||
if ((signed char)c < 0)
|
if (static_cast<unsigned char>(c) > 0x7f)
|
||||||
return true;
|
return true;
|
||||||
output.push_back((UTF32)c);
|
OutCodePoints.push_back(c);
|
||||||
}
|
}
|
||||||
// if more than zero code points were consumed then consume one more
|
// if more than zero code points were consumed then consume one more
|
||||||
// (which will be the last delimiter)
|
// (which will be the last delimiter)
|
||||||
inputPunycode = inputPunycode.slice(lastDelimiter + 1, inputPunycode.size());
|
InputPunycode =
|
||||||
|
InputPunycode.slice(lastDelimiter + 1, InputPunycode.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
while (!inputPunycode.empty()) {
|
while (!InputPunycode.empty()) {
|
||||||
int oldi = i;
|
int oldi = i;
|
||||||
int w = 1;
|
int w = 1;
|
||||||
for (int k = base; ; k += base) {
|
for (int k = base; ; k += base) {
|
||||||
// consume a code point, or fail if there was none to consume
|
// consume a code point, or fail if there was none to consume
|
||||||
if (inputPunycode.empty())
|
if (InputPunycode.empty())
|
||||||
return true;
|
return true;
|
||||||
char codePoint = inputPunycode.front();
|
char codePoint = InputPunycode.front();
|
||||||
inputPunycode = inputPunycode.slice(1, inputPunycode.size());
|
InputPunycode = InputPunycode.slice(1, InputPunycode.size());
|
||||||
// let digit = the code point's digit-value, fail if it has none
|
// let digit = the code point's digit-value, fail if it has none
|
||||||
int digit = digit_index(codePoint);
|
int digit = digit_index(codePoint);
|
||||||
if (digit < 0)
|
if (digit < 0)
|
||||||
@@ -125,76 +113,59 @@ bool Punycode::decodePunycode(StringRef inputPunycode,
|
|||||||
break;
|
break;
|
||||||
w = w * (base - t);
|
w = w * (base - t);
|
||||||
}
|
}
|
||||||
bias = adapt(i - oldi, output.size() + 1, oldi == 0);
|
bias = adapt(i - oldi, OutCodePoints.size() + 1, oldi == 0);
|
||||||
n = n + i / (output.size() + 1);
|
n = n + i / (OutCodePoints.size() + 1);
|
||||||
i = i % (output.size() + 1);
|
i = i % (OutCodePoints.size() + 1);
|
||||||
// if n is a basic code point then fail
|
// if n is a basic code point then fail
|
||||||
if (n < 0x80)
|
if (n < 0x80)
|
||||||
return true;
|
return true;
|
||||||
// insert n into output at position i
|
// insert n into output at position i
|
||||||
output.insert(output.begin() + i, n);
|
OutCodePoints.insert(OutCodePoints.begin() + i, n);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// -- Transcode to a UTF-8 result.
|
return true;
|
||||||
size_t SizeUpperBound = output.size()*4;
|
|
||||||
std::vector<UTF8> Result(SizeUpperBound);
|
|
||||||
const UTF32 *utf32_begin = output.begin();
|
|
||||||
UTF8 *utf8_begin = Result.data();
|
|
||||||
auto res = ConvertUTF32toUTF8(&utf32_begin, output.end(),
|
|
||||||
&utf8_begin, Result.data() + SizeUpperBound,
|
|
||||||
lenientConversion);
|
|
||||||
assert(res == conversionOK && "wide-to-utf8 conversion failed!");
|
|
||||||
(void)res;
|
|
||||||
outUTF8 = std::string(Result.data(), utf8_begin);
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Section 6.3: Encoding procedure
|
// Section 6.3: Encoding procedure
|
||||||
|
|
||||||
void Punycode::encodePunycode(StringRef inputUTF8,
|
bool Punycode::encodePunycode(const std::vector<uint32_t> &InputCodePoints,
|
||||||
std::string &outPunycode) {
|
std::string &OutPunycode) {
|
||||||
outPunycode.clear();
|
OutPunycode.clear();
|
||||||
|
|
||||||
UTF32 n = initial_n;
|
uint32_t n = initial_n;
|
||||||
int delta = 0;
|
int delta = 0;
|
||||||
int bias = initial_bias;
|
int bias = initial_bias;
|
||||||
|
|
||||||
// let h = b = the number of basic code points in the input
|
// let h = b = the number of basic code points in the input
|
||||||
// copy them to the output in order...
|
// copy them to the output in order...
|
||||||
size_t h = 0;
|
size_t h = 0;
|
||||||
SmallVector<UTF32, 32> inputCodePoints;
|
for (auto C : InputCodePoints) {
|
||||||
for (auto *i = reinterpret_cast<UTF8 const *>(inputUTF8.begin()),
|
if (C < 0x80) {
|
||||||
*end = reinterpret_cast<UTF8 const *>(inputUTF8.end());
|
|
||||||
i < end;
|
|
||||||
) {
|
|
||||||
UTF32 c;
|
|
||||||
auto conv = llvm::convertUTF8Sequence(&i, end, &c, strictConversion);
|
|
||||||
assert(conv == conversionOK && "invalid UTF-8 input");
|
|
||||||
(void)conv;
|
|
||||||
inputCodePoints.push_back(c);
|
|
||||||
if (c < 0x80) {
|
|
||||||
++h;
|
++h;
|
||||||
outPunycode.push_back(c);
|
OutPunycode.push_back(C);
|
||||||
|
}
|
||||||
|
if (C >= 0xD800 && C <= 0xDFFF) {
|
||||||
|
OutPunycode.clear();
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
size_t b = h;
|
size_t b = h;
|
||||||
// ...followed by a delimiter if b > 0
|
// ...followed by a delimiter if b > 0
|
||||||
if (b > 0)
|
if (b > 0)
|
||||||
outPunycode.push_back(delimiter);
|
OutPunycode.push_back(delimiter);
|
||||||
|
|
||||||
while (h < inputCodePoints.size()) {
|
while (h < InputCodePoints.size()) {
|
||||||
// let m = the minimum code point >= n in the input
|
// let m = the minimum code point >= n in the input
|
||||||
UTF32 m = 0x10FFFF;
|
uint32_t m = 0x10FFFF;
|
||||||
for (UTF32 codePoint : inputCodePoints) {
|
for (auto codePoint : InputCodePoints) {
|
||||||
if (codePoint >= n && codePoint < m)
|
if (codePoint >= n && codePoint < m)
|
||||||
m = codePoint;
|
m = codePoint;
|
||||||
}
|
}
|
||||||
|
|
||||||
delta = delta + (m - n) * (h + 1);
|
delta = delta + (m - n) * (h + 1);
|
||||||
n = m;
|
n = m;
|
||||||
for (UTF32 c : inputCodePoints) {
|
for (auto c : InputCodePoints) {
|
||||||
if (c < n) ++delta;
|
if (c < n) ++delta;
|
||||||
if (c == n) {
|
if (c == n) {
|
||||||
int q = delta;
|
int q = delta;
|
||||||
@@ -204,10 +175,10 @@ void Punycode::encodePunycode(StringRef inputUTF8,
|
|||||||
: k - bias;
|
: k - bias;
|
||||||
|
|
||||||
if (q < t) break;
|
if (q < t) break;
|
||||||
outPunycode.push_back(digit_value(t + ((q - t) % (base - t))));
|
OutPunycode.push_back(digit_value(t + ((q - t) % (base - t))));
|
||||||
q = (q - t) / (base - t);
|
q = (q - t) / (base - t);
|
||||||
}
|
}
|
||||||
outPunycode.push_back(digit_value(q));
|
OutPunycode.push_back(digit_value(q));
|
||||||
bias = adapt(delta, h + 1, h == b);
|
bias = adapt(delta, h + 1, h == b);
|
||||||
delta = 0;
|
delta = 0;
|
||||||
++h;
|
++h;
|
||||||
@@ -215,4 +186,6 @@ void Punycode::encodePunycode(StringRef inputUTF8,
|
|||||||
}
|
}
|
||||||
++delta; ++n;
|
++delta; ++n;
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
52
lib/Basic/PunycodeUTF8.cpp
Normal file
52
lib/Basic/PunycodeUTF8.cpp
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
//===--- PunycodeUTF8.cpp - Unicode to Punycode transcoding -----*- C++ -*-===//
|
||||||
|
//
|
||||||
|
// This source file is part of the Swift.org open source project
|
||||||
|
//
|
||||||
|
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
|
||||||
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
||||||
|
//
|
||||||
|
// See http://swift.org/LICENSE.txt for license information
|
||||||
|
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "swift/Basic/Punycode.h"
|
||||||
|
#include "llvm/Support/ConvertUTF.h"
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
using namespace swift;
|
||||||
|
|
||||||
|
bool Punycode::encodePunycodeUTF8(StringRef InputUTF8,
|
||||||
|
std::string &OutPunycode) {
|
||||||
|
std::vector<uint32_t> InputCodePoints(InputUTF8.size());
|
||||||
|
const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(InputUTF8.data());
|
||||||
|
UTF32 *TargetStart = InputCodePoints.data();
|
||||||
|
auto ConvStatus = ConvertUTF8toUTF32(
|
||||||
|
&SourceStart, SourceStart + InputUTF8.size(), &TargetStart,
|
||||||
|
InputCodePoints.data() + InputCodePoints.size(), strictConversion);
|
||||||
|
if (ConvStatus != conversionOK)
|
||||||
|
return false;
|
||||||
|
return encodePunycode(InputCodePoints, OutPunycode);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Punycode::decodePunycodeUTF8(StringRef InputPunycode,
|
||||||
|
std::string &OutUTF8) {
|
||||||
|
std::vector<uint32_t> OutCodePoints;
|
||||||
|
if (!decodePunycode(InputPunycode, OutCodePoints))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const size_t SizeUpperBound = OutCodePoints.size() * 4;
|
||||||
|
std::vector<UTF8> Result(SizeUpperBound);
|
||||||
|
const UTF32 *SourceStart = OutCodePoints.data();
|
||||||
|
UTF8 *TargetStart = Result.data();
|
||||||
|
auto ConvStatus = ConvertUTF32toUTF8(
|
||||||
|
&SourceStart, SourceStart + OutCodePoints.size(), &TargetStart,
|
||||||
|
Result.data() + Result.size(), strictConversion);
|
||||||
|
if (ConvStatus != conversionOK) {
|
||||||
|
OutUTF8.clear();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
OutUTF8 = std::string(Result.data(), TargetStart);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
@@ -84,7 +84,7 @@ static void mangleIdent(llvm::raw_string_ostream &OS, StringRef Id) {
|
|||||||
std::string PunycodeBuf;
|
std::string PunycodeBuf;
|
||||||
if (isNonAscii(Id)) {
|
if (isNonAscii(Id)) {
|
||||||
OS << 'X';
|
OS << 'X';
|
||||||
Punycode::encodePunycode(Id, PunycodeBuf);
|
Punycode::encodePunycodeUTF8(Id, PunycodeBuf);
|
||||||
Id = PunycodeBuf;
|
Id = PunycodeBuf;
|
||||||
}
|
}
|
||||||
OS << Id.size() << Id;
|
OS << Id.size() << Id;
|
||||||
|
|||||||
Reference in New Issue
Block a user