mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
They are not used. Demangling uses `assert()` from <cassert>, and macros from lib/Demangling/DemanglerAssert.h. Reverts the lib/Demangling changes from #74184. No behavior change.
370 lines
10 KiB
C++
370 lines
10 KiB
C++
//===--- Punycode.cpp - Unicode to Punycode transcoding -------------------===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See https://swift.org/LICENSE.txt for license information
|
|
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "swift/Demangling/Punycode.h"
|
|
#include "swift/Demangling/ManglingUtils.h"
|
|
#include <limits>
|
|
#include <vector>
|
|
#include <cstdint>
|
|
|
|
using namespace swift;
|
|
using namespace Punycode;
|
|
|
|
// RFC 3492
|
|
// Section 5: Parameter values for Punycode
|
|
|
|
static const int base = 36;
|
|
static const int tmin = 1;
|
|
static const int tmax = 26;
|
|
static const int skew = 38;
|
|
static const int damp = 700;
|
|
static const int initial_bias = 72;
|
|
static const uint32_t initial_n = 128;
|
|
|
|
static const char delimiter = '_';
|
|
|
|
static char digit_value(int digit) {
|
|
assert(digit < base && "invalid punycode digit");
|
|
if (digit < 26)
|
|
return 'a' + digit;
|
|
return 'A' - 26 + digit;
|
|
}
|
|
|
|
static int digit_index(char value) {
|
|
if (value >= 'a' && value <= 'z')
|
|
return value - 'a';
|
|
if (value >= 'A' && value <= 'J')
|
|
return value - 'A' + 26;
|
|
return -1;
|
|
}
|
|
|
|
static bool isValidUnicodeScalar(uint32_t S) {
|
|
// Also accept the range of 0xD800 - 0xD880, which is used for non-symbol
|
|
// ASCII characters.
|
|
return (S < 0xD880) || (S >= 0xE000 && S <= 0x10FFFF);
|
|
}
|
|
|
|
// Section 6.1: Bias adaptation function
|
|
|
|
static int adapt(int delta, int numpoints, bool firsttime) {
|
|
if (firsttime)
|
|
delta = delta / damp;
|
|
else
|
|
delta = delta / 2;
|
|
|
|
delta += delta / numpoints;
|
|
int k = 0;
|
|
while (delta > ((base - tmin) * tmax) / 2) {
|
|
delta /= base - tmin;
|
|
k += base;
|
|
}
|
|
return k + (((base - tmin + 1) * delta) / (delta + skew));
|
|
}
|
|
|
|
// Section 6.2: Decoding procedure
|
|
|
|
bool Punycode::decodePunycode(StringRef InputPunycode,
|
|
std::vector<uint32_t> &OutCodePoints) {
|
|
OutCodePoints.clear();
|
|
OutCodePoints.reserve(InputPunycode.size());
|
|
|
|
// -- Build the decoded string as UTF32 first because we need random access.
|
|
uint32_t n = initial_n;
|
|
int i = 0;
|
|
int bias = initial_bias;
|
|
/// let output = an empty string indexed from 0
|
|
// consume all code points before the last delimiter (if there is one)
|
|
// and copy them to output,
|
|
size_t lastDelimiter = InputPunycode.find_last_of(delimiter);
|
|
if (lastDelimiter != StringRef::npos) {
|
|
for (char c : InputPunycode.slice(0, lastDelimiter)) {
|
|
// fail on any non-basic code point
|
|
if (static_cast<unsigned char>(c) > 0x7f)
|
|
return false;
|
|
OutCodePoints.push_back(c);
|
|
}
|
|
// if more than zero code points were consumed then consume one more
|
|
// (which will be the last delimiter)
|
|
InputPunycode =
|
|
InputPunycode.slice(lastDelimiter + 1, InputPunycode.size());
|
|
}
|
|
|
|
while (!InputPunycode.empty()) {
|
|
int oldi = i;
|
|
int w = 1;
|
|
for (int k = base; ; k += base) {
|
|
// consume a code point, or fail if there was none to consume
|
|
if (InputPunycode.empty())
|
|
return false;
|
|
char codePoint = InputPunycode.front();
|
|
InputPunycode = InputPunycode.slice(1, InputPunycode.size());
|
|
// let digit = the code point's digit-value, fail if it has none
|
|
int digit = digit_index(codePoint);
|
|
if (digit < 0)
|
|
return false;
|
|
|
|
// Fail if i + (digit * w) would overflow
|
|
if (digit > (std::numeric_limits<int>::max() - i) / w)
|
|
return false;
|
|
i = i + digit * w;
|
|
int t = k <= bias ? tmin
|
|
: k >= bias + tmax ? tmax
|
|
: k - bias;
|
|
if (digit < t)
|
|
break;
|
|
// Fail if w * (base - t) would overflow
|
|
if (w > std::numeric_limits<int>::max() / (base - t))
|
|
return false;
|
|
w = w * (base - t);
|
|
}
|
|
bias = adapt(i - oldi, OutCodePoints.size() + 1, oldi == 0);
|
|
// Fail if n + i / (OutCodePoints.size() + 1) would overflow
|
|
if (i / (OutCodePoints.size() + 1) > std::numeric_limits<int>::max() - n)
|
|
return false;
|
|
n = n + i / (OutCodePoints.size() + 1);
|
|
i = i % (OutCodePoints.size() + 1);
|
|
// if n is a basic code point then fail
|
|
if (n < 0x80)
|
|
return false;
|
|
// insert n into output at position i
|
|
OutCodePoints.insert(OutCodePoints.begin() + i, n);
|
|
++i;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// Section 6.3: Encoding procedure
|
|
|
|
bool Punycode::encodePunycode(const std::vector<uint32_t> &InputCodePoints,
|
|
std::string &OutPunycode) {
|
|
OutPunycode.clear();
|
|
|
|
uint32_t n = initial_n;
|
|
int delta = 0;
|
|
int bias = initial_bias;
|
|
|
|
// let h = b = the number of basic code points in the input
|
|
// copy them to the output in order...
|
|
size_t h = 0;
|
|
for (auto C : InputCodePoints) {
|
|
if (C < 0x80) {
|
|
++h;
|
|
OutPunycode.push_back(C);
|
|
}
|
|
if (!isValidUnicodeScalar(C)) {
|
|
OutPunycode.clear();
|
|
return false;
|
|
}
|
|
}
|
|
size_t b = h;
|
|
// ...followed by a delimiter if b > 0
|
|
if (b > 0)
|
|
OutPunycode.push_back(delimiter);
|
|
|
|
while (h < InputCodePoints.size()) {
|
|
// let m = the minimum code point >= n in the input
|
|
uint32_t m = 0x10FFFF;
|
|
for (auto codePoint : InputCodePoints) {
|
|
if (codePoint >= n && codePoint < m)
|
|
m = codePoint;
|
|
}
|
|
|
|
if ((m - n) > (std::numeric_limits<int>::max() - delta) / (h + 1))
|
|
return false;
|
|
delta = delta + (m - n) * (h + 1);
|
|
n = m;
|
|
for (auto c : InputCodePoints) {
|
|
if (c < n) {
|
|
if (delta == std::numeric_limits<int>::max())
|
|
return false;
|
|
++delta;
|
|
}
|
|
if (c == n) {
|
|
int q = delta;
|
|
for (int k = base; ; k += base) {
|
|
int t = k <= bias ? tmin
|
|
: k >= bias + tmax ? tmax
|
|
: k - bias;
|
|
|
|
if (q < t) break;
|
|
OutPunycode.push_back(digit_value(t + ((q - t) % (base - t))));
|
|
q = (q - t) / (base - t);
|
|
}
|
|
OutPunycode.push_back(digit_value(q));
|
|
bias = adapt(delta, h + 1, h == b);
|
|
delta = 0;
|
|
++h;
|
|
}
|
|
}
|
|
++delta; ++n;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static bool encodeToUTF8(const std::vector<uint32_t> &Scalars,
|
|
std::string &OutUTF8) {
|
|
for (auto S : Scalars) {
|
|
if (!isValidUnicodeScalar(S)) {
|
|
OutUTF8.clear();
|
|
return false;
|
|
}
|
|
if (S >= 0xD800 && S < 0xD880)
|
|
S -= 0xD800;
|
|
|
|
unsigned Bytes = 0;
|
|
if (S < 0x80)
|
|
Bytes = 1;
|
|
else if (S < 0x800)
|
|
Bytes = 2;
|
|
else if (S < 0x10000)
|
|
Bytes = 3;
|
|
else
|
|
Bytes = 4;
|
|
|
|
switch (Bytes) {
|
|
case 1:
|
|
OutUTF8.push_back(S);
|
|
break;
|
|
case 2: {
|
|
uint8_t Byte2 = (S | 0x80) & 0xBF;
|
|
S >>= 6;
|
|
uint8_t Byte1 = S | 0xC0;
|
|
OutUTF8.push_back(Byte1);
|
|
OutUTF8.push_back(Byte2);
|
|
break;
|
|
}
|
|
case 3: {
|
|
uint8_t Byte3 = (S | 0x80) & 0xBF;
|
|
S >>= 6;
|
|
uint8_t Byte2 = (S | 0x80) & 0xBF;
|
|
S >>= 6;
|
|
uint8_t Byte1 = S | 0xE0;
|
|
OutUTF8.push_back(Byte1);
|
|
OutUTF8.push_back(Byte2);
|
|
OutUTF8.push_back(Byte3);
|
|
break;
|
|
}
|
|
case 4: {
|
|
uint8_t Byte4 = (S | 0x80) & 0xBF;
|
|
S >>= 6;
|
|
uint8_t Byte3 = (S | 0x80) & 0xBF;
|
|
S >>= 6;
|
|
uint8_t Byte2 = (S | 0x80) & 0xBF;
|
|
S >>= 6;
|
|
uint8_t Byte1 = S | 0xF0;
|
|
OutUTF8.push_back(Byte1);
|
|
OutUTF8.push_back(Byte2);
|
|
OutUTF8.push_back(Byte3);
|
|
OutUTF8.push_back(Byte4);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool Punycode::decodePunycodeUTF8(StringRef InputPunycode,
|
|
std::string &OutUTF8) {
|
|
std::vector<uint32_t> OutCodePoints;
|
|
if (!decodePunycode(InputPunycode, OutCodePoints))
|
|
return false;
|
|
|
|
return encodeToUTF8(OutCodePoints, OutUTF8);
|
|
}
|
|
|
|
static bool isContinuationByte(uint8_t unit) {
|
|
return (unit & 0xC0) == 0x80;
|
|
}
|
|
|
|
/// Reencode well-formed UTF-8 as UTF-32.
|
|
///
|
|
/// This entry point is only called from compiler-internal entry points, so does
|
|
/// only minimal validation. In particular, it does *not* check for overlong
|
|
/// encodings.
|
|
/// If \p mapNonSymbolChars is true, non-symbol ASCII characters (characters
|
|
/// except [$_a-zA-Z0-9]) are also encoded like non-ASCII unicode characters.
|
|
/// Returns false if \p InputUTF8 contains surrogate code points.
|
|
static bool convertUTF8toUTF32(llvm::StringRef InputUTF8,
|
|
std::vector<uint32_t> &OutUTF32,
|
|
bool mapNonSymbolChars) {
|
|
auto ptr = InputUTF8.begin();
|
|
auto end = InputUTF8.end();
|
|
while (ptr < end) {
|
|
uint8_t first = *ptr++;
|
|
uint32_t code_point = 0;
|
|
if (first < 0x80) {
|
|
if (Mangle::isValidSymbolChar(first) || !mapNonSymbolChars) {
|
|
code_point = first;
|
|
} else {
|
|
code_point = (uint32_t)first + 0xD800;
|
|
}
|
|
} else if (first < 0xC0) {
|
|
// Invalid continuation byte.
|
|
return false;
|
|
} else if (first < 0xE0) {
|
|
// Two-byte sequence.
|
|
if (ptr == end)
|
|
return false;
|
|
uint8_t second = *ptr++;
|
|
if (!isContinuationByte(second))
|
|
return false;
|
|
code_point = ((first & 0x1F) << 6) | (second & 0x3F);
|
|
} else if (first < 0xF0) {
|
|
// Three-byte sequence.
|
|
if (end - ptr < 2)
|
|
return false;
|
|
uint8_t second = *ptr++;
|
|
uint8_t third = *ptr++;
|
|
if (!isContinuationByte(second) || !isContinuationByte(third))
|
|
return false;
|
|
code_point = ((first & 0xF) << 12)
|
|
| ((second & 0x3F) << 6)
|
|
| ( third & 0x3F );
|
|
} else if (first < 0xF8) {
|
|
// Four-byte sequence.
|
|
if (end - ptr < 3)
|
|
return false;
|
|
uint8_t second = *ptr++;
|
|
uint8_t third = *ptr++;
|
|
uint8_t fourth = *ptr++;
|
|
if (!isContinuationByte(second) || !isContinuationByte(third)
|
|
|| !isContinuationByte(fourth))
|
|
return false;
|
|
code_point = ((first & 0x7) << 18)
|
|
| ((second & 0x3F) << 12)
|
|
| ((third & 0x3F) << 6)
|
|
| ( fourth & 0x3F );
|
|
} else {
|
|
// Unused sequence length.
|
|
return false;
|
|
}
|
|
if (!isValidUnicodeScalar(code_point))
|
|
return false;
|
|
OutUTF32.push_back(code_point);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool Punycode::encodePunycodeUTF8(StringRef InputUTF8,
|
|
std::string &OutPunycode,
|
|
bool mapNonSymbolChars) {
|
|
std::vector<uint32_t> InputCodePoints;
|
|
InputCodePoints.reserve(InputUTF8.size());
|
|
|
|
if (!convertUTF8toUTF32(InputUTF8, InputCodePoints, mapNonSymbolChars))
|
|
return false;
|
|
|
|
return encodePunycode(InputCodePoints, OutPunycode);
|
|
}
|
|
|