mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
343 lines
12 KiB
C++
343 lines
12 KiB
C++
//===--- UnicodeNormalization.cpp - Unicode Normalization Helpers ---------===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See https://swift.org/LICENSE.txt for license information
|
|
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// Functions that use ICU to do unicode normalization and collation.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "../SwiftShims/UnicodeShims.h"
|
|
|
|
#include <stdint.h>
|
|
|
|
#if defined(__APPLE__)
|
|
|
|
// Declare a few external functions to avoid a dependency on ICU headers.
|
|
extern "C" {
|
|
|
|
// Types
|
|
typedef struct UBreakIterator UBreakIterator;
|
|
typedef struct UBreakIterator UNormalizer2;
|
|
typedef enum UBreakIteratorType {} UBreakIteratorType;
|
|
typedef enum UErrorCode {} UErrorCode;
|
|
typedef uint16_t UChar;
|
|
typedef int32_t UChar32;
|
|
typedef int8_t UBool;
|
|
typedef swift::__swift_stdlib_UProperty UProperty;
|
|
|
|
#define U_MAX_VERSION_LENGTH 4
|
|
typedef uint8_t UVersionInfo[U_MAX_VERSION_LENGTH];
|
|
|
|
// Grapheme breaking APIs
|
|
void ubrk_close(UBreakIterator *);
|
|
UBreakIterator *ubrk_open(UBreakIteratorType, const char *, const UChar *,
|
|
int32_t, UErrorCode *);
|
|
int32_t ubrk_preceding(UBreakIterator *, int32_t);
|
|
int32_t ubrk_following(UBreakIterator *, int32_t);
|
|
void ubrk_setText(UBreakIterator *, const UChar *, int32_t, UErrorCode *);
|
|
|
|
// Comparison, normalization, and character property APIs
|
|
int32_t unorm2_spanQuickCheckYes(const UNormalizer2 *, const UChar *, int32_t,
|
|
UErrorCode *);
|
|
int32_t unorm2_normalize(const UNormalizer2 *, const UChar *, int32_t, UChar *,
|
|
int32_t, UErrorCode *);
|
|
const UNormalizer2 *unorm2_getNFCInstance(UErrorCode *);
|
|
UBool unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c);
|
|
UBool u_hasBinaryProperty(UChar32, UProperty);
|
|
UBool u_isdefined(UChar32);
|
|
void u_charAge(UChar32, UVersionInfo);
|
|
}
|
|
|
|
#else
|
|
|
|
#pragma clang diagnostic push
|
|
#pragma clang diagnostic ignored "-Wdocumentation"
|
|
|
|
#include <unicode/ustring.h>
|
|
#include <unicode/ucol.h>
|
|
#include <unicode/ucoleitr.h>
|
|
#include <unicode/uiter.h>
|
|
#include <unicode/ubrk.h>
|
|
#include <unicode/uchar.h>
|
|
#include <unicode/uvernum.h>
|
|
#include <unicode/uversion.h>
|
|
|
|
#pragma clang diagnostic pop
|
|
|
|
#endif
|
|
|
|
#if !defined(__APPLE__)
|
|
#include "swift/Basic/Lazy.h"
|
|
#include "swift/Runtime/Config.h"
|
|
#include "swift/Runtime/Debug.h"
|
|
|
|
#include <algorithm>
|
|
#include <mutex>
|
|
#include <assert.h>
|
|
|
|
static const UCollator *MakeRootCollator() {
|
|
UErrorCode ErrorCode = U_ZERO_ERROR;
|
|
UCollator *root = ucol_open("", &ErrorCode);
|
|
if (U_FAILURE(ErrorCode)) {
|
|
swift::crash("ucol_open: Failure setting up default collation.");
|
|
}
|
|
ucol_setAttribute(root, UCOL_NORMALIZATION_MODE, UCOL_ON, &ErrorCode);
|
|
ucol_setAttribute(root, UCOL_STRENGTH, UCOL_TERTIARY, &ErrorCode);
|
|
ucol_setAttribute(root, UCOL_NUMERIC_COLLATION, UCOL_OFF, &ErrorCode);
|
|
ucol_setAttribute(root, UCOL_CASE_LEVEL, UCOL_OFF, &ErrorCode);
|
|
if (U_FAILURE(ErrorCode)) {
|
|
swift::crash("ucol_setAttribute: Failure setting up default collation.");
|
|
}
|
|
return root;
|
|
}
|
|
|
|
// According to this thread in the ICU mailing list, it should be safe
|
|
// to assume the UCollator object is thread safe so long as you're only
|
|
// passing it to functions that take a const pointer to it. So, we make it
|
|
// const here to make sure we don't misuse it.
|
|
// http://sourceforge.net/p/icu/mailman/message/27427062/
|
|
static const UCollator *GetRootCollator() {
|
|
return SWIFT_LAZY_CONSTANT(MakeRootCollator());
|
|
}
|
|
|
|
/// This class caches the collation element results for the ASCII subset of
|
|
/// unicode.
|
|
class ASCIICollation {
|
|
public:
|
|
friend class swift::Lazy<ASCIICollation>;
|
|
|
|
static swift::Lazy<ASCIICollation> theTable;
|
|
static const ASCIICollation *getTable() {
|
|
return &theTable.get();
|
|
}
|
|
|
|
int32_t CollationTable[128];
|
|
|
|
/// Maps an ASCII character to a collation element priority as would be
|
|
/// returned by a call to ucol_next().
|
|
int32_t map(unsigned char c) const {
|
|
return CollationTable[c];
|
|
}
|
|
|
|
private:
|
|
/// Construct the ASCII collation table.
|
|
ASCIICollation() {
|
|
const UCollator *Collator = GetRootCollator();
|
|
for (unsigned char c = 0; c < 128; ++c) {
|
|
UErrorCode ErrorCode = U_ZERO_ERROR;
|
|
intptr_t NumCollationElts = 0;
|
|
UChar Buffer[1];
|
|
Buffer[0] = c;
|
|
|
|
UCollationElements *CollationIterator =
|
|
ucol_openElements(Collator, Buffer, 1, &ErrorCode);
|
|
|
|
while (U_SUCCESS(ErrorCode)) {
|
|
intptr_t Elem = ucol_next(CollationIterator, &ErrorCode);
|
|
if (Elem != UCOL_NULLORDER) {
|
|
CollationTable[c] = Elem;
|
|
++NumCollationElts;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
ucol_closeElements(CollationIterator);
|
|
if (U_FAILURE(ErrorCode) || NumCollationElts != 1) {
|
|
swift::crash("Error setting up the ASCII collation table");
|
|
}
|
|
}
|
|
}
|
|
|
|
ASCIICollation &operator=(const ASCIICollation &) = delete;
|
|
ASCIICollation(const ASCIICollation &) = delete;
|
|
};
|
|
|
|
void *swift::_swift_stdlib_unicodeCollationIterator_create(
|
|
const __swift_uint16_t *Str, __swift_uint32_t Length) {
|
|
UErrorCode ErrorCode = U_ZERO_ERROR;
|
|
UCollationElements *CollationIterator =
|
|
ucol_openElements(GetRootCollator(), reinterpret_cast<const UChar *>(Str),
|
|
Length, &ErrorCode);
|
|
if (U_FAILURE(ErrorCode)) {
|
|
swift::crash("_swift_stdlib_unicodeCollationIterator_create: ucol_openElements() failed.");
|
|
}
|
|
return CollationIterator;
|
|
}
|
|
|
|
__swift_int32_t swift::_swift_stdlib_unicodeCollationIterator_next(
|
|
void *CollationIterator, bool *HitEnd) {
|
|
UErrorCode ErrorCode = U_ZERO_ERROR;
|
|
auto Result = ucol_next(
|
|
static_cast<UCollationElements *>(CollationIterator), &ErrorCode);
|
|
if (U_FAILURE(ErrorCode)) {
|
|
swift::crash(
|
|
"_swift_stdlib_unicodeCollationIterator_next: ucol_next() failed.");
|
|
}
|
|
*HitEnd = (Result == UCOL_NULLORDER);
|
|
return Result;
|
|
}
|
|
|
|
void swift::_swift_stdlib_unicodeCollationIterator_delete(
|
|
void *CollationIterator) {
|
|
ucol_closeElements(static_cast<UCollationElements *>(CollationIterator));
|
|
}
|
|
|
|
const __swift_int32_t *swift::_swift_stdlib_unicode_getASCIICollationTable() {
|
|
return ASCIICollation::getTable()->CollationTable;
|
|
}
|
|
|
|
/// Convert the unicode string to uppercase. This function will return the
|
|
/// required buffer length as a result. If this length does not match the
|
|
/// 'DestinationCapacity' this function must be called again with a buffer of
|
|
/// the required length to get an uppercase version of the string.
|
|
int32_t
|
|
swift::_swift_stdlib_unicode_strToUpper(uint16_t *Destination,
|
|
int32_t DestinationCapacity,
|
|
const uint16_t *Source,
|
|
int32_t SourceLength) {
|
|
UErrorCode ErrorCode = U_ZERO_ERROR;
|
|
uint32_t OutputLength = u_strToUpper(reinterpret_cast<UChar *>(Destination),
|
|
DestinationCapacity,
|
|
reinterpret_cast<const UChar *>(Source),
|
|
SourceLength,
|
|
"", &ErrorCode);
|
|
if (U_FAILURE(ErrorCode) && ErrorCode != U_BUFFER_OVERFLOW_ERROR) {
|
|
swift::crash("u_strToUpper: Unexpected error uppercasing unicode string.");
|
|
}
|
|
return OutputLength;
|
|
}
|
|
|
|
/// Convert the unicode string to lowercase. This function will return the
|
|
/// required buffer length as a result. If this length does not match the
|
|
/// 'DestinationCapacity' this function must be called again with a buffer of
|
|
/// the required length to get a lowercase version of the string.
|
|
int32_t
|
|
swift::_swift_stdlib_unicode_strToLower(uint16_t *Destination,
|
|
int32_t DestinationCapacity,
|
|
const uint16_t *Source,
|
|
int32_t SourceLength) {
|
|
UErrorCode ErrorCode = U_ZERO_ERROR;
|
|
uint32_t OutputLength = u_strToLower(reinterpret_cast<UChar *>(Destination),
|
|
DestinationCapacity,
|
|
reinterpret_cast<const UChar *>(Source),
|
|
SourceLength,
|
|
"", &ErrorCode);
|
|
if (U_FAILURE(ErrorCode) && ErrorCode != U_BUFFER_OVERFLOW_ERROR) {
|
|
swift::crash("u_strToLower: Unexpected error lowercasing unicode string.");
|
|
}
|
|
return OutputLength;
|
|
}
|
|
|
|
swift::Lazy<ASCIICollation> ASCIICollation::theTable;
|
|
#endif
|
|
|
|
namespace {
|
|
template <typename T, typename U> T *ptr_cast(U *p) {
|
|
return static_cast<T *>(static_cast<void *>(p));
|
|
}
|
|
template <typename T, typename U> const T *ptr_cast(const U *p) {
|
|
return static_cast<const T *>(static_cast<const void *>(p));
|
|
}
|
|
}
|
|
|
|
void swift::__swift_stdlib_ubrk_close(
|
|
swift::__swift_stdlib_UBreakIterator *bi) {
|
|
ubrk_close(ptr_cast<UBreakIterator>(bi));
|
|
}
|
|
|
|
swift::__swift_stdlib_UBreakIterator *swift::__swift_stdlib_ubrk_open(
|
|
swift::__swift_stdlib_UBreakIteratorType type, const char *locale,
|
|
const __swift_stdlib_UChar *text, int32_t textLength,
|
|
__swift_stdlib_UErrorCode *status) {
|
|
return ptr_cast<swift::__swift_stdlib_UBreakIterator>(
|
|
ubrk_open(static_cast<UBreakIteratorType>(type), locale,
|
|
reinterpret_cast<const UChar *>(text), textLength,
|
|
ptr_cast<UErrorCode>(status)));
|
|
}
|
|
|
|
int32_t
|
|
swift::__swift_stdlib_ubrk_preceding(swift::__swift_stdlib_UBreakIterator *bi,
|
|
int32_t offset) {
|
|
return ubrk_preceding(ptr_cast<UBreakIterator>(bi), offset);
|
|
}
|
|
|
|
int32_t
|
|
swift::__swift_stdlib_ubrk_following(swift::__swift_stdlib_UBreakIterator *bi,
|
|
int32_t offset) {
|
|
return ubrk_following(ptr_cast<UBreakIterator>(bi), offset);
|
|
}
|
|
|
|
void swift::__swift_stdlib_ubrk_setText(
|
|
swift::__swift_stdlib_UBreakIterator *bi, const __swift_stdlib_UChar *text,
|
|
__swift_int32_t textLength, __swift_stdlib_UErrorCode *status) {
|
|
return ubrk_setText(ptr_cast<UBreakIterator>(bi), ptr_cast<UChar>(text),
|
|
textLength, ptr_cast<UErrorCode>(status));
|
|
}
|
|
|
|
swift::__swift_stdlib_UBool swift::__swift_stdlib_unorm2_hasBoundaryBefore(
|
|
const __swift_stdlib_UNormalizer2 *ptr, __swift_stdlib_UChar32 char32) {
|
|
return unorm2_hasBoundaryBefore(ptr_cast<UNormalizer2>(ptr), char32);
|
|
}
|
|
const swift::__swift_stdlib_UNormalizer2 *
|
|
swift::__swift_stdlib_unorm2_getNFCInstance(__swift_stdlib_UErrorCode *err) {
|
|
return ptr_cast<__swift_stdlib_UNormalizer2>(
|
|
unorm2_getNFCInstance(ptr_cast<UErrorCode>(err)));
|
|
}
|
|
|
|
int32_t swift::__swift_stdlib_unorm2_normalize(
|
|
const __swift_stdlib_UNormalizer2 *norm, const __swift_stdlib_UChar *src,
|
|
__swift_int32_t len, __swift_stdlib_UChar *dst, __swift_int32_t capacity,
|
|
__swift_stdlib_UErrorCode *err) {
|
|
// TODO remove this compatibility when we require ICU >= 60 on Linux
|
|
#if defined(__APPLE__) || U_ICU_VERSION_MAJOR_NUM >= 60
|
|
return unorm2_normalize(ptr_cast<UNormalizer2>(norm), src, len, dst, capacity,
|
|
ptr_cast<UErrorCode>(err));
|
|
#else
|
|
return unorm2_normalize(ptr_cast<UNormalizer2>(norm),
|
|
reinterpret_cast<const UChar *>(src), len,
|
|
reinterpret_cast<UChar *>(dst), capacity,
|
|
ptr_cast<UErrorCode>(err));
|
|
#endif
|
|
}
|
|
|
|
__swift_int32_t swift::__swift_stdlib_unorm2_spanQuickCheckYes(
|
|
const __swift_stdlib_UNormalizer2 *norm, const __swift_stdlib_UChar *ptr,
|
|
__swift_int32_t len, __swift_stdlib_UErrorCode *err) {
|
|
return unorm2_spanQuickCheckYes(ptr_cast<UNormalizer2>(norm),
|
|
ptr_cast<UChar>(ptr), len,
|
|
ptr_cast<UErrorCode>(err));
|
|
}
|
|
|
|
swift::__swift_stdlib_UBool
|
|
swift::__swift_stdlib_u_hasBinaryProperty(__swift_stdlib_UChar32 c,
|
|
__swift_stdlib_UProperty p) {
|
|
return u_hasBinaryProperty(c, static_cast<UProperty>(p));
|
|
}
|
|
|
|
swift::__swift_stdlib_UBool
|
|
swift::__swift_stdlib_u_isdefined(UChar32 c) {
|
|
return u_isdefined(c);
|
|
}
|
|
|
|
void
|
|
swift::__swift_stdlib_u_charAge(__swift_stdlib_UChar32 c,
|
|
__swift_stdlib_UVersionInfo versionInfo) {
|
|
return u_charAge(c, versionInfo);
|
|
}
|
|
|
|
|
|
// Force an autolink with ICU
|
|
#if defined(__MACH__)
|
|
asm(".linker_option \"-licucore\"\n");
|
|
#endif // defined(__MACH__)
|
|
|