Files
swift-mirror/include/swift/Basic/Unicode.h
Becca Royal-Gordon 4bd532ab9a Don't import string macros with invalid UTF-8
Swift string literals are only permitted to contain well-formed UTF-8, but C does not share this restriction, and ClangImporter wasn't checking for that before it created `StringLiteralExpr`s for imported macros; this could cause crashes when importing a header. This commit makes us drop these macros instead.

Although invalid UTF-8 always *did* cause a segfault in my testing, I'm not convinced that there isn't a way to cause a miscompile with a bug like this. If we somehow did generate code that fed ill-formed UTF-8 to the builtin literal init for Swift.String, the resulting string could cause undefined behavior at runtime. So I have additionally added a defensive assertion to StringLiteralInst that any UTF-8 string represented in SIL is well-formed. Hopefully that will catch any non-crashing compiler bugs like this one.

Fixes rdar://67840900.
2022-01-26 20:57:13 -08:00

81 lines
2.5 KiB
C++

//===--- Unicode.h - Unicode utilities --------------------------*- C++ -*-===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
#ifndef SWIFT_BASIC_UNICODE_H
#define SWIFT_BASIC_UNICODE_H
#include "swift/Basic/LLVM.h"
#include "llvm/ADT/StringRef.h"
namespace swift {
namespace unicode {
StringRef extractFirstExtendedGraphemeCluster(StringRef S);
static inline bool isSingleExtendedGraphemeCluster(StringRef S) {
StringRef First = extractFirstExtendedGraphemeCluster(S);
if (First.empty())
return false;
return First == S;
}
enum class GraphemeClusterBreakProperty : uint8_t {
Other,
CR,
LF,
Control,
Extend,
Regional_Indicator,
Prepend,
SpacingMark,
L,
V,
T,
LV,
LVT,
};
/// Extended grapheme cluster boundary rules, represented as a matrix. Indexed
/// by first code point, then by second code point in least-significant-bit
/// order. A set bit means that a boundary is prohibited between two code
/// points.
extern const uint16_t ExtendedGraphemeClusterNoBoundaryRulesMatrix[];
/// Returns the value of the Grapheme_Cluster_Break property for a given code
/// point.
GraphemeClusterBreakProperty getGraphemeClusterBreakProperty(uint32_t C);
/// Determine if there is an extended grapheme cluster boundary between code
/// points with given Grapheme_Cluster_Break property values.
static inline bool
isExtendedGraphemeClusterBoundary(GraphemeClusterBreakProperty GCB1,
GraphemeClusterBreakProperty GCB2) {
auto RuleRow =
ExtendedGraphemeClusterNoBoundaryRulesMatrix[static_cast<unsigned>(GCB1)];
return !(RuleRow & (1 << static_cast<unsigned>(GCB2)));
}
bool isSingleUnicodeScalar(StringRef S);
unsigned extractFirstUnicodeScalar(StringRef S);
/// Returns true if \p S does not contain any ill-formed subsequences. This does
/// not check whether all of the characters in it are actually allocated or
/// used correctly; it just checks that every byte can be grouped into a code
/// unit (Unicode scalar).
bool isWellFormedUTF8(StringRef S);
} // end namespace unicode
} // end namespace swift
#endif // SWIFT_BASIC_UNICODE_H