Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -6085,6 +6085,7 @@ checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
name = "unicode-table-generator"
version = "0.1.0"
dependencies = [
"rustc-hash 2.1.1",
"ucd-parse",
]

Expand Down
100 changes: 100 additions & 0 deletions library/alloc/src/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,106 @@ impl str {
s
}

/// Returns the case-folded equivalent of this string slice, as a new [`String`].
///
/// Case folding is a transformation, mostly matching lowercase, that is meant to be used
/// for case-insensitive string comparisons. Case-folded strings should not usually
/// be exposed directly to users.
///
/// For the precise specification of case folding, see
/// [Chapter 3 (Conformance)](https://www.unicode.org/versions/latest/core-spec/chapter-3/#G63737)
/// of the Unicode standard.
///
/// Since some characters can expand into multiple characters when case folding,
/// this function returns a [`String`] instead of modifying the parameter in-place.
///
/// This function does not perform any [normalization] (e.g. NFC),
/// so semantically and visually identical strings may compare unequal.
///
/// Like [`char::to_casefold()`] this method does not handle language-specific
/// casing, like Turkish and Azeri I/ı/İ/i. See that method's documentation
/// for more information.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// #![feature(casefold)]
/// let s0 = "HELLO";
/// let s1 = "Hello";
///
/// assert_eq!(s0.to_casefold(), s1.to_casefold());
/// assert_eq!(s0.to_casefold(), "hello")
/// ```
///
/// Scripts without case are not changed:
///
/// ```
/// #![feature(casefold)]
/// let new_year = "农历新年";
///
/// assert_eq!(new_year, new_year.to_casefold());
/// ```
///
/// One character can become multiple:
///
/// ```
/// #![feature(casefold)]
/// let s0 = "TSCHÜẞ";
Comment thread
jhpratt marked this conversation as resolved.
/// let s1 = "TSCHÜSS";
/// let s2 = "tschüß";
///
/// assert_eq!(s0.to_casefold(), s1.to_casefold());
/// assert_eq!(s0.to_casefold(), s2.to_casefold());
/// assert_eq!(s0.to_casefold(), "tschüss");
/// ```
///
/// No NFC [normalization] is performed:
///
/// ```rust
/// #![feature(casefold)]
/// // These two strings are visually and semantically identical...
/// let comp = "Á";
/// let decomp = "Á";
///
/// // ... but not codepoint-for-codepoint equal.
/// assert_eq!(comp, "\u{C1}");
/// assert_eq!(decomp, "A\u{0301}");
///
/// // Their case-foldings are likewise unequal:
/// assert_eq!(comp.to_casefold(), "\u{E1}");
/// assert_eq!(decomp.to_casefold(), "a\u{0301}");
/// ```
///
/// [normalization]: https://www.unicode.org/faq/normalization
#[cfg(not(no_global_oom_handling))]
#[rustc_allow_incoherent_impl]
#[must_use = "this returns the case-folded string as a new String, \
without modifying the original"]
#[unstable(feature = "casefold", issue = "none")]
pub fn to_casefold(&self) -> String {
// SAFETY: `to_ascii_lowercase` preserves ASCII bytes, so the converted
// prefix remains valid UTF-8.
let (mut s, rest) = unsafe { convert_while_ascii(self, u8::to_ascii_lowercase) };

for c in rest.chars() {
match conversions::to_casefold(c) {
[a, '\0', _] => s.push(a),
[a, b, '\0'] => {
s.push(a);
s.push(b);
}
[a, b, c] => {
s.push(a);
s.push(b);
s.push(c);
}
}
}
s
}

/// Converts a [`Box<str>`] into a [`String`] without copying or allocating.
///
/// # Examples
Expand Down
1 change: 1 addition & 0 deletions library/alloctests/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#![feature(const_heap)]
#![feature(deque_extend_front)]
#![feature(iter_array_chunks)]
#![feature(casefold)]
#![feature(cow_is_borrowed)]
#![feature(core_intrinsics)]
#![feature(downcast_unchecked)]
Expand Down
8 changes: 7 additions & 1 deletion library/alloctests/tests/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1867,7 +1867,13 @@ fn to_lowercase() {
#[test]
fn to_uppercase() {
assert_eq!("".to_uppercase(), "");
assert_eq!("aéDžßfiᾀ".to_uppercase(), "AÉDŽSSFIἈΙ");
assert_eq!("aéDžßẞfiᾀ".to_uppercase(), "AÉDŽSSẞFIἈΙ");
}

#[test]
fn to_casefold() {
assert_eq!("".to_casefold(), "");
assert_eq!("ꮿfiῲὼ\u{0345}ßẞΣς".to_casefold(), "Ꮿfiὼιὼιssssσσ");
}

#[test]
Expand Down
130 changes: 112 additions & 18 deletions library/core/src/char/methods.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1075,16 +1075,17 @@ impl char {
}

/// Returns `true` if this `char` has the `Case_Ignorable` property. This narrow-use property
/// is used to implement context-dependent casing for the Greek letter sigma (uppercase Σ),
/// is used to implement context-dependent casing for the Greek letter sigma (uppercase 'Σ'),
/// which has two lowercase forms.
///
/// `Case_Ignorable` is [described][D136] in Chapter 3 (Conformance) of the Unicode Core Specification,
/// and specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`];
/// see those resources for more information.
/// and specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
/// See those resources, as well as [`to_lowercase()`]'s documentation, for more information.
///
/// [D136]: https://www.unicode.org/versions/latest/core-spec/chapter-3/#G63116
/// [ucd]: https://www.unicode.org/reports/tr44/
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
/// [`to_lowercase()`]: Self::to_lowercase()
#[must_use]
#[inline]
#[unstable(feature = "case_ignorable", issue = "154848")]
Expand Down Expand Up @@ -1154,8 +1155,6 @@ impl char {
/// If this `char` expands to multiple `char`s, the iterator yields the `char`s given by
/// [`SpecialCasing.txt`]. The maximum number of `char`s in a case mapping is 3.
///
/// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt
///
/// This operation performs an unconditional mapping without tailoring. That is, the conversion
/// is independent of context and language. See [below](#notes-on-context-and-locale)
/// for more information.
Expand Down Expand Up @@ -1210,28 +1209,39 @@ impl char {
///
/// ## Greek sigma
///
/// In Greek, the letter simga (uppercase Σ) has two lowercase forms:
/// ς which is used only at the end of a word, and σ which is used everywhere else.
/// `to_lowercase()` always uses the second form:
/// In Greek, the letter simga (uppercase 'Σ') has two lowercase forms:
/// 'σ' which is used in most situations, and 'ς' which appears only
/// at the end of a word. [`char::to_lowercase()`] always uses the first form:
///
/// ```
/// assert_eq!('Σ'.to_lowercase().to_string(), "σ");
/// ```
///
/// `str::to_lowercase()` (only available with the `alloc` crate)
/// *does* properly handle this contextual mapping,
/// so prefer using that method if you can. Alternatively, you can use
/// [`is_cased()`] and [`is_case_ignorable()`] to implement it yourself.
/// See `Final_Sigma` in [Table 3.17] of the Unicode Standard,
/// along with [`SpecialCasing.txt`], for more details.
///
/// [`is_cased()`]: Self::is_cased()
/// [`is_case_ignorable()`]: Self::is_case_ignorable()
/// [Table 3.17]: https://www.unicode.org/versions/latest/core-spec/chapter-3/#G54277
///
/// ## Turkish and Azeri I/ı/İ/i
///
/// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
///
/// * 'Dotless': I / ı, sometimes written ï
/// * 'Dotted': İ / i
///
/// Note that the uppercase undotted 'I' is the same as the Latin. Therefore:
/// Note that the uppercase undotted 'I' is the same codepoint as the Latin. Therefore:
///
/// ```
/// let lower_i = 'I'.to_lowercase().to_string();
/// ```
///
/// The value of `lower_i` here relies on the language of the text: if we're
/// `'I'`'s correct lowercase relies on the language of the text: if we're
/// in `en-US`, it should be `"i"`, but if we're in `tr-TR` or `az-AZ`, it should
/// be `"ı"`. `to_lowercase()` does not take this into account, and so:
///
Expand All @@ -1242,6 +1252,8 @@ impl char {
/// ```
///
/// holds across languages.
///
/// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt
#[must_use = "this returns the lowercased character as a new iterator, \
without modifying the original"]
#[stable(feature = "rust1", since = "1.0.0")]
Expand Down Expand Up @@ -1392,22 +1404,22 @@ impl char {
/// As stated above, this method is locale-insensitive.
/// If you need locale support, consider using an external crate,
/// like [`icu_casemap`](https://crates.io/crates/icu_casemap)
/// which is developed by Unicode. A description of a common
/// locale-dependent casing issue follows:
/// which is developed by Unicode. A description of one common
/// locale-dependent casing issue follows (there are others):
///
/// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
///
/// * 'Dotless': I / ı, sometimes written ï
/// * 'Dotted': İ / i
///
/// Note that the lowercase dotted 'i' is the same as the Latin. Therefore:
/// Note that the lowercase dotted 'i' is the same codepoint as the Latin. Therefore:
///
/// ```
/// #![feature(titlecase)]
/// let upper_i = 'i'.to_titlecase().to_string();
/// ```
///
/// The value of `upper_i` here relies on the language of the text: if we're
/// `'i'`'s correct titlecase relies on the language of the text: if we're
/// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should
/// be `"İ"`. `to_titlecase()` does not take this into account, and so:
///
Expand Down Expand Up @@ -1504,21 +1516,21 @@ impl char {
/// As stated above, this method is locale-insensitive.
/// If you need locale support, consider using an external crate,
/// like [`icu_casemap`](https://crates.io/crates/icu_casemap)
/// which is developed by Unicode. A description of a common
/// locale-dependent casing issue follows:
/// which is developed by Unicode. A description of one common
/// locale-dependent casing issue follows (there are others):
///
/// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
///
/// * 'Dotless': I / ı, sometimes written ï
/// * 'Dotted': İ / i
///
/// Note that the lowercase dotted 'i' is the same as the Latin. Therefore:
/// Note that the lowercase dotted 'i' is the same codepoint as the Latin. Therefore:
///
/// ```
/// let upper_i = 'i'.to_uppercase().to_string();
/// ```
///
/// The value of `upper_i` here relies on the language of the text: if we're
/// `'i'`'s correct uppercase relies on the language of the text: if we're
/// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should
/// be `"İ"`. `to_uppercase()` does not take this into account, and so:
///
Expand All @@ -1539,6 +1551,88 @@ impl char {
ToUppercase(CaseMappingIter::new(conversions::to_upper(self)))
}

/// Returns an iterator that yields the case folding of this `char` as one or more
/// `char`s.
///
/// Case folding is meant to be used when performing case-insensitive string comparisons.
/// Case-folded strings should not usually be exposed directly to users. For most,
/// but not all, characters, the casefold mapping is identical to the lowercase one.
///
/// This iterator yields the `char`(s) in the common or full case folding for this `char`,
/// as given by the [Unicode Character Database][ucd] [`CaseFolding.txt`].
/// The maximum number of `char`s in a case folding is 3.
///
/// [ucd]: https://www.unicode.org/reports/tr44/
/// [`CaseFolding.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt
///
/// This operation performs an unconditional mapping without tailoring. That is, the conversion
/// is independent of context and language.
///
/// It also does not perform any [normalization] (e.g. NFC).
///
/// [normalization]: https://www.unicode.org/faq/normalization
///
/// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case folding in
/// general and Chapter 3 (Conformance) discusses the default algorithm for case folding.
///
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
///
/// # Examples
///
/// The German sharp S `'ß'` (U+DF) is a single Unicode code point
/// that casefolds to `"ss"`. Its uppercase variant '`ẞ`' (U+1E9E)
/// has the same case-folding.
///
/// As an iterator:
///
/// ```
/// #![feature(casefold)]
/// assert!('ß'.to_casefold().eq(['s', 's']));
/// assert!('ẞ'.to_casefold().eq(['s', 's']));
/// ```
///
/// Using [`to_string`](../std/string/trait.ToString.html#tymethod.to_string):
///
/// ```
/// #![feature(casefold)]
/// assert_eq!('ß'.to_casefold().to_string(), "ss");
/// assert_eq!('ẞ'.to_casefold().to_string(), "ss");
/// ```
///
/// # Note on locale
///
/// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
///
/// * 'Dotless': I / ı, sometimes written ï
/// * 'Dotted': İ / i
///
/// Note that the uppercase undotted 'I' is the same codepoint as the Latin. Therefore:
///
/// ```
/// #![feature(casefold)]
/// let casefold_i = 'I'.to_casefold().to_string();
/// ```
///
/// `'I'`'s correct case folding relies on the language of the text: if we're
/// in `en-US`, it should be `"i"`, but if we're in `tr-TR` or `az-AZ`, it should
/// be `"ı"`. `to_casefold()` does not take this into account, and so:
///
/// ```
/// #![feature(casefold)]
/// let casefold_i = 'I'.to_casefold().to_string();
///
/// assert_eq!(casefold_i, "i");
/// ```
///
/// holds across languages.
#[must_use = "this returns the case-folded character as a new iterator, \
without modifying the original"]
#[unstable(feature = "casefold", issue = "none")]
#[inline]
pub fn to_casefold(self) -> ToCasefold {
ToCasefold(CaseMappingIter::new(conversions::to_casefold(self)))
}

/// Checks if the value is within the ASCII range.
///
/// # Examples
Expand Down
15 changes: 15 additions & 0 deletions library/core/src/char/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,21 @@ casemappingiter_impls! {
ToLowercase
}

casemappingiter_impls! {
#[unstable(feature = "casefold", issue = "none")]
#[unstable(feature = "casefold", issue = "none")]
#[unstable(feature = "casefold", issue = "none")]
#[unstable(feature = "casefold", issue = "none")]
#[unstable(feature = "casefold", issue = "none")]
/// Returns an iterator that yields the case-folded equivalent of a `char`.
///
/// This `struct` is created by the [`to_casefold`] method on [`char`]. See
/// its documentation for more.
///
/// [`to_casefold`]: char::to_casefold
ToCasefold
}

#[derive(Debug, Clone)]
struct CaseMappingIter(core::array::IntoIter<char, 3>);

Expand Down
Loading
Loading