From c6bf0acbc502661f9630d9ff0fd0a86fcd4ba37b Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Thu, 2 Apr 2026 00:09:54 -0400 Subject: [PATCH 1/5] Add `char::to_casefold()` --- Cargo.lock | 1 + library/core/src/char/methods.rs | 79 ++++++++++++++++ library/core/src/char/mod.rs | 15 +++ library/core/src/unicode/unicode_data.rs | 82 +++++++++++++++- library/coretests/tests/char.rs | 35 +++++++ library/coretests/tests/lib.rs | 1 + library/coretests/tests/unicode.rs | 14 +++ library/coretests/tests/unicode/test_data.rs | 91 ++++++++++++++++++ src/tools/unicode-table-generator/Cargo.toml | 1 + .../src/cascading_map.rs | 5 +- .../src/case_mapping.rs | 85 +++++++++++++++-- src/tools/unicode-table-generator/src/main.rs | 94 +++++++++++++++++-- .../src/raw_emitter.rs | 17 ++-- .../src/unicode_download.rs | 9 +- 14 files changed, 498 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bac0aeb37c600..bfc5598bea709 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6085,6 +6085,7 @@ checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" name = "unicode-table-generator" version = "0.1.0" dependencies = [ + "rustc-hash 2.1.1", "ucd-parse", ] diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 00b735e91a377..2fb70214a9c34 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -1539,6 +1539,85 @@ impl char { ToUppercase(CaseMappingIter::new(conversions::to_upper(self))) } + /// Returns an iterator that yields the case folding of this `char` as one or more + /// `char`s. + /// + /// Case folding is meant to be used when performing case-insensitive string comparisons, + /// but case-folded strings should not generally be exposed directly to users. For most, + /// but not all, characters, the casefold mapping is identical to the lowercase one. + /// + /// This iterator yields the `char`(s) in the common or full case folding for this `char`, + /// as given by the [Unicode Character Database][ucd] [`CaseFolding.txt`]. + /// + /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [`CaseFolding.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt + /// + /// This operation performs an unconditional mapping without tailoring. That is, the conversion + /// is independent of context and language. + /// + /// It also does not perform any normalization (e.g. NFC). + /// + /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case folding in + /// general and Chapter 3 (Conformance) discusses the default algorithm for case folding. + /// + /// [Unicode Standard]: https://www.unicode.org/versions/latest/ + /// + /// # Examples + /// + /// The German sharp S `'ß'` (U+DF) is a single Unicode code point + /// that casefolds to `"ss"`. Its uppercase variant '`ẞ`' (U+1E9E) + /// has the same case-folding. + /// + /// As an iterator: + /// + /// ``` + /// #![feature(casefold)] + /// assert!('ß'.to_casefold().eq(['s', 's'])); + /// assert!('ẞ'.to_casefold().eq(['s', 's'])); + /// ``` + /// + /// Using [`to_string`](../std/string/trait.ToString.html#tymethod.to_string): + /// + /// ``` + /// #![feature(casefold)] + /// assert_eq!('ß'.to_casefold().to_string(), "ss"); + /// assert_eq!('ẞ'.to_casefold().to_string(), "ss"); + /// ``` + /// + /// # Note on locale + /// + /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two: + /// + /// * 'Dotless': I / ı, sometimes written ï + /// * 'Dotted': İ / i + /// + /// Note that the uppercase undotted 'I' is the same as the Latin. Therefore: + /// + /// ``` + /// #![feature(casefold)] + /// let casefold_i = 'I'.to_casefold().to_string(); + /// ``` + /// + /// The value of `casefold_i` here relies on the language of the text: if we're + /// in `en-US`, it should be `"i"`, but if we're in `tr-TR` or `az-AZ`, it should + /// be `"ı"`. `to_casefold()` does not take this into account, and so: + /// + /// ``` + /// #![feature(casefold)] + /// let casefold_i = 'I'.to_casefold().to_string(); + /// + /// assert_eq!(casefold_i, "i"); + /// ``` + /// + /// holds across languages. + #[must_use = "this returns the case-folded character as a new iterator, \ + without modifying the original"] + #[unstable(feature = "casefold", issue = "none")] + #[inline] + pub fn to_casefold(self) -> ToCasefold { + ToCasefold(CaseMappingIter::new(conversions::to_casefold(self))) + } + /// Checks if the value is within the ASCII range. /// /// # Examples diff --git a/library/core/src/char/mod.rs b/library/core/src/char/mod.rs index 3231c4193064c..7e10c46265d1b 100644 --- a/library/core/src/char/mod.rs +++ b/library/core/src/char/mod.rs @@ -516,6 +516,21 @@ casemappingiter_impls! { ToLowercase } +casemappingiter_impls! { + #[unstable(feature = "casefold", issue = "none")] + #[unstable(feature = "casefold", issue = "none")] + #[unstable(feature = "casefold", issue = "none")] + #[unstable(feature = "casefold", issue = "none")] + #[unstable(feature = "casefold", issue = "none")] + /// Returns an iterator that yields the case-folded equivalent of a `char`. + /// + /// This `struct` is created by the [`to_casefold`] method on [`char`]. See + /// its documentation for more. + /// + /// [`to_casefold`]: char::to_casefold + ToCasefold +} + #[derive(Debug, Clone)] struct CaseMappingIter(core::array::IntoIter); diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index 83d3808051840..729234506f60c 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -10,7 +10,8 @@ // to_lower : 1112 bytes, 1462 codepoints in 185 ranges (U+0000C0 - U+01E921) using 2-level LUT // to_upper : 1998 bytes, 1554 codepoints in 299 ranges (U+0000B5 - U+01E943) using 2-level LUT // to_title : 340 bytes, 135 codepoints in 49 ranges (U+0000DF - U+00FB17) using 2-level LUT -// Total : 9629 bytes +// to_casefold : 32 bytes, 174 codepoints in 5 ranges (U+000131 - U+00ABBF) using 2-level LUT +// Total : 9661 bytes #[inline(always)] const fn bitset_search< @@ -846,7 +847,7 @@ pub mod conversions { } pub fn to_lower(c: char) -> [char; 3] { - // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AChanges_When_Lowercased%253A%5D-%5B%253AASCII%253A%5D&abb=on + // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Changes_When_Lowercased:]-[:ASCII:]&abb=on if c < '\u{C0}' { return [c.to_ascii_lowercase(), '\0', '\0']; } @@ -855,7 +856,7 @@ pub mod conversions { } pub fn to_upper(c: char) -> [char; 3] { - // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AChanges_When_Uppercased%253A%5D-%5B%253AASCII%253A%5D&abb=on + // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Changes_When_Uppercased:]-[:ASCII:]&abb=on if c < '\u{B5}' { return [c.to_ascii_uppercase(), '\0', '\0']; } @@ -864,7 +865,7 @@ pub mod conversions { } pub fn to_title(c: char) -> [char; 3] { - // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AChanges_When_Titlecased%253A%5D-%5B%253AASCII%253A%5D&abb=on + // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Changes_When_Titlecased:]-[:ASCII:]&abb=on if c < '\u{B5}' { return [c.to_ascii_uppercase(), '\0', '\0']; } @@ -872,6 +873,59 @@ pub mod conversions { lookup(c, &TITLECASE_LUT).or_else(|| lookup(c, &UPPERCASE_LUT)).unwrap_or([c, '\0', '\0']) } + pub fn to_casefold(c: char) -> [char; 3] { + // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Changes_When_Casefolded:]-[:ASCII:]&abb=on + if c < '\u{B5}' { + return [c.to_ascii_lowercase(), '\0', '\0']; + } + + + lookup(c, &CASEFOLD_LUT).unwrap_or_else(|| { + // fall back to lowercase of uppercase + + let uppercase = lookup(c, &UPPERCASE_LUT).unwrap_or([c, '\0', '\0']); + let mut final_result = to_lower(uppercase[0]); + if uppercase[1] != '\0' { + let lowercase_1 = to_lower(uppercase[1]); + debug_assert_eq!(lowercase_1[2], '\0'); + + // If, after updating the Unicode data + // to a new Unicode version, the below + // assertion starts to fail in tests, + // delete it, and uncomment the + // `if` condition and corresponding + // `else` block below it. + debug_assert_eq!(final_result[1], '\0'); + //if final_result[1] == '\0' { + + final_result[1] = lowercase_1[0]; + + if uppercase[2] != '\0' { + debug_assert_eq!(lowercase_1[1], '\0'); + let lowercase_2 = to_lower(uppercase[2]); + debug_assert_eq!(lowercase_2[1], '\0'); + debug_assert_eq!(lowercase_2[2], '\0'); + final_result[2] = lowercase_2[0]; + } else { + // If, after updating the Unicode data + // to a new Unicode version, the below + // assertion starts to fail in tests, + // delete it and uncomment the line + // below it. + debug_assert_eq!(lowercase_1[1], '\0'); + //final_result[2] = lowercase_1[1]; + } + + /*} else { + final_result[2] = lowercase_1[0]; + debug_assert_eq!(lowercase_1[1], '\0'); + debug_assert_eq!(uppercase[2], '\0') + }*/ + } + final_result + }) + } + static LOWERCASE_LUT: L1Lut = L1Lut { l2_luts: [ L2Lut { @@ -1188,4 +1242,24 @@ pub mod conversions { }, ], }; + + static CASEFOLD_LUT: L1Lut = L1Lut { + l2_luts: [ + L2Lut { + singles: &[ // 4 entries, 24 bytes + (Range::singleton(0x0131), 0), (Range::step_by_1(0x13a0..=0x13f5), 0), + (Range::step_by_1(0x13f8..=0x13fd), -8), (Range::step_by_1(0xab70..=0xabbf), 26672), + ], + multis: &[ // 1 entries, 8 bytes + (0x1e9e, [0x0073, 0x0073, 0x0000]), + ], + }, + L2Lut { + singles: &[ // 0 entries, 0 bytes + ], + multis: &[ // 0 entries, 0 bytes + ], + }, + ], + }; } diff --git a/library/coretests/tests/char.rs b/library/coretests/tests/char.rs index 877017f682c97..3c43e4db7b330 100644 --- a/library/coretests/tests/char.rs +++ b/library/coretests/tests/char.rs @@ -212,6 +212,41 @@ fn test_to_uppercase() { assert_eq!(upper('ᾀ'), "ἈΙ"); } +#[test] +fn test_to_casefold() { + fn fold(c: char) -> String { + let to_casefold = c.to_casefold(); + assert_eq!(to_casefold.len(), to_casefold.count()); + let iter: String = c.to_casefold().collect(); + let disp: String = c.to_casefold().to_string(); + assert_eq!(iter, disp); + let iter_rev: String = c.to_casefold().rev().collect(); + let disp_rev: String = disp.chars().rev().collect(); + assert_eq!(iter_rev, disp_rev); + iter + } + assert_eq!(fold('A'), "a"); + assert_eq!(fold('Ö'), "ö"); + assert_eq!(fold('ß'), "ss"); + assert_eq!(fold('ẞ'), "ss"); + assert_eq!(fold('Ü'), "ü"); + assert_eq!(fold('💩'), "💩"); + assert_eq!(fold('Σ'), "σ"); + assert_eq!(fold('ς'), "σ"); + assert_eq!(fold('Τ'), "τ"); + assert_eq!(fold('Ι'), "ι"); + assert_eq!(fold('Γ'), "γ"); + assert_eq!(fold('Μ'), "μ"); + assert_eq!(fold('Α'), "α"); + assert_eq!(fold('Dž'), "dž"); + assert_eq!(fold('fi'), "fi"); + assert_eq!(fold('İ'), "i\u{307}"); + assert_eq!(fold('ꮿ'), "Ꮿ"); + assert_eq!(fold('Ꮿ'), "Ꮿ"); + assert_eq!(fold('ῲ'), "ὼι"); + assert_eq!(fold('\u{0345}'), "ι"); +} + #[test] fn test_is_control() { assert!('\u{0}'.is_control()); diff --git a/library/coretests/tests/lib.rs b/library/coretests/tests/lib.rs index 2c561b5b0529e..cd5f4d80039f4 100644 --- a/library/coretests/tests/lib.rs +++ b/library/coretests/tests/lib.rs @@ -10,6 +10,7 @@ #![feature(bool_to_result)] #![feature(borrowed_buf_init)] #![feature(bstr)] +#![feature(casefold)] #![feature(cfg_target_has_reliable_f16_f128)] #![feature(char_internals)] #![feature(char_max_len)] diff --git a/library/coretests/tests/unicode.rs b/library/coretests/tests/unicode.rs index 12eed25a1feae..6ca45661f7d83 100644 --- a/library/coretests/tests/unicode.rs +++ b/library/coretests/tests/unicode.rs @@ -124,3 +124,17 @@ fn to_titlecase() { unicode_data::conversions::to_upper, ); } + +#[test] +#[cfg_attr(miri, ignore)] // Miri is too slow +fn to_casefold() { + test_case_mapping(test_data::TO_CASEFOLD, unicode_data::conversions::to_casefold, |c| { + let upper = unicode_data::conversions::to_upper(c); + let lower = upper.map(unicode_data::conversions::to_lower); + let mut result = ['\0'; 3]; + for (i, c) in lower.into_iter().flatten().filter(|&c| c != '\0').enumerate() { + result[i] = c; + } + result + }); +} diff --git a/library/coretests/tests/unicode/test_data.rs b/library/coretests/tests/unicode/test_data.rs index 962770a0ff830..77b976c489c9b 100644 --- a/library/coretests/tests/unicode/test_data.rs +++ b/library/coretests/tests/unicode/test_data.rs @@ -2931,3 +2931,94 @@ pub(super) static TO_TITLE: &[(char, [char; 3]); 135] = &[ ('\u{fb16}', ['\u{54e}', '\u{576}', '\u{0}']), ('\u{fb17}', ['\u{544}', '\u{56d}', '\u{0}']), ]; + +#[rustfmt::skip] +pub(super) static TO_CASEFOLD: &[(char, [char; 3]); 174] = &[ + ('\u{131}', ['\u{131}', '\u{0}', '\u{0}']), ('\u{13a0}', ['\u{13a0}', '\u{0}', '\u{0}']), + ('\u{13a1}', ['\u{13a1}', '\u{0}', '\u{0}']), ('\u{13a2}', ['\u{13a2}', '\u{0}', '\u{0}']), + ('\u{13a3}', ['\u{13a3}', '\u{0}', '\u{0}']), ('\u{13a4}', ['\u{13a4}', '\u{0}', '\u{0}']), + ('\u{13a5}', ['\u{13a5}', '\u{0}', '\u{0}']), ('\u{13a6}', ['\u{13a6}', '\u{0}', '\u{0}']), + ('\u{13a7}', ['\u{13a7}', '\u{0}', '\u{0}']), ('\u{13a8}', ['\u{13a8}', '\u{0}', '\u{0}']), + ('\u{13a9}', ['\u{13a9}', '\u{0}', '\u{0}']), ('\u{13aa}', ['\u{13aa}', '\u{0}', '\u{0}']), + ('\u{13ab}', ['\u{13ab}', '\u{0}', '\u{0}']), ('\u{13ac}', ['\u{13ac}', '\u{0}', '\u{0}']), + ('\u{13ad}', ['\u{13ad}', '\u{0}', '\u{0}']), ('\u{13ae}', ['\u{13ae}', '\u{0}', '\u{0}']), + ('\u{13af}', ['\u{13af}', '\u{0}', '\u{0}']), ('\u{13b0}', ['\u{13b0}', '\u{0}', '\u{0}']), + ('\u{13b1}', ['\u{13b1}', '\u{0}', '\u{0}']), ('\u{13b2}', ['\u{13b2}', '\u{0}', '\u{0}']), + ('\u{13b3}', ['\u{13b3}', '\u{0}', '\u{0}']), ('\u{13b4}', ['\u{13b4}', '\u{0}', '\u{0}']), + ('\u{13b5}', ['\u{13b5}', '\u{0}', '\u{0}']), ('\u{13b6}', ['\u{13b6}', '\u{0}', '\u{0}']), + ('\u{13b7}', ['\u{13b7}', '\u{0}', '\u{0}']), ('\u{13b8}', ['\u{13b8}', '\u{0}', '\u{0}']), + ('\u{13b9}', ['\u{13b9}', '\u{0}', '\u{0}']), ('\u{13ba}', ['\u{13ba}', '\u{0}', '\u{0}']), + ('\u{13bb}', ['\u{13bb}', '\u{0}', '\u{0}']), ('\u{13bc}', ['\u{13bc}', '\u{0}', '\u{0}']), + ('\u{13bd}', ['\u{13bd}', '\u{0}', '\u{0}']), ('\u{13be}', ['\u{13be}', '\u{0}', '\u{0}']), + ('\u{13bf}', ['\u{13bf}', '\u{0}', '\u{0}']), ('\u{13c0}', ['\u{13c0}', '\u{0}', '\u{0}']), + ('\u{13c1}', ['\u{13c1}', '\u{0}', '\u{0}']), ('\u{13c2}', ['\u{13c2}', '\u{0}', '\u{0}']), + ('\u{13c3}', ['\u{13c3}', '\u{0}', '\u{0}']), ('\u{13c4}', ['\u{13c4}', '\u{0}', '\u{0}']), + ('\u{13c5}', ['\u{13c5}', '\u{0}', '\u{0}']), ('\u{13c6}', ['\u{13c6}', '\u{0}', '\u{0}']), + ('\u{13c7}', ['\u{13c7}', '\u{0}', '\u{0}']), ('\u{13c8}', ['\u{13c8}', '\u{0}', '\u{0}']), + ('\u{13c9}', ['\u{13c9}', '\u{0}', '\u{0}']), ('\u{13ca}', ['\u{13ca}', '\u{0}', '\u{0}']), + ('\u{13cb}', ['\u{13cb}', '\u{0}', '\u{0}']), ('\u{13cc}', ['\u{13cc}', '\u{0}', '\u{0}']), + ('\u{13cd}', ['\u{13cd}', '\u{0}', '\u{0}']), ('\u{13ce}', ['\u{13ce}', '\u{0}', '\u{0}']), + ('\u{13cf}', ['\u{13cf}', '\u{0}', '\u{0}']), ('\u{13d0}', ['\u{13d0}', '\u{0}', '\u{0}']), + ('\u{13d1}', ['\u{13d1}', '\u{0}', '\u{0}']), ('\u{13d2}', ['\u{13d2}', '\u{0}', '\u{0}']), + ('\u{13d3}', ['\u{13d3}', '\u{0}', '\u{0}']), ('\u{13d4}', ['\u{13d4}', '\u{0}', '\u{0}']), + ('\u{13d5}', ['\u{13d5}', '\u{0}', '\u{0}']), ('\u{13d6}', ['\u{13d6}', '\u{0}', '\u{0}']), + ('\u{13d7}', ['\u{13d7}', '\u{0}', '\u{0}']), ('\u{13d8}', ['\u{13d8}', '\u{0}', '\u{0}']), + ('\u{13d9}', ['\u{13d9}', '\u{0}', '\u{0}']), ('\u{13da}', ['\u{13da}', '\u{0}', '\u{0}']), + ('\u{13db}', ['\u{13db}', '\u{0}', '\u{0}']), ('\u{13dc}', ['\u{13dc}', '\u{0}', '\u{0}']), + ('\u{13dd}', ['\u{13dd}', '\u{0}', '\u{0}']), ('\u{13de}', ['\u{13de}', '\u{0}', '\u{0}']), + ('\u{13df}', ['\u{13df}', '\u{0}', '\u{0}']), ('\u{13e0}', ['\u{13e0}', '\u{0}', '\u{0}']), + ('\u{13e1}', ['\u{13e1}', '\u{0}', '\u{0}']), ('\u{13e2}', ['\u{13e2}', '\u{0}', '\u{0}']), + ('\u{13e3}', ['\u{13e3}', '\u{0}', '\u{0}']), ('\u{13e4}', ['\u{13e4}', '\u{0}', '\u{0}']), + ('\u{13e5}', ['\u{13e5}', '\u{0}', '\u{0}']), ('\u{13e6}', ['\u{13e6}', '\u{0}', '\u{0}']), + ('\u{13e7}', ['\u{13e7}', '\u{0}', '\u{0}']), ('\u{13e8}', ['\u{13e8}', '\u{0}', '\u{0}']), + ('\u{13e9}', ['\u{13e9}', '\u{0}', '\u{0}']), ('\u{13ea}', ['\u{13ea}', '\u{0}', '\u{0}']), + ('\u{13eb}', ['\u{13eb}', '\u{0}', '\u{0}']), ('\u{13ec}', ['\u{13ec}', '\u{0}', '\u{0}']), + ('\u{13ed}', ['\u{13ed}', '\u{0}', '\u{0}']), ('\u{13ee}', ['\u{13ee}', '\u{0}', '\u{0}']), + ('\u{13ef}', ['\u{13ef}', '\u{0}', '\u{0}']), ('\u{13f0}', ['\u{13f0}', '\u{0}', '\u{0}']), + ('\u{13f1}', ['\u{13f1}', '\u{0}', '\u{0}']), ('\u{13f2}', ['\u{13f2}', '\u{0}', '\u{0}']), + ('\u{13f3}', ['\u{13f3}', '\u{0}', '\u{0}']), ('\u{13f4}', ['\u{13f4}', '\u{0}', '\u{0}']), + ('\u{13f5}', ['\u{13f5}', '\u{0}', '\u{0}']), ('\u{13f8}', ['\u{13f0}', '\u{0}', '\u{0}']), + ('\u{13f9}', ['\u{13f1}', '\u{0}', '\u{0}']), ('\u{13fa}', ['\u{13f2}', '\u{0}', '\u{0}']), + ('\u{13fb}', ['\u{13f3}', '\u{0}', '\u{0}']), ('\u{13fc}', ['\u{13f4}', '\u{0}', '\u{0}']), + ('\u{13fd}', ['\u{13f5}', '\u{0}', '\u{0}']), ('\u{1e9e}', ['s', 's', '\u{0}']), + ('\u{ab70}', ['\u{13a0}', '\u{0}', '\u{0}']), ('\u{ab71}', ['\u{13a1}', '\u{0}', '\u{0}']), + ('\u{ab72}', ['\u{13a2}', '\u{0}', '\u{0}']), ('\u{ab73}', ['\u{13a3}', '\u{0}', '\u{0}']), + ('\u{ab74}', ['\u{13a4}', '\u{0}', '\u{0}']), ('\u{ab75}', ['\u{13a5}', '\u{0}', '\u{0}']), + ('\u{ab76}', ['\u{13a6}', '\u{0}', '\u{0}']), ('\u{ab77}', ['\u{13a7}', '\u{0}', '\u{0}']), + ('\u{ab78}', ['\u{13a8}', '\u{0}', '\u{0}']), ('\u{ab79}', ['\u{13a9}', '\u{0}', '\u{0}']), + ('\u{ab7a}', ['\u{13aa}', '\u{0}', '\u{0}']), ('\u{ab7b}', ['\u{13ab}', '\u{0}', '\u{0}']), + ('\u{ab7c}', ['\u{13ac}', '\u{0}', '\u{0}']), ('\u{ab7d}', ['\u{13ad}', '\u{0}', '\u{0}']), + ('\u{ab7e}', ['\u{13ae}', '\u{0}', '\u{0}']), ('\u{ab7f}', ['\u{13af}', '\u{0}', '\u{0}']), + ('\u{ab80}', ['\u{13b0}', '\u{0}', '\u{0}']), ('\u{ab81}', ['\u{13b1}', '\u{0}', '\u{0}']), + ('\u{ab82}', ['\u{13b2}', '\u{0}', '\u{0}']), ('\u{ab83}', ['\u{13b3}', '\u{0}', '\u{0}']), + ('\u{ab84}', ['\u{13b4}', '\u{0}', '\u{0}']), ('\u{ab85}', ['\u{13b5}', '\u{0}', '\u{0}']), + ('\u{ab86}', ['\u{13b6}', '\u{0}', '\u{0}']), ('\u{ab87}', ['\u{13b7}', '\u{0}', '\u{0}']), + ('\u{ab88}', ['\u{13b8}', '\u{0}', '\u{0}']), ('\u{ab89}', ['\u{13b9}', '\u{0}', '\u{0}']), + ('\u{ab8a}', ['\u{13ba}', '\u{0}', '\u{0}']), ('\u{ab8b}', ['\u{13bb}', '\u{0}', '\u{0}']), + ('\u{ab8c}', ['\u{13bc}', '\u{0}', '\u{0}']), ('\u{ab8d}', ['\u{13bd}', '\u{0}', '\u{0}']), + ('\u{ab8e}', ['\u{13be}', '\u{0}', '\u{0}']), ('\u{ab8f}', ['\u{13bf}', '\u{0}', '\u{0}']), + ('\u{ab90}', ['\u{13c0}', '\u{0}', '\u{0}']), ('\u{ab91}', ['\u{13c1}', '\u{0}', '\u{0}']), + ('\u{ab92}', ['\u{13c2}', '\u{0}', '\u{0}']), ('\u{ab93}', ['\u{13c3}', '\u{0}', '\u{0}']), + ('\u{ab94}', ['\u{13c4}', '\u{0}', '\u{0}']), ('\u{ab95}', ['\u{13c5}', '\u{0}', '\u{0}']), + ('\u{ab96}', ['\u{13c6}', '\u{0}', '\u{0}']), ('\u{ab97}', ['\u{13c7}', '\u{0}', '\u{0}']), + ('\u{ab98}', ['\u{13c8}', '\u{0}', '\u{0}']), ('\u{ab99}', ['\u{13c9}', '\u{0}', '\u{0}']), + ('\u{ab9a}', ['\u{13ca}', '\u{0}', '\u{0}']), ('\u{ab9b}', ['\u{13cb}', '\u{0}', '\u{0}']), + ('\u{ab9c}', ['\u{13cc}', '\u{0}', '\u{0}']), ('\u{ab9d}', ['\u{13cd}', '\u{0}', '\u{0}']), + ('\u{ab9e}', ['\u{13ce}', '\u{0}', '\u{0}']), ('\u{ab9f}', ['\u{13cf}', '\u{0}', '\u{0}']), + ('\u{aba0}', ['\u{13d0}', '\u{0}', '\u{0}']), ('\u{aba1}', ['\u{13d1}', '\u{0}', '\u{0}']), + ('\u{aba2}', ['\u{13d2}', '\u{0}', '\u{0}']), ('\u{aba3}', ['\u{13d3}', '\u{0}', '\u{0}']), + ('\u{aba4}', ['\u{13d4}', '\u{0}', '\u{0}']), ('\u{aba5}', ['\u{13d5}', '\u{0}', '\u{0}']), + ('\u{aba6}', ['\u{13d6}', '\u{0}', '\u{0}']), ('\u{aba7}', ['\u{13d7}', '\u{0}', '\u{0}']), + ('\u{aba8}', ['\u{13d8}', '\u{0}', '\u{0}']), ('\u{aba9}', ['\u{13d9}', '\u{0}', '\u{0}']), + ('\u{abaa}', ['\u{13da}', '\u{0}', '\u{0}']), ('\u{abab}', ['\u{13db}', '\u{0}', '\u{0}']), + ('\u{abac}', ['\u{13dc}', '\u{0}', '\u{0}']), ('\u{abad}', ['\u{13dd}', '\u{0}', '\u{0}']), + ('\u{abae}', ['\u{13de}', '\u{0}', '\u{0}']), ('\u{abaf}', ['\u{13df}', '\u{0}', '\u{0}']), + ('\u{abb0}', ['\u{13e0}', '\u{0}', '\u{0}']), ('\u{abb1}', ['\u{13e1}', '\u{0}', '\u{0}']), + ('\u{abb2}', ['\u{13e2}', '\u{0}', '\u{0}']), ('\u{abb3}', ['\u{13e3}', '\u{0}', '\u{0}']), + ('\u{abb4}', ['\u{13e4}', '\u{0}', '\u{0}']), ('\u{abb5}', ['\u{13e5}', '\u{0}', '\u{0}']), + ('\u{abb6}', ['\u{13e6}', '\u{0}', '\u{0}']), ('\u{abb7}', ['\u{13e7}', '\u{0}', '\u{0}']), + ('\u{abb8}', ['\u{13e8}', '\u{0}', '\u{0}']), ('\u{abb9}', ['\u{13e9}', '\u{0}', '\u{0}']), + ('\u{abba}', ['\u{13ea}', '\u{0}', '\u{0}']), ('\u{abbb}', ['\u{13eb}', '\u{0}', '\u{0}']), + ('\u{abbc}', ['\u{13ec}', '\u{0}', '\u{0}']), ('\u{abbd}', ['\u{13ed}', '\u{0}', '\u{0}']), + ('\u{abbe}', ['\u{13ee}', '\u{0}', '\u{0}']), ('\u{abbf}', ['\u{13ef}', '\u{0}', '\u{0}']), +]; diff --git a/src/tools/unicode-table-generator/Cargo.toml b/src/tools/unicode-table-generator/Cargo.toml index 3ca6e9e316f1d..3be916dc69bf5 100644 --- a/src/tools/unicode-table-generator/Cargo.toml +++ b/src/tools/unicode-table-generator/Cargo.toml @@ -7,3 +7,4 @@ edition = "2024" [dependencies] ucd-parse = "0.1.3" +rustc-hash = "2.0.0" diff --git a/src/tools/unicode-table-generator/src/cascading_map.rs b/src/tools/unicode-table-generator/src/cascading_map.rs index 56e6401908dcf..da06049beb575 100644 --- a/src/tools/unicode-table-generator/src/cascading_map.rs +++ b/src/tools/unicode-table-generator/src/cascading_map.rs @@ -1,7 +1,8 @@ -use std::collections::HashMap; use std::fmt::Write as _; use std::ops::Range; +use rustc_hash::FxHashMap; + use crate::fmt_list; use crate::raw_emitter::RawEmitter; @@ -27,7 +28,7 @@ impl RawEmitter { println!("there are {} points", points.len()); // how many distinct ranges need to be counted? - let mut codepoints_by_high_bytes = HashMap::>::new(); + let mut codepoints_by_high_bytes = FxHashMap::>::default(); for point in points { // assert that there is no whitespace over the 0x3000 range. assert!(point <= 0x3000, "the highest unicode whitespace value has changed"); diff --git a/src/tools/unicode-table-generator/src/case_mapping.rs b/src/tools/unicode-table-generator/src/case_mapping.rs index b7b385542ef53..ee4dfc2514c20 100644 --- a/src/tools/unicode-table-generator/src/case_mapping.rs +++ b/src/tools/unicode-table-generator/src/case_mapping.rs @@ -48,21 +48,33 @@ use std::ops::RangeInclusive; use crate::fmt_helpers::Hex; use crate::{UnicodeData, fmt_list}; -pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [(String, usize); 3]) { +pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [(String, usize); 4]) { let mut file = String::new(); file.push_str("\n\n"); file.push_str(HEADER.trim_start()); file.push('\n'); - let (lower_tables, lower_desc, lower_size) = generate_tables("LOWER", &data.to_lower); + let (lower_tables, lower_desc, lower_size) = generate_tables("LOWERCASE", &data.to_lower); file.push_str(&lower_tables); file.push_str("\n\n"); - let (upper_tables, upper_desc, upper_size) = generate_tables("UPPER", &data.to_upper); + let (upper_tables, upper_desc, upper_size) = generate_tables("UPPERCASE", &data.to_upper); file.push_str(&upper_tables); file.push_str("\n\n"); - let (title_tables, title_desc, title_size) = generate_tables("TITLE", &data.to_title); + let (title_tables, title_desc, title_size) = generate_tables("TITLECASE", &data.to_title); file.push_str(&title_tables); - (file, [(lower_desc, lower_size), (upper_desc, upper_size), (title_desc, title_size)]) + file.push_str("\n\n"); + let (casefold_tables, casefold_desc, casefold_size) = + generate_tables("CASEFOLD", &data.to_casefold); + file.push_str(&casefold_tables); + ( + file, + [ + (lower_desc, lower_size), + (upper_desc, upper_size), + (title_desc, title_size), + (casefold_desc, casefold_size), + ], + ) } // So far, only planes 0 and 1 (Basic Multilingual Plane and Supplementary @@ -205,7 +217,7 @@ fn generate_tables(case: &str, data: &BTreeMap) -> (String, Strin output_high, input_high, "Case-mapping a character should not change its plane" ); - let delta = output_low as i16 - input_low as i16; + let delta = output_low.wrapping_sub(input_low).cast_signed(); let range = Range::singleton(input_low); l2_lut.singles.push((range, delta)); } @@ -264,7 +276,7 @@ fn generate_tables(case: &str, data: &BTreeMap) -> (String, Strin let size = l1_lut.size(); let num_ranges = l1_lut.l2_luts.iter().map(|l2| l2.singles.len() + l2.multis.len()).sum::(); - let table = format!("static {case}CASE_LUT: L1Lut = {l1_lut:#?};"); + let table = format!("static {case}_LUT: L1Lut = {l1_lut:#?};"); let desc = format!( "{:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using 2-level LUT", data.len(), @@ -381,7 +393,7 @@ fn lookup(input: char, l1_lut: &L1Lut) -> Option<[char; 3]> { } pub fn to_lower(c: char) -> [char; 3] { - // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AChanges_When_Lowercased%253A%5D-%5B%253AASCII%253A%5D&abb=on + // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Changes_When_Lowercased:]-[:ASCII:]&abb=on if c < '\u{C0}' { return [c.to_ascii_lowercase(), '\0', '\0']; } @@ -390,7 +402,7 @@ pub fn to_lower(c: char) -> [char; 3] { } pub fn to_upper(c: char) -> [char; 3] { - // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AChanges_When_Uppercased%253A%5D-%5B%253AASCII%253A%5D&abb=on + // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Changes_When_Uppercased:]-[:ASCII:]&abb=on if c < '\u{B5}' { return [c.to_ascii_uppercase(), '\0', '\0']; } @@ -399,11 +411,64 @@ pub fn to_upper(c: char) -> [char; 3] { } pub fn to_title(c: char) -> [char; 3] { - // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AChanges_When_Titlecased%253A%5D-%5B%253AASCII%253A%5D&abb=on + // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Changes_When_Titlecased:]-[:ASCII:]&abb=on if c < '\u{B5}' { return [c.to_ascii_uppercase(), '\0', '\0']; } lookup(c, &TITLECASE_LUT).or_else(|| lookup(c, &UPPERCASE_LUT)).unwrap_or([c, '\0', '\0']) } + +pub fn to_casefold(c: char) -> [char; 3] { + // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Changes_When_Casefolded:]-[:ASCII:]&abb=on + if c < '\u{B5}' { + return [c.to_ascii_lowercase(), '\0', '\0']; + } + + + lookup(c, &CASEFOLD_LUT).unwrap_or_else(|| { + // fall back to lowercase of uppercase + + let uppercase = lookup(c, &UPPERCASE_LUT).unwrap_or([c, '\0', '\0']); + let mut final_result = to_lower(uppercase[0]); + if uppercase[1] != '\0' { + let lowercase_1 = to_lower(uppercase[1]); + debug_assert_eq!(lowercase_1[2], '\0'); + + // If, after updating the Unicode data + // to a new Unicode version, the below + // assertion starts to fail in tests, + // delete it, and uncomment the + // `if` condition and corresponding + // `else` block below it. + debug_assert_eq!(final_result[1], '\0'); + //if final_result[1] == '\0' { + + final_result[1] = lowercase_1[0]; + + if uppercase[2] != '\0' { + debug_assert_eq!(lowercase_1[1], '\0'); + let lowercase_2 = to_lower(uppercase[2]); + debug_assert_eq!(lowercase_2[1], '\0'); + debug_assert_eq!(lowercase_2[2], '\0'); + final_result[2] = lowercase_2[0]; + } else { + // If, after updating the Unicode data + // to a new Unicode version, the below + // assertion starts to fail in tests, + // delete it and uncomment the line + // below it. + debug_assert_eq!(lowercase_1[1], '\0'); + //final_result[2] = lowercase_1[1]; + } + + /*} else { + final_result[2] = lowercase_1[0]; + debug_assert_eq!(lowercase_1[1], '\0'); + debug_assert_eq!(uppercase[2], '\0') + }*/ + } + final_result + }) +} "; diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 398b4c7b7ec5a..a55cd2f657a6d 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -71,11 +71,12 @@ //! index of that offset is utilized as the answer to whether we're in the set //! or not. -use std::collections::{BTreeMap, HashMap}; +use std::collections::BTreeMap; use std::fmt::Write; use std::ops::Range; -use ucd_parse::Codepoints; +use rustc_hash::{FxHashMap, FxHashSet}; +use ucd_parse::{Codepoint, Codepoints}; mod cascading_map; mod case_mapping; @@ -106,6 +107,9 @@ struct UnicodeData { to_title: BTreeMap, /// Only stores mappings that are not to self to_lower: BTreeMap, + /// Only stores mappings that differ from + /// `to_upper` followed by `to_lower` + to_casefold: BTreeMap, } fn to_mapping( @@ -126,7 +130,7 @@ static UNICODE_DIRECTORY: &str = "unicode-downloads"; fn load_data() -> UnicodeData { unicode_download::fetch_latest(); - let mut properties = HashMap::new(); + let mut properties = FxHashMap::default(); for row in ucd_parse::parse::<_, ucd_parse::CoreProperty>(&UNICODE_DIRECTORY).unwrap() { if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) { properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints); @@ -138,7 +142,8 @@ fn load_data() -> UnicodeData { } } - let [mut to_lower, mut to_upper, mut to_title] = [const { BTreeMap::new() }; 3]; + let [mut to_lower, mut to_upper, mut to_title, mut to_casefold] = + [const { BTreeMap::new() }; 4]; for row in ucd_parse::UnicodeDataExpander::new( ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(), ) { @@ -189,6 +194,78 @@ fn load_data() -> UnicodeData { } } + fn get_mapping_from_btreemap<'a>( + cp: Codepoint, + map: &'a BTreeMap, + ) -> Vec { + let mapping = + map.get(&cp.value()).copied().map(|cs| cs.map(|c| Codepoint::from_u32(c).unwrap())); + + mapping + .as_ref() + .map(|cs| { + let nul = Codepoint::from_u32(0).unwrap(); + if cs[1] == nul { + &cs[..1] + } else if cs[2] == nul { + &cs[..2] + } else { + &cs[..] + } + }) + .map_or_else(|| vec![cp], ToOwned::to_owned) + } + + let mut nontrivial_casefold = FxHashSet::default(); + + for row in ucd_parse::parse::<_, ucd_parse::CaseFold>(&UNICODE_DIRECTORY).unwrap() { + use ucd_parse::{CaseStatus, Codepoint}; + if matches!(row.status, CaseStatus::Common | CaseStatus::Full) { + let key = row.codepoint.value(); + nontrivial_casefold.insert(key); + + // We store case-fold data only for characters whose case-folding + // differs from the lowercase of their uppercase. + + let lower_upper_mapping: Vec = + get_mapping_from_btreemap(row.codepoint, &to_upper) + .into_iter() + .flat_map(|cp| get_mapping_from_btreemap(cp, &to_lower)) + .collect(); + + if let Some(casefold) = to_mapping(&lower_upper_mapping, &row.mapping) { + to_casefold.insert(key, casefold); + } + } + } + + // Now, account for characters that remain unchanged by case-folding + // (and are therefore omitted from `CaseFolding.txt`), + // but yet differ from the lowercase of their uppercase. + + for c in '\0'..=char::MAX { + let cnum: u32 = c.into(); + if !nontrivial_casefold.contains(&cnum) { + let cp = Codepoint::from_u32(cnum).unwrap(); + + use std::collections::btree_map::Entry; + match to_casefold.entry(cnum) { + Entry::Vacant(vacant_entry) => { + let lower_upper_mapping: Vec = + get_mapping_from_btreemap(cp, &to_upper) + .into_iter() + .flat_map(|cp| get_mapping_from_btreemap(cp, &to_lower)) + .collect(); + + if let Some(casefold) = to_mapping(&lower_upper_mapping, &[cp]) { + vacant_entry.insert(casefold); + } + } + Entry::Occupied(_) => {} + } + } + } + // Filter out ASCII codepoints. to_lower.retain(|&c, _| c > 0x7f); to_upper.retain(|&c, _| c > 0x7f); @@ -207,7 +284,7 @@ fn load_data() -> UnicodeData { .collect(); properties.sort_by_key(|p| p.0); - UnicodeData { ranges: properties, to_lower, to_title, to_upper } + UnicodeData { ranges: properties, to_lower, to_title, to_upper, to_casefold } } fn main() { @@ -259,7 +336,9 @@ fn main() { total_bytes += emitter.bytes_used; } let (conversions, sizes) = case_mapping::generate_case_mapping(&unicode_data); - for (name, (desc, size)) in ["to_lower", "to_upper", "to_title"].iter().zip(sizes) { + for (name, (desc, size)) in + ["to_lower", "to_upper", "to_title", "to_casefold"].iter().zip(sizes) + { table_file.push_str(&format!("// {:16}: {:5} bytes, {desc}\n", name, size,)); total_bytes += size; } @@ -369,10 +448,11 @@ pub(super) static {prop_upper}: &[RangeInclusive; {is_true_len}] = &[{is_t .unwrap(); } - for (name, lut) in ["TO_LOWER", "TO_UPPER", "TO_TITLE"].iter().zip([ + for (name, lut) in ["TO_LOWER", "TO_UPPER", "TO_TITLE", "TO_CASEFOLD"].iter().zip([ &data.to_lower, &data.to_upper, &data.to_title, + &data.to_casefold, ]) { let lut = lut .iter() diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index 297965615c1a5..de3395df3806e 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -1,7 +1,9 @@ -use std::collections::{BTreeMap, BTreeSet, HashMap}; +use std::collections::{BTreeMap, BTreeSet}; use std::fmt::{self, Write}; use std::ops::Range; +use rustc_hash::FxHashMap; + use crate::fmt_list; #[derive(Clone)] @@ -126,8 +128,11 @@ impl RawEmitter { for chunk in compressed_words.chunks(chunk_length) { chunks.insert(chunk); } - let chunk_map = - chunks.iter().enumerate().map(|(idx, &chunk)| (chunk, idx)).collect::>(); + let chunk_map = chunks + .iter() + .enumerate() + .map(|(idx, &chunk)| (chunk, idx)) + .collect::>(); let mut chunk_indices = Vec::new(); for chunk in compressed_words.chunks(chunk_length) { chunk_indices.push(chunk_map[chunk]); @@ -186,7 +191,7 @@ struct Canonicalized { /// Maps an input unique word to the associated index (u8) which is into /// canonical_words or canonicalized_words (in order). - unique_mapping: HashMap, + unique_mapping: FxHashMap, } impl Canonicalized { @@ -253,7 +258,7 @@ impl Canonicalized { // These are mapped words, which will be represented by an index into // the canonical_words and a Mapping; u16 when encoded. let mut canonicalized_words = Vec::new(); - let mut unique_mapping = HashMap::new(); + let mut unique_mapping = FxHashMap::default(); #[derive(Debug, PartialEq, Eq)] enum UniqueMapping { @@ -361,7 +366,7 @@ impl Canonicalized { }, ) }) - .collect::>(); + .collect::>(); let mut distinct_indices = BTreeSet::new(); for &w in unique_words { diff --git a/src/tools/unicode-table-generator/src/unicode_download.rs b/src/tools/unicode-table-generator/src/unicode_download.rs index c9826170905c2..b2fcf6444033d 100644 --- a/src/tools/unicode-table-generator/src/unicode_download.rs +++ b/src/tools/unicode-table-generator/src/unicode_download.rs @@ -7,8 +7,13 @@ static URL_PREFIX: &str = "https://www.unicode.org/Public/UCD/latest/ucd/"; static README: &str = "ReadMe.txt"; -static RESOURCES: &[&str] = - &["DerivedCoreProperties.txt", "PropList.txt", "UnicodeData.txt", "SpecialCasing.txt"]; +static RESOURCES: &[&str] = &[ + "CaseFolding.txt", + "DerivedCoreProperties.txt", + "PropList.txt", + "SpecialCasing.txt", + "UnicodeData.txt", +]; #[track_caller] fn fetch(url: &str) -> Output { From a5d82ca7fbe78ffc2ddff687ded2c383fe269029 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Thu, 2 Apr 2026 20:26:14 -0400 Subject: [PATCH 2/5] Add `str::to_casefold()` --- library/alloc/src/str.rs | 97 +++++++++++++++++++++++++++++++++ library/alloctests/tests/lib.rs | 1 + library/alloctests/tests/str.rs | 8 ++- 3 files changed, 105 insertions(+), 1 deletion(-) diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index 2966f3ccc1791..db9f96cdd7e70 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -497,6 +497,103 @@ impl str { s } + /// Returns the case-folded equivalent of this string slice, as a new [`String`]. + /// + /// Case folding is a transformation, mostly matching lowercase, that is meant to be used + /// for case-insensitive string comparisons. Case-folded strings should not usually + /// be exposed directly to users. + /// + /// For the precise specification of case folding, see + /// [Chapter 3 (Conformance)](https://www.unicode.org/versions/latest/core-spec/chapter-3/#G63737) + /// of the Unicode standard. + /// + /// Since some characters can expand into multiple characters when case folding, + /// this function returns a [`String`] instead of modifying the parameter in-place. + /// + /// This function does not perform any normalization (e.g. NFC). + /// + /// Like [`char::to_casefold()`] this method does not handle language-specific + /// casing, like Turkish and Azeri I/ı/İ/i. See that method's documentation + /// for more information. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(casefold)] + /// let s0 = "HELLO"; + /// let s1 = "Hello"; + /// + /// assert_eq!(s0.to_casefold(), s1.to_casefold()); + /// assert_eq!(s0.to_casefold(), "hello") + /// ``` + /// + /// Scripts without case are not changed: + /// + /// ``` + /// #![feature(casefold)] + /// let new_year = "农历新年"; + /// + /// assert_eq!(new_year, new_year.to_casefold()); + /// ``` + /// + /// One character can become multiple: + /// + /// ``` + /// #![feature(casefold)] + /// let s0 = "TSCHÜẞ"; + /// let s1 = "TSCHÜSS"; + /// let s2 = "tschüß"; + /// + /// assert_eq!(s0.to_casefold(), s1.to_casefold()); + /// assert_eq!(s0.to_casefold(), s2.to_casefold()); + /// assert_eq!(s0.to_casefold(), "tschüss"); + /// ``` + /// + /// No NFC normalization is performed: + /// + /// ```rust + /// #![feature(casefold)] + /// // These two strings are visually and semantically identical... + /// let comp = "Á"; + /// let decomp = "Á"; + /// + /// // ... but not codepoint-for-codepoint equal. + /// + /// assert_eq!(comp, "\u{C1}"); + /// assert_eq!(decomp, "A\u{0301}"); + /// + /// // Their case-foldings are likewise unequal: + /// + /// assert_eq!(comp.to_casefold(), "\u{E1}"); + /// assert_eq!(decomp.to_casefold(), "a\u{0301}"); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[rustc_allow_incoherent_impl] + #[must_use = "this returns the case-folded string as a new String, \ + without modifying the original"] + #[unstable(feature = "casefold", issue = "none")] + pub fn to_casefold(&self) -> String { + let (mut s, rest) = convert_while_ascii(self, u8::to_ascii_lowercase); + + for c in rest.chars() { + match conversions::to_casefold(c) { + [a, '\0', _] => s.push(a), + [a, b, '\0'] => { + s.push(a); + s.push(b); + } + [a, b, c] => { + s.push(a); + s.push(b); + s.push(c); + } + } + } + s + } + /// Converts a [`Box`] into a [`String`] without copying or allocating. /// /// # Examples diff --git a/library/alloctests/tests/lib.rs b/library/alloctests/tests/lib.rs index 699a5010282b0..5067fc45eb29b 100644 --- a/library/alloctests/tests/lib.rs +++ b/library/alloctests/tests/lib.rs @@ -3,6 +3,7 @@ #![feature(const_heap)] #![feature(deque_extend_front)] #![feature(iter_array_chunks)] +#![feature(casefold)] #![feature(cow_is_borrowed)] #![feature(core_intrinsics)] #![feature(downcast_unchecked)] diff --git a/library/alloctests/tests/str.rs b/library/alloctests/tests/str.rs index c0bcdb8500af6..42fbdd0ea9a9a 100644 --- a/library/alloctests/tests/str.rs +++ b/library/alloctests/tests/str.rs @@ -1867,7 +1867,13 @@ fn to_lowercase() { #[test] fn to_uppercase() { assert_eq!("".to_uppercase(), ""); - assert_eq!("aéDžßfiᾀ".to_uppercase(), "AÉDŽSSFIἈΙ"); + assert_eq!("aéDžßẞfiᾀ".to_uppercase(), "AÉDŽSSẞFIἈΙ"); +} + +#[test] +fn to_casefold() { + assert_eq!("".to_casefold(), ""); + assert_eq!("ꮿfiῲὼ\u{0345}ßẞΣς".to_casefold(), "Ꮿfiὼιὼιssssσσ"); } #[test] From 66b91bdf5070bd30c19566bd29007f10d866751f Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Thu, 2 Apr 2026 20:38:23 -0400 Subject: [PATCH 3/5] Add `str::eq_ignore_case()` With an unoptimized, non-`const` implementation for now. --- library/alloc/src/str.rs | 4 +++- library/core/src/str/mod.rs | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index db9f96cdd7e70..7ac7495e7f701 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -575,7 +575,9 @@ impl str { without modifying the original"] #[unstable(feature = "casefold", issue = "none")] pub fn to_casefold(&self) -> String { - let (mut s, rest) = convert_while_ascii(self, u8::to_ascii_lowercase); + // SAFETY: `to_ascii_lowercase` preserves ASCII bytes, so the converted + // prefix remains valid UTF-8. + let (mut s, rest) = unsafe { convert_while_ascii(self, u8::to_ascii_lowercase) }; for c in rest.chars() { match conversions::to_casefold(c) { diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 5af399ab1b34c..a8fc30a632642 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -2826,6 +2826,9 @@ impl str { /// Same as `to_ascii_lowercase(a) == to_ascii_lowercase(b)`, /// but without allocating and copying temporaries. /// + /// For Unicode-aware case-insensitive matching, consider + /// [`str::eq_ignore_case`]. + /// /// # Examples /// /// ``` @@ -2841,6 +2844,38 @@ impl str { self.as_bytes().eq_ignore_ascii_case(other.as_bytes()) } + /// Checks that two strings are a caseless match, according to + /// [Definition 144] in Chapter 3 of the Unicode Standard. + /// + /// [Definition 144]: https://www.unicode.org/versions/latest/core-spec/chapter-3/#G53513 + /// + /// Same as `a.to_casefold() == b.to_casefold()`, + /// but without allocating. See that method's documentation, + /// and [`char::to_casefold()`], + /// for more information about case folding. + /// + /// No normalization (e.g. NFC) is performed, + /// so visually and semantically identical strings + /// might still compare unequal. In addition, + /// this method is independent of language/locale, + /// so the special behavior of I/ı/İ/i + /// in Turkish and Azeri is not handled. + /// + /// # Examples + /// + /// ``` + /// #![feature(casefold)] + /// assert!("Ferris".eq_ignore_case("FERRIS")); + /// assert!("Ferrös".eq_ignore_case("FERRÖS")); + /// assert!("ẞ".eq_ignore_case("ss")); + /// ``` + #[unstable(feature = "casefold", issue = "none")] + #[must_use] + #[inline] + pub fn eq_ignore_case(&self, other: &str) -> bool { + self.chars().flat_map(char::to_casefold).eq(other.chars().flat_map(char::to_casefold)) + } + /// Converts this string to its ASCII upper case equivalent in-place. /// /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', From b14b43baad04900daa2e6815fe65c2a78100aeb1 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Sat, 18 Apr 2026 00:29:21 -0400 Subject: [PATCH 4/5] Address review comments --- library/alloc/src/str.rs | 9 ++--- library/core/src/char/methods.rs | 61 ++++++++++++++++++++------------ library/core/src/str/mod.rs | 24 +++++++++++-- 3 files changed, 64 insertions(+), 30 deletions(-) diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index 7ac7495e7f701..4946c7f034678 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -510,7 +510,8 @@ impl str { /// Since some characters can expand into multiple characters when case folding, /// this function returns a [`String`] instead of modifying the parameter in-place. /// - /// This function does not perform any normalization (e.g. NFC). + /// This function does not perform any [normalization] (e.g. NFC), + /// so semantically and visually identical strings may compare unequal. /// /// Like [`char::to_casefold()`] this method does not handle language-specific /// casing, like Turkish and Azeri I/ı/İ/i. See that method's documentation @@ -551,7 +552,7 @@ impl str { /// assert_eq!(s0.to_casefold(), "tschüss"); /// ``` /// - /// No NFC normalization is performed: + /// No NFC [normalization] is performed: /// /// ```rust /// #![feature(casefold)] @@ -560,15 +561,15 @@ impl str { /// let decomp = "Á"; /// /// // ... but not codepoint-for-codepoint equal. - /// /// assert_eq!(comp, "\u{C1}"); /// assert_eq!(decomp, "A\u{0301}"); /// /// // Their case-foldings are likewise unequal: - /// /// assert_eq!(comp.to_casefold(), "\u{E1}"); /// assert_eq!(decomp.to_casefold(), "a\u{0301}"); /// ``` + /// + /// [normalization]: https://www.unicode.org/faq/normalization #[cfg(not(no_global_oom_handling))] #[rustc_allow_incoherent_impl] #[must_use = "this returns the case-folded string as a new String, \ diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 2fb70214a9c34..a87d11d7fa1a6 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -1075,16 +1075,17 @@ impl char { } /// Returns `true` if this `char` has the `Case_Ignorable` property. This narrow-use property - /// is used to implement context-dependent casing for the Greek letter sigma (uppercase Σ), + /// is used to implement context-dependent casing for the Greek letter sigma (uppercase 'Σ'), /// which has two lowercase forms. /// /// `Case_Ignorable` is [described][D136] in Chapter 3 (Conformance) of the Unicode Core Specification, - /// and specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]; - /// see those resources for more information. + /// and specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. + /// See those resources, as well as [`to_lowercase()`]'s documentation, for more information. /// /// [D136]: https://www.unicode.org/versions/latest/core-spec/chapter-3/#G63116 /// [ucd]: https://www.unicode.org/reports/tr44/ /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt + /// [`to_lowercase()`]: Self::to_lowercase() #[must_use] #[inline] #[unstable(feature = "case_ignorable", issue = "154848")] @@ -1154,8 +1155,6 @@ impl char { /// If this `char` expands to multiple `char`s, the iterator yields the `char`s given by /// [`SpecialCasing.txt`]. The maximum number of `char`s in a case mapping is 3. /// - /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt - /// /// This operation performs an unconditional mapping without tailoring. That is, the conversion /// is independent of context and language. See [below](#notes-on-context-and-locale) /// for more information. @@ -1210,14 +1209,25 @@ impl char { /// /// ## Greek sigma /// - /// In Greek, the letter simga (uppercase Σ) has two lowercase forms: - /// ς which is used only at the end of a word, and σ which is used everywhere else. - /// `to_lowercase()` always uses the second form: + /// In Greek, the letter simga (uppercase 'Σ') has two lowercase forms: + /// 'σ' which is used in most situations, and 'ς' which appears only + /// at the end of a word. [`char::to_lowercase()`] always uses the first form: /// /// ``` /// assert_eq!('Σ'.to_lowercase().to_string(), "σ"); /// ``` /// + /// `str::to_lowercase()` (only available with the `alloc` crate) + /// *does* properly handle this contextual mapping, + /// so prefer using that method if you can. Alternatively, you can use + /// [`is_cased()`] and [`is_case_ignorable()`] to implement it yourself. + /// See `Final_Sigma` in [Table 3.17] of the Unicode Standard, + /// along with [`SpecialCasing.txt`], for more details. + /// + /// [`is_cased()`]: Self::is_cased() + /// [`is_case_ignorable()`]: Self::is_case_ignorable() + /// [Table 3.17]: https://www.unicode.org/versions/latest/core-spec/chapter-3/#G54277 + /// /// ## Turkish and Azeri I/ı/İ/i /// /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two: @@ -1225,13 +1235,13 @@ impl char { /// * 'Dotless': I / ı, sometimes written ï /// * 'Dotted': İ / i /// - /// Note that the uppercase undotted 'I' is the same as the Latin. Therefore: + /// Note that the uppercase undotted 'I' is the same codepoint as the Latin. Therefore: /// /// ``` /// let lower_i = 'I'.to_lowercase().to_string(); /// ``` /// - /// The value of `lower_i` here relies on the language of the text: if we're + /// `'I'`'s correct lowercase relies on the language of the text: if we're /// in `en-US`, it should be `"i"`, but if we're in `tr-TR` or `az-AZ`, it should /// be `"ı"`. `to_lowercase()` does not take this into account, and so: /// @@ -1242,6 +1252,8 @@ impl char { /// ``` /// /// holds across languages. + /// + /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt #[must_use = "this returns the lowercased character as a new iterator, \ without modifying the original"] #[stable(feature = "rust1", since = "1.0.0")] @@ -1392,22 +1404,22 @@ impl char { /// As stated above, this method is locale-insensitive. /// If you need locale support, consider using an external crate, /// like [`icu_casemap`](https://crates.io/crates/icu_casemap) - /// which is developed by Unicode. A description of a common - /// locale-dependent casing issue follows: + /// which is developed by Unicode. A description of one common + /// locale-dependent casing issue follows (there are others): /// /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two: /// /// * 'Dotless': I / ı, sometimes written ï /// * 'Dotted': İ / i /// - /// Note that the lowercase dotted 'i' is the same as the Latin. Therefore: + /// Note that the lowercase dotted 'i' is the same codepoint as the Latin. Therefore: /// /// ``` /// #![feature(titlecase)] /// let upper_i = 'i'.to_titlecase().to_string(); /// ``` /// - /// The value of `upper_i` here relies on the language of the text: if we're + /// `'i'`'s correct titlecase relies on the language of the text: if we're /// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should /// be `"İ"`. `to_titlecase()` does not take this into account, and so: /// @@ -1504,21 +1516,21 @@ impl char { /// As stated above, this method is locale-insensitive. /// If you need locale support, consider using an external crate, /// like [`icu_casemap`](https://crates.io/crates/icu_casemap) - /// which is developed by Unicode. A description of a common - /// locale-dependent casing issue follows: + /// which is developed by Unicode. A description of one common + /// locale-dependent casing issue follows (there are others): /// /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two: /// /// * 'Dotless': I / ı, sometimes written ï /// * 'Dotted': İ / i /// - /// Note that the lowercase dotted 'i' is the same as the Latin. Therefore: + /// Note that the lowercase dotted 'i' is the same codepoint as the Latin. Therefore: /// /// ``` /// let upper_i = 'i'.to_uppercase().to_string(); /// ``` /// - /// The value of `upper_i` here relies on the language of the text: if we're + /// `'i'`'s correct uppercase relies on the language of the text: if we're /// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should /// be `"İ"`. `to_uppercase()` does not take this into account, and so: /// @@ -1542,12 +1554,13 @@ impl char { /// Returns an iterator that yields the case folding of this `char` as one or more /// `char`s. /// - /// Case folding is meant to be used when performing case-insensitive string comparisons, - /// but case-folded strings should not generally be exposed directly to users. For most, + /// Case folding is meant to be used when performing case-insensitive string comparisons. + /// Case-folded strings should not usually be exposed directly to users. For most, /// but not all, characters, the casefold mapping is identical to the lowercase one. /// /// This iterator yields the `char`(s) in the common or full case folding for this `char`, /// as given by the [Unicode Character Database][ucd] [`CaseFolding.txt`]. + /// The maximum number of `char`s in a case folding is 3. /// /// [ucd]: https://www.unicode.org/reports/tr44/ /// [`CaseFolding.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt @@ -1555,7 +1568,9 @@ impl char { /// This operation performs an unconditional mapping without tailoring. That is, the conversion /// is independent of context and language. /// - /// It also does not perform any normalization (e.g. NFC). + /// It also does not perform any [normalization] (e.g. NFC). + /// + /// [normalization]: https://www.unicode.org/faq/normalization /// /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case folding in /// general and Chapter 3 (Conformance) discusses the default algorithm for case folding. @@ -1591,14 +1606,14 @@ impl char { /// * 'Dotless': I / ı, sometimes written ï /// * 'Dotted': İ / i /// - /// Note that the uppercase undotted 'I' is the same as the Latin. Therefore: + /// Note that the uppercase undotted 'I' is the same codepoint as the Latin. Therefore: /// /// ``` /// #![feature(casefold)] /// let casefold_i = 'I'.to_casefold().to_string(); /// ``` /// - /// The value of `casefold_i` here relies on the language of the text: if we're + /// `'I'`'s correct case folding relies on the language of the text: if we're /// in `en-US`, it should be `"i"`, but if we're in `tr-TR` or `az-AZ`, it should /// be `"ı"`. `to_casefold()` does not take this into account, and so: /// diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index a8fc30a632642..9ff0fd2bd3f31 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -2851,14 +2851,14 @@ impl str { /// /// Same as `a.to_casefold() == b.to_casefold()`, /// but without allocating. See that method's documentation, - /// and [`char::to_casefold()`], + /// as well as [`char::to_casefold()`], /// for more information about case folding. /// - /// No normalization (e.g. NFC) is performed, + /// No [normalization] (e.g. NFC) is performed, /// so visually and semantically identical strings /// might still compare unequal. In addition, /// this method is independent of language/locale, - /// so the special behavior of I/ı/İ/i + /// so the special behavior of I/ı/İ/i /// in Turkish and Azeri is not handled. /// /// # Examples @@ -2869,6 +2869,24 @@ impl str { /// assert!("Ferrös".eq_ignore_case("FERRÖS")); /// assert!("ẞ".eq_ignore_case("ss")); /// ``` + /// + /// No NFC [normalization] is performed: + /// + /// ```rust + /// #![feature(casefold)] + /// // These two strings are visually and semantically identical... + /// let comp = "Á"; + /// let decomp = "Á"; + /// + /// // ... but not codepoint-for-codepoint equal. + /// assert_eq!(comp, "\u{C1}"); + /// assert_eq!(decomp, "A\u{0301}"); + /// + /// // Their case-foldings are likewise unequal: + /// assert_eq!(!comp.eq_ignore_case(decomp)); + /// ``` + /// + /// [normalization]: https://www.unicode.org/faq/normalization #[unstable(feature = "casefold", issue = "none")] #[must_use] #[inline] From d82bc3946f6875e3398a568411ad22ab679ca73a Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Sat, 18 Apr 2026 07:07:56 -0400 Subject: [PATCH 5/5] Fix doctest --- library/core/src/str/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 9ff0fd2bd3f31..8b0bdd230a540 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -2883,7 +2883,7 @@ impl str { /// assert_eq!(decomp, "A\u{0301}"); /// /// // Their case-foldings are likewise unequal: - /// assert_eq!(!comp.eq_ignore_case(decomp)); + /// assert!(!comp.eq_ignore_case(decomp)); /// ``` /// /// [normalization]: https://www.unicode.org/faq/normalization