icu_casemap/
casemapper.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::internals::{CaseMapLocale, FoldOptions, FullCaseWriteable, StringAndWriteable};
6use crate::provider::data::MappingKind;
7use crate::provider::CaseMap;
8use crate::provider::CaseMapV1;
9use crate::set::ClosureSink;
10use crate::titlecase::{LeadingAdjustment, TitlecaseOptions, TrailingCase};
11use alloc::borrow::Cow;
12use icu_locale_core::LanguageIdentifier;
13use icu_provider::prelude::*;
14use writeable::Writeable;
15
16/// A struct with the ability to convert characters and strings to uppercase or lowercase,
17/// or fold them to a normalized form for case-insensitive comparison.
18///
19/// Most methods for this type live on [`CaseMapperBorrowed`], which you can obtain via
20/// [`CaseMapper::new()`] or [`CaseMapper::as_borrowed()`].
21///
22/// # Examples
23///
24/// ```rust
25/// use icu::casemap::CaseMapper;
26/// use icu::locale::langid;
27///
28/// let cm = CaseMapper::new();
29///
30/// assert_eq!(
31///     cm.uppercase_to_string("hello world", &langid!("und")),
32///     "HELLO WORLD"
33/// );
34/// assert_eq!(
35///     cm.lowercase_to_string("Γειά σου Κόσμε", &langid!("und")),
36///     "γειά σου κόσμε"
37/// );
38/// ```
39#[derive(Clone, Debug)]
40pub struct CaseMapper {
41    pub(crate) data: DataPayload<CaseMapV1>,
42}
43
44impl AsRef<CaseMapper> for CaseMapper {
45    fn as_ref(&self) -> &CaseMapper {
46        self
47    }
48}
49
50/// A struct with the ability to convert characters and strings to uppercase or lowercase,
51/// or fold them to a normalized form for case-insensitive comparison, borrowed version.
52///
53/// See methods or [`CaseMapper`] for examples.
54#[derive(Clone, Debug, Copy)]
55pub struct CaseMapperBorrowed<'a> {
56    pub(crate) data: &'a CaseMap<'a>,
57}
58
59impl CaseMapperBorrowed<'static> {
60    /// Cheaply converts a [`CaseMapperBorrowed<'static>`] into a [`CaseMapper`].
61    ///
62    /// Note: Due to branching and indirection, using [`CaseMapper`] might inhibit some
63    /// compile-time optimizations that are possible with [`CaseMapperBorrowed`].
64    pub const fn static_to_owned(self) -> CaseMapper {
65        CaseMapper {
66            data: DataPayload::from_static_ref(self.data),
67        }
68    }
69    /// Creates a [`CaseMapperBorrowed`] using compiled data.
70    ///
71    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
72    ///
73    /// [📚 Help choosing a constructor](icu_provider::constructors)
74    ///
75    /// # Examples
76    ///
77    /// ```rust
78    /// use icu::casemap::CaseMapper;
79    /// use icu::locale::langid;
80    ///
81    /// let cm = CaseMapper::new();
82    ///
83    /// assert_eq!(
84    ///     cm.uppercase_to_string("hello world", &langid!("und")),
85    ///     "HELLO WORLD"
86    /// );
87    /// ```
88    #[cfg(feature = "compiled_data")]
89    pub const fn new() -> Self {
90        Self {
91            data: crate::provider::Baked::SINGLETON_CASE_MAP_V1,
92        }
93    }
94}
95
96#[cfg(feature = "compiled_data")]
97impl Default for CaseMapperBorrowed<'static> {
98    fn default() -> Self {
99        Self::new()
100    }
101}
102
103impl<'a> CaseMapperBorrowed<'a> {
104    /// Returns the full lowercase mapping of the given string as a [`Writeable`].
105    /// This function is context and language sensitive. Callers should pass the text's language
106    /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
107    /// `Default::default()` for the root locale.
108    ///
109    /// See [`Self::lowercase_to_string()`] for the equivalent convenience function that returns a string,
110    /// as well as for an example.
111    pub fn lowercase(self, src: &'a str, langid: &LanguageIdentifier) -> impl Writeable + 'a {
112        self.data.full_helper_writeable::<false>(
113            src,
114            CaseMapLocale::from_langid(langid),
115            MappingKind::Lower,
116            TrailingCase::default(),
117        )
118    }
119
120    /// Returns the full uppercase mapping of the given string as a [`Writeable`].
121    /// This function is context and language sensitive. Callers should pass the text's language
122    /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
123    /// `Default::default()` for the root locale.
124    ///
125    /// See [`Self::uppercase_to_string()`] for the equivalent convenience function that returns a string,
126    /// as well as for an example.
127    pub fn uppercase(self, src: &'a str, langid: &LanguageIdentifier) -> impl Writeable + 'a {
128        self.data.full_helper_writeable::<false>(
129            src,
130            CaseMapLocale::from_langid(langid),
131            MappingKind::Upper,
132            TrailingCase::default(),
133        )
134    }
135
136    /// Returns the full titlecase mapping of the given string as a [`Writeable`], treating
137    /// the string as a single segment (and thus only titlecasing the beginning of it). Performs
138    /// the specified leading adjustment behavior from the options without loading additional data.
139    ///
140    /// This should typically be used as a lower-level helper to construct the titlecasing operation desired
141    /// by the application, for example one can titlecase on a per-word basis by mixing this with
142    /// a `WordSegmenter`.
143    ///
144    /// This function is context and language sensitive. Callers should pass the text's language
145    /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
146    /// `Default::default()` for the root locale.
147    ///
148    /// This function performs "adjust to cased" leading adjustment behavior when [`LeadingAdjustment::Auto`] or [`LeadingAdjustment::ToCased`]
149    /// is set. Auto mode is not able to pick the "adjust to letter/number/symbol" behavior as this type does not load
150    /// the data to do so, use [`TitlecaseMapper`] if such behavior is desired. See
151    /// the docs of [`TitlecaseMapper`] for more information on what this means. There is no difference between
152    /// the behavior of this function and the equivalent ones on [`TitlecaseMapper`] when the head adjustment mode
153    /// is [`LeadingAdjustment::None`].
154    ///
155    /// See [`Self::titlecase_segment_with_only_case_data_to_string()`] for the equivalent convenience function that returns a string,
156    /// as well as for an example.
157    ///
158    /// [`TitlecaseMapper`]: crate::TitlecaseMapper
159    pub fn titlecase_segment_with_only_case_data(
160        self,
161        src: &'a str,
162        langid: &LanguageIdentifier,
163        options: TitlecaseOptions,
164    ) -> impl Writeable + 'a {
165        self.titlecase_segment_with_adjustment(src, langid, options, |data, ch| data.is_cased(ch))
166    }
167
168    /// Helper to support different leading adjustment behaviors,
169    /// `char_is_lead` is a function that returns true for a character that is allowed to be the
170    /// first relevant character in a titlecasing string, when `leading_adjustment != None`
171    ///
172    /// We return a concrete type instead of `impl Trait` so the return value can be mixed with that of other calls
173    /// to this function with different closures
174    pub(crate) fn titlecase_segment_with_adjustment(
175        self,
176        src: &'a str,
177        langid: &LanguageIdentifier,
178        options: TitlecaseOptions,
179        char_is_lead: impl Fn(&CaseMap, char) -> bool,
180    ) -> StringAndWriteable<'a, FullCaseWriteable<'a, 'a, true>> {
181        let (head, rest) = match options.leading_adjustment.unwrap_or_default() {
182            LeadingAdjustment::Auto | LeadingAdjustment::ToCased => {
183                let first_cased = src
184                    .char_indices()
185                    .find(|(_i, ch)| char_is_lead(self.data, *ch));
186                if let Some((first_cased, _ch)) = first_cased {
187                    (
188                        src.get(..first_cased).unwrap_or(""),
189                        src.get(first_cased..).unwrap_or(""),
190                    )
191                } else {
192                    (src, "")
193                }
194            }
195            LeadingAdjustment::None => ("", src),
196        };
197        let writeable = self.data.full_helper_writeable::<true>(
198            rest,
199            CaseMapLocale::from_langid(langid),
200            MappingKind::Title,
201            options.trailing_case.unwrap_or_default(),
202        );
203        StringAndWriteable {
204            string: head,
205            writeable,
206        }
207    }
208    /// Case-folds the characters in the given string as a [`Writeable`].
209    /// This function is locale-independent and context-insensitive.
210    ///
211    /// Can be used to test if two strings are case-insensitively equivalent.
212    ///
213    /// See [`Self::fold_string()`] for the equivalent convenience function that returns a string,
214    /// as well as for an example.
215    pub fn fold(self, src: &'a str) -> impl Writeable + 'a {
216        self.data.full_helper_writeable::<false>(
217            src,
218            CaseMapLocale::Root,
219            MappingKind::Fold,
220            TrailingCase::default(),
221        )
222    }
223
224    /// Case-folds the characters in the given string as a [`Writeable`],
225    /// using Turkic (T) mappings for dotted/dotless I.
226    /// This function is locale-independent and context-insensitive.
227    ///
228    /// Can be used to test if two strings are case-insensitively equivalent.
229    ///
230    /// See [`Self::fold_turkic_string()`] for the equivalent convenience function that returns a string,
231    /// as well as for an example.
232    pub fn fold_turkic(self, src: &'a str) -> impl Writeable + 'a {
233        self.data.full_helper_writeable::<false>(
234            src,
235            CaseMapLocale::Turkish,
236            MappingKind::Fold,
237            TrailingCase::default(),
238        )
239    }
240
241    /// Returns the full lowercase mapping of the given string as a string.
242    ///
243    /// This function is context and language sensitive. Callers should pass the text's language
244    /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
245    /// `Default::default()` for the root locale.
246    ///
247    /// See [`Self::lowercase()`] for the equivalent lower-level function that returns a [`Writeable`]
248    ///
249    /// # Examples
250    ///
251    /// ```rust
252    /// use icu::casemap::CaseMapper;
253    /// use icu::locale::langid;
254    ///
255    /// let cm = CaseMapper::new();
256    /// let root = langid!("und");
257    ///
258    /// assert_eq!(cm.lowercase_to_string("hEllO WorLd", &root), "hello world");
259    /// assert_eq!(cm.lowercase_to_string("Γειά σου Κόσμε", &root), "γειά σου κόσμε");
260    /// assert_eq!(cm.lowercase_to_string("नमस्ते दुनिया", &root), "नमस्ते दुनिया");
261    /// assert_eq!(cm.lowercase_to_string("Привет мир", &root), "привет мир");
262    ///
263    /// // Some behavior is language-sensitive
264    /// assert_eq!(cm.lowercase_to_string("CONSTANTINOPLE", &root), "constantinople");
265    /// assert_eq!(cm.lowercase_to_string("CONSTANTINOPLE", &langid!("tr")), "constantınople");
266    /// ```
267    pub fn lowercase_to_string<'s>(
268        self,
269        src: &'s str,
270        langid: &LanguageIdentifier,
271    ) -> Cow<'s, str> {
272        writeable::to_string_or_borrow(&self.lowercase(src, langid), src.as_bytes())
273    }
274
275    /// Returns the full uppercase mapping of the given string as a string.
276    ///
277    /// This function is context and language sensitive. Callers should pass the text's language
278    /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
279    /// `Default::default()` for the root locale.
280    ///
281    /// See [`Self::uppercase()`] for the equivalent lower-level function that returns a [`Writeable`]
282    ///
283    /// # Examples
284    ///
285    /// ```rust
286    /// use icu::casemap::CaseMapper;
287    /// use icu::locale::langid;
288    ///
289    /// let cm = CaseMapper::new();
290    /// let root = langid!("und");
291    ///
292    /// assert_eq!(cm.uppercase_to_string("hEllO WorLd", &root), "HELLO WORLD");
293    /// assert_eq!(cm.uppercase_to_string("Γειά σου Κόσμε", &root), "ΓΕΙΆ ΣΟΥ ΚΌΣΜΕ");
294    /// assert_eq!(cm.uppercase_to_string("नमस्ते दुनिया", &root), "नमस्ते दुनिया");
295    /// assert_eq!(cm.uppercase_to_string("Привет мир", &root), "ПРИВЕТ МИР");
296    ///
297    /// // Some behavior is language-sensitive
298    /// assert_eq!(cm.uppercase_to_string("istanbul", &root), "ISTANBUL");
299    /// assert_eq!(cm.uppercase_to_string("istanbul", &langid!("tr")), "İSTANBUL"); // Turkish dotted i
300    ///
301    /// assert_eq!(cm.uppercase_to_string("և Երևանի", &root), "ԵՒ ԵՐԵՒԱՆԻ");
302    /// assert_eq!(cm.uppercase_to_string("և Երևանի", &langid!("hy")), "ԵՎ ԵՐԵՎԱՆԻ"); // Eastern Armenian ech-yiwn ligature
303    /// ```
304    pub fn uppercase_to_string<'s>(
305        self,
306        src: &'s str,
307        langid: &LanguageIdentifier,
308    ) -> Cow<'s, str> {
309        writeable::to_string_or_borrow(&self.uppercase(src, langid), src.as_bytes())
310    }
311
312    /// Returns the full titlecase mapping of the given string as a [`Writeable`], treating
313    /// the string as a single segment (and thus only titlecasing the beginning of it). Performs
314    /// the specified leading adjustment behavior from the options without loading additional data.
315    ///
316    /// Note that [`TitlecaseMapper`] has better behavior, most users should consider using
317    /// it instead. This method primarily exists for people who care about the amount of data being loaded.
318    ///
319    /// This should typically be used as a lower-level helper to construct the titlecasing operation desired
320    /// by the application, for example one can titlecase on a per-word basis by mixing this with
321    /// a `WordSegmenter`.
322    ///
323    /// This function is context and language sensitive. Callers should pass the text's language
324    /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
325    /// `Default::default()` for the root locale.
326    ///
327    /// This function performs "adjust to cased" leading adjustment behavior when [`LeadingAdjustment::Auto`] or [`LeadingAdjustment::ToCased`]
328    /// is set. Auto mode is not able to pick the "adjust to letter/number/symbol" behavior as this type does not load
329    /// the data to do so, use [`TitlecaseMapper`] if such behavior is desired. See
330    /// the docs of [`TitlecaseMapper`] for more information on what this means. There is no difference between
331    /// the behavior of this function and the equivalent ones on [`TitlecaseMapper`] when the head adjustment mode
332    /// is [`LeadingAdjustment::None`].
333    ///
334    /// See [`Self::titlecase_segment_with_only_case_data()`] for the equivalent lower-level function that returns a [`Writeable`]
335    ///
336    /// # Examples
337    ///
338    /// ```rust
339    /// use icu::casemap::CaseMapper;
340    /// use icu::locale::langid;
341    ///
342    /// let cm = CaseMapper::new();
343    /// let root = langid!("und");
344    ///
345    /// let default_options = Default::default();
346    ///
347    /// // note that the subsequent words are not titlecased, this function assumes
348    /// // that the entire string is a single segment and only titlecases at the beginning.
349    /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("hEllO WorLd", &root, default_options), "Hello world");
350    /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("Γειά σου Κόσμε", &root, default_options), "Γειά σου κόσμε");
351    /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("नमस्ते दुनिया", &root, default_options), "नमस्ते दुनिया");
352    /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("Привет мир", &root, default_options), "Привет мир");
353    ///
354    /// // Some behavior is language-sensitive
355    /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("istanbul", &root, default_options), "Istanbul");
356    /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("istanbul", &langid!("tr"), default_options), "İstanbul"); // Turkish dotted i
357    ///
358    /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("և Երևանի", &root, default_options), "Եւ երևանի");
359    /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("և Երևանի", &langid!("hy"), default_options), "Եվ երևանի"); // Eastern Armenian ech-yiwn ligature
360    ///
361    /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("ijkdijk", &root, default_options), "Ijkdijk");
362    /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("ijkdijk", &langid!("nl"), default_options), "IJkdijk"); // Dutch IJ digraph
363    /// ```
364    ///
365    /// [`TitlecaseMapper`]: crate::TitlecaseMapper
366    pub fn titlecase_segment_with_only_case_data_to_string<'s>(
367        self,
368        src: &'s str,
369        langid: &LanguageIdentifier,
370        options: TitlecaseOptions,
371    ) -> Cow<'s, str> {
372        writeable::to_string_or_borrow(
373            &self.titlecase_segment_with_only_case_data(src, langid, options),
374            src.as_bytes(),
375        )
376    }
377
378    /// Case-folds the characters in the given string as a String.
379    /// This function is locale-independent and context-insensitive.
380    ///
381    /// Can be used to test if two strings are case-insensitively equivalent.
382    ///
383    /// See [`Self::fold()`] for the equivalent lower-level function that returns a [`Writeable`]
384    ///s s
385    /// # Examples
386    ///
387    /// ```rust
388    /// use icu::casemap::CaseMapper;
389    ///
390    /// let cm = CaseMapper::new();
391    ///
392    /// // Check if two strings are equivalent case insensitively
393    /// assert_eq!(cm.fold_string("hEllO WorLd"), cm.fold_string("HELLO worlD"));
394    ///
395    /// assert_eq!(cm.fold_string("hEllO WorLd"), "hello world");
396    /// assert_eq!(cm.fold_string("Γειά σου Κόσμε"), "γειά σου κόσμε");
397    /// assert_eq!(cm.fold_string("नमस्ते दुनिया"), "नमस्ते दुनिया");
398    /// assert_eq!(cm.fold_string("Привет мир"), "привет мир");
399    /// ```
400    pub fn fold_string(self, src: &str) -> Cow<'_, str> {
401        writeable::to_string_or_borrow(&self.fold(src), src.as_bytes())
402    }
403
404    /// Case-folds the characters in the given string as a String,
405    /// using Turkic (T) mappings for dotted/dotless I.
406    /// This function is locale-independent and context-insensitive.
407    ///
408    /// Can be used to test if two strings are case-insensitively equivalent.
409    ///
410    /// See [`Self::fold_turkic()`] for the equivalent lower-level function that returns a [`Writeable`]
411    ///
412    /// # Examples
413    ///
414    /// ```rust
415    /// use icu::casemap::CaseMapper;
416    ///
417    /// let cm = CaseMapper::new();
418    ///
419    /// // Check if two strings are equivalent case insensitively
420    /// assert_eq!(cm.fold_turkic_string("İstanbul"), cm.fold_turkic_string("iSTANBUL"));
421    ///
422    /// assert_eq!(cm.fold_turkic_string("İstanbul not Constantinople"), "istanbul not constantinople");
423    /// assert_eq!(cm.fold_turkic_string("Istanbul not Constantınople"), "ıstanbul not constantınople");
424    ///
425    /// assert_eq!(cm.fold_turkic_string("hEllO WorLd"), "hello world");
426    /// assert_eq!(cm.fold_turkic_string("Γειά σου Κόσμε"), "γειά σου κόσμε");
427    /// assert_eq!(cm.fold_turkic_string("नमस्ते दुनिया"), "नमस्ते दुनिया");
428    /// assert_eq!(cm.fold_turkic_string("Привет мир"), "привет мир");
429    /// ```
430    pub fn fold_turkic_string(self, src: &str) -> Cow<'_, str> {
431        writeable::to_string_or_borrow(&self.fold_turkic(src), src.as_bytes())
432    }
433
434    /// Adds all simple case mappings and the full case folding for `c` to `set`.
435    /// Also adds special case closure mappings.
436    ///
437    /// Identical to [`CaseMapCloserBorrowed::add_case_closure_to()`], see docs there for more information.
438    /// This method is duplicated so that one does not need to load extra unfold data
439    /// if they only need this and not also [`CaseMapCloserBorrowed::add_string_case_closure_to()`].
440    ///
441    ///
442    /// # Examples
443    ///
444    /// ```rust
445    /// use icu::casemap::CaseMapper;
446    /// use icu::collections::codepointinvlist::CodePointInversionListBuilder;
447    ///
448    /// let cm = CaseMapper::new();
449    /// let mut builder = CodePointInversionListBuilder::new();
450    /// cm.add_case_closure_to('s', &mut builder);
451    ///
452    /// let set = builder.build();
453    ///
454    /// assert!(set.contains('S'));
455    /// assert!(set.contains('ſ'));
456    /// assert!(!set.contains('s')); // does not contain itself
457    /// ```
458    ///
459    /// [`CaseMapCloserBorrowed::add_case_closure_to()`]: crate::CaseMapCloserBorrowed::add_case_closure_to
460    /// [`CaseMapCloserBorrowed::add_string_case_closure_to()`]: crate::CaseMapCloserBorrowed::add_string_case_closure_to
461    pub fn add_case_closure_to<S: ClosureSink>(self, c: char, set: &mut S) {
462        self.data.add_case_closure_to(c, set);
463    }
464
465    /// Returns the lowercase mapping of the given `char`.
466    /// This function only implements simple and common mappings. Full mappings,
467    /// which can map one `char` to a string, are not included.
468    /// For full mappings, use [`CaseMapperBorrowed::lowercase`].
469    ///
470    /// # Examples
471    ///
472    /// ```rust
473    /// use icu::casemap::CaseMapper;
474    ///
475    /// let cm = CaseMapper::new();
476    ///
477    /// assert_eq!(cm.simple_lowercase('C'), 'c');
478    /// assert_eq!(cm.simple_lowercase('c'), 'c');
479    /// assert_eq!(cm.simple_lowercase('Ć'), 'ć');
480    /// assert_eq!(cm.simple_lowercase('Γ'), 'γ');
481    /// ```
482    pub fn simple_lowercase(self, c: char) -> char {
483        self.data.simple_lower(c)
484    }
485
486    /// Returns the uppercase mapping of the given `char`.
487    /// This function only implements simple and common mappings. Full mappings,
488    /// which can map one `char` to a string, are not included.
489    /// For full mappings, use [`CaseMapperBorrowed::uppercase`].
490    ///
491    /// # Examples
492    ///
493    /// ```rust
494    /// use icu::casemap::CaseMapper;
495    ///
496    /// let cm = CaseMapper::new();
497    ///
498    /// assert_eq!(cm.simple_uppercase('c'), 'C');
499    /// assert_eq!(cm.simple_uppercase('C'), 'C');
500    /// assert_eq!(cm.simple_uppercase('ć'), 'Ć');
501    /// assert_eq!(cm.simple_uppercase('γ'), 'Γ');
502    ///
503    /// assert_eq!(cm.simple_uppercase('ǳ'), 'Ǳ');
504    /// ```
505    pub fn simple_uppercase(self, c: char) -> char {
506        self.data.simple_upper(c)
507    }
508
509    /// Returns the titlecase mapping of the given `char`.
510    /// This function only implements simple and common mappings. Full mappings,
511    /// which can map one `char` to a string, are not included.
512    ///
513    /// # Examples
514    ///
515    /// ```rust
516    /// use icu::casemap::CaseMapper;
517    ///
518    /// let cm = CaseMapper::new();
519    ///
520    /// assert_eq!(cm.simple_titlecase('ǳ'), 'ǲ');
521    ///
522    /// assert_eq!(cm.simple_titlecase('c'), 'C');
523    /// assert_eq!(cm.simple_titlecase('C'), 'C');
524    /// assert_eq!(cm.simple_titlecase('ć'), 'Ć');
525    /// assert_eq!(cm.simple_titlecase('γ'), 'Γ');
526    /// ```
527    pub fn simple_titlecase(self, c: char) -> char {
528        self.data.simple_title(c)
529    }
530
531    /// Returns the simple case folding of the given char.
532    /// For full mappings, use [`CaseMapperBorrowed::fold`].
533    ///
534    /// This function can be used to perform caseless matches on
535    /// individual characters.
536    /// > *Note:* With Unicode 15.0 data, there are three
537    /// > pairs of characters for which equivalence under this
538    /// > function is inconsistent with equivalence of the
539    /// > one-character strings under [`CaseMapperBorrowed::fold`].
540    /// > This is resolved in Unicode 15.1 and later.
541    ///
542    /// For compatibility applications where simple case folding
543    /// of strings is required, this function can be applied to
544    /// each character of a string.  Note that the resulting
545    /// equivalence relation is different from that obtained
546    /// by [`CaseMapperBorrowed::fold`]:
547    /// The strings "Straße" and "STRASSE" are distinct
548    /// under simple case folding, but are equivalent under
549    /// default (full) case folding.
550    ///
551    /// # Examples
552    ///
553    /// ```rust
554    /// use icu::casemap::CaseMapper;
555    ///
556    /// let cm = CaseMapper::new();
557    ///
558    /// // perform case insensitive checks
559    /// assert_eq!(cm.simple_fold('σ'), cm.simple_fold('ς'));
560    /// assert_eq!(cm.simple_fold('Σ'), cm.simple_fold('ς'));
561    ///
562    /// assert_eq!(cm.simple_fold('c'), 'c');
563    /// assert_eq!(cm.simple_fold('Ć'), 'ć');
564    /// assert_eq!(cm.simple_fold('Γ'), 'γ');
565    /// assert_eq!(cm.simple_fold('ς'), 'σ');
566    ///
567    /// assert_eq!(cm.simple_fold('ß'), 'ß');
568    /// assert_eq!(cm.simple_fold('I'), 'i');
569    /// assert_eq!(cm.simple_fold('İ'), 'İ');
570    /// assert_eq!(cm.simple_fold('ı'), 'ı');
571    /// ```
572    pub fn simple_fold(self, c: char) -> char {
573        self.data.simple_fold(c, FoldOptions::default())
574    }
575
576    /// Returns the simple case folding of the given char, using Turkic (T) mappings for
577    /// dotted/dotless i. This function does not fold `i` and `I` to the same character. Instead,
578    /// `I` will fold to `ı`, and `İ` will fold to `i`. Otherwise, this is the same as
579    /// [`CaseMapperBorrowed::fold()`].
580    ///
581    /// You can use the case folding to perform Turkic caseless matches on characters
582    /// provided they don't full-casefold to strings. To avoid that situation,
583    /// convert to a string and use [`CaseMapperBorrowed::fold_turkic`].
584    ///
585    ///
586    /// # Examples
587    ///
588    /// ```rust
589    /// use icu::casemap::CaseMapper;
590    ///
591    /// let cm = CaseMapper::new();
592    ///
593    /// assert_eq!(cm.simple_fold_turkic('I'), 'ı');
594    /// assert_eq!(cm.simple_fold_turkic('İ'), 'i');
595    /// ```
596    pub fn simple_fold_turkic(self, c: char) -> char {
597        self.data
598            .simple_fold(c, FoldOptions::with_turkic_mappings())
599    }
600}
601
602impl CaseMapper {
603    /// Creates a [`CaseMapperBorrowed`] using compiled data.
604    ///
605    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
606    ///
607    /// [📚 Help choosing a constructor](icu_provider::constructors)
608    ///
609    /// # Examples
610    ///
611    /// ```rust
612    /// use icu::casemap::CaseMapper;
613    /// use icu::locale::langid;
614    ///
615    /// let cm = CaseMapper::new();
616    ///
617    /// assert_eq!(
618    ///     cm.uppercase_to_string("hello world", &langid!("und")),
619    ///     "HELLO WORLD"
620    /// );
621    /// ```
622    #[cfg(feature = "compiled_data")]
623    #[expect(clippy::new_ret_no_self)] // Intentional
624    pub const fn new() -> CaseMapperBorrowed<'static> {
625        CaseMapperBorrowed::new()
626    }
627
628    /// Constructs a borrowed version of this type for more efficient querying.
629    pub fn as_borrowed(&self) -> CaseMapperBorrowed<'_> {
630        CaseMapperBorrowed {
631            data: self.data.get(),
632        }
633    }
634
635    icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
636    functions: [
637        new: skip,
638        try_new_with_buffer_provider,
639        try_new_unstable,
640        Self,
641    ]);
642
643    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
644    pub fn try_new_unstable<P>(provider: &P) -> Result<CaseMapper, DataError>
645    where
646        P: DataProvider<CaseMapV1> + ?Sized,
647    {
648        let data = provider.load(Default::default())?.payload;
649        Ok(Self { data })
650    }
651}
652
653#[cfg(test)]
654mod tests {
655    use super::*;
656    use icu_locale_core::langid;
657
658    #[test]
659    /// Tests for SpecialCasing.txt. Some of the special cases are data-driven, some are code-driven
660    fn test_special_cases() {
661        let cm = CaseMapper::new();
662        let root = langid!("und");
663        let default_options = Default::default();
664
665        // Ligatures
666
667        // U+FB00 LATIN SMALL LIGATURE FF
668        assert_eq!(cm.uppercase_to_string("ﬀ", &root), "FF");
669        // U+FB05 LATIN SMALL LIGATURE LONG S T
670        assert_eq!(cm.uppercase_to_string("ﬅ", &root), "ST");
671
672        // No corresponding uppercased character
673
674        // U+0149 LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
675        assert_eq!(cm.uppercase_to_string("ŉ", &root), "ʼN");
676
677        // U+1F50 GREEK SMALL LETTER UPSILON WITH PSILI
678        assert_eq!(cm.uppercase_to_string("ὐ", &root), "Υ̓");
679        // U+1FF6 GREEK SMALL LETTER OMEGA WITH PERISPOMENI
680        assert_eq!(cm.uppercase_to_string("ῶ", &root), "Ω͂");
681
682        // YPOGEGRAMMENI / PROSGEGRAMMENI special cases
683
684        // E.g. <alpha><iota_subscript><acute> is uppercased to <ALPHA><acute><IOTA>
685        assert_eq!(
686            cm.uppercase_to_string("α\u{0313}\u{0345}", &root),
687            "Α\u{0313}Ι"
688        );
689        // but the YPOGEGRAMMENI should not titlecase
690        assert_eq!(
691            cm.titlecase_segment_with_only_case_data_to_string(
692                "α\u{0313}\u{0345}",
693                &root,
694                default_options
695            ),
696            "Α\u{0313}\u{0345}"
697        );
698
699        // U+1F80 GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
700        assert_eq!(
701            cm.titlecase_segment_with_only_case_data_to_string("ᾀ", &root, default_options),
702            "ᾈ"
703        );
704        assert_eq!(cm.uppercase_to_string("ᾀ", &root), "ἈΙ");
705
706        // U+1FFC GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
707        assert_eq!(cm.lowercase_to_string("ῼ", &root), "ῳ");
708        assert_eq!(
709            cm.titlecase_segment_with_only_case_data_to_string("ῼ", &root, default_options),
710            "ῼ"
711        );
712        assert_eq!(cm.uppercase_to_string("ῼ", &root), "ΩΙ");
713
714        // U+1F98 GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
715        assert_eq!(cm.lowercase_to_string("ᾘ", &root), "ᾐ");
716        assert_eq!(
717            cm.titlecase_segment_with_only_case_data_to_string("ᾘ", &root, default_options),
718            "ᾘ"
719        );
720        assert_eq!(cm.uppercase_to_string("ᾘ", &root), "ἨΙ");
721
722        // U+1FB2 GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
723        assert_eq!(cm.lowercase_to_string("ᾲ", &root), "ᾲ");
724        assert_eq!(
725            cm.titlecase_segment_with_only_case_data_to_string("ᾲ", &root, default_options),
726            "Ὰ\u{345}"
727        );
728        assert_eq!(cm.uppercase_to_string("ᾲ", &root), "ᾺΙ");
729
730        // Final sigma test
731        // U+03A3 GREEK CAPITAL LETTER SIGMA in Final_Sigma context
732        assert_eq!(cm.lowercase_to_string("ΙΙΙΣ", &root), "ιιις");
733
734        // Turkish / Azeri
735        let tr = langid!("tr");
736        let az = langid!("az");
737        // U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE
738        assert_eq!(cm.lowercase_to_string("İ", &tr), "i");
739        assert_eq!(cm.lowercase_to_string("İ", &az), "i");
740        assert_eq!(
741            cm.titlecase_segment_with_only_case_data_to_string("İ", &tr, default_options),
742            "İ"
743        );
744        assert_eq!(
745            cm.titlecase_segment_with_only_case_data_to_string("İ", &az, default_options),
746            "İ"
747        );
748        assert_eq!(cm.uppercase_to_string("İ", &tr), "İ");
749        assert_eq!(cm.uppercase_to_string("İ", &az), "İ");
750
751        // U+0049 LATIN CAPITAL LETTER I and U+0307 COMBINING DOT ABOVE
752        assert_eq!(cm.lowercase_to_string("I\u{0307}", &tr), "i");
753        assert_eq!(cm.lowercase_to_string("I\u{0307}", &az), "i");
754        assert_eq!(
755            cm.titlecase_segment_with_only_case_data_to_string("I\u{0307}", &tr, default_options),
756            "I\u{0307}"
757        );
758        assert_eq!(
759            cm.titlecase_segment_with_only_case_data_to_string("I\u{0307}", &az, default_options),
760            "I\u{0307}"
761        );
762        assert_eq!(cm.uppercase_to_string("I\u{0307}", &tr), "I\u{0307}");
763        assert_eq!(cm.uppercase_to_string("I\u{0307}", &az), "I\u{0307}");
764
765        // U+0049 LATIN CAPITAL LETTER I
766        assert_eq!(cm.lowercase_to_string("I", &tr), "ı");
767        assert_eq!(cm.lowercase_to_string("I", &az), "ı");
768        assert_eq!(
769            cm.titlecase_segment_with_only_case_data_to_string("I", &tr, default_options),
770            "I"
771        );
772        assert_eq!(
773            cm.titlecase_segment_with_only_case_data_to_string("I", &az, default_options),
774            "I"
775        );
776        assert_eq!(cm.uppercase_to_string("I", &tr), "I");
777        assert_eq!(cm.uppercase_to_string("I", &az), "I");
778
779        // U+0069 LATIN SMALL LETTER I
780        assert_eq!(cm.lowercase_to_string("i", &tr), "i");
781        assert_eq!(cm.lowercase_to_string("i", &az), "i");
782        assert_eq!(
783            cm.titlecase_segment_with_only_case_data_to_string("i", &tr, default_options),
784            "İ"
785        );
786        assert_eq!(
787            cm.titlecase_segment_with_only_case_data_to_string("i", &az, default_options),
788            "İ"
789        );
790        assert_eq!(cm.uppercase_to_string("i", &tr), "İ");
791        assert_eq!(cm.uppercase_to_string("i", &az), "İ");
792    }
793
794    #[test]
795    fn test_cherokee_case_folding() {
796        let case_mapping = CaseMapper::new();
797        assert_eq!(case_mapping.simple_fold('Ꭰ'), 'Ꭰ');
798        assert_eq!(case_mapping.simple_fold('ꭰ'), 'Ꭰ');
799        assert_eq!(case_mapping.simple_fold_turkic('Ꭰ'), 'Ꭰ');
800        assert_eq!(case_mapping.simple_fold_turkic('ꭰ'), 'Ꭰ');
801        assert_eq!(case_mapping.fold_string("Ꭰ"), "Ꭰ");
802        assert_eq!(case_mapping.fold_string("ꭰ"), "Ꭰ");
803        assert_eq!(case_mapping.fold_turkic_string("Ꭰ"), "Ꭰ");
804        assert_eq!(case_mapping.fold_turkic_string("ꭰ"), "Ꭰ");
805    }
806}
icu_casemap/casemapper.rs

icu_casemap/
casemapper.rs