icu_casemap/
titlecase.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Titlecasing-specific
6use crate::provider::CaseMapV1;
7use crate::{CaseMapper, CaseMapperBorrowed};
8use alloc::borrow::Cow;
9use icu_locale_core::LanguageIdentifier;
10use icu_properties::props::{GeneralCategory, GeneralCategoryGroup};
11use icu_properties::provider::PropertyEnumGeneralCategoryV1;
12use icu_properties::{CodePointMapData, CodePointMapDataBorrowed};
13use icu_provider::prelude::*;
14use writeable::Writeable;
15
16/// How to handle the rest of the string once the beginning of the
17/// string has been titlecased.
18///
19/// # Examples
20///
21/// ```rust
22/// use icu::casemap::options::{TitlecaseOptions, TrailingCase};
23/// use icu::casemap::TitlecaseMapper;
24/// use icu::locale::langid;
25///
26/// let cm = TitlecaseMapper::new();
27/// let root = langid!("und");
28///
29/// let default_options = Default::default();
30/// let mut preserve_case: TitlecaseOptions = Default::default();
31/// preserve_case.trailing_case = Some(TrailingCase::Unchanged);
32///
33/// // Exhibits trailing case when set:
34/// assert_eq!(
35///     cm.titlecase_segment_to_string("spOngeBoB", &root, default_options),
36///     "Spongebob"
37/// );
38/// assert_eq!(
39///     cm.titlecase_segment_to_string("spOngeBoB", &root, preserve_case),
40///     "SpOngeBoB"
41/// );
42/// ```
43#[non_exhaustive]
44#[derive(Copy, Clone, Default, PartialEq, Eq, Hash, Debug)]
45pub enum TrailingCase {
46    /// Preserve the casing of the rest of the string ("spoNgEBoB" -> "SpoNgEBoB")
47    Unchanged,
48    /// Lowercase the rest of the string ("spoNgEBoB" -> "Spongebob")
49    #[default]
50    Lower,
51}
52
53/// Where to start casing the string.
54///
55/// [`TitlecaseMapper`] by default performs "leading adjustment", where it searches for the first "relevant" character
56/// in the string before initializing the actual titlecasing. For example, it will skip punctuation at the beginning
57/// of a string, allowing for strings like `'twas` or `«hello»` to be appropriately titlecased.
58///
59/// Opinions on exactly what is a "relevant" character may differ. In "adjust to cased" mode the first cased character is considered "relevant",
60/// whereas in the "auto" mode, it is the first character that is a letter, number, symbol, or private use character. This means
61/// that the strings `49ers` and `«丰(abc)»` will titlecase in "adjust to cased" mode to `49Ers` and `«丰(Abc)»`, whereas in the "auto" mode they stay unchanged.
62/// This difference largely matters for things that mix numbers and letters, or mix writing systems, within a single segment.
63///
64/// # Examples
65///
66/// ```rust
67/// use icu::casemap::options::{LeadingAdjustment, TitlecaseOptions};
68/// use icu::casemap::TitlecaseMapper;
69/// use icu::locale::langid;
70///
71/// let cm = TitlecaseMapper::new();
72/// let root = langid!("und");
73///
74/// let default_options = Default::default(); // head adjustment set to Auto
75/// let mut no_adjust: TitlecaseOptions = Default::default();
76/// let mut adjust_to_cased: TitlecaseOptions = Default::default();
77/// no_adjust.leading_adjustment = Some(LeadingAdjustment::None);
78/// adjust_to_cased.leading_adjustment = Some(LeadingAdjustment::ToCased);
79///
80/// // Exhibits leading adjustment when set:
81/// assert_eq!(
82///     cm.titlecase_segment_to_string("«hello»", &root, default_options),
83///     "«Hello»"
84/// );
85/// assert_eq!(
86///     cm.titlecase_segment_to_string("«hello»", &root, adjust_to_cased),
87///     "«Hello»"
88/// );
89/// assert_eq!(
90///     cm.titlecase_segment_to_string("«hello»", &root, no_adjust),
91///     "«hello»"
92/// );
93///
94/// // Only changed in adjust-to-cased mode:
95/// assert_eq!(
96///     cm.titlecase_segment_to_string("丰(abc)", &root, default_options),
97///     "丰(abc)"
98/// );
99/// assert_eq!(
100///     cm.titlecase_segment_to_string("丰(abc)", &root, adjust_to_cased),
101///     "丰(Abc)"
102/// );
103/// assert_eq!(
104///     cm.titlecase_segment_to_string("丰(abc)", &root, no_adjust),
105///     "丰(abc)"
106/// );
107///
108/// // Only changed in adjust-to-cased mode:
109/// assert_eq!(
110///     cm.titlecase_segment_to_string("49ers", &root, default_options),
111///     "49ers"
112/// );
113/// assert_eq!(
114///     cm.titlecase_segment_to_string("49ers", &root, adjust_to_cased),
115///     "49Ers"
116/// );
117/// assert_eq!(
118///     cm.titlecase_segment_to_string("49ers", &root, no_adjust),
119///     "49ers"
120/// );
121/// ```
122#[non_exhaustive]
123#[derive(Copy, Clone, Default, PartialEq, Eq, Hash, Debug)]
124pub enum LeadingAdjustment {
125    /// Start titlecasing immediately, even if the character is not one that is relevant for casing
126    /// ("'twixt" -> "'twixt", "twixt" -> "Twixt")
127    None,
128    /// Adjust the string to the first relevant character before beginning to apply casing
129    /// ("'twixt" -> "'Twixt"). "Relevant" character is picked by best available algorithm,
130    /// by default will adjust to first letter, number, symbol, or private use character,
131    /// but if no data is available (e.g. this API is being called via [`CaseMapperBorrowed::titlecase_segment_with_only_case_data()`]),
132    /// then may be equivalent to "adjust to cased".
133    ///
134    /// This is the default
135    #[default]
136    Auto,
137    /// Adjust the string to the first cased character before beginning to apply casing
138    /// ("'twixt" -> "'Twixt")
139    ToCased,
140}
141
142/// Various options for controlling titlecasing
143///
144/// See docs of [`TitlecaseMapper`] for examples.
145#[non_exhaustive]
146#[derive(Copy, Clone, Default, PartialEq, Eq, Hash, Debug)]
147pub struct TitlecaseOptions {
148    /// How to handle the rest of the string once the head of the
149    /// string has been titlecased
150    ///
151    /// Default is [`TrailingCase::Lower`]
152    pub trailing_case: Option<TrailingCase>,
153    /// Whether to start casing at the beginning of the string or at the first
154    /// relevant character.
155    ///
156    /// Default is [`LeadingAdjustment::Auto`]
157    pub leading_adjustment: Option<LeadingAdjustment>,
158}
159
160/// A wrapper around [`CaseMapper`] that can compute titlecasing stuff, and is able to load additional data
161/// to support the non-legacy "head adjustment" behavior.
162///
163///
164/// Most methods for this type live on [`TitlecaseMapperBorrowed`], which you can obtain via
165/// [`TitlecaseMapper::new()`] or [`TitlecaseMapper::as_borrowed()`].
166///
167/// By default, [`TitlecaseMapperBorrowed::titlecase_segment()`] and [`TitlecaseMapperBorrowed::titlecase_segment_to_string()`] perform "leading adjustment",
168/// where they wait till the first relevant character to begin titlecasing. For example, in the string `'twixt`, the apostrophe
169/// is ignored because the word starts at the first "t", which will get titlecased (producing `'Twixt`). Other punctuation will
170/// also be ignored, like in the string `«hello»`, which will get titlecased to `«Hello»`.
171///
172/// This is a separate type from [`CaseMapper`] because it loads the additional data
173/// required by [`LeadingAdjustment::Auto`] to perform the best possible leading adjustment.
174///
175/// If you are planning on only using [`LeadingAdjustment::None`] or [`LeadingAdjustment::ToCased`], consider using [`CaseMapper`] directly; this
176/// type will have no additional behavior.
177///
178/// # Examples
179///
180/// Basic casemapping behavior:
181///
182/// ```rust
183/// use icu::casemap::TitlecaseMapper;
184/// use icu::locale::langid;
185///
186/// let cm = TitlecaseMapper::new();
187/// let root = langid!("und");
188///
189/// let default_options = Default::default();
190///
191/// // note that the subsequent words are not titlecased, this function assumes
192/// // that the entire string is a single segment and only titlecases at the beginning.
193/// assert_eq!(cm.titlecase_segment_to_string("hEllO WorLd", &root, default_options), "Hello world");
194/// assert_eq!(cm.titlecase_segment_to_string("Γειά σου Κόσμε", &root, default_options), "Γειά σου κόσμε");
195/// assert_eq!(cm.titlecase_segment_to_string("नमस्ते दुनिया", &root, default_options), "नमस्ते दुनिया");
196/// assert_eq!(cm.titlecase_segment_to_string("Привет мир", &root, default_options), "Привет мир");
197///
198/// // Some behavior is language-sensitive
199/// assert_eq!(cm.titlecase_segment_to_string("istanbul", &root, default_options), "Istanbul");
200/// assert_eq!(cm.titlecase_segment_to_string("istanbul", &langid!("tr"), default_options), "İstanbul"); // Turkish dotted i
201///
202/// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &root, default_options), "Եւ երևանի");
203/// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &langid!("hy"), default_options), "Եվ երևանի"); // Eastern Armenian ech-yiwn ligature
204///
205/// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &root, default_options), "Ijkdijk");
206/// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &langid!("nl"), default_options), "IJkdijk"); // Dutch IJ digraph
207/// ```
208#[derive(Clone, Debug)]
209pub struct TitlecaseMapper<CM> {
210    cm: CM,
211    gc: CodePointMapData<GeneralCategory>,
212}
213
214impl TitlecaseMapper<CaseMapper> {
215    icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
216    functions: [
217        new: skip,
218        try_new_with_buffer_provider,
219        try_new_unstable,
220        Self,
221    ]);
222
223    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
224    pub fn try_new_unstable<P>(provider: &P) -> Result<Self, DataError>
225    where
226        P: DataProvider<CaseMapV1> + DataProvider<PropertyEnumGeneralCategoryV1> + ?Sized,
227    {
228        let cm = CaseMapper::try_new_unstable(provider)?;
229        let gc = icu_properties::CodePointMapData::<icu_properties::props::GeneralCategory>::try_new_unstable(provider)?;
230        Ok(Self { cm, gc })
231    }
232}
233
234impl TitlecaseMapper<CaseMapper> {
235    /// A constructor which creates a [`TitlecaseMapperBorrowed`] using compiled data
236    ///
237    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
238    ///
239    /// [📚 Help choosing a constructor](icu_provider::constructors)
240    #[cfg(feature = "compiled_data")]
241    #[allow(clippy::new_ret_no_self)] // Intentional
242    pub const fn new() -> TitlecaseMapperBorrowed<'static> {
243        TitlecaseMapperBorrowed::new()
244    }
245}
246// We use Borrow, not AsRef, since we want the blanket impl on T
247impl<CM: AsRef<CaseMapper>> TitlecaseMapper<CM> {
248    icu_provider::gen_buffer_data_constructors!((casemapper: CM) -> error: DataError,
249    functions: [
250        new_with_mapper: skip,
251        try_new_with_mapper_with_buffer_provider,
252        try_new_with_mapper_unstable,
253        Self,
254    ]);
255
256    /// A constructor which creates a [`TitlecaseMapper`] from an existing [`CaseMapper`]
257    /// (either owned or as a reference) and compiled data
258    ///
259    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
260    ///
261    /// [📚 Help choosing a constructor](icu_provider::constructors)
262    #[cfg(feature = "compiled_data")]
263    pub const fn new_with_mapper(casemapper: CM) -> Self {
264        Self {
265            cm: casemapper,
266            gc: icu_properties::CodePointMapData::<icu_properties::props::GeneralCategory>::new()
267                .static_to_owned(),
268        }
269    }
270
271    /// Construct this object to wrap an existing CaseMapper (or a reference to one), loading additional data as needed.
272    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_with_mapper)]
273    pub fn try_new_with_mapper_unstable<P>(provider: &P, casemapper: CM) -> Result<Self, DataError>
274    where
275        P: DataProvider<CaseMapV1> + DataProvider<PropertyEnumGeneralCategoryV1> + ?Sized,
276    {
277        let gc = icu_properties::CodePointMapData::<icu_properties::props::GeneralCategory>::try_new_unstable(provider)?;
278        Ok(Self { cm: casemapper, gc })
279    }
280
281    /// Constructs a borrowed version of this type for more efficient querying.
282    pub fn as_borrowed(&self) -> TitlecaseMapperBorrowed<'_> {
283        TitlecaseMapperBorrowed {
284            cm: self.cm.as_ref().as_borrowed(),
285            gc: self.gc.as_borrowed(),
286        }
287    }
288}
289
290/// A borrowed [`TitlecaseMapper`].
291///
292/// See methods or [`TitlecaseMapper`] for examples.
293#[derive(Clone, Debug, Copy)]
294pub struct TitlecaseMapperBorrowed<'a> {
295    cm: CaseMapperBorrowed<'a>,
296    gc: CodePointMapDataBorrowed<'a, GeneralCategory>,
297}
298
299impl TitlecaseMapperBorrowed<'static> {
300    /// A constructor which creates a [`TitlecaseMapperBorrowed`] using compiled data
301    ///
302    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
303    ///
304    /// [📚 Help choosing a constructor](icu_provider::constructors)
305    #[cfg(feature = "compiled_data")]
306    pub const fn new() -> Self {
307        Self {
308            cm: CaseMapper::new(),
309            gc: icu_properties::CodePointMapData::<icu_properties::props::GeneralCategory>::new(),
310        }
311    }
312    /// Cheaply converts a [`TitlecaseMapperBorrowed<'static>`] into a [`TitlecaseMapper`].
313    ///
314    /// Note: Due to branching and indirection, using [`TitlecaseMapper`] might inhibit some
315    /// compile-time optimizations that are possible with [`TitlecaseMapper`].
316    pub const fn static_to_owned(self) -> TitlecaseMapper<CaseMapper> {
317        TitlecaseMapper {
318            cm: self.cm.static_to_owned(),
319            gc: self.gc.static_to_owned(),
320        }
321    }
322}
323
324#[cfg(feature = "compiled_data")]
325impl Default for TitlecaseMapperBorrowed<'static> {
326    fn default() -> Self {
327        Self::new()
328    }
329}
330
331impl<'a> TitlecaseMapperBorrowed<'a> {
332    /// Returns the full titlecase mapping of the given string as a [`Writeable`], treating
333    /// the string as a single segment (and thus only titlecasing the beginning of it).
334    ///
335    /// This should typically be used as a lower-level helper to construct the titlecasing operation desired
336    /// by the application, for example one can titlecase on a per-word basis by mixing this with
337    /// a `WordSegmenter`.
338    ///
339    /// This function is context and language sensitive. Callers should pass the text's language
340    /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
341    /// `Default::default()` for the root locale.
342    ///
343    /// See [`TitlecaseMapperBorrowed::titlecase_segment_to_string()`] for the equivalent convenience function that returns a String,
344    /// as well as for an example.
345    pub fn titlecase_segment(
346        self,
347        src: &'a str,
348        langid: &LanguageIdentifier,
349        options: TitlecaseOptions,
350    ) -> impl Writeable + 'a {
351        if options.leading_adjustment.unwrap_or_default() == LeadingAdjustment::Auto {
352            // letter, number, symbol, or private use code point
353            const HEAD_GROUPS: GeneralCategoryGroup = GeneralCategoryGroup::Letter
354                .union(GeneralCategoryGroup::Number)
355                .union(GeneralCategoryGroup::Symbol)
356                .union(GeneralCategoryGroup::PrivateUse);
357            self.cm
358                .titlecase_segment_with_adjustment(src, langid, options, |_data, ch| {
359                    HEAD_GROUPS.contains(self.gc.get(ch))
360                })
361        } else {
362            self.cm
363                .titlecase_segment_with_adjustment(src, langid, options, |data, ch| {
364                    data.is_cased(ch)
365                })
366        }
367    }
368
369    /// Returns the full titlecase mapping of the given string as a String, treating
370    /// the string as a single segment (and thus only titlecasing the beginning of it).
371    ///
372    /// This should typically be used as a lower-level helper to construct the titlecasing operation desired
373    /// by the application, for example one can titlecase on a per-word basis by mixing this with
374    /// a `WordSegmenter`.
375    ///
376    /// This function is context and language sensitive. Callers should pass the text's language
377    /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
378    /// `Default::default()` for the root locale.
379    ///
380    /// See [`TitlecaseMapperBorrowed::titlecase_segment()`] for the equivalent lower-level function that returns a [`Writeable`]
381    ///
382    /// # Examples
383    ///
384    /// ```rust
385    /// use icu::casemap::TitlecaseMapper;
386    /// use icu::locale::langid;
387    ///
388    /// let cm = TitlecaseMapper::new();
389    /// let root = langid!("und");
390    ///
391    /// let default_options = Default::default();
392    ///
393    /// // note that the subsequent words are not titlecased, this function assumes
394    /// // that the entire string is a single segment and only titlecases at the beginning.
395    /// assert_eq!(cm.titlecase_segment_to_string("hEllO WorLd", &root, default_options), "Hello world");
396    /// assert_eq!(cm.titlecase_segment_to_string("Γειά σου Κόσμε", &root, default_options), "Γειά σου κόσμε");
397    /// assert_eq!(cm.titlecase_segment_to_string("नमस्ते दुनिया", &root, default_options), "नमस्ते दुनिया");
398    /// assert_eq!(cm.titlecase_segment_to_string("Привет мир", &root, default_options), "Привет мир");
399    ///
400    /// // Some behavior is language-sensitive
401    /// assert_eq!(cm.titlecase_segment_to_string("istanbul", &root, default_options), "Istanbul");
402    /// assert_eq!(cm.titlecase_segment_to_string("istanbul", &langid!("tr"), default_options), "İstanbul"); // Turkish dotted i
403    ///
404    /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &root, default_options), "Եւ երևանի");
405    /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &langid!("hy"), default_options), "Եվ երևանի"); // Eastern Armenian ech-yiwn ligature
406    ///
407    /// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &root, default_options), "Ijkdijk");
408    /// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &langid!("nl"), default_options), "IJkdijk"); // Dutch IJ digraph
409    /// ```
410    ///
411    /// Leading adjustment behaviors:
412    ///
413    /// ```rust
414    /// use icu::casemap::options::{LeadingAdjustment, TitlecaseOptions};
415    /// use icu::casemap::TitlecaseMapper;
416    /// use icu::locale::langid;
417    ///
418    /// let cm = TitlecaseMapper::new();
419    /// let root = langid!("und");
420    ///
421    /// let default_options = Default::default();
422    /// let mut no_adjust: TitlecaseOptions = Default::default();
423    /// no_adjust.leading_adjustment = Some(LeadingAdjustment::None);
424    ///
425    /// // Exhibits leading adjustment when set:
426    /// assert_eq!(
427    ///     cm.titlecase_segment_to_string("«hello»", &root, default_options),
428    ///     "«Hello»"
429    /// );
430    /// assert_eq!(
431    ///     cm.titlecase_segment_to_string("«hello»", &root, no_adjust),
432    ///     "«hello»"
433    /// );
434    ///
435    /// assert_eq!(
436    ///     cm.titlecase_segment_to_string("'Twas", &root, default_options),
437    ///     "'Twas"
438    /// );
439    /// assert_eq!(
440    ///     cm.titlecase_segment_to_string("'Twas", &root, no_adjust),
441    ///     "'twas"
442    /// );
443    ///
444    /// assert_eq!(
445    ///     cm.titlecase_segment_to_string("", &root, default_options),
446    ///     ""
447    /// );
448    /// assert_eq!(cm.titlecase_segment_to_string("", &root, no_adjust), "");
449    /// ```
450    ///
451    /// Tail casing behaviors:
452    ///
453    /// ```rust
454    /// use icu::casemap::options::{TitlecaseOptions, TrailingCase};
455    /// use icu::casemap::TitlecaseMapper;
456    /// use icu::locale::langid;
457    ///
458    /// let cm = TitlecaseMapper::new();
459    /// let root = langid!("und");
460    ///
461    /// let default_options = Default::default();
462    /// let mut preserve_case: TitlecaseOptions = Default::default();
463    /// preserve_case.trailing_case = Some(TrailingCase::Unchanged);
464    ///
465    /// // Exhibits trailing case when set:
466    /// assert_eq!(
467    ///     cm.titlecase_segment_to_string("spOngeBoB", &root, default_options),
468    ///     "Spongebob"
469    /// );
470    /// assert_eq!(
471    ///     cm.titlecase_segment_to_string("spOngeBoB", &root, preserve_case),
472    ///     "SpOngeBoB"
473    /// );
474    /// ```
475    pub fn titlecase_segment_to_string<'s>(
476        self,
477        src: &'s str,
478        langid: &LanguageIdentifier,
479        options: TitlecaseOptions,
480    ) -> Cow<'s, str> {
481        writeable::to_string_or_borrow(
482            &self.titlecase_segment(src, langid, options),
483            src.as_bytes(),
484        )
485    }
486}