icu_casemap/titlecase.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Titlecasing-specific
6use crate::provider::CaseMapV1;
7use crate::{CaseMapper, CaseMapperBorrowed};
8use alloc::borrow::Cow;
9use icu_locale_core::LanguageIdentifier;
10use icu_properties::props::{GeneralCategory, GeneralCategoryGroup};
11use icu_properties::provider::PropertyEnumGeneralCategoryV1;
12use icu_properties::{CodePointMapData, CodePointMapDataBorrowed};
13use icu_provider::prelude::*;
14use writeable::Writeable;
15
16/// How to handle the rest of the string once the beginning of the
17/// string has been titlecased.
18///
19/// # Examples
20///
21/// ```rust
22/// use icu::casemap::options::{TitlecaseOptions, TrailingCase};
23/// use icu::casemap::TitlecaseMapper;
24/// use icu::locale::langid;
25///
26/// let cm = TitlecaseMapper::new();
27/// let root = langid!("und");
28///
29/// let default_options = Default::default();
30/// let mut preserve_case: TitlecaseOptions = Default::default();
31/// preserve_case.trailing_case = Some(TrailingCase::Unchanged);
32///
33/// // Exhibits trailing case when set:
34/// assert_eq!(
35/// cm.titlecase_segment_to_string("spOngeBoB", &root, default_options),
36/// "Spongebob"
37/// );
38/// assert_eq!(
39/// cm.titlecase_segment_to_string("spOngeBoB", &root, preserve_case),
40/// "SpOngeBoB"
41/// );
42/// ```
43#[non_exhaustive]
44#[derive(Copy, Clone, Default, PartialEq, Eq, Hash, Debug)]
45pub enum TrailingCase {
46 /// Preserve the casing of the rest of the string ("spoNgEBoB" -> "SpoNgEBoB")
47 Unchanged,
48 /// Lowercase the rest of the string ("spoNgEBoB" -> "Spongebob")
49 #[default]
50 Lower,
51}
52
53/// Where to start casing the string.
54///
55/// [`TitlecaseMapper`] by default performs "leading adjustment", where it searches for the first "relevant" character
56/// in the string before initializing the actual titlecasing. For example, it will skip punctuation at the beginning
57/// of a string, allowing for strings like `'twas` or `«hello»` to be appropriately titlecased.
58///
59/// Opinions on exactly what is a "relevant" character may differ. In "adjust to cased" mode the first cased character is considered "relevant",
60/// whereas in the "auto" mode, it is the first character that is a letter, number, symbol, or private use character. This means
61/// that the strings `49ers` and `«丰(abc)»` will titlecase in "adjust to cased" mode to `49Ers` and `«丰(Abc)»`, whereas in the "auto" mode they stay unchanged.
62/// This difference largely matters for things that mix numbers and letters, or mix writing systems, within a single segment.
63///
64/// # Examples
65///
66/// ```rust
67/// use icu::casemap::options::{LeadingAdjustment, TitlecaseOptions};
68/// use icu::casemap::TitlecaseMapper;
69/// use icu::locale::langid;
70///
71/// let cm = TitlecaseMapper::new();
72/// let root = langid!("und");
73///
74/// let default_options = Default::default(); // head adjustment set to Auto
75/// let mut no_adjust: TitlecaseOptions = Default::default();
76/// let mut adjust_to_cased: TitlecaseOptions = Default::default();
77/// no_adjust.leading_adjustment = Some(LeadingAdjustment::None);
78/// adjust_to_cased.leading_adjustment = Some(LeadingAdjustment::ToCased);
79///
80/// // Exhibits leading adjustment when set:
81/// assert_eq!(
82/// cm.titlecase_segment_to_string("«hello»", &root, default_options),
83/// "«Hello»"
84/// );
85/// assert_eq!(
86/// cm.titlecase_segment_to_string("«hello»", &root, adjust_to_cased),
87/// "«Hello»"
88/// );
89/// assert_eq!(
90/// cm.titlecase_segment_to_string("«hello»", &root, no_adjust),
91/// "«hello»"
92/// );
93///
94/// // Only changed in adjust-to-cased mode:
95/// assert_eq!(
96/// cm.titlecase_segment_to_string("丰(abc)", &root, default_options),
97/// "丰(abc)"
98/// );
99/// assert_eq!(
100/// cm.titlecase_segment_to_string("丰(abc)", &root, adjust_to_cased),
101/// "丰(Abc)"
102/// );
103/// assert_eq!(
104/// cm.titlecase_segment_to_string("丰(abc)", &root, no_adjust),
105/// "丰(abc)"
106/// );
107///
108/// // Only changed in adjust-to-cased mode:
109/// assert_eq!(
110/// cm.titlecase_segment_to_string("49ers", &root, default_options),
111/// "49ers"
112/// );
113/// assert_eq!(
114/// cm.titlecase_segment_to_string("49ers", &root, adjust_to_cased),
115/// "49Ers"
116/// );
117/// assert_eq!(
118/// cm.titlecase_segment_to_string("49ers", &root, no_adjust),
119/// "49ers"
120/// );
121/// ```
122#[non_exhaustive]
123#[derive(Copy, Clone, Default, PartialEq, Eq, Hash, Debug)]
124pub enum LeadingAdjustment {
125 /// Start titlecasing immediately, even if the character is not one that is relevant for casing
126 /// ("'twixt" -> "'twixt", "twixt" -> "Twixt")
127 None,
128 /// Adjust the string to the first relevant character before beginning to apply casing
129 /// ("'twixt" -> "'Twixt"). "Relevant" character is picked by best available algorithm,
130 /// by default will adjust to first letter, number, symbol, or private use character,
131 /// but if no data is available (e.g. this API is being called via [`CaseMapperBorrowed::titlecase_segment_with_only_case_data()`]),
132 /// then may be equivalent to "adjust to cased".
133 ///
134 /// This is the default
135 #[default]
136 Auto,
137 /// Adjust the string to the first cased character before beginning to apply casing
138 /// ("'twixt" -> "'Twixt")
139 ToCased,
140}
141
142/// Various options for controlling titlecasing
143///
144/// See docs of [`TitlecaseMapper`] for examples.
145#[non_exhaustive]
146#[derive(Copy, Clone, Default, PartialEq, Eq, Hash, Debug)]
147pub struct TitlecaseOptions {
148 /// How to handle the rest of the string once the head of the
149 /// string has been titlecased
150 ///
151 /// Default is [`TrailingCase::Lower`]
152 pub trailing_case: Option<TrailingCase>,
153 /// Whether to start casing at the beginning of the string or at the first
154 /// relevant character.
155 ///
156 /// Default is [`LeadingAdjustment::Auto`]
157 pub leading_adjustment: Option<LeadingAdjustment>,
158}
159
160/// A wrapper around [`CaseMapper`] that can compute titlecasing stuff, and is able to load additional data
161/// to support the non-legacy "head adjustment" behavior.
162///
163///
164/// Most methods for this type live on [`TitlecaseMapperBorrowed`], which you can obtain via
165/// [`TitlecaseMapper::new()`] or [`TitlecaseMapper::as_borrowed()`].
166///
167/// By default, [`TitlecaseMapperBorrowed::titlecase_segment()`] and [`TitlecaseMapperBorrowed::titlecase_segment_to_string()`] perform "leading adjustment",
168/// where they wait till the first relevant character to begin titlecasing. For example, in the string `'twixt`, the apostrophe
169/// is ignored because the word starts at the first "t", which will get titlecased (producing `'Twixt`). Other punctuation will
170/// also be ignored, like in the string `«hello»`, which will get titlecased to `«Hello»`.
171///
172/// This is a separate type from [`CaseMapper`] because it loads the additional data
173/// required by [`LeadingAdjustment::Auto`] to perform the best possible leading adjustment.
174///
175/// If you are planning on only using [`LeadingAdjustment::None`] or [`LeadingAdjustment::ToCased`], consider using [`CaseMapper`] directly; this
176/// type will have no additional behavior.
177///
178/// # Examples
179///
180/// Basic casemapping behavior:
181///
182/// ```rust
183/// use icu::casemap::TitlecaseMapper;
184/// use icu::locale::langid;
185///
186/// let cm = TitlecaseMapper::new();
187/// let root = langid!("und");
188///
189/// let default_options = Default::default();
190///
191/// // note that the subsequent words are not titlecased, this function assumes
192/// // that the entire string is a single segment and only titlecases at the beginning.
193/// assert_eq!(cm.titlecase_segment_to_string("hEllO WorLd", &root, default_options), "Hello world");
194/// assert_eq!(cm.titlecase_segment_to_string("Γειά σου Κόσμε", &root, default_options), "Γειά σου κόσμε");
195/// assert_eq!(cm.titlecase_segment_to_string("नमस्ते दुनिया", &root, default_options), "नमस्ते दुनिया");
196/// assert_eq!(cm.titlecase_segment_to_string("Привет мир", &root, default_options), "Привет мир");
197///
198/// // Some behavior is language-sensitive
199/// assert_eq!(cm.titlecase_segment_to_string("istanbul", &root, default_options), "Istanbul");
200/// assert_eq!(cm.titlecase_segment_to_string("istanbul", &langid!("tr"), default_options), "İstanbul"); // Turkish dotted i
201///
202/// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &root, default_options), "Եւ երևանի");
203/// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &langid!("hy"), default_options), "Եվ երևանի"); // Eastern Armenian ech-yiwn ligature
204///
205/// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &root, default_options), "Ijkdijk");
206/// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &langid!("nl"), default_options), "IJkdijk"); // Dutch IJ digraph
207/// ```
208#[derive(Clone, Debug)]
209pub struct TitlecaseMapper<CM> {
210 cm: CM,
211 gc: CodePointMapData<GeneralCategory>,
212}
213
214impl TitlecaseMapper<CaseMapper> {
215 icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
216 functions: [
217 new: skip,
218 try_new_with_buffer_provider,
219 try_new_unstable,
220 Self,
221 ]);
222
223 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
224 pub fn try_new_unstable<P>(provider: &P) -> Result<Self, DataError>
225 where
226 P: DataProvider<CaseMapV1> + DataProvider<PropertyEnumGeneralCategoryV1> + ?Sized,
227 {
228 let cm = CaseMapper::try_new_unstable(provider)?;
229 let gc = icu_properties::CodePointMapData::<icu_properties::props::GeneralCategory>::try_new_unstable(provider)?;
230 Ok(Self { cm, gc })
231 }
232}
233
234impl TitlecaseMapper<CaseMapper> {
235 /// A constructor which creates a [`TitlecaseMapperBorrowed`] using compiled data
236 ///
237 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
238 ///
239 /// [📚 Help choosing a constructor](icu_provider::constructors)
240 #[cfg(feature = "compiled_data")]
241 #[allow(clippy::new_ret_no_self)] // Intentional
242 pub const fn new() -> TitlecaseMapperBorrowed<'static> {
243 TitlecaseMapperBorrowed::new()
244 }
245}
246// We use Borrow, not AsRef, since we want the blanket impl on T
247impl<CM: AsRef<CaseMapper>> TitlecaseMapper<CM> {
248 icu_provider::gen_buffer_data_constructors!((casemapper: CM) -> error: DataError,
249 functions: [
250 new_with_mapper: skip,
251 try_new_with_mapper_with_buffer_provider,
252 try_new_with_mapper_unstable,
253 Self,
254 ]);
255
256 /// A constructor which creates a [`TitlecaseMapper`] from an existing [`CaseMapper`]
257 /// (either owned or as a reference) and compiled data
258 ///
259 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
260 ///
261 /// [📚 Help choosing a constructor](icu_provider::constructors)
262 #[cfg(feature = "compiled_data")]
263 pub const fn new_with_mapper(casemapper: CM) -> Self {
264 Self {
265 cm: casemapper,
266 gc: icu_properties::CodePointMapData::<icu_properties::props::GeneralCategory>::new()
267 .static_to_owned(),
268 }
269 }
270
271 /// Construct this object to wrap an existing CaseMapper (or a reference to one), loading additional data as needed.
272 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_with_mapper)]
273 pub fn try_new_with_mapper_unstable<P>(provider: &P, casemapper: CM) -> Result<Self, DataError>
274 where
275 P: DataProvider<CaseMapV1> + DataProvider<PropertyEnumGeneralCategoryV1> + ?Sized,
276 {
277 let gc = icu_properties::CodePointMapData::<icu_properties::props::GeneralCategory>::try_new_unstable(provider)?;
278 Ok(Self { cm: casemapper, gc })
279 }
280
281 /// Constructs a borrowed version of this type for more efficient querying.
282 pub fn as_borrowed(&self) -> TitlecaseMapperBorrowed<'_> {
283 TitlecaseMapperBorrowed {
284 cm: self.cm.as_ref().as_borrowed(),
285 gc: self.gc.as_borrowed(),
286 }
287 }
288}
289
290/// A borrowed [`TitlecaseMapper`].
291///
292/// See methods or [`TitlecaseMapper`] for examples.
293#[derive(Clone, Debug, Copy)]
294pub struct TitlecaseMapperBorrowed<'a> {
295 cm: CaseMapperBorrowed<'a>,
296 gc: CodePointMapDataBorrowed<'a, GeneralCategory>,
297}
298
299impl TitlecaseMapperBorrowed<'static> {
300 /// A constructor which creates a [`TitlecaseMapperBorrowed`] using compiled data
301 ///
302 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
303 ///
304 /// [📚 Help choosing a constructor](icu_provider::constructors)
305 #[cfg(feature = "compiled_data")]
306 pub const fn new() -> Self {
307 Self {
308 cm: CaseMapper::new(),
309 gc: icu_properties::CodePointMapData::<icu_properties::props::GeneralCategory>::new(),
310 }
311 }
312 /// Cheaply converts a [`TitlecaseMapperBorrowed<'static>`] into a [`TitlecaseMapper`].
313 ///
314 /// Note: Due to branching and indirection, using [`TitlecaseMapper`] might inhibit some
315 /// compile-time optimizations that are possible with [`TitlecaseMapper`].
316 pub const fn static_to_owned(self) -> TitlecaseMapper<CaseMapper> {
317 TitlecaseMapper {
318 cm: self.cm.static_to_owned(),
319 gc: self.gc.static_to_owned(),
320 }
321 }
322}
323
324#[cfg(feature = "compiled_data")]
325impl Default for TitlecaseMapperBorrowed<'static> {
326 fn default() -> Self {
327 Self::new()
328 }
329}
330
331impl<'a> TitlecaseMapperBorrowed<'a> {
332 /// Returns the full titlecase mapping of the given string as a [`Writeable`], treating
333 /// the string as a single segment (and thus only titlecasing the beginning of it).
334 ///
335 /// This should typically be used as a lower-level helper to construct the titlecasing operation desired
336 /// by the application, for example one can titlecase on a per-word basis by mixing this with
337 /// a `WordSegmenter`.
338 ///
339 /// This function is context and language sensitive. Callers should pass the text's language
340 /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
341 /// `Default::default()` for the root locale.
342 ///
343 /// See [`TitlecaseMapperBorrowed::titlecase_segment_to_string()`] for the equivalent convenience function that returns a String,
344 /// as well as for an example.
345 pub fn titlecase_segment(
346 self,
347 src: &'a str,
348 langid: &LanguageIdentifier,
349 options: TitlecaseOptions,
350 ) -> impl Writeable + 'a {
351 if options.leading_adjustment.unwrap_or_default() == LeadingAdjustment::Auto {
352 // letter, number, symbol, or private use code point
353 const HEAD_GROUPS: GeneralCategoryGroup = GeneralCategoryGroup::Letter
354 .union(GeneralCategoryGroup::Number)
355 .union(GeneralCategoryGroup::Symbol)
356 .union(GeneralCategoryGroup::PrivateUse);
357 self.cm
358 .titlecase_segment_with_adjustment(src, langid, options, |_data, ch| {
359 HEAD_GROUPS.contains(self.gc.get(ch))
360 })
361 } else {
362 self.cm
363 .titlecase_segment_with_adjustment(src, langid, options, |data, ch| {
364 data.is_cased(ch)
365 })
366 }
367 }
368
369 /// Returns the full titlecase mapping of the given string as a String, treating
370 /// the string as a single segment (and thus only titlecasing the beginning of it).
371 ///
372 /// This should typically be used as a lower-level helper to construct the titlecasing operation desired
373 /// by the application, for example one can titlecase on a per-word basis by mixing this with
374 /// a `WordSegmenter`.
375 ///
376 /// This function is context and language sensitive. Callers should pass the text's language
377 /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
378 /// `Default::default()` for the root locale.
379 ///
380 /// See [`TitlecaseMapperBorrowed::titlecase_segment()`] for the equivalent lower-level function that returns a [`Writeable`]
381 ///
382 /// # Examples
383 ///
384 /// ```rust
385 /// use icu::casemap::TitlecaseMapper;
386 /// use icu::locale::langid;
387 ///
388 /// let cm = TitlecaseMapper::new();
389 /// let root = langid!("und");
390 ///
391 /// let default_options = Default::default();
392 ///
393 /// // note that the subsequent words are not titlecased, this function assumes
394 /// // that the entire string is a single segment and only titlecases at the beginning.
395 /// assert_eq!(cm.titlecase_segment_to_string("hEllO WorLd", &root, default_options), "Hello world");
396 /// assert_eq!(cm.titlecase_segment_to_string("Γειά σου Κόσμε", &root, default_options), "Γειά σου κόσμε");
397 /// assert_eq!(cm.titlecase_segment_to_string("नमस्ते दुनिया", &root, default_options), "नमस्ते दुनिया");
398 /// assert_eq!(cm.titlecase_segment_to_string("Привет мир", &root, default_options), "Привет мир");
399 ///
400 /// // Some behavior is language-sensitive
401 /// assert_eq!(cm.titlecase_segment_to_string("istanbul", &root, default_options), "Istanbul");
402 /// assert_eq!(cm.titlecase_segment_to_string("istanbul", &langid!("tr"), default_options), "İstanbul"); // Turkish dotted i
403 ///
404 /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &root, default_options), "Եւ երևանի");
405 /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &langid!("hy"), default_options), "Եվ երևանի"); // Eastern Armenian ech-yiwn ligature
406 ///
407 /// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &root, default_options), "Ijkdijk");
408 /// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &langid!("nl"), default_options), "IJkdijk"); // Dutch IJ digraph
409 /// ```
410 ///
411 /// Leading adjustment behaviors:
412 ///
413 /// ```rust
414 /// use icu::casemap::options::{LeadingAdjustment, TitlecaseOptions};
415 /// use icu::casemap::TitlecaseMapper;
416 /// use icu::locale::langid;
417 ///
418 /// let cm = TitlecaseMapper::new();
419 /// let root = langid!("und");
420 ///
421 /// let default_options = Default::default();
422 /// let mut no_adjust: TitlecaseOptions = Default::default();
423 /// no_adjust.leading_adjustment = Some(LeadingAdjustment::None);
424 ///
425 /// // Exhibits leading adjustment when set:
426 /// assert_eq!(
427 /// cm.titlecase_segment_to_string("«hello»", &root, default_options),
428 /// "«Hello»"
429 /// );
430 /// assert_eq!(
431 /// cm.titlecase_segment_to_string("«hello»", &root, no_adjust),
432 /// "«hello»"
433 /// );
434 ///
435 /// assert_eq!(
436 /// cm.titlecase_segment_to_string("'Twas", &root, default_options),
437 /// "'Twas"
438 /// );
439 /// assert_eq!(
440 /// cm.titlecase_segment_to_string("'Twas", &root, no_adjust),
441 /// "'twas"
442 /// );
443 ///
444 /// assert_eq!(
445 /// cm.titlecase_segment_to_string("", &root, default_options),
446 /// ""
447 /// );
448 /// assert_eq!(cm.titlecase_segment_to_string("", &root, no_adjust), "");
449 /// ```
450 ///
451 /// Tail casing behaviors:
452 ///
453 /// ```rust
454 /// use icu::casemap::options::{TitlecaseOptions, TrailingCase};
455 /// use icu::casemap::TitlecaseMapper;
456 /// use icu::locale::langid;
457 ///
458 /// let cm = TitlecaseMapper::new();
459 /// let root = langid!("und");
460 ///
461 /// let default_options = Default::default();
462 /// let mut preserve_case: TitlecaseOptions = Default::default();
463 /// preserve_case.trailing_case = Some(TrailingCase::Unchanged);
464 ///
465 /// // Exhibits trailing case when set:
466 /// assert_eq!(
467 /// cm.titlecase_segment_to_string("spOngeBoB", &root, default_options),
468 /// "Spongebob"
469 /// );
470 /// assert_eq!(
471 /// cm.titlecase_segment_to_string("spOngeBoB", &root, preserve_case),
472 /// "SpOngeBoB"
473 /// );
474 /// ```
475 pub fn titlecase_segment_to_string<'s>(
476 self,
477 src: &'s str,
478 langid: &LanguageIdentifier,
479 options: TitlecaseOptions,
480 ) -> Cow<'s, str> {
481 writeable::to_string_or_borrow(
482 &self.titlecase_segment(src, langid, options),
483 src.as_bytes(),
484 )
485 }
486}