icu_segmenter/
lib.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Segment strings by lines, graphemes, words, and sentences.
6//!
7//! This module is published as its own crate ([`icu_segmenter`](https://docs.rs/icu_segmenter/latest/icu_segmenter/))
8//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
9//!
10//! This module contains segmenter implementation for the following rules.
11//!
12//! - Line segmenter that is compatible with [Unicode Standard Annex #14][UAX14], _Unicode Line
13//!   Breaking Algorithm_, with options to tailor line-breaking behavior for CSS [`line-break`] and
14//!   [`word-break`] properties.
15//! - Grapheme cluster segmenter, word segmenter, and sentence segmenter that are compatible with
16//!   [Unicode Standard Annex #29][UAX29], _Unicode Text Segmentation_.
17//!
18//! [UAX14]: https://www.unicode.org/reports/tr14/
19//! [UAX29]: https://www.unicode.org/reports/tr29/
20//! [`line-break`]: https://drafts.csswg.org/css-text-3/#line-break-property
21//! [`word-break`]: https://drafts.csswg.org/css-text-3/#word-break-property
22//!
23//! # Examples
24//!
25//! ## Line Break
26//!
27//! Find line break opportunities:
28//!
29//!```rust
30//! use icu::segmenter::LineSegmenter;
31//!
32//! let segmenter = LineSegmenter::new_auto(Default::default());
33//!
34//! let breakpoints: Vec<usize> = segmenter
35//!     .segment_str("Hello World. Xin chào thế giới!")
36//!     .collect();
37//! assert_eq!(&breakpoints, &[0, 6, 13, 17, 23, 29, 36]);
38//! ```
39//!
40//! See [`LineSegmenter`] for more examples.
41//!
42//! ## Grapheme Cluster Break
43//!
44//! Find all grapheme cluster boundaries:
45//!
46//!```rust
47//! use icu::segmenter::GraphemeClusterSegmenter;
48//!
49//! let segmenter = GraphemeClusterSegmenter::new();
50//!
51//! let breakpoints: Vec<usize> = segmenter
52//!     .segment_str("Hello World. Xin chào thế giới!")
53//!     .collect();
54//! assert_eq!(
55//!     &breakpoints,
56//!     &[
57//!         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
58//!         19, 21, 22, 23, 24, 25, 28, 29, 30, 31, 34, 35, 36
59//!     ]
60//! );
61//! ```
62//!
63//! See [`GraphemeClusterSegmenter`] for more examples.
64//!
65//! ## Word Break
66//!
67//! Find all word boundaries:
68//!
69//!```rust
70//! use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
71//!
72//! let segmenter =
73//!     WordSegmenter::new_auto(WordBreakInvariantOptions::default());
74//!
75//! let breakpoints: Vec<usize> = segmenter
76//!     .segment_str("Hello World. Xin chào thế giới!")
77//!     .collect();
78//! assert_eq!(
79//!     &breakpoints,
80//!     &[0, 5, 6, 11, 12, 13, 16, 17, 22, 23, 28, 29, 35, 36]
81//! );
82//! ```
83//!
84//! See [`WordSegmenter`] for more examples.
85//!
86//! ## Sentence Break
87//!
88//! Segment the string into sentences:
89//!
90//!```rust
91//! use icu::segmenter::{
92//!     options::SentenceBreakInvariantOptions, SentenceSegmenter,
93//! };
94//!
95//! let segmenter =
96//!     SentenceSegmenter::new(SentenceBreakInvariantOptions::default());
97//!
98//! let breakpoints: Vec<usize> = segmenter
99//!     .segment_str("Hello World. Xin chào thế giới!")
100//!     .collect();
101//! assert_eq!(&breakpoints, &[0, 13, 36]);
102//! ```
103//!
104//! See [`SentenceSegmenter`] for more examples.
105
106// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
107#![cfg_attr(not(any(test, doc)), no_std)]
108#![cfg_attr(
109    not(test),
110    deny(
111        clippy::indexing_slicing,
112        clippy::unwrap_used,
113        clippy::expect_used,
114        clippy::panic,
115        clippy::exhaustive_structs,
116        clippy::exhaustive_enums,
117        clippy::trivially_copy_pass_by_ref,
118        missing_debug_implementations,
119    )
120)]
121#![warn(missing_docs)]
122
123extern crate alloc;
124
125mod complex;
126mod indices;
127mod iterator_helpers;
128mod rule_segmenter;
129
130/// [`GraphemeClusterSegmenter`] and its related iterators, borrowed types, and options.
131mod grapheme;
132/// [`LineSegmenter`] and its related iterators, borrowed types, and options.
133mod line;
134/// [`SentenceSegmenter`] and its related iterators, borrowed types, and options.
135mod sentence;
136/// [`WordSegmenter`] and its related iterators, borrowed types, and options.
137mod word;
138
139pub mod provider;
140
141// Main Segmenter and BreakIterator public types
142pub use crate::grapheme::GraphemeClusterSegmenter;
143pub use crate::grapheme::GraphemeClusterSegmenterBorrowed;
144pub use crate::line::LineSegmenter;
145pub use crate::line::LineSegmenterBorrowed;
146pub use crate::sentence::SentenceSegmenter;
147pub use crate::sentence::SentenceSegmenterBorrowed;
148pub use crate::word::WordSegmenter;
149pub use crate::word::WordSegmenterBorrowed;
150
151/// Options structs and enums
152pub mod options {
153    pub use crate::line::LineBreakOptions;
154    pub use crate::line::LineBreakStrictness;
155    pub use crate::line::LineBreakWordOption;
156    pub use crate::sentence::SentenceBreakInvariantOptions;
157    pub use crate::sentence::SentenceBreakOptions;
158    pub use crate::word::WordBreakInvariantOptions;
159    pub use crate::word::WordBreakOptions;
160    pub use crate::word::WordType;
161}
162
163/// Largely-internal scaffolding types (You should very rarely need to reference these directly)
164pub mod scaffold {
165    pub use crate::line::LineBreakType;
166    pub use crate::rule_segmenter::{Latin1, PotentiallyIllFormedUtf8, RuleBreakType, Utf16, Utf8};
167    pub use crate::word::WordBreakType;
168}
169
170/// Types supporting iteration over segments. Obtained from the segmenter types.
171pub mod iterators {
172    pub use crate::grapheme::GraphemeClusterBreakIterator;
173    pub use crate::line::LineBreakIterator;
174    pub use crate::sentence::SentenceBreakIterator;
175    pub use crate::word::{WordBreakIterator, WordBreakIteratorWithWordType};
176}
177
178pub(crate) mod private {
179    /// Trait marking other traits that are considered unstable and should not generally be
180    /// implemented outside of the segmenter crate.
181    pub trait Sealed {}
182}