textwrap/word_splitters.rs
1//! Word splitting functionality.
2//!
3//! To wrap text into lines, long words sometimes need to be split
4//! across lines. The [`WordSplitter`] enum defines this
5//! functionality.
6
7use crate::core::{display_width, Word};
8
9/// The `WordSplitter` enum describes where words can be split.
10///
11/// If the textwrap crate has been compiled with the `hyphenation`
12/// Cargo feature enabled, you will find a
13/// [`WordSplitter::Hyphenation`] variant. Use this struct for
14/// language-aware hyphenation:
15///
16/// ```
17/// #[cfg(feature = "hyphenation")] {
18/// use hyphenation::{Language, Load, Standard};
19/// use textwrap::{wrap, Options, WordSplitter};
20///
21/// let text = "Oxidation is the loss of electrons.";
22/// let dictionary = Standard::from_embedded(Language::EnglishUS).unwrap();
23/// let options = Options::new(8).word_splitter(WordSplitter::Hyphenation(dictionary));
24/// assert_eq!(wrap(text, &options), vec!["Oxida-",
25/// "tion is",
26/// "the loss",
27/// "of elec-",
28/// "trons."]);
29/// }
30/// ```
31///
32/// Please see the documentation for the [hyphenation] crate for more
33/// details.
34///
35/// [hyphenation]: https://docs.rs/hyphenation/
36#[derive(Debug, Clone)]
37pub enum WordSplitter {
38 /// Use this as a [`Options.word_splitter`] to avoid any kind of
39 /// hyphenation:
40 ///
41 /// ```
42 /// use textwrap::{wrap, Options, WordSplitter};
43 ///
44 /// let options = Options::new(8).word_splitter(WordSplitter::NoHyphenation);
45 /// assert_eq!(wrap("foo bar-baz", &options),
46 /// vec!["foo", "bar-baz"]);
47 /// ```
48 ///
49 /// [`Options.word_splitter`]: super::Options::word_splitter
50 NoHyphenation,
51
52 /// `HyphenSplitter` is the default `WordSplitter` used by
53 /// [`Options::new`](super::Options::new). It will split words on
54 /// existing hyphens in the word.
55 ///
56 /// It will only use hyphens that are surrounded by alphanumeric
57 /// characters, which prevents a word like `"--foo-bar"` from
58 /// being split into `"--"` and `"foo-bar"`.
59 ///
60 /// # Examples
61 ///
62 /// ```
63 /// use textwrap::WordSplitter;
64 ///
65 /// assert_eq!(WordSplitter::HyphenSplitter.split_points("--foo-bar"),
66 /// vec![6]);
67 /// ```
68 HyphenSplitter,
69
70 /// Use a custom function as the word splitter.
71 ///
72 /// This variant lets you implement a custom word splitter using
73 /// your own function.
74 ///
75 /// # Examples
76 ///
77 /// ```
78 /// use textwrap::WordSplitter;
79 ///
80 /// fn split_at_underscore(word: &str) -> Vec<usize> {
81 /// word.match_indices('_').map(|(idx, _)| idx + 1).collect()
82 /// }
83 ///
84 /// let word_splitter = WordSplitter::Custom(split_at_underscore);
85 /// assert_eq!(word_splitter.split_points("a_long_identifier"),
86 /// vec![2, 7]);
87 /// ```
88 Custom(fn(word: &str) -> Vec<usize>),
89
90 /// A hyphenation dictionary can be used to do language-specific
91 /// hyphenation using patterns from the [hyphenation] crate.
92 ///
93 /// **Note:** Only available when the `hyphenation` Cargo feature is
94 /// enabled.
95 ///
96 /// [hyphenation]: https://docs.rs/hyphenation/
97 #[cfg(feature = "hyphenation")]
98 Hyphenation(hyphenation::Standard),
99}
100
101impl PartialEq<WordSplitter> for WordSplitter {
102 fn eq(&self, other: &WordSplitter) -> bool {
103 match (self, other) {
104 (WordSplitter::NoHyphenation, WordSplitter::NoHyphenation) => true,
105 (WordSplitter::HyphenSplitter, WordSplitter::HyphenSplitter) => true,
106 #[cfg(feature = "hyphenation")]
107 (WordSplitter::Hyphenation(this_dict), WordSplitter::Hyphenation(other_dict)) => {
108 this_dict.language() == other_dict.language()
109 }
110 (_, _) => false,
111 }
112 }
113}
114
115impl WordSplitter {
116 /// Return all possible indices where `word` can be split.
117 ///
118 /// The indices are in the range `0..word.len()`. They point to
119 /// the index _after_ the split point, i.e., after `-` if
120 /// splitting on hyphens. This way, `word.split_at(idx)` will
121 /// break the word into two well-formed pieces.
122 ///
123 /// # Examples
124 ///
125 /// ```
126 /// use textwrap::WordSplitter;
127 /// assert_eq!(WordSplitter::NoHyphenation.split_points("cannot-be-split"), vec![]);
128 /// assert_eq!(WordSplitter::HyphenSplitter.split_points("can-be-split"), vec![4, 7]);
129 /// assert_eq!(WordSplitter::Custom(|word| vec![word.len()/2]).split_points("middle"), vec![3]);
130 /// ```
131 pub fn split_points(&self, word: &str) -> Vec<usize> {
132 match self {
133 WordSplitter::NoHyphenation => Vec::new(),
134 WordSplitter::HyphenSplitter => {
135 let mut splits = Vec::new();
136
137 for (idx, _) in word.match_indices('-') {
138 // We only use hyphens that are surrounded by alphanumeric
139 // characters. This is to avoid splitting on repeated hyphens,
140 // such as those found in --foo-bar.
141 let prev = word[..idx].chars().next_back();
142 let next = word[idx + 1..].chars().next();
143
144 if prev.filter(|ch| ch.is_alphanumeric()).is_some()
145 && next.filter(|ch| ch.is_alphanumeric()).is_some()
146 {
147 splits.push(idx + 1); // +1 due to width of '-'.
148 }
149 }
150
151 splits
152 }
153 WordSplitter::Custom(splitter_func) => splitter_func(word),
154 #[cfg(feature = "hyphenation")]
155 WordSplitter::Hyphenation(dictionary) => {
156 use hyphenation::Hyphenator;
157 dictionary.hyphenate(word).breaks
158 }
159 }
160 }
161}
162
163/// Split words into smaller words according to the split points given
164/// by `word_splitter`.
165///
166/// Note that we split all words, regardless of their length. This is
167/// to more cleanly separate the business of splitting (including
168/// automatic hyphenation) from the business of word wrapping.
169pub fn split_words<'a, I>(
170 words: I,
171 word_splitter: &'a WordSplitter,
172) -> impl Iterator<Item = Word<'a>>
173where
174 I: IntoIterator<Item = Word<'a>>,
175{
176 words.into_iter().flat_map(move |word| {
177 let mut prev = 0;
178 let mut split_points = word_splitter.split_points(&word).into_iter();
179 std::iter::from_fn(move || {
180 if let Some(idx) = split_points.next() {
181 let need_hyphen = !word[..idx].ends_with('-');
182 let w = Word {
183 word: &word.word[prev..idx],
184 width: display_width(&word[prev..idx]),
185 whitespace: "",
186 penalty: if need_hyphen { "-" } else { "" },
187 };
188 prev = idx;
189 return Some(w);
190 }
191
192 if prev < word.word.len() || prev == 0 {
193 let w = Word {
194 word: &word.word[prev..],
195 width: display_width(&word[prev..]),
196 whitespace: word.whitespace,
197 penalty: word.penalty,
198 };
199 prev = word.word.len() + 1;
200 return Some(w);
201 }
202
203 None
204 })
205 })
206}
207
208#[cfg(test)]
209mod tests {
210 use super::*;
211
212 // Like assert_eq!, but the left expression is an iterator.
213 macro_rules! assert_iter_eq {
214 ($left:expr, $right:expr) => {
215 assert_eq!($left.collect::<Vec<_>>(), $right);
216 };
217 }
218
219 #[test]
220 fn split_words_no_words() {
221 assert_iter_eq!(split_words(vec![], &WordSplitter::HyphenSplitter), vec![]);
222 }
223
224 #[test]
225 fn split_words_empty_word() {
226 assert_iter_eq!(
227 split_words(vec![Word::from(" ")], &WordSplitter::HyphenSplitter),
228 vec![Word::from(" ")]
229 );
230 }
231
232 #[test]
233 fn split_words_single_word() {
234 assert_iter_eq!(
235 split_words(vec![Word::from("foobar")], &WordSplitter::HyphenSplitter),
236 vec![Word::from("foobar")]
237 );
238 }
239
240 #[test]
241 fn split_words_hyphen_splitter() {
242 assert_iter_eq!(
243 split_words(vec![Word::from("foo-bar")], &WordSplitter::HyphenSplitter),
244 vec![Word::from("foo-"), Word::from("bar")]
245 );
246 }
247
248 #[test]
249 fn split_words_no_hyphenation() {
250 assert_iter_eq!(
251 split_words(vec![Word::from("foo-bar")], &WordSplitter::NoHyphenation),
252 vec![Word::from("foo-bar")]
253 );
254 }
255
256 #[test]
257 fn split_words_adds_penalty() {
258 let fixed_split_point = |_: &str| vec![3];
259
260 assert_iter_eq!(
261 split_words(
262 vec![Word::from("foobar")].into_iter(),
263 &WordSplitter::Custom(fixed_split_point)
264 ),
265 vec![
266 Word {
267 word: "foo",
268 width: 3,
269 whitespace: "",
270 penalty: "-"
271 },
272 Word {
273 word: "bar",
274 width: 3,
275 whitespace: "",
276 penalty: ""
277 }
278 ]
279 );
280
281 assert_iter_eq!(
282 split_words(
283 vec![Word::from("fo-bar")].into_iter(),
284 &WordSplitter::Custom(fixed_split_point)
285 ),
286 vec![
287 Word {
288 word: "fo-",
289 width: 3,
290 whitespace: "",
291 penalty: ""
292 },
293 Word {
294 word: "bar",
295 width: 3,
296 whitespace: "",
297 penalty: ""
298 }
299 ]
300 );
301 }
302}