textwrap/
word_splitters.rs

1//! Word splitting functionality.
2//!
3//! To wrap text into lines, long words sometimes need to be split
4//! across lines. The [`WordSplitter`] enum defines this
5//! functionality.
6
7use crate::core::{display_width, Word};
8
9/// The `WordSplitter` enum describes where words can be split.
10///
11/// If the textwrap crate has been compiled with the `hyphenation`
12/// Cargo feature enabled, you will find a
13/// [`WordSplitter::Hyphenation`] variant. Use this struct for
14/// language-aware hyphenation:
15///
16/// ```
17/// #[cfg(feature = "hyphenation")] {
18///     use hyphenation::{Language, Load, Standard};
19///     use textwrap::{wrap, Options, WordSplitter};
20///
21///     let text = "Oxidation is the loss of electrons.";
22///     let dictionary = Standard::from_embedded(Language::EnglishUS).unwrap();
23///     let options = Options::new(8).word_splitter(WordSplitter::Hyphenation(dictionary));
24///     assert_eq!(wrap(text, &options), vec!["Oxida-",
25///                                           "tion is",
26///                                           "the loss",
27///                                           "of elec-",
28///                                           "trons."]);
29/// }
30/// ```
31///
32/// Please see the documentation for the [hyphenation] crate for more
33/// details.
34///
35/// [hyphenation]: https://docs.rs/hyphenation/
36#[derive(Debug, Clone)]
37pub enum WordSplitter {
38    /// Use this as a [`Options.word_splitter`] to avoid any kind of
39    /// hyphenation:
40    ///
41    /// ```
42    /// use textwrap::{wrap, Options, WordSplitter};
43    ///
44    /// let options = Options::new(8).word_splitter(WordSplitter::NoHyphenation);
45    /// assert_eq!(wrap("foo bar-baz", &options),
46    ///            vec!["foo", "bar-baz"]);
47    /// ```
48    ///
49    /// [`Options.word_splitter`]: super::Options::word_splitter
50    NoHyphenation,
51
52    /// `HyphenSplitter` is the default `WordSplitter` used by
53    /// [`Options::new`](super::Options::new). It will split words on
54    /// existing hyphens in the word.
55    ///
56    /// It will only use hyphens that are surrounded by alphanumeric
57    /// characters, which prevents a word like `"--foo-bar"` from
58    /// being split into `"--"` and `"foo-bar"`.
59    ///
60    /// # Examples
61    ///
62    /// ```
63    /// use textwrap::WordSplitter;
64    ///
65    /// assert_eq!(WordSplitter::HyphenSplitter.split_points("--foo-bar"),
66    ///            vec![6]);
67    /// ```
68    HyphenSplitter,
69
70    /// Use a custom function as the word splitter.
71    ///
72    /// This variant lets you implement a custom word splitter using
73    /// your own function.
74    ///
75    /// # Examples
76    ///
77    /// ```
78    /// use textwrap::WordSplitter;
79    ///
80    /// fn split_at_underscore(word: &str) -> Vec<usize> {
81    ///     word.match_indices('_').map(|(idx, _)| idx + 1).collect()
82    /// }
83    ///
84    /// let word_splitter = WordSplitter::Custom(split_at_underscore);
85    /// assert_eq!(word_splitter.split_points("a_long_identifier"),
86    ///            vec![2, 7]);
87    /// ```
88    Custom(fn(word: &str) -> Vec<usize>),
89
90    /// A hyphenation dictionary can be used to do language-specific
91    /// hyphenation using patterns from the [hyphenation] crate.
92    ///
93    /// **Note:** Only available when the `hyphenation` Cargo feature is
94    /// enabled.
95    ///
96    /// [hyphenation]: https://docs.rs/hyphenation/
97    #[cfg(feature = "hyphenation")]
98    Hyphenation(hyphenation::Standard),
99}
100
101impl PartialEq<WordSplitter> for WordSplitter {
102    fn eq(&self, other: &WordSplitter) -> bool {
103        match (self, other) {
104            (WordSplitter::NoHyphenation, WordSplitter::NoHyphenation) => true,
105            (WordSplitter::HyphenSplitter, WordSplitter::HyphenSplitter) => true,
106            #[cfg(feature = "hyphenation")]
107            (WordSplitter::Hyphenation(this_dict), WordSplitter::Hyphenation(other_dict)) => {
108                this_dict.language() == other_dict.language()
109            }
110            (_, _) => false,
111        }
112    }
113}
114
115impl WordSplitter {
116    /// Return all possible indices where `word` can be split.
117    ///
118    /// The indices are in the range `0..word.len()`. They point to
119    /// the index _after_ the split point, i.e., after `-` if
120    /// splitting on hyphens. This way, `word.split_at(idx)` will
121    /// break the word into two well-formed pieces.
122    ///
123    /// # Examples
124    ///
125    /// ```
126    /// use textwrap::WordSplitter;
127    /// assert_eq!(WordSplitter::NoHyphenation.split_points("cannot-be-split"), vec![]);
128    /// assert_eq!(WordSplitter::HyphenSplitter.split_points("can-be-split"), vec![4, 7]);
129    /// assert_eq!(WordSplitter::Custom(|word| vec![word.len()/2]).split_points("middle"), vec![3]);
130    /// ```
131    pub fn split_points(&self, word: &str) -> Vec<usize> {
132        match self {
133            WordSplitter::NoHyphenation => Vec::new(),
134            WordSplitter::HyphenSplitter => {
135                let mut splits = Vec::new();
136
137                for (idx, _) in word.match_indices('-') {
138                    // We only use hyphens that are surrounded by alphanumeric
139                    // characters. This is to avoid splitting on repeated hyphens,
140                    // such as those found in --foo-bar.
141                    let prev = word[..idx].chars().next_back();
142                    let next = word[idx + 1..].chars().next();
143
144                    if prev.filter(|ch| ch.is_alphanumeric()).is_some()
145                        && next.filter(|ch| ch.is_alphanumeric()).is_some()
146                    {
147                        splits.push(idx + 1); // +1 due to width of '-'.
148                    }
149                }
150
151                splits
152            }
153            WordSplitter::Custom(splitter_func) => splitter_func(word),
154            #[cfg(feature = "hyphenation")]
155            WordSplitter::Hyphenation(dictionary) => {
156                use hyphenation::Hyphenator;
157                dictionary.hyphenate(word).breaks
158            }
159        }
160    }
161}
162
163/// Split words into smaller words according to the split points given
164/// by `word_splitter`.
165///
166/// Note that we split all words, regardless of their length. This is
167/// to more cleanly separate the business of splitting (including
168/// automatic hyphenation) from the business of word wrapping.
169pub fn split_words<'a, I>(
170    words: I,
171    word_splitter: &'a WordSplitter,
172) -> impl Iterator<Item = Word<'a>>
173where
174    I: IntoIterator<Item = Word<'a>>,
175{
176    words.into_iter().flat_map(move |word| {
177        let mut prev = 0;
178        let mut split_points = word_splitter.split_points(&word).into_iter();
179        std::iter::from_fn(move || {
180            if let Some(idx) = split_points.next() {
181                let need_hyphen = !word[..idx].ends_with('-');
182                let w = Word {
183                    word: &word.word[prev..idx],
184                    width: display_width(&word[prev..idx]),
185                    whitespace: "",
186                    penalty: if need_hyphen { "-" } else { "" },
187                };
188                prev = idx;
189                return Some(w);
190            }
191
192            if prev < word.word.len() || prev == 0 {
193                let w = Word {
194                    word: &word.word[prev..],
195                    width: display_width(&word[prev..]),
196                    whitespace: word.whitespace,
197                    penalty: word.penalty,
198                };
199                prev = word.word.len() + 1;
200                return Some(w);
201            }
202
203            None
204        })
205    })
206}
207
208#[cfg(test)]
209mod tests {
210    use super::*;
211
212    // Like assert_eq!, but the left expression is an iterator.
213    macro_rules! assert_iter_eq {
214        ($left:expr, $right:expr) => {
215            assert_eq!($left.collect::<Vec<_>>(), $right);
216        };
217    }
218
219    #[test]
220    fn split_words_no_words() {
221        assert_iter_eq!(split_words(vec![], &WordSplitter::HyphenSplitter), vec![]);
222    }
223
224    #[test]
225    fn split_words_empty_word() {
226        assert_iter_eq!(
227            split_words(vec![Word::from("   ")], &WordSplitter::HyphenSplitter),
228            vec![Word::from("   ")]
229        );
230    }
231
232    #[test]
233    fn split_words_single_word() {
234        assert_iter_eq!(
235            split_words(vec![Word::from("foobar")], &WordSplitter::HyphenSplitter),
236            vec![Word::from("foobar")]
237        );
238    }
239
240    #[test]
241    fn split_words_hyphen_splitter() {
242        assert_iter_eq!(
243            split_words(vec![Word::from("foo-bar")], &WordSplitter::HyphenSplitter),
244            vec![Word::from("foo-"), Word::from("bar")]
245        );
246    }
247
248    #[test]
249    fn split_words_no_hyphenation() {
250        assert_iter_eq!(
251            split_words(vec![Word::from("foo-bar")], &WordSplitter::NoHyphenation),
252            vec![Word::from("foo-bar")]
253        );
254    }
255
256    #[test]
257    fn split_words_adds_penalty() {
258        let fixed_split_point = |_: &str| vec![3];
259
260        assert_iter_eq!(
261            split_words(
262                vec![Word::from("foobar")].into_iter(),
263                &WordSplitter::Custom(fixed_split_point)
264            ),
265            vec![
266                Word {
267                    word: "foo",
268                    width: 3,
269                    whitespace: "",
270                    penalty: "-"
271                },
272                Word {
273                    word: "bar",
274                    width: 3,
275                    whitespace: "",
276                    penalty: ""
277                }
278            ]
279        );
280
281        assert_iter_eq!(
282            split_words(
283                vec![Word::from("fo-bar")].into_iter(),
284                &WordSplitter::Custom(fixed_split_point)
285            ),
286            vec![
287                Word {
288                    word: "fo-",
289                    width: 3,
290                    whitespace: "",
291                    penalty: ""
292                },
293                Word {
294                    word: "bar",
295                    width: 3,
296                    whitespace: "",
297                    penalty: ""
298                }
299            ]
300        );
301    }
302}