3030//! # Rules for determining width
3131//!
3232//! This crate currently uses the following rules to determine the width of a
33- //! character or string, in order of decreasing precedence. These may be tweaked in the future;
34- //! however see [guarantees](#guarantees) below.
33+ //! character or string, in order of decreasing precedence. These may be tweaked in the future.
3534//!
3635//! 1. [Emoji presentation sequences] have width 2.
3736//! 2. Outside of an East Asian context, [text presentation sequences] have width 1
3837//! if their base character:
3938//! - Has the [`Emoji_Presentation`] property, and
4039//! - Is not in the [Enclosed Ideographic Supplement] block.
4140//! 3. The sequence `"\r\n"` has width 1.
42- //! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
43- //! 5. The following have width 0:
41+ //! 4. [Lisu tone letter] combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
42+ //! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1.
43+ //! 5. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
44+ //! 6. The following have width 0:
4445//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
4546//! with the [`Default_Ignorable_Code_Point`] property.
4647//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
5657//! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
5758//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
5859//! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
59- //! 6 . [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
60+ //! 7 . [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
6061//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
61- //! 7 . [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
62+ //! 8 . [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
6263//! with an [`East_Asian_Width`] of [`Ambiguous`] have width 2 in an East Asian context, and width 1 otherwise.
63- //! 8 . All other characters have width 1.
64+ //! 9 . All other characters have width 1.
6465//!
6566//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
6667//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
7778//!
7879//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/PDF/U1F200.pdf
7980//!
80- //! ## Guarantees
81+ //! [Lisu tone letter]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078
8182//!
82- //! - Any two canonically equivalent strings have the same non-CJK width.
83- //! This will not change in any future semver-compatible version.
84- //! (This guarantee does not currently hold for the CJK width variants.)
85- //! - The width of any string equals the sum of the widths of its [extended grapheme clusters].
86- //! This is unlikely to change in any future semver-compatible version.
87- //! (This guarantee holds for both CJK and non-CJK width.)
83+ //! ## Canonical equivalence
8884//!
89- //! [extended grapheme clusters]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
85+ //! The non-CJK width methods guarantee that canonically equivalent strings are assigned the same width.
86+ //! However, this guarantee does not currently hold for the CJK width variants.
9087
9188#![ forbid( unsafe_code) ]
9289#![ deny( missing_docs) ]
@@ -102,14 +99,6 @@ pub use tables::UNICODE_VERSION;
10299mod tables;
103100
104101/// Methods for determining displayed width of Unicode characters.
105- ///
106- /// **NB:** the width of a string may differ from the sum of the widths of its characters;
107- /// see the [crate-level documentation](crate#rules-for-determining-width) for more.
108- /// Instead of working with individual characters, consider using [extended grapheme clusters],
109- /// perhaps with the [`unicode-segmentation`] crate.
110- ///
111- /// [extended grapheme clusters]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
112- /// [`unicode-segmentation`]: https://docs.rs/unicode-segmentation/latest/unicode_segmentation/trait.UnicodeSegmentation.html#tymethod.graphemes
113102pub trait UnicodeWidthChar {
114103 /// Returns the character's displayed width in columns, or `None` if the
115104 /// character is a control character.
@@ -200,8 +189,14 @@ impl UnicodeWidthStr for str {
200189enum NextCharInfo {
201190 #[ default]
202191 Default ,
192+ /// `'\n'`
203193 LineFeed = 0x0A ,
194+ /// `'\u{A4FC}'..='\u{A4FD}'`
195+ /// <https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078>
196+ TrailingLisuToneLetter ,
197+ /// `'\u{FE0E}'`
204198 Vs15 = 0x0E ,
199+ /// `'\u{FE0F}'`
205200 Vs16 = 0x0F ,
206201}
207202
@@ -219,25 +214,28 @@ fn str_width(s: &str, is_cjk: bool) -> usize {
219214/// they're treated as single width.
220215#[ inline]
221216fn width_in_str ( c : char , is_cjk : bool , next_info : NextCharInfo ) -> ( usize , NextCharInfo ) {
222- match next_info {
223- NextCharInfo :: Vs15 if !is_cjk && cw:: starts_non_ideographic_text_presentation_seq ( c) => {
224- ( 1 , NextCharInfo :: Default )
217+ if next_info == NextCharInfo :: Vs16 && cw:: starts_emoji_presentation_seq ( c) {
218+ ( 2 , NextCharInfo :: Default )
219+ } else if c <= '\u{A0}' {
220+ match c {
221+ '\n' => ( 1 , NextCharInfo :: LineFeed ) ,
222+ '\r' if next_info == NextCharInfo :: LineFeed => ( 0 , NextCharInfo :: Default ) ,
223+ _ => ( 1 , NextCharInfo :: Default ) ,
225224 }
226- NextCharInfo :: Vs16 if cw:: starts_emoji_presentation_seq ( c) => ( 2 , NextCharInfo :: Default ) ,
227- _ => {
228- if c <= '\u{A0}' {
229- match c {
230- '\n' => ( 1 , NextCharInfo :: LineFeed ) ,
231- '\r' if next_info == NextCharInfo :: LineFeed => ( 0 , NextCharInfo :: Default ) ,
232- _ => ( 1 , NextCharInfo :: Default ) ,
233- }
234- } else {
235- match c {
236- '\u{FE0E}' => ( 0 , NextCharInfo :: Vs15 ) ,
237- '\u{FE0F}' => ( 0 , NextCharInfo :: Vs16 ) ,
238- _ => ( cw:: lookup_width ( c, is_cjk) , NextCharInfo :: Default ) ,
239- }
225+ } else {
226+ match ( c, next_info) {
227+ ( '\u{A4F8}' ..='\u{A4FB}' , NextCharInfo :: TrailingLisuToneLetter ) => {
228+ ( 0 , NextCharInfo :: Default )
229+ }
230+ ( '\u{A4FC}' ..='\u{A4FD}' , _) => ( 1 , NextCharInfo :: TrailingLisuToneLetter ) ,
231+ ( '\u{FE0E}' , _) => ( 0 , NextCharInfo :: Vs15 ) ,
232+ ( '\u{FE0F}' , _) => ( 0 , NextCharInfo :: Vs16 ) ,
233+ ( _, NextCharInfo :: Vs15 )
234+ if !is_cjk && cw:: starts_non_ideographic_text_presentation_seq ( c) =>
235+ {
236+ ( 1 , NextCharInfo :: Default )
240237 }
238+ _ => ( cw:: lookup_width ( c, is_cjk) , NextCharInfo :: Default ) ,
241239 }
242240 }
243241}
0 commit comments