3434//!
3535//! 1. [Emoji presentation sequences] have width 2.
3636//! (The width of a string may therefore differ from the sum of the widths of its characters.)
37- //! 2. Outside of an East Asian context, [text presentation sequences] have width 1
38- //! iff their base character fulfills all the following requirements :
37+ //! 2. Outside of an East Asian context, [text presentation sequences] fulfilling all the following requirements
38+ //! have width 1 :
3939//! - Has the [`Emoji_Presentation`] property, and
4040//! - Not in the [Enclosed Ideographic Supplement] block.
41- //! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
42- //! 4. The following have width 0:
41+ //! 3. The sequence `"\r\n"` has width 1.
42+ //! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
43+ //! 5. The following have width 0:
4344//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
4445//! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property.
4546//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
5556//! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
5657//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
5758//! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
58- //! - [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000).
59- //! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)
60- //! have no defined width, and are ignored when determining the width of a string.
6159//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
6260//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
6361//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
@@ -99,7 +97,7 @@ mod tables;
9997/// Methods for determining displayed width of Unicode characters.
10098pub trait UnicodeWidthChar {
10199 /// Returns the character's displayed width in columns, or `None` if the
102- /// character is a control character other than `'\x00'` .
100+ /// character is a control character.
103101 ///
104102 /// This function treats characters in the Ambiguous category according
105103 /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
@@ -108,7 +106,7 @@ pub trait UnicodeWidthChar {
108106 fn width ( self ) -> Option < usize > ;
109107
110108 /// Returns the character's displayed width in columns, or `None` if the
111- /// character is a control character other than `'\x00'` .
109+ /// character is a control character.
112110 ///
113111 /// This function treats characters in the Ambiguous category according
114112 /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
@@ -120,23 +118,42 @@ pub trait UnicodeWidthChar {
120118impl UnicodeWidthChar for char {
121119 #[ inline]
122120 fn width ( self ) -> Option < usize > {
123- cw :: width ( self , false )
121+ single_char_width ( self , false )
124122 }
125123
126124 #[ inline]
127125 fn width_cjk ( self ) -> Option < usize > {
128- cw:: width ( self , true )
126+ single_char_width ( self , true )
127+ }
128+ }
129+
130+ /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
131+ /// `None` if `c` is a control character.
132+ /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
133+ /// they're treated as single width.
134+ #[ inline]
135+ fn single_char_width ( c : char , is_cjk : bool ) -> Option < usize > {
136+ if c < '\u{7F}' {
137+ if c >= '\u{20}' {
138+ // U+0020 to U+007F (exclusive) are single-width ASCII codepoints
139+ Some ( 1 )
140+ } else {
141+ // U+0001 to U+0020 (exclusive) are control codes
142+ None
143+ }
144+ } else if c >= '\u{A0}' {
145+ // No characters >= U+00A0 are control codes, so we can consult the lookup tables
146+ Some ( cw:: lookup_width ( c, is_cjk) )
147+ } else {
148+ // U+007F to U+00A0 (exclusive) are control codes
149+ None
129150 }
130151}
131152
132153/// Methods for determining displayed width of Unicode strings.
133154pub trait UnicodeWidthStr {
134155 /// Returns the string's displayed width in columns.
135156 ///
136- /// Control characters are treated as having zero width,
137- /// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
138- /// are assigned width 2.
139- ///
140157 /// This function treats characters in the Ambiguous category according
141158 /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
142159 /// as 1 column wide. This is consistent with the recommendations for
@@ -145,10 +162,6 @@ pub trait UnicodeWidthStr {
145162
146163 /// Returns the string's displayed width in columns.
147164 ///
148- /// Control characters are treated as having zero width,
149- /// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
150- /// are assigned width 2.
151- ///
152165 /// This function treats characters in the Ambiguous category according
153166 /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
154167 /// as 2 column wide. This is consistent with the recommendations for
@@ -168,30 +181,48 @@ impl UnicodeWidthStr for str {
168181 }
169182}
170183
171- #[ derive( Clone , Copy , Debug , PartialEq , Eq ) ]
172- enum VariationSelector {
184+ #[ derive( Clone , Copy , Debug , Default , PartialEq , Eq ) ]
185+ enum NextCharInfo {
186+ #[ default]
187+ Default ,
188+ LineFeed = 0x0A ,
173189 Vs15 = 0x0E ,
174190 Vs16 = 0x0F ,
175191}
176192
177193fn str_width ( s : & str , is_cjk : bool ) -> usize {
178194 s. chars ( )
179- . rfold ( ( 0 , None ) , |( sum, vsel) , c| match c {
180- '\u{FE0E}' => ( sum, Some ( VariationSelector :: Vs15 ) ) ,
181- '\u{FE0F}' => ( sum, Some ( VariationSelector :: Vs16 ) ) ,
182- _ => {
183- let add = match vsel {
184- Some ( VariationSelector :: Vs15 )
185- if !is_cjk && cw:: starts_non_ideographic_text_presentation_seq ( c) =>
186- {
187- 1
188- }
189-
190- Some ( VariationSelector :: Vs16 ) if cw:: starts_emoji_presentation_seq ( c) => 2 ,
191- _ => cw:: width ( c, is_cjk) . unwrap_or ( 0 ) ,
192- } ;
193- ( sum + add, None )
194- }
195+ . rfold ( ( 0 , NextCharInfo :: Default ) , |( sum, next_info) , c| {
196+ let ( add, info) = width_in_str ( c, is_cjk, next_info) ;
197+ ( sum + add, info)
195198 } )
196199 . 0
197200}
201+
202+ /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`.
203+ /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
204+ /// they're treated as single width.
205+ #[ inline]
206+ fn width_in_str ( c : char , is_cjk : bool , next_info : NextCharInfo ) -> ( usize , NextCharInfo ) {
207+ match next_info {
208+ NextCharInfo :: Vs15 if !is_cjk && cw:: starts_non_ideographic_text_presentation_seq ( c) => {
209+ ( 1 , NextCharInfo :: Default )
210+ }
211+ NextCharInfo :: Vs16 if cw:: starts_emoji_presentation_seq ( c) => ( 2 , NextCharInfo :: Default ) ,
212+ _ => {
213+ if c <= '\u{A0}' {
214+ match c {
215+ '\n' => ( 1 , NextCharInfo :: LineFeed ) ,
216+ '\r' if next_info == NextCharInfo :: LineFeed => ( 0 , NextCharInfo :: Default ) ,
217+ _ => ( 1 , NextCharInfo :: Default ) ,
218+ }
219+ } else {
220+ match c {
221+ '\u{FE0E}' => ( 0 , NextCharInfo :: Vs15 ) ,
222+ '\u{FE0F}' => ( 0 , NextCharInfo :: Vs16 ) ,
223+ _ => ( cw:: lookup_width ( c, is_cjk) , NextCharInfo :: Default ) ,
224+ }
225+ }
226+ }
227+ }
228+ }
0 commit comments