88// option. This file may not be copied, modified, or distributed
99// except according to those terms.
1010
11- //! Unicode characters manipulation (`char` type)
11+ //! Character manipulation (`char` type, Unicode Scalar Value)
12+ //!
13+ //! This module provides the `Char` trait, as well as its implementation
14+ //! for the primitive `char` type, in order to allow basic character manipulation.
15+ //!
16+ //! A `char` actually represents a
17+ //! *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
18+ //! as it can contain any Unicode code point except high-surrogate and
19+ //! low-surrogate code points.
20+ //!
21+ //! As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
22+ //! (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
23+ //! however the converse is not always true due to the above range limits
24+ //! and, as such, should be performed via the `from_u32` function..
25+
1226
1327use cast:: transmute;
1428use option:: { None , Option , Some } ;
@@ -66,7 +80,7 @@ static TAG_FOUR_B: uint = 240u;
6680/// The highest valid code point
6781pub static MAX : char = ' \U 0010 ffff' ;
6882
69- /// Convert from `u32` to a character .
83+ /// Convert from `u32` to a `char` .
7084#[ inline]
7185pub fn from_u32 ( i : u32 ) -> Option < char > {
7286 // catch out-of-bounds and surrogates
@@ -77,30 +91,30 @@ pub fn from_u32(i: u32) -> Option<char> {
7791 }
7892}
7993
80- /// Returns whether the specified character is considered a unicode alphabetic
81- /// character
94+ /// Returns whether the specified `char` is considered a unicode alphabetic
95+ /// scalar value
8296pub fn is_alphabetic ( c : char ) -> bool { derived_property:: Alphabetic ( c) }
8397#[ allow( missing_doc) ]
8498pub fn is_XID_start ( c : char ) -> bool { derived_property:: XID_Start ( c) }
8599#[ allow( missing_doc) ]
86100pub fn is_XID_continue ( c : char ) -> bool { derived_property:: XID_Continue ( c) }
87101
88102///
89- /// Indicates whether a character is in lower case, defined
103+ /// Indicates whether a `char` is in lower case, defined
90104/// in terms of the Unicode Derived Core Property 'Lowercase'.
91105///
92106#[ inline]
93107pub fn is_lowercase ( c : char ) -> bool { derived_property:: Lowercase ( c) }
94108
95109///
96- /// Indicates whether a character is in upper case, defined
110+ /// Indicates whether a `char` is in upper case, defined
97111/// in terms of the Unicode Derived Core Property 'Uppercase'.
98112///
99113#[ inline]
100114pub fn is_uppercase ( c : char ) -> bool { derived_property:: Uppercase ( c) }
101115
102116///
103- /// Indicates whether a character is whitespace. Whitespace is defined in
117+ /// Indicates whether a `char` is whitespace. Whitespace is defined in
104118/// terms of the Unicode Property 'White_Space'.
105119///
106120#[ inline]
@@ -112,7 +126,7 @@ pub fn is_whitespace(c: char) -> bool {
112126}
113127
114128///
115- /// Indicates whether a character is alphanumeric. Alphanumericness is
129+ /// Indicates whether a `char` is alphanumeric. Alphanumericness is
116130/// defined in terms of the Unicode General Categories 'Nd', 'Nl', 'No'
117131/// and the Derived Core Property 'Alphabetic'.
118132///
@@ -125,14 +139,14 @@ pub fn is_alphanumeric(c: char) -> bool {
125139}
126140
127141///
128- /// Indicates whether a character is a control character . Control
129- /// characters are defined in terms of the Unicode General Category
142+ /// Indicates whether a `char` is a control code point . Control
143+ /// code points are defined in terms of the Unicode General Category
130144/// 'Cc'.
131145///
132146#[ inline]
133147pub fn is_control ( c : char ) -> bool { general_category:: Cc ( c) }
134148
135- /// Indicates whether the character is numeric (Nd, Nl, or No)
149+ /// Indicates whether the `char` is numeric (Nd, Nl, or No)
136150#[ inline]
137151pub fn is_digit ( c : char ) -> bool {
138152 general_category:: Nd ( c)
@@ -141,7 +155,7 @@ pub fn is_digit(c: char) -> bool {
141155}
142156
143157///
144- /// Checks if a character parses as a numeric digit in the given radix.
158+ /// Checks if a `char` parses as a numeric digit in the given radix.
145159/// Compared to `is_digit()`, this function only recognizes the
146160/// characters `0-9`, `a-z` and `A-Z`.
147161///
@@ -167,13 +181,13 @@ pub fn is_digit_radix(c: char, radix: uint) -> bool {
167181}
168182
169183///
170- /// Convert a char to the corresponding digit.
184+ /// Convert a ` char` to the corresponding digit.
171185///
172186/// # Return value
173187///
174188/// If `c` is between '0' and '9', the corresponding value
175189/// between 0 and 9. If `c` is 'a' or 'A', 10. If `c` is
176- /// 'b' or 'B', 11, etc. Returns none if the char does not
190+ /// 'b' or 'B', 11, etc. Returns none if the ` char` does not
177191/// refer to a digit in the given radix.
178192///
179193/// # Failure
@@ -273,7 +287,7 @@ pub fn decompose_compatible(c: char, f: |char|) {
273287}
274288
275289///
276- /// Return the hexadecimal unicode escape of a char.
290+ /// Return the hexadecimal unicode escape of a ` char` .
277291///
278292/// The rules are as follows:
279293///
@@ -301,7 +315,7 @@ pub fn escape_unicode(c: char, f: |char|) {
301315}
302316
303317///
304- /// Return a 'default' ASCII and C++11-like char- literal escape of a char.
318+ /// Return a 'default' ASCII and C++11-like literal escape of a ` char` .
305319///
306320/// The default is chosen with a bias toward producing literals that are
307321/// legal in a variety of languages, including C++11 and similar C-family
@@ -325,7 +339,7 @@ pub fn escape_default(c: char, f: |char|) {
325339 }
326340}
327341
328- /// Returns the amount of bytes this character would need if encoded in utf8
342+ /// Returns the amount of bytes this `char` would need if encoded in UTF-8
329343pub fn len_utf8_bytes ( c : char ) -> uint {
330344 static MAX_ONE_B : uint = 128 u;
331345 static MAX_TWO_B : uint = 2048 u;
@@ -360,7 +374,7 @@ pub trait Char {
360374 fn escape_default ( & self , f: |char|) ;
361375 fn len_utf8_bytes ( & self ) -> uint ;
362376
363- /// Encodes this character as utf-8 into the provided byte-buffer. The
377+ /// Encodes this `char` as utf-8 into the provided byte-buffer. The
364378 /// buffer must be at least 4 bytes long or a runtime failure will occur.
365379 ///
366380 /// This will then return the number of characters written to the slice.
0 commit comments