88// option. This file may not be copied, modified, or distributed
99// except according to those terms.
1010
11- //! Unicode characters manipulation (`char` type)
11+ //! Character manipulation (`char` type, Unicode Scalar Value)
12+ //!
13+ //! This module provides the `Char` trait, as well as its implementation
14+ //! for the primitive `char` type, in order to allow basic character manipulation.
15+ //!
16+ //! A `char` actually represents a
17+ //! *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
18+ //! as it can contain any Unicode code point except high-surrogate and
19+ //! low-surrogate code points.
20+ //!
21+ //! As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
22+ //! (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
23+ //! however the converse is not always true due to the above range limits
24+ //! and, as such, should be performed via the `from_u32` function..
25+
1226
1327use cast:: transmute;
1428use option:: { None , Option , Some } ;
@@ -66,7 +80,7 @@ static TAG_FOUR_B: uint = 240u;
6680/// The highest valid code point
6781pub static MAX : char = ' \U 0010 ffff' ;
6882
69- /// Convert from `u32` to a character.
83+ /// Converts from `u32` to a `char`
7084#[ inline]
7185pub fn from_u32 ( i : u32 ) -> Option < char > {
7286 // catch out-of-bounds and surrogates
@@ -77,31 +91,44 @@ pub fn from_u32(i: u32) -> Option<char> {
7791 }
7892}
7993
80- /// Returns whether the specified character is considered a unicode alphabetic
81- /// character
94+ /// Returns whether the specified `char` is considered a Unicode alphabetic
95+ /// code point
8296pub fn is_alphabetic ( c : char ) -> bool { derived_property:: Alphabetic ( c) }
83- #[ allow( missing_doc) ]
97+
98+ /// Returns whether the specified `char` satisfies the 'XID_Start' Unicode property
99+ ///
100+ /// 'XID_Start' is a Unicode Derived Property specified in
101+ /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
102+ /// mostly similar to ID_Start but modified for closure under NFKx.
84103pub fn is_XID_start ( c : char ) -> bool { derived_property:: XID_Start ( c) }
85- #[ allow( missing_doc) ]
104+
105+ /// Returns whether the specified `char` satisfies the 'XID_Continue' Unicode property
106+ ///
107+ /// 'XID_Continue' is a Unicode Derived Property specified in
108+ /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
109+ /// mostly similar to 'ID_Continue' but modified for closure under NFKx.
86110pub fn is_XID_continue ( c : char ) -> bool { derived_property:: XID_Continue ( c) }
87111
88112///
89- /// Indicates whether a character is in lower case, defined
90- /// in terms of the Unicode Derived Core Property 'Lowercase'.
113+ /// Indicates whether a `char` is in lower case
114+ ///
115+ /// This is defined according to the terms of the Unicode Derived Core Property 'Lowercase'.
91116///
92117#[ inline]
93118pub fn is_lowercase ( c : char ) -> bool { derived_property:: Lowercase ( c) }
94119
95120///
96- /// Indicates whether a character is in upper case, defined
97- /// in terms of the Unicode Derived Core Property 'Uppercase'.
121+ /// Indicates whether a `char` is in upper case
122+ ///
123+ /// This is defined according to the terms of the Unicode Derived Core Property 'Uppercase'.
98124///
99125#[ inline]
100126pub fn is_uppercase ( c : char ) -> bool { derived_property:: Uppercase ( c) }
101127
102128///
103- /// Indicates whether a character is whitespace. Whitespace is defined in
104- /// terms of the Unicode Property 'White_Space'.
129+ /// Indicates whether a `char` is whitespace
130+ ///
131+ /// Whitespace is defined in terms of the Unicode Property 'White_Space'.
105132///
106133#[ inline]
107134pub fn is_whitespace ( c : char ) -> bool {
@@ -112,9 +139,10 @@ pub fn is_whitespace(c: char) -> bool {
112139}
113140
114141///
115- /// Indicates whether a character is alphanumeric. Alphanumericness is
116- /// defined in terms of the Unicode General Categories 'Nd', 'Nl', 'No'
117- /// and the Derived Core Property 'Alphabetic'.
142+ /// Indicates whether a `char` is alphanumeric
143+ ///
144+ /// Alphanumericness is defined in terms of the Unicode General Categories
145+ /// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
118146///
119147#[ inline]
120148pub fn is_alphanumeric ( c : char ) -> bool {
@@ -125,14 +153,15 @@ pub fn is_alphanumeric(c: char) -> bool {
125153}
126154
127155///
128- /// Indicates whether a character is a control character. Control
129- /// characters are defined in terms of the Unicode General Category
156+ /// Indicates whether a `char` is a control code point
157+ ///
158+ /// Control code points are defined in terms of the Unicode General Category
130159/// 'Cc'.
131160///
132161#[ inline]
133162pub fn is_control ( c : char ) -> bool { general_category:: Cc ( c) }
134163
135- /// Indicates whether the character is numeric (Nd, Nl, or No)
164+ /// Indicates whether the `char` is numeric (Nd, Nl, or No)
136165#[ inline]
137166pub fn is_digit ( c : char ) -> bool {
138167 general_category:: Nd ( c)
@@ -141,7 +170,8 @@ pub fn is_digit(c: char) -> bool {
141170}
142171
143172///
144- /// Checks if a character parses as a numeric digit in the given radix.
173+ /// Checks if a `char` parses as a numeric digit in the given radix
174+ ///
145175/// Compared to `is_digit()`, this function only recognizes the
146176/// characters `0-9`, `a-z` and `A-Z`.
147177///
@@ -167,13 +197,13 @@ pub fn is_digit_radix(c: char, radix: uint) -> bool {
167197}
168198
169199///
170- /// Convert a char to the corresponding digit.
200+ /// Converts a ` char` to the corresponding digit
171201///
172202/// # Return value
173203///
174204/// If `c` is between '0' and '9', the corresponding value
175205/// between 0 and 9. If `c` is 'a' or 'A', 10. If `c` is
176- /// 'b' or 'B', 11, etc. Returns none if the char does not
206+ /// 'b' or 'B', 11, etc. Returns none if the ` char` does not
177207/// refer to a digit in the given radix.
178208///
179209/// # Failure
@@ -196,7 +226,7 @@ pub fn to_digit(c: char, radix: uint) -> Option<uint> {
196226}
197227
198228///
199- /// Converts a number to the character representing it.
229+ /// Converts a number to the character representing it
200230///
201231/// # Return value
202232///
@@ -254,7 +284,7 @@ fn decompose_hangul(s: char, f: |char|) {
254284 }
255285}
256286
257- /// Returns the canonical decomposition of a character.
287+ /// Returns the canonical decomposition of a character
258288pub fn decompose_canonical ( c : char , f: |char|) {
259289 if ( c as uint ) < S_BASE || ( c as uint ) >= ( S_BASE + S_COUNT ) {
260290 decompose:: canonical ( c, f) ;
@@ -263,7 +293,7 @@ pub fn decompose_canonical(c: char, f: |char|) {
263293 }
264294}
265295
266- /// Returns the compatibility decomposition of a character.
296+ /// Returns the compatibility decomposition of a character
267297pub fn decompose_compatible ( c : char , f: |char|) {
268298 if ( c as uint ) < S_BASE || ( c as uint ) >= ( S_BASE + S_COUNT ) {
269299 decompose:: compatibility ( c, f) ;
@@ -273,7 +303,7 @@ pub fn decompose_compatible(c: char, f: |char|) {
273303}
274304
275305///
276- /// Return the hexadecimal unicode escape of a char.
306+ /// Returns the hexadecimal Unicode escape of a ` char`
277307///
278308/// The rules are as follows:
279309///
@@ -301,7 +331,7 @@ pub fn escape_unicode(c: char, f: |char|) {
301331}
302332
303333///
304- /// Return a 'default' ASCII and C++11-like char- literal escape of a char.
334+ /// Returns a 'default' ASCII and C++11-like literal escape of a ` char`
305335///
306336/// The default is chosen with a bias toward producing literals that are
307337/// legal in a variety of languages, including C++11 and similar C-family
@@ -325,7 +355,7 @@ pub fn escape_default(c: char, f: |char|) {
325355 }
326356}
327357
328- /// Returns the amount of bytes this character would need if encoded in utf8
358+ /// Returns the amount of bytes this `char` would need if encoded in UTF-8
329359pub fn len_utf8_bytes ( c : char ) -> uint {
330360 static MAX_ONE_B : uint = 128 u;
331361 static MAX_TWO_B : uint = 2048 u;
@@ -360,8 +390,9 @@ pub trait Char {
360390 fn escape_default ( & self , f: |char|) ;
361391 fn len_utf8_bytes ( & self ) -> uint ;
362392
363- /// Encodes this character as utf-8 into the provided byte-buffer. The
364- /// buffer must be at least 4 bytes long or a runtime failure will occur.
393+ /// Encodes this `char` as utf-8 into the provided byte-buffer
394+ ///
395+ /// The buffer must be at least 4 bytes long or a runtime failure will occur.
365396 ///
366397 /// This will then return the number of characters written to the slice.
367398 fn encode_utf8 ( & self , dst : & mut [ u8 ] ) -> uint ;
0 commit comments