@@ -500,7 +500,7 @@ impl str {
500500 ///
501501 /// # Unsafety
502502 ///
503- /// Caller must check both UTF-8 character boundaries and the boundaries
503+ /// Caller must check both UTF-8 sequence boundaries and the boundaries
504504 /// of the entire slice as
505505 /// well.
506506 ///
@@ -526,15 +526,16 @@ impl str {
526526 core_str:: StrExt :: slice_mut_unchecked ( self , begin, end)
527527 }
528528
529- /// Returns a slice of the string from the character range [`begin`..`end`).
529+ /// Returns a slice of the string from the range [`begin`..`end`) where indices
530+ /// are counted in code points.
530531 ///
531532 /// That is, start at the `begin`-th code point of the string and continue
532533 /// to the `end`-th code point. This does not detect or handle edge cases
533- /// such as leaving a combining character as the first code point of the
534+ /// such as leaving a combining character as the first `char` of the
534535 /// string.
535536 ///
536537 /// Due to the design of UTF-8, this operation is `O(end)`. Use slicing
537- /// syntax if you want to use byte indices rather than codepoint indices .
538+ /// syntax if you want to use `O(1)` byte indices instead .
538539 ///
539540 /// # Panics
540541 ///
@@ -556,26 +557,26 @@ impl str {
556557 core_str:: StrExt :: slice_chars ( self , begin, end)
557558 }
558559
559- /// Given a byte position, return the next char and its index.
560+ /// Given a byte position, return the next code point and its index.
560561 ///
561- /// This can be used to iterate over the Unicode characters of a string.
562+ /// This can be used to iterate over the Unicode code points of a string.
562563 ///
563564 /// # Panics
564565 ///
565566 /// If `i` is greater than or equal to the length of the string.
566- /// If `i` is not the index of the beginning of a valid UTF-8 character .
567+ /// If `i` is not the index of the beginning of a valid UTF-8 sequence .
567568 ///
568569 /// # Examples
569570 ///
570- /// This example manually iterates through the characters of a string;
571+ /// This example manually iterates through the code points of a string;
571572 /// this should normally be
572573 /// done by `.chars()` or `.char_indices()`.
573574 ///
574575 /// ```
575576 /// # #![feature(str_char, core)]
576577 /// use std::str::CharRange;
577578 ///
578- /// let s = "中华Việt Nam";
579+ /// let s = "中华Việt Nam";
579580 /// let mut i = 0;
580581 /// while i < s.len() {
581582 /// let CharRange {ch, next} = s.char_range_at(i);
@@ -591,12 +592,14 @@ impl str {
591592 /// 3: 华
592593 /// 6: V
593594 /// 7: i
594- /// 8: ệ
595- /// 11: t
596- /// 12:
597- /// 13: N
598- /// 14: a
599- /// 15: m
595+ /// 8: e
596+ /// 9: ̣
597+ /// 11: ̂
598+ /// 13: t
599+ /// 14:
600+ /// 15: N
601+ /// 16: a
602+ /// 17: m
600603 /// ```
601604 #[ unstable( feature = "str_char" ,
602605 reason = "often replaced by char_indices, this method may \
@@ -608,26 +611,29 @@ impl str {
608611
609612 /// Given a byte position, return the previous `char` and its position.
610613 ///
611- /// This function can be used to iterate over a Unicode string in reverse.
614+ /// This function can be used to iterate over a Unicode code points in reverse.
615+ ///
616+ /// Note that Unicode has many features, such as combining marks, ligatures,
617+ /// and direction marks, that need to be taken into account to correctly reverse a string.
612618 ///
613619 /// Returns 0 for next index if called on start index 0.
614620 ///
615621 /// # Panics
616622 ///
617623 /// If `i` is greater than the length of the string.
618- /// If `i` is not an index following a valid UTF-8 character .
624+ /// If `i` is not an index following a valid UTF-8 sequence .
619625 ///
620626 /// # Examples
621627 ///
622- /// This example manually iterates through the characters of a string;
628+ /// This example manually iterates through the code points of a string;
623629 /// this should normally be
624630 /// done by `.chars().rev()` or `.char_indices()`.
625631 ///
626632 /// ```
627633 /// # #![feature(str_char, core)]
628634 /// use std::str::CharRange;
629635 ///
630- /// let s = "中华Việt Nam";
636+ /// let s = "中华Việt Nam";
631637 /// let mut i = s.len();
632638 /// while i > 0 {
633639 /// let CharRange {ch, next} = s.char_range_at_reverse(i);
@@ -639,12 +645,14 @@ impl str {
639645 /// This outputs:
640646 ///
641647 /// ```text
642- /// 16: m
643- /// 15: a
644- /// 14: N
645- /// 13:
646- /// 12: t
647- /// 11: ệ
648+ /// 18: m
649+ /// 17: a
650+ /// 16: N
651+ /// 15:
652+ /// 14: t
653+ /// 13: ̂
654+ /// 11: ̣
655+ /// 9: e
648656 /// 8: i
649657 /// 7: V
650658 /// 6: 华
@@ -663,7 +671,7 @@ impl str {
663671 /// # Panics
664672 ///
665673 /// If `i` is greater than or equal to the length of the string.
666- /// If `i` is not the index of the beginning of a valid UTF-8 character .
674+ /// If `i` is not the index of the beginning of a valid UTF-8 sequence .
667675 ///
668676 /// # Examples
669677 ///
@@ -672,6 +680,7 @@ impl str {
672680 /// let s = "abπc";
673681 /// assert_eq!(s.char_at(1), 'b');
674682 /// assert_eq!(s.char_at(2), 'π');
683+ /// assert_eq!(s.char_at(4), 'c');
675684 /// ```
676685 #[ unstable( feature = "str_char" ,
677686 reason = "frequently replaced by the chars() iterator, this \
@@ -689,7 +698,7 @@ impl str {
689698 /// # Panics
690699 ///
691700 /// If `i` is greater than the length of the string.
692- /// If `i` is not an index following a valid UTF-8 character .
701+ /// If `i` is not an index following a valid UTF-8 sequence .
693702 ///
694703 /// # Examples
695704 ///
@@ -698,6 +707,7 @@ impl str {
698707 /// let s = "abπc";
699708 /// assert_eq!(s.char_at_reverse(1), 'a');
700709 /// assert_eq!(s.char_at_reverse(2), 'b');
710+ /// assert_eq!(s.char_at_reverse(3), 'π');
701711 /// ```
702712 #[ unstable( feature = "str_char" ,
703713 reason = "see char_at for more details, but reverse semantics \
@@ -707,28 +717,30 @@ impl str {
707717 core_str:: StrExt :: char_at_reverse ( self , i)
708718 }
709719
710- /// Retrieves the first character from a `&str` and returns it.
720+ /// Retrieves the first code point from a `&str` and returns it.
721+ ///
722+ /// Note that a single Unicode character (grapheme cluster)
723+ /// can be composed of multiple `char`s.
711724 ///
712725 /// This does not allocate a new string; instead, it returns a slice that
713- /// points one character
714- /// beyond the character that was shifted.
726+ /// points one code point beyond the code point that was shifted.
715727 ///
716- /// If the slice does not contain any characters, None is returned instead .
728+ /// `None` is returned if the slice is empty .
717729 ///
718730 /// # Examples
719731 ///
720732 /// ```
721733 /// # #![feature(str_char)]
722- /// let s = "Löwe 老虎 Léopard";
734+ /// let s = "Łódź"; // \u{141}o\u{301}dz\u{301}
723735 /// let (c, s1) = s.slice_shift_char().unwrap();
724736 ///
725- /// assert_eq!(c, 'L ');
726- /// assert_eq!(s1, "öwe 老虎 Léopard ");
737+ /// assert_eq!(c, 'Ł ');
738+ /// assert_eq!(s1, "ódź ");
727739 ///
728740 /// let (c, s2) = s1.slice_shift_char().unwrap();
729741 ///
730- /// assert_eq!(c, 'ö ');
731- /// assert_eq!(s2, "we 老虎 Léopard ");
742+ /// assert_eq!(c, 'o ');
743+ /// assert_eq!(s2, "\u{301}dz\u{301} ");
732744 /// ```
733745 #[ unstable( feature = "str_char" ,
734746 reason = "awaiting conventions about shifting and slices and \
@@ -741,14 +753,14 @@ impl str {
741753 /// Divide one string slice into two at an index.
742754 ///
743755 /// The index `mid` is a byte offset from the start of the string
744- /// that must be on a character boundary.
756+ /// that must be on a `char` boundary.
745757 ///
746758 /// Return slices `&self[..mid]` and `&self[mid..]`.
747759 ///
748760 /// # Panics
749761 ///
750- /// Panics if `mid` is beyond the last character of the string,
751- /// or if it is not on a character boundary.
762+ /// Panics if `mid` is beyond the last code point of the string,
763+ /// or if it is not on a `char` boundary.
752764 ///
753765 /// # Examples
754766 /// ```
@@ -773,27 +785,39 @@ impl str {
773785 core_str:: StrExt :: split_at_mut ( self , mid)
774786 }
775787
776- /// An iterator over the codepoints of `self`.
788+ /// An iterator over the code points of `self`.
789+ ///
790+ /// In Unicode relationship between code points and characters is complex.
791+ /// A single character may be composed of multiple code points
792+ /// (e.g. diacritical marks added to a letter), and a single code point
793+ /// (e.g. Hangul syllable) may contain multiple characters.
794+ ///
795+ /// For iteration over human-readable characters a grapheme cluster iterator
796+ /// may be more appropriate. See the [unicode-segmentation crate][1].
797+ ///
798+ /// [1]: https://crates.io/crates/unicode-segmentation
777799 ///
778800 /// # Examples
779801 ///
780802 /// ```
781- /// let v: Vec<char> = "abc åäö ".chars().collect();
803+ /// let v: Vec<char> = "ASCII żółć 🇨🇭 한 ".chars().collect();
782804 ///
783- /// assert_eq!(v, ['a', 'b', 'c', ' ', 'å', 'ä', 'ö']);
805+ /// assert_eq!(v, ['A', 'S', 'C', 'I', 'I', ' ',
806+ /// 'z', '\u{307}', 'o', '\u{301}', 'ł', 'c', '\u{301}', ' ',
807+ /// '\u{1f1e8}', '\u{1f1ed}', ' ', '한']);
784808 /// ```
785809 #[ stable( feature = "rust1" , since = "1.0.0" ) ]
786810 pub fn chars ( & self ) -> Chars {
787811 core_str:: StrExt :: chars ( self )
788812 }
789813
790- /// An iterator over the characters of `self` and their byte offsets.
814+ /// An iterator over the `char`s of `self` and their byte offsets.
791815 ///
792816 /// # Examples
793817 ///
794818 /// ```
795- /// let v: Vec<(usize, char)> = "abc ".char_indices().collect();
796- /// let b = vec![(0, 'a '), (1, 'b '), (2 , 'c ')];
819+ /// let v: Vec<(usize, char)> = "A🇨🇭 ".char_indices().collect();
820+ /// let b = vec![(0, 'A '), (1, '\u{1f1e8} '), (5 , '\u{1f1ed} ')];
797821 ///
798822 /// assert_eq!(v, b);
799823 /// ```
@@ -822,7 +846,7 @@ impl str {
822846 /// # Examples
823847 ///
824848 /// ```
825- /// let some_words = " Mary had\ta little \n\t lamb";
849+ /// let some_words = " Mary had\ta\u{2009} little \n\t lamb";
826850 /// let v: Vec<&str> = some_words.split_whitespace().collect();
827851 ///
828852 /// assert_eq!(v, ["Mary", "had", "a", "little", "lamb"]);
@@ -840,7 +864,7 @@ impl str {
840864 /// ```
841865 /// # #![feature(str_words)]
842866 /// # #![allow(deprecated)]
843- /// let some_words = " Mary had\ta little \n\t lamb";
867+ /// let some_words = " Mary had\ta\u{2009} little \n\t lamb";
844868 /// let v: Vec<&str> = some_words.words().collect();
845869 ///
846870 /// assert_eq!(v, ["Mary", "had", "a", "little", "lamb"]);
0 commit comments