11use UnicodeNormalization ;
2+ use stream_safe;
23use tables;
34
45/// The QuickCheck algorithm can quickly determine if a text is or isn't
56/// normalized without any allocations in many cases, but it has to be able to
67/// return `Maybe` when a full decomposition and recomposition is necessary.
8+ #[ derive( Debug , Eq , PartialEq ) ]
79pub enum IsNormalized {
810 /// The text is definitely normalized.
911 Yes ,
@@ -15,17 +17,20 @@ pub enum IsNormalized {
1517
1618// https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
1719#[ inline]
18- fn quick_check < F , I > ( s : I , is_allowed : F ) -> IsNormalized
20+ fn quick_check < F , I > ( s : I , is_allowed : F , stream_safe : bool ) -> IsNormalized
1921 where I : Iterator < Item =char > , F : Fn ( char ) -> IsNormalized
2022{
2123 let mut last_cc = 0u8 ;
24+ let mut nonstarter_count = 0 ;
2225 let mut result = IsNormalized :: Yes ;
2326 for ch in s {
2427 // For ASCII we know it's always allowed and a starter
2528 if ch <= '\x7f' {
2629 last_cc = 0 ;
30+ nonstarter_count = 0 ;
2731 continue ;
2832 }
33+
2934 // Otherwise, lookup the combining class and QC property
3035 let cc = tables:: canonical_combining_class ( ch) ;
3136 if last_cc > cc && cc != 0 {
@@ -38,6 +43,20 @@ fn quick_check<F, I>(s: I, is_allowed: F) -> IsNormalized
3843 result = IsNormalized :: Maybe ;
3944 } ,
4045 }
46+ if stream_safe {
47+ let decomp = stream_safe:: classify_nonstarters ( ch) ;
48+
49+ // If we're above `MAX_NONSTARTERS`, we're definitely *not*
50+ // stream-safe normalized.
51+ if nonstarter_count + decomp. leading_nonstarters > stream_safe:: MAX_NONSTARTERS {
52+ return IsNormalized :: No ;
53+ }
54+ if decomp. leading_nonstarters == decomp. decomposition_len {
55+ nonstarter_count += decomp. decomposition_len ;
56+ } else {
57+ nonstarter_count = decomp. trailing_nonstarters ;
58+ }
59+ }
4160 last_cc = cc;
4261 }
4362 result
@@ -48,16 +67,29 @@ fn quick_check<F, I>(s: I, is_allowed: F) -> IsNormalized
4867/// like `s.chars().nfc().eq(s.chars())` should suffice.
4968#[ inline]
5069pub fn is_nfc_quick < I : Iterator < Item =char > > ( s : I ) -> IsNormalized {
51- quick_check ( s, tables:: qc_nfc)
70+ quick_check ( s, tables:: qc_nfc, false )
5271}
5372
5473/// Quickly check if a string is in NFD.
5574#[ inline]
5675pub fn is_nfd_quick < I : Iterator < Item =char > > ( s : I ) -> IsNormalized {
57- quick_check ( s, tables:: qc_nfd)
76+ quick_check ( s, tables:: qc_nfd, false )
77+ }
78+
79+ /// Quickly check if a string is Stream-Safe NFC.
80+ #[ inline]
81+ pub fn is_nfc_stream_safe_quick < I : Iterator < Item =char > > ( s : I ) -> IsNormalized {
82+ quick_check ( s, tables:: qc_nfc, true )
83+ }
84+
85+ /// Quickly check if a string is Stream-Safe NFD.
86+ #[ inline]
87+ pub fn is_nfd_stream_safe_quick < I : Iterator < Item =char > > ( s : I ) -> IsNormalized {
88+ quick_check ( s, tables:: qc_nfd, true )
5889}
5990
6091/// Authoritatively check if a string is in NFC.
92+ #[ inline]
6193pub fn is_nfc ( s : & str ) -> bool {
6294 match is_nfc_quick ( s. chars ( ) ) {
6395 IsNormalized :: Yes => true ,
@@ -67,10 +99,58 @@ pub fn is_nfc(s: &str) -> bool {
6799}
68100
69101/// Authoritatively check if a string is in NFD.
102+ #[ inline]
70103pub fn is_nfd ( s : & str ) -> bool {
71104 match is_nfd_quick ( s. chars ( ) ) {
72105 IsNormalized :: Yes => true ,
73106 IsNormalized :: No => false ,
74107 IsNormalized :: Maybe => s. chars ( ) . eq ( s. chars ( ) . nfd ( ) ) ,
75108 }
76109}
110+
111+ /// Authoritatively check if a string is Stream-Safe NFC.
112+ #[ inline]
113+ pub fn is_nfc_stream_safe ( s : & str ) -> bool {
114+ match is_nfc_stream_safe_quick ( s. chars ( ) ) {
115+ IsNormalized :: Yes => true ,
116+ IsNormalized :: No => false ,
117+ IsNormalized :: Maybe => s. chars ( ) . eq ( s. chars ( ) . stream_safe ( ) . nfc ( ) ) ,
118+ }
119+ }
120+
121+ /// Authoritatively check if a string is Stream-Safe NFD.
122+ #[ inline]
123+ pub fn is_nfd_stream_safe ( s : & str ) -> bool {
124+ match is_nfd_stream_safe_quick ( s. chars ( ) ) {
125+ IsNormalized :: Yes => true ,
126+ IsNormalized :: No => false ,
127+ IsNormalized :: Maybe => s. chars ( ) . eq ( s. chars ( ) . stream_safe ( ) . nfd ( ) ) ,
128+ }
129+ }
130+
131+ #[ cfg( test) ]
132+ mod tests {
133+ use super :: {
134+ IsNormalized ,
135+ is_nfc_stream_safe_quick,
136+ is_nfd_stream_safe_quick,
137+ } ;
138+
139+ #[ test]
140+ fn test_stream_safe_nfd ( ) {
141+ let okay = "Da\u{031b} \u{0316} \u{0317} \u{0318} \u{0319} \u{031c} \u{031d} \u{0300} \u{0301} \u{0302} \u{0303} \u{0304} \u{0305} \u{0306} \u{0307} \u{0308} \u{0309} \u{030a} \u{030b} \u{030c} \u{030d} \u{030e} \u{030f} \u{0310} \u{0311} \u{0312} \u{0313} \u{0314} \u{0315} \u{031a} ngerzone" ;
142+ assert_eq ! ( is_nfd_stream_safe_quick( okay. chars( ) ) , IsNormalized :: Yes ) ;
143+
144+ let too_much = "Da\u{031b} \u{0316} \u{0317} \u{0318} \u{0319} \u{031c} \u{031d} \u{031e} \u{0300} \u{0301} \u{0302} \u{0303} \u{0304} \u{0305} \u{0306} \u{0307} \u{0308} \u{0309} \u{030a} \u{030b} \u{030c} \u{030d} \u{030e} \u{030f} \u{0310} \u{0311} \u{0312} \u{0313} \u{0314} \u{0315} \u{031a} ngerzone" ;
145+ assert_eq ! ( is_nfd_stream_safe_quick( too_much. chars( ) ) , IsNormalized :: No ) ;
146+ }
147+
148+ #[ test]
149+ fn test_stream_safe_nfc ( ) {
150+ let okay = "ok\u{e0} \u{031b} \u{0316} \u{0317} \u{0318} \u{0319} \u{031c} \u{031d} \u{0301} \u{0302} \u{0303} \u{0304} \u{0305} \u{0306} \u{0307} \u{0308} \u{0309} \u{030a} \u{030b} \u{030c} \u{030d} \u{030e} \u{030f} \u{0310} \u{0311} \u{0312} \u{0313} \u{0314} \u{0315} \u{031a} y" ;
151+ assert_eq ! ( is_nfc_stream_safe_quick( okay. chars( ) ) , IsNormalized :: Maybe ) ;
152+
153+ let too_much = "not ok\u{e0} \u{031b} \u{0316} \u{0317} \u{0318} \u{0319} \u{031c} \u{031d} \u{031e} \u{0301} \u{0302} \u{0303} \u{0304} \u{0305} \u{0306} \u{0307} \u{0308} \u{0309} \u{030a} \u{030b} \u{030c} \u{030d} \u{030e} \u{030f} \u{0310} \u{0311} \u{0312} \u{0313} \u{0314} \u{0315} \u{031a} y" ;
154+ assert_eq ! ( is_nfc_stream_safe_quick( too_much. chars( ) ) , IsNormalized :: No ) ;
155+ }
156+ }
0 commit comments