7171}
7272
7373#[ inline]
74+ #[ allow( unsafe_code) ]
7475fn decompose < D , F > ( c : char , decompose_char : D , mut emit_char : F )
7576where
7677 D : Fn ( char ) -> Option < & ' static [ char ] > ,
8485
8586 // Perform decomposition for Hangul
8687 if is_hangul_syllable ( c) {
87- decompose_hangul ( c, emit_char) ;
88+ // Safety: Hangul Syllables invariant checked by is_hangul_syllable above
89+ unsafe {
90+ decompose_hangul ( c, emit_char) ;
91+ }
8892 return ;
8993 }
9094
@@ -127,27 +131,37 @@ const T_LAST: u32 = T_BASE + T_COUNT - 1;
127131// i.e. `T_BASE + 1 ..= T_LAST`.
128132const T_FIRST : u32 = T_BASE + 1 ;
129133
134+ // Safety-usable invariant: This ensures that c is a valid Hangul Syllable character (U+AC00..U+D7AF)
130135pub ( crate ) fn is_hangul_syllable ( c : char ) -> bool {
136+ // Safety: This checks the range 0xAC00 (S_BASE) to 0xD7A4 (S_BASE + S_COUNT), upholding the safety-usable invariant
131137 ( c as u32 ) >= S_BASE && ( c as u32 ) < ( S_BASE + S_COUNT )
132138}
133139
134140// Decompose a precomposed Hangul syllable
135- #[ allow( unsafe_code) ]
141+ // Safety: `s` MUST be a valid Hangul Syllable character, between U+AC00..U+D7AF
142+ #[ allow( unsafe_code, unused_unsafe) ]
136143#[ inline( always) ]
137- fn decompose_hangul < F > ( s : char , mut emit_char : F )
144+ unsafe fn decompose_hangul < F > ( s : char , mut emit_char : F )
138145where
139146 F : FnMut ( char ) ,
140147{
148+ // This will be at most 0x2baf, the size of the Hangul Syllables block
141149 let s_index = s as u32 - S_BASE ;
150+ // This will be at most 0x2baf / (21 * 28), 19
142151 let l_index = s_index / N_COUNT ;
143152 unsafe {
153+ // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
144154 emit_char ( char:: from_u32_unchecked ( L_BASE + l_index) ) ;
145155
156+ // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21
146157 let v_index = ( s_index % N_COUNT ) / T_COUNT ;
158+ // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
147159 emit_char ( char:: from_u32_unchecked ( V_BASE + v_index) ) ;
148160
161+ // Safety: This will be at most T_COUNT - 1 (27)
149162 let t_index = s_index % T_COUNT ;
150163 if t_index > 0 {
164+ // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
151165 emit_char ( char:: from_u32_unchecked ( T_BASE + t_index) ) ;
152166 }
153167 }
@@ -173,14 +187,23 @@ fn compose_hangul(a: char, b: char) -> Option<char> {
173187 match ( a, b) {
174188 // Compose a leading consonant and a vowel together into an LV_Syllable
175189 ( L_BASE ..=L_LAST , V_BASE ..=V_LAST ) => {
190+ // Safety: based on the above bounds, l_index will be less than or equal to L_COUNT (19)
191+ // and v_index will be <= V_COUNT (21)
176192 let l_index = a - L_BASE ;
177193 let v_index = b - V_BASE ;
194+ // Safety: This will be <= 19 * (20 * 21) + (21 * 20), which is 8400.
178195 let lv_index = l_index * N_COUNT + v_index * T_COUNT ;
196+ // Safety: This is between 0xAC00 and 0xCCD0, which are in range for Hangul Syllables (U+AC00..U+D7AF) and also in range
197+ // for BMP unicode
179198 let s = S_BASE + lv_index;
199+ // Safety: We've verified this is in-range
180200 Some ( unsafe { char:: from_u32_unchecked ( s) } )
181201 }
182202 // Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
183203 ( S_BASE ..=S_LAST , T_FIRST ..=T_LAST ) if ( a - S_BASE ) % T_COUNT == 0 => {
204+ // Safety: a is between 0xAC00 and (0xAC00 + 19 * 21 * 28). b - T_BASE is between 0 and 19.
205+ // Adding a number 0 to 19 to a number that is at largest 0xD7A4 will not go out of bounds to 0xD800 (where the
206+ // surrogates start), so this is safe.
184207 Some ( unsafe { char:: from_u32_unchecked ( a + ( b - T_BASE ) ) } )
185208 }
186209 _ => None ,
0 commit comments