1919# programs". It is not meant to be a complete implementation of unicode.
2020# For that we recommend you use a proper binding to libicu.
2121
22- import fileinput , re , os , sys
22+ import fileinput , re , os , sys , operator
2323
2424
2525def fetch (f ):
@@ -35,6 +35,8 @@ def fetch(f):
3535def load_unicode_data (f ):
3636 fetch (f )
3737 gencats = {}
38+ upperlower = {}
39+ lowerupper = {}
3840 combines = []
3941 canon_decomp = {}
4042 compat_decomp = {}
@@ -44,6 +46,7 @@ def load_unicode_data(f):
4446 c_hi = 0
4547 com_lo = 0
4648 com_hi = 0
49+
4750 for line in fileinput .input (f ):
4851 fields = line .split (";" )
4952 if len (fields ) != 15 :
@@ -52,7 +55,17 @@ def load_unicode_data(f):
5255 decomp , deci , digit , num , mirror ,
5356 old , iso , upcase , lowcase , titlecase ] = fields
5457
55- code = int (code , 16 )
58+ code_org = code
59+ code = int (code , 16 )
60+
61+ # generate char to char direct common and simple conversions
62+ # uppercase to lowercase
63+ if gencat == "Lu" and lowcase != "" and code_org != lowcase :
64+ upperlower [code ] = int (lowcase , 16 )
65+
66+ # lowercase to uppercase
67+ if gencat == "Ll" and upcase != "" and code_org != upcase :
68+ lowerupper [code ] = int (upcase , 16 )
5669
5770 if decomp != "" :
5871 if decomp .startswith ('<' ):
@@ -96,7 +109,7 @@ def load_unicode_data(f):
96109 com_lo = code
97110 com_hi = code
98111
99- return (canon_decomp , compat_decomp , gencats , combines )
112+ return (canon_decomp , compat_decomp , gencats , combines , lowerupper , upperlower )
100113
101114def load_properties (f , interestingprops ):
102115 fetch (f )
@@ -164,11 +177,12 @@ def emit_property_module(f, mod, tbl):
164177 keys = tbl .keys ()
165178 keys .sort ()
166179 emit_bsearch_range_table (f );
180+
167181 for cat in keys :
168182 if cat not in ["Nd" , "Nl" , "No" , "Cc" ,
169- "XID_Start" , "XID_Continue" , "Alphabetic" ,
170- "Lowercase" , "Uppercase" , "White_Space" ]:
171- continue
183+ "XID_Start" , "XID_Continue" , "Alphabetic" ,
184+ "Lowercase" , "Uppercase" , "White_Space" ]:
185+ continue
172186 f .write (" static %s_table : &'static [(char,char)] = &[\n " % cat )
173187 ix = 0
174188 for pair in tbl [cat ]:
@@ -183,30 +197,58 @@ def emit_property_module(f, mod, tbl):
183197 f .write ("}\n " )
184198
185199
186- def emit_property_module_old (f , mod , tbl ):
187- f .write ("mod %s {\n " % mod )
188- keys = tbl .keys ()
189- keys .sort ()
190- for cat in keys :
191- f .write (" fn %s(c: char) -> bool {\n " % cat )
192- f .write (" ret alt c {\n " )
193- prefix = ' '
194- for pair in tbl [cat ]:
195- if pair [0 ] == pair [1 ]:
196- f .write (" %c %s\n " %
197- (prefix , escape_char (pair [0 ])))
198- else :
199- f .write (" %c %s to %s\n " %
200- (prefix ,
201- escape_char (pair [0 ]),
202- escape_char (pair [1 ])))
203- prefix = '|'
204- f .write (" { true }\n " )
205- f .write (" _ { false }\n " )
206- f .write (" };\n " )
207- f .write (" }\n \n " )
200+ def emit_conversions_module (f , lowerupper , upperlower ):
201+ f .write ("pub mod conversions {\n " )
202+ f .write ("""
203+ use cmp::{Equal, Less, Greater};
204+ use vec::ImmutableVector;
205+ use tuple::Tuple2;
206+ use option::{ Option, Some, None };
207+
208+ pub fn to_lower(c: char) -> char {
209+ match bsearch_case_table(c, LuLl_table) {
210+ None => c,
211+ Some(index) => LuLl_table[index].val1()
212+ }
213+ }
214+
215+ pub fn to_upper(c: char) -> char {
216+ match bsearch_case_table(c, LlLu_table) {
217+ None => c,
218+ Some(index) => LlLu_table[index].val1()
219+ }
220+ }
221+
222+ fn bsearch_case_table(c: char, table: &'static [(char, char)]) -> Option<uint> {
223+ table.bsearch(|&(key, _)| {
224+ if c == key { Equal }
225+ else if key < c { Less }
226+ else { Greater }
227+ })
228+ }
229+ """ );
230+ emit_caseconversions (f , lowerupper , upperlower )
208231 f .write ("}\n " )
209232
233+ def emit_caseconversions (f , lowerupper , upperlower ):
234+ f .write (" static LuLl_table : &'static [(char, char)] = &[\n " )
235+ sorted_by_lu = sorted (upperlower .iteritems (), key = operator .itemgetter (0 ))
236+ ix = 0
237+ for key , value in sorted_by_lu :
238+ f .write (ch_prefix (ix ))
239+ f .write ("(%s, %s)" % (escape_char (key ), escape_char (value )))
240+ ix += 1
241+ f .write ("\n ];\n \n " )
242+
243+ f .write (" static LlLu_table : &'static [(char, char)] = &[\n " )
244+ sorted_by_ll = sorted (lowerupper .iteritems (), key = operator .itemgetter (0 ))
245+ ix = 0
246+ for key , value in sorted_by_ll :
247+ f .write (ch_prefix (ix ))
248+ f .write ("(%s, %s)" % (escape_char (key ), escape_char (value )))
249+ ix += 1
250+ f .write ("\n ];\n \n " )
251+
210252def format_table_content (f , content , indent ):
211253 line = " " * indent
212254 first = True
@@ -362,7 +404,8 @@ def emit_decomp_module(f, canon, compat, combine):
362404 os .remove (i );
363405rf = open (r , "w" )
364406
365- (canon_decomp , compat_decomp , gencats , combines ) = load_unicode_data ("UnicodeData.txt" )
407+ (canon_decomp , compat_decomp , gencats ,
408+ combines , lowerupper , upperlower ) = load_unicode_data ("UnicodeData.txt" )
366409
367410# Preamble
368411rf .write ('''// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
@@ -388,7 +431,9 @@ def emit_decomp_module(f, canon, compat, combine):
388431
389432derived = load_properties ("DerivedCoreProperties.txt" ,
390433 ["XID_Start" , "XID_Continue" , "Alphabetic" , "Lowercase" , "Uppercase" ])
434+
391435emit_property_module (rf , "derived_property" , derived )
392436
393437props = load_properties ("PropList.txt" , ["White_Space" ])
394438emit_property_module (rf , "property" , props )
439+ emit_conversions_module (rf , lowerupper , upperlower )
0 commit comments