@@ -47,37 +47,39 @@ def fetch(f):
4747 sys .stderr .write ("cannot load %s\n " % f )
4848 exit (1 )
4949
50- # load identifier status data
51- def load_identifier_status ():
52- f = "IdentifierStatus.txt"
50+ # Implementation from unicode-segmentation
51+ def load_properties (f , interestingprops = None ):
5352 fetch (f )
54- statuses = []
55- re1 = re .compile ("^ ([0-9A-F]+) +; + (\w+)" )
56- re2 = re .compile ("^ ([0-9A-F]+)\.\.([0-9A-F]+) +; + (\w+)" )
53+ props = {}
54+ re1 = re .compile (r"^ * ([0-9A-F]+) *; * (\w+)" )
55+ re2 = re .compile (r"^ * ([0-9A-F]+)\.\.([0-9A-F]+) *; * (\w+)" )
5756
58- for line in fileinput .input (f ):
57+ for line in fileinput .input (os .path .basename (f )):
58+ prop = None
5959 d_lo = 0
6060 d_hi = 0
61- cat = None
6261 m = re1 .match (line )
6362 if m :
6463 d_lo = m .group (1 )
6564 d_hi = m .group (1 )
66- cat = m .group (2 )
65+ prop = m .group (2 ). strip ( )
6766 else :
6867 m = re2 .match (line )
6968 if m :
7069 d_lo = m .group (1 )
7170 d_hi = m .group (2 )
72- cat = m .group (3 )
71+ prop = m .group (3 ). strip ( )
7372 else :
7473 continue
75- if cat != "Allowed" :
74+ if interestingprops and prop not in interestingprops :
7675 continue
7776 d_lo = int (d_lo , 16 )
7877 d_hi = int (d_hi , 16 )
79- statuses .append ((d_lo , d_hi ))
80- return statuses
78+ if prop not in props :
79+ props [prop ] = []
80+ props [prop ].append ((d_lo , d_hi ))
81+
82+ return props
8183
8284def format_table_content (f , content , indent ):
8385 line = " " * indent
@@ -115,41 +117,95 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
115117 format_table_content (f , data , 8 )
116118 f .write ("\n ];\n \n " )
117119
118- def emit_identifier_status_module ( f , statuses_table ):
119- f .write ("pub mod identifier_status {" )
120+ def emit_identifier_module ( f ):
121+ f .write ("pub mod identifier {" )
120122 f .write ("""
121- use core::result::Result::{Ok, Err};
122123
124+ #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
125+ #[allow(non_camel_case_types)]
126+ /// https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type
127+ pub enum IdentifierType {
128+ // Restricted
129+ Not_Character,
130+ Deprecated,
131+ Default_Ignorable,
132+ Not_NFKC,
133+ Not_XID,
134+ Exclusion,
135+ Obsolete,
136+ Technical,
137+ Uncommon_Use,
138+ Limited_Use,
139+
140+ // Allowed
141+ Inclusion,
142+ Recommended
143+ }
123144 #[inline]
124- fn bsearch_range_value_table(c: char, r: &'static [(char, char)]) -> bool {
125- use core::cmp::Ordering::{Equal, Less, Greater};
126- match r.binary_search_by(|&(lo, hi)| {
127- if lo <= c && c <= hi { Equal }
128- else if hi < c { Less }
129- else { Greater }
130- }) {
131- Ok(_) => true,
132- Err(_) => false
145+ pub fn identifier_status_allowed(c: char) -> bool {
146+ // FIXME: do we want to special case ASCII here?
147+ match c as usize {
148+ _ => super::util::bsearch_range_table(c, IDENTIFIER_STATUS)
133149 }
134150 }
135- """ )
136151
137- f .write ("""
138152 #[inline]
139- pub fn identifier_status_allowed (c: char) -> bool {
153+ pub fn identifier_type (c: char) -> Option<IdentifierType> {
140154 // FIXME: do we want to special case ASCII here?
141155 match c as usize {
142- _ => bsearch_range_value_table(c, identifier_status_table )
156+ _ => super::util:: bsearch_range_value_table(c, IDENTIFIER_TYPE )
143157 }
144158 }
145-
146159""" )
147160
148- f .write (" // identifier status table.\n " )
149- emit_table (f , "identifier_status_table" , statuses_table , "&'static [(char, char)]" , is_pub = False ,
161+ f .write (" // Identifier status table:\n " )
162+ identifier_status_table = load_properties ("IdentifierStatus.txt" )
163+ emit_table (f , "IDENTIFIER_STATUS" , identifier_status_table ['Allowed' ], "&'static [(char, char)]" , is_pub = False ,
150164 pfun = lambda x : "(%s,%s)" % (escape_char (x [0 ]), escape_char (x [1 ])))
165+ identifier_type = load_properties ("IdentifierType.txt" )
166+ type_table = []
167+ for ty in identifier_type :
168+ type_table .extend ([(x , y , ty ) for (x , y ) in identifier_type [ty ]])
169+
170+ type_table .sort (key = lambda w : w [0 ])
171+
172+ emit_table (f , "IDENTIFIER_TYPE" , type_table , "&'static [(char, char, IdentifierType)]" , is_pub = False ,
173+ pfun = lambda x : "(%s,%s, IdentifierType::%s)" % (escape_char (x [0 ]), escape_char (x [1 ]), x [2 ]))
151174 f .write ("}\n \n " )
152175
176+ def emit_util_mod (f ):
177+ f .write ("""
178+ pub mod util {
179+ use core::result::Result::{Ok, Err};
180+ #[inline]
181+ pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
182+ use core::cmp::Ordering::{Equal, Less, Greater};
183+ r.binary_search_by(|&(lo,hi)| {
184+ if lo <= c && c <= hi { Equal }
185+ else if hi < c { Less }
186+ else { Greater }
187+ }).is_ok()
188+ }
189+
190+ pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> {
191+ use core::cmp::Ordering::{Equal, Less, Greater};
192+ match r.binary_search_by(|&(lo, hi, _)| {
193+ if lo <= c && c <= hi { Equal }
194+ else if hi < c { Less }
195+ else { Greater }
196+ }) {
197+ Ok(idx) => {
198+ let (_, _, cat) = r[idx];
199+ Some(cat)
200+ }
201+ Err(_) => None
202+ }
203+ }
204+
205+ }
206+
207+ """ )
208+
153209if __name__ == "__main__" :
154210 r = "tables.rs"
155211 if os .path .exists (r ):
@@ -164,6 +220,7 @@ def emit_identifier_status_module(f, statuses_table):
164220pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
165221
166222""" % UNICODE_VERSION )
167- ### identifier status module
168- identifier_status_table = load_identifier_status ()
169- emit_identifier_status_module (rf , identifier_status_table )
223+
224+ emit_util_mod (rf )
225+ ### identifier module
226+ emit_identifier_module (rf )
0 commit comments