3434 from StringIO import StringIO
3535
3636try :
37- # completely optional type hinting
37+ # Completely optional type hinting
3838 # (Python 2 compatible using comments,
3939 # see: https://mypy.readthedocs.io/en/latest/python2.html)
4040 # This is very helpful in typing-aware IDE like PyCharm.
4343 pass
4444
4545
46- # we don't use enum.Enum because of Python 2.7 compatibility
46+ # We don't use enum.Enum because of Python 2.7 compatibility.
4747class UnicodeFiles (object ):
48- # ReadMe does not contain any unicode data, we
48+ # ReadMe does not contain any Unicode data, we
4949 # only use it to extract versions.
5050 README = "ReadMe.txt"
5151
@@ -57,11 +57,15 @@ class UnicodeFiles(object):
5757 UNICODE_DATA = "UnicodeData.txt"
5858
5959
60- UnicodeFiles .ALL_FILES = tuple (
61- getattr (UnicodeFiles , name ) for name in dir (UnicodeFiles )
60+ # The order doesn't really matter (Python < 3.6 won't preserve it),
61+ # we only want to aggregate all the file names.
62+ ALL_UNICODE_FILES = tuple (
63+ value for name , value in UnicodeFiles .__dict__ .items ()
6264 if not name .startswith ("_" )
6365)
6466
67+ assert len (ALL_UNICODE_FILES ) == 7 , "Unexpected number of unicode files"
68+
6569# The directory this file is located in.
6670THIS_DIR = os .path .dirname (os .path .realpath (__file__ ))
6771
@@ -97,18 +101,17 @@ class UnicodeFiles(object):
97101
98102# This is the (inclusive) range of surrogate codepoints.
99103# These are not valid Rust characters.
100- # - they are not valid Rust characters
101104SURROGATE_CODEPOINTS_RANGE = (0xd800 , 0xdfff )
102105
103106UnicodeData = namedtuple (
104107 "UnicodeData" , (
105- # conversions :
108+ # Conversions :
106109 "to_upper" , "to_lower" , "to_title" ,
107110
108- # decompositions : canonical decompositions, compatibility decomp
111+ # Decompositions : canonical decompositions, compatibility decomp
109112 "canon_decomp" , "compat_decomp" ,
110113
111- # grouped : general categories and combining characters
114+ # Grouped : general categories and combining characters
112115 "general_categories" , "combines" ,
113116 )
114117)
@@ -136,10 +139,10 @@ def fetch_files(version=None):
136139 return have_version
137140
138141 if version :
139- # check if the desired version exists on the server
142+ # Check if the desired version exists on the server.
140143 get_fetch_url = lambda name : FETCH_URL_VERSION .format (version = version , filename = name )
141144 else :
142- # extract the latest version
145+ # Extract the latest version.
143146 get_fetch_url = lambda name : FETCH_URL_LATEST .format (filename = name )
144147
145148 readme_url = get_fetch_url (UnicodeFiles .README )
@@ -153,14 +156,14 @@ def fetch_files(version=None):
153156
154157 download_dir = get_unicode_dir (unicode_version )
155158 if not os .path .exists (download_dir ):
156- # for 2.7 compat, we don't use exist_ok=True
159+ # For 2.7 compat, we don't use ` exist_ok=True`.
157160 os .makedirs (download_dir )
158161
159- for filename in UnicodeFiles . ALL_FILES :
162+ for filename in ALL_UNICODE_FILES :
160163 file_path = get_unicode_file_path (unicode_version , filename )
161164
162165 if os .path .exists (file_path ):
163- # assume file on the server didn't change if it's been saved before
166+ # Assume file on the server didn't change if it's been saved before.
164167 continue
165168
166169 if filename == UnicodeFiles .README :
@@ -178,15 +181,16 @@ def check_stored_version(version):
178181 # type: (Optional[str]) -> Optional[UnicodeVersion]
179182 """
180183 Given desired Unicode version, return the version
181- if stored files are all present, and None otherwise.
184+ if stored files are all present, and ` None` otherwise.
182185 """
183186 if not version :
184- # should always check latest version
187+ # If no desired version specified, we should check what's the latest
188+ # version, skipping stored version checks.
185189 return None
186190
187191 fetch_dir = os .path .join (FETCH_DIR , version )
188192
189- for filename in UnicodeFiles . ALL_FILES :
193+ for filename in ALL_UNICODE_FILES :
190194 file_path = os .path .join (fetch_dir , filename )
191195
192196 if not os .path .exists (file_path ):
@@ -199,11 +203,11 @@ def check_stored_version(version):
199203def parse_readme_unicode_version (readme_content ):
200204 # type: (str) -> UnicodeVersion
201205 """
202- Parse the Unicode version contained in their ReadMe.txt file.
206+ Parse the Unicode version contained in their ` ReadMe.txt` file.
203207 """
204- # "raw string" is necessary for \d not being treated as escape char
205- # (for the sake of compat with future Python versions)
206- # see : https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
208+ # "Raw string" is necessary for \d not being treated as escape char
209+ # (for the sake of compat with future Python versions).
210+ # See : https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
207211 pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
208212 groups = re .search (pattern , readme_content ).groups ()
209213
@@ -213,7 +217,7 @@ def parse_readme_unicode_version(readme_content):
213217def get_unicode_dir (unicode_version ):
214218 # type: (UnicodeVersion) -> str
215219 """
216- Indicate where the unicode data files should be stored.
220+ Indicate in which parent dir the Unicode data files should be stored.
217221
218222 This returns a full, absolute path.
219223 """
@@ -223,7 +227,7 @@ def get_unicode_dir(unicode_version):
223227def get_unicode_file_path (unicode_version , filename ):
224228 # type: (UnicodeVersion, str) -> str
225229 """
226- Indicate where the unicode data file should be stored.
230+ Indicate where the Unicode data file should be stored.
227231 """
228232 return os .path .join (get_unicode_dir (unicode_version ), filename )
229233
@@ -239,22 +243,22 @@ def is_surrogate(n):
239243def load_unicode_data (file_path ):
240244 # type: (str) -> UnicodeData
241245 """
242- Load main unicode data.
246+ Load main Unicode data.
243247 """
244- # conversions
248+ # Conversions
245249 to_lower = {} # type: Dict[int, Tuple[int, int, int]]
246250 to_upper = {} # type: Dict[int, Tuple[int, int, int]]
247251 to_title = {} # type: Dict[int, Tuple[int, int, int]]
248252
249- # decompositions
253+ # Decompositions
250254 compat_decomp = {} # type: Dict[int, List[int]]
251255 canon_decomp = {} # type: Dict[int, List[int]]
252256
253- # combining characters
257+ # Combining characters
254258 # FIXME: combines are not used
255259 combines = defaultdict (set ) # type: Dict[str, Set[int]]
256260
257- # categories
261+ # Categories
258262 general_categories = defaultdict (set ) # type: Dict[str, Set[int]]
259263 category_assigned_codepoints = set () # type: Set[int]
260264
@@ -283,41 +287,42 @@ def load_unicode_data(file_path):
283287 decomp , deci , digit , num , mirror ,
284288 old , iso , upcase , lowcase , titlecase ) = data
285289
286- # generate char to char direct common and simple conversions
287- # uppercase to lowercase
290+ # Generate char to char direct common and simple conversions:
291+
292+ # Uppercase to lowercase
288293 if lowcase != "" and code_org != lowcase :
289294 to_lower [code ] = (int (lowcase , 16 ), 0 , 0 )
290295
291- # lowercase to uppercase
296+ # Lowercase to uppercase
292297 if upcase != "" and code_org != upcase :
293298 to_upper [code ] = (int (upcase , 16 ), 0 , 0 )
294299
295- # title case
300+ # Title case
296301 if titlecase .strip () != "" and code_org != titlecase :
297302 to_title [code ] = (int (titlecase , 16 ), 0 , 0 )
298303
299- # store decomposition, if given
304+ # Store decomposition, if given
300305 if decomp :
301306 decompositions = decomp .split ()[1 :]
302307 decomp_code_points = [int (i , 16 ) for i in decompositions ]
303308
304309 if decomp .startswith ("<" ):
305- # compatibility decomposition
310+ # Compatibility decomposition
306311 compat_decomp [code ] = decomp_code_points
307312 else :
308- # canonical decomposition
313+ # Canonical decomposition
309314 canon_decomp [code ] = decomp_code_points
310315
311- # place letter in categories as appropriate
316+ # Place letter in categories as appropriate.
312317 for cat in itertools .chain ((gencat , ), EXPANDED_CATEGORIES .get (gencat , [])):
313318 general_categories [cat ].add (code )
314319 category_assigned_codepoints .add (code )
315320
316- # record combining class, if any
321+ # Record combining class, if any.
317322 if combine != "0" :
318323 combines [combine ].add (code )
319324
320- # generate Not_Assigned from Assigned
325+ # Generate Not_Assigned from Assigned.
321326 general_categories ["Cn" ] = get_unassigned_codepoints (category_assigned_codepoints )
322327
323328 # Other contains Not_Assigned
@@ -336,7 +341,7 @@ def load_unicode_data(file_path):
336341def load_special_casing (file_path , unicode_data ):
337342 # type: (str, UnicodeData) -> None
338343 """
339- Load special casing data and enrich given unicode data.
344+ Load special casing data and enrich given Unicode data.
340345 """
341346 for line in fileinput .input (file_path ):
342347 data = line .split ("#" )[0 ].split (";" )
@@ -474,9 +479,9 @@ def load_properties(file_path, interesting_props):
474479 Load properties data and return in grouped form.
475480 """
476481 props = defaultdict (list ) # type: Dict[str, List[Tuple[int, int]]]
477- # "raw string" is necessary for \. and \w not to be treated as escape chars
478- # (for the sake of compat with future Python versions)
479- # see : https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
482+ # "Raw string" is necessary for `\.` and `\w` not to be treated as escape chars
483+ # (for the sake of compat with future Python versions).
484+ # See : https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
480485 re1 = re .compile (r"^ *([0-9A-F]+) *; *(\w+)" )
481486 re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)" )
482487
@@ -486,7 +491,7 @@ def load_properties(file_path, interesting_props):
486491 groups = match .groups ()
487492
488493 if len (groups ) == 2 :
489- # re1 matched
494+ # ` re1` matched (2 groups).
490495 d_lo , prop = groups
491496 d_hi = d_lo
492497 else :
@@ -502,7 +507,7 @@ def load_properties(file_path, interesting_props):
502507
503508 props [prop ].append ((lo_value , hi_value ))
504509
505- # optimize if possible
510+ # Optimize if possible.
506511 for prop in props :
507512 props [prop ] = group_codepoints (ungroup_codepoints (props [prop ]))
508513
@@ -587,10 +592,10 @@ def compute_trie(raw_data, chunk_size):
587592 for i in range (len (raw_data ) // chunk_size ):
588593 data = raw_data [i * chunk_size : (i + 1 ) * chunk_size ]
589594
590- # postfix compression of child nodes (data chunks)
591- # (identical child nodes are shared)
595+ # Postfix compression of child nodes (data chunks)
596+ # (identical child nodes are shared).
592597
593- # make a tuple out of the list so it's hashable
598+ # Make a tuple out of the list so it's hashable.
594599 child = tuple (data )
595600 if child not in childmap :
596601 childmap [child ] = len (childmap )
@@ -609,15 +614,15 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
609614 This yields string fragments that should be joined to produce
610615 the final string.
611616
612- See: bool_trie.rs
617+ See: ` bool_trie.rs`.
613618 """
614619 chunk_size = 64
615620 rawdata = [False ] * 0x110000
616621 for (lo , hi ) in codepoint_ranges :
617622 for cp in range (lo , hi + 1 ):
618623 rawdata [cp ] = True
619624
620- # convert to bitmap chunks of chunk_size bits each
625+ # Convert to bitmap chunks of ` chunk_size` bits each.
621626 chunks = []
622627 for i in range (0x110000 // chunk_size ):
623628 chunk = 0
@@ -679,9 +684,9 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
679684def generate_small_bool_trie (name , codepoint_ranges , is_pub = True ):
680685 # type: (str, List[Tuple[int, int]], bool) -> Iterator[str]
681686 """
682- Generate Rust code for SmallBoolTrie struct.
687+ Generate Rust code for ` SmallBoolTrie` struct.
683688
684- See: bool_trie.rs
689+ See: ` bool_trie.rs`.
685690 """
686691 last_chunk = max (hi // 64 for (lo , hi ) in codepoint_ranges )
687692 n_chunks = last_chunk + 1
@@ -813,8 +818,8 @@ def main():
813818 unicode_version = fetch_files (args .version )
814819 print ("Using Unicode version: {}" .format (unicode_version .as_str ))
815820
816- # all the writing happens entirely in memory, we only write to file
817- # once we have generated the file content (it's not very large, <1 MB)
821+ # All the writing happens entirely in memory, we only write to file
822+ # once we have generated the file content (it's not very large, <1 MB).
818823 buf = StringIO ()
819824 buf .write (PREAMBLE )
820825
@@ -844,7 +849,7 @@ def main():
844849 {"White_Space" , "Join_Control" , "Noncharacter_Code_Point" ,
845850 "Pattern_White_Space" })
846851
847- # category tables
852+ # Category tables
848853 for (name , categories , category_subset ) in (
849854 ("general_category" , unicode_data .general_categories , ["N" , "Cc" ]),
850855 ("derived_property" , derived , want_derived ),
@@ -858,7 +863,8 @@ def main():
858863
859864 tables_rs_path = os .path .join (THIS_DIR , "tables.rs" )
860865
861- # will overwrite the file if it exists
866+ # Actually write out the file content.
867+ # Will overwrite the file if it exists.
862868 with open (tables_rs_path , "w" ) as fd :
863869 fd .write (buf .getvalue ())
864870
0 commit comments