@@ -417,19 +417,22 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
417417 """
418418 table : dict [UnicodeVersion , TableDef ] = {}
419419 unicode_latest = fetch_unicode_versions ()[- 1 ]
420+ hex_str_vs = 'FE0F'
420421
421422 wide_tables = fetch_table_wide_data ().table
422423 unicode_version = UnicodeVersion .parse ('9.0.0' )
423424
424425 # parse table formatted by the latest emoji release (developed with
425426 # 15.1.0) and parse a single file for all individual releases
426- table [unicode_version ] = parse_vs16_data (fname = UnicodeDataFile .EmojiVariationSequences (unicode_latest ),
427- ubound_unicode_version = unicode_version )
427+ table [unicode_version ] = parse_vs_data (fname = UnicodeDataFile .EmojiVariationSequences (unicode_latest ),
428+ ubound_unicode_version = unicode_version ,
429+ hex_str_vs = hex_str_vs )
428430
429431 # parse and join the final emoji release 12.0 of the earlier "type"
430432 table [unicode_version ].values .update (
431- parse_vs16_data (fname = UnicodeDataFile .LegacyEmojiVariationSequences (),
432- ubound_unicode_version = unicode_version ).values )
433+ parse_vs_data (fname = UnicodeDataFile .LegacyEmojiVariationSequences (),
434+ ubound_unicode_version = unicode_version ,
435+ hex_str_vs = hex_str_vs ).values )
433436
434437 # perform culling on any values that are already understood as 'wide'
435438 # without the variation-16 selector
@@ -442,16 +445,61 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
442445 return UnicodeTableRenderCtx ('VS16_NARROW_TO_WIDE' , table )
443446
444447
445- def parse_vs16_data (fname : str , ubound_unicode_version : UnicodeVersion ):
448+ def parse_vs_data (fname : str , ubound_unicode_version : UnicodeVersion , hex_str_vs : str ):
446449 with open (fname , encoding = 'utf-8' ) as fin :
447- table_iter = parse_vs16_table (fin )
450+ table_iter = parse_vs_table (fin , hex_str_vs )
448451 # pull "date string"
449452 date = next (table_iter ).comment .split (':' , 1 )[1 ].strip ()
450453 # pull values only matching this unicode version and lower
451454 values = {entry .code_range [0 ] for entry in table_iter }
452455 return TableDef (ubound_unicode_version , date , values )
453456
454457
458+ def fetch_table_vs15_data () -> UnicodeTableRenderCtx :
459+ """
460+ Fetch and create a "wide to narrow variation-15" lookup table.
461+
462+ Characters in this table are wide, but when combined with a variation selector-15 (\uFE0E ), they
463+ become narrow, for the given versions of unicode.
464+
465+ UNICODE_VERSION=9.0.0 or greater is required to enable detection of the effect of *any*
466+ 'variation selector-15' wide emoji becoming narrow.
467+
468+ Some terminals display U+231a, u+FE0E as a narrow font, but consuming a wide cell (iTerm2),
469+ while most others display it as a wide cell, only.
470+
471+ It is fair to call these ambiguous, see related 'ucs-detect' project.
472+ """
473+ table : dict [UnicodeVersion , TableDef ] = {}
474+ unicode_latest = fetch_unicode_versions ()[- 1 ]
475+ hex_str_vs = 'FE0E'
476+
477+ wide_tables = fetch_table_wide_data ().table
478+ unicode_version = UnicodeVersion .parse ('9.0.0' )
479+
480+ # parse table formatted by the latest emoji release (developed with
481+ # 15.1.0) and parse a single file for all individual releases
482+ table [unicode_version ] = parse_vs_data (fname = UnicodeDataFile .EmojiVariationSequences (unicode_latest ),
483+ ubound_unicode_version = unicode_version ,
484+ hex_str_vs = hex_str_vs )
485+
486+ # parse and join the final emoji release 12.0 of the earlier "type"
487+ table [unicode_version ].values .update (
488+ parse_vs_data (fname = UnicodeDataFile .LegacyEmojiVariationSequences (),
489+ ubound_unicode_version = unicode_version ,
490+ hex_str_vs = hex_str_vs ).values )
491+
492+ # perform culling on any values that are already understood as 'narrow'
493+ # without the variation-15 selector
494+ wide_table = wide_tables [unicode_version ].as_value_ranges ()
495+ table [unicode_version ].values = {
496+ ucs for ucs in table [unicode_version ].values
497+ if _bisearch (ucs , wide_table )
498+ }
499+
500+ return UnicodeTableRenderCtx ('VS15_WIDE_TO_NARROW' , table )
501+
502+
455503def cite_source_description (filename : str ) -> tuple [str , str ]:
456504 """Return unicode.org source data file's own description as citation."""
457505 with open (filename , encoding = 'utf-8' ) as f :
@@ -496,9 +544,8 @@ def parse_unicode_table(file: Iterable[str]) -> Iterator[TableEntry]:
496544 yield TableEntry (code_range , tuple (properties ), comment )
497545
498546
499- def parse_vs16_table (fp : Iterable [str ]) -> Iterator [TableEntry ]:
500- """Parse emoji-variation-sequences.txt for codepoints that preceed 0xFE0F."""
501- hex_str_vs16 = 'FE0F'
547+ def parse_vs_table (fp : Iterable [str ], hex_str_vs : str = 'FE0F' ) -> Iterator [TableEntry ]:
548+ """Parse emoji-variation-sequences.txt for codepoints that precede `hex_str_vs`"""
502549 for line in fp :
503550 data , _ , comment = line .partition ('#' )
504551 data_fields : Iterator [str ] = (field .strip () for field in data .split (';' ))
@@ -510,7 +557,7 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
510557 yield TableEntry (None , tuple (properties ), comment )
511558 continue
512559 code_points = code_points_str .split ()
513- if len (code_points ) == 2 and code_points [1 ] == hex_str_vs16 :
560+ if len (code_points ) == 2 and code_points [1 ] == hex_str_vs :
514561 # yeild a single "code range" entry for a single value that preceeds FE0F
515562 yield TableEntry ((int (code_points [0 ], 16 ), int (code_points [0 ], 16 )), tuple (properties ), comment )
516563
@@ -663,6 +710,7 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]:
663710 UnicodeVersionPyRenderCtx (fetch_unicode_versions ())
664711 )
665712 yield UnicodeTableRenderDef .new ('table_vs16.py' , fetch_table_vs16_data ())
713+ yield UnicodeTableRenderDef .new ('table_vs15.py' , fetch_table_vs15_data ())
666714 yield UnicodeTableRenderDef .new ('table_wide.py' , fetch_table_wide_data ())
667715 yield UnicodeTableRenderDef .new ('table_zero.py' , fetch_table_zero_data ())
668716 yield UnicodeVersionRstRenderDef .new (fetch_source_headers ())
0 commit comments