@@ -433,19 +433,22 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
433433 """
434434 table : dict [UnicodeVersion , TableDef ] = {}
435435 unicode_latest = fetch_unicode_versions ()[- 1 ]
436+ hex_str_vs = 'FE0F'
436437
437438 wide_tables = fetch_table_wide_data ().table
438439 unicode_version = UnicodeVersion .parse ('9.0.0' )
439440
440441 # parse table formatted by the latest emoji release (developed with
441442 # 15.1.0) and parse a single file for all individual releases
442- table [unicode_version ] = parse_vs16_data (fname = UnicodeDataFile .EmojiVariationSequences (unicode_latest ),
443- ubound_unicode_version = unicode_version )
443+ table [unicode_version ] = parse_vs_data (fname = UnicodeDataFile .EmojiVariationSequences (unicode_latest ),
444+ ubound_unicode_version = unicode_version ,
445+ hex_str_vs = hex_str_vs )
444446
445447 # parse and join the final emoji release 12.0 of the earlier "type"
446448 table [unicode_version ].values .update (
447- parse_vs16_data (fname = UnicodeDataFile .LegacyEmojiVariationSequences (),
448- ubound_unicode_version = unicode_version ).values )
449+ parse_vs_data (fname = UnicodeDataFile .LegacyEmojiVariationSequences (),
450+ ubound_unicode_version = unicode_version ,
451+ hex_str_vs = hex_str_vs ).values )
449452
450453 # perform culling on any values that are already understood as 'wide'
451454 # without the variation-16 selector
@@ -458,16 +461,61 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
458461 return UnicodeTableRenderCtx ('VS16_NARROW_TO_WIDE' , table )
459462
460463
461- def parse_vs16_data (fname : str , ubound_unicode_version : UnicodeVersion ):
464+ def parse_vs_data (fname : str , ubound_unicode_version : UnicodeVersion , hex_str_vs : str ):
462465 with open (fname , encoding = 'utf-8' ) as fin :
463- table_iter = parse_vs16_table (fin )
466+ table_iter = parse_vs_table (fin , hex_str_vs )
464467 # pull "date string"
465468 date = next (table_iter ).comment .split (':' , 1 )[1 ].strip ()
466469 # pull values only matching this unicode version and lower
467470 values = {entry .code_range [0 ] for entry in table_iter }
468471 return TableDef (ubound_unicode_version , date , values )
469472
470473
474+ def fetch_table_vs15_data () -> UnicodeTableRenderCtx :
475+ """
476+ Fetch and create a "wide to narrow variation-15" lookup table.
477+
478+ Characters in this table are wide, but when combined with a variation selector-15 (\uFE0E ), they
479+ become narrow, for the given versions of unicode.
480+
481+ UNICODE_VERSION=9.0.0 or greater is required to enable detection of the effect of *any*
482+ 'variation selector-15' wide emoji becoming narrow.
483+
484+ Some terminals display U+231a, u+FE0E as a narrow font, but consuming a wide cell (iTerm2),
485+ while most others display it as a wide cell, only.
486+
487+ It is fair to call these ambiguous, see related 'ucs-detect' project.
488+ """
489+ table : dict [UnicodeVersion , TableDef ] = {}
490+ unicode_latest = fetch_unicode_versions ()[- 1 ]
491+ hex_str_vs = 'FE0E'
492+
493+ wide_tables = fetch_table_wide_data ().table
494+ unicode_version = UnicodeVersion .parse ('9.0.0' )
495+
496+ # parse table formatted by the latest emoji release (developed with
497+ # 15.1.0) and parse a single file for all individual releases
498+ table [unicode_version ] = parse_vs_data (fname = UnicodeDataFile .EmojiVariationSequences (unicode_latest ),
499+ ubound_unicode_version = unicode_version ,
500+ hex_str_vs = hex_str_vs )
501+
502+ # parse and join the final emoji release 12.0 of the earlier "type"
503+ table [unicode_version ].values .update (
504+ parse_vs_data (fname = UnicodeDataFile .LegacyEmojiVariationSequences (),
505+ ubound_unicode_version = unicode_version ,
506+ hex_str_vs = hex_str_vs ).values )
507+
508+ # perform culling on any values that are already understood as 'narrow'
509+ # without the variation-15 selector
510+ wide_table = wide_tables [unicode_version ].as_value_ranges ()
511+ table [unicode_version ].values = {
512+ ucs for ucs in table [unicode_version ].values
513+ if _bisearch (ucs , wide_table )
514+ }
515+
516+ return UnicodeTableRenderCtx ('VS15_WIDE_TO_NARROW' , table )
517+
518+
471519def cite_source_description (filename : str ) -> tuple [str , str ]:
472520 """Return unicode.org source data file's own description as citation."""
473521 with open (filename , encoding = 'utf-8' ) as f :
@@ -512,9 +560,8 @@ def parse_unicode_table(file: Iterable[str]) -> Iterator[TableEntry]:
512560 yield TableEntry (code_range , tuple (properties ), comment )
513561
514562
515- def parse_vs16_table (fp : Iterable [str ]) -> Iterator [TableEntry ]:
516- """Parse emoji-variation-sequences.txt for codepoints that precede 0xFE0F."""
517- hex_str_vs16 = 'FE0F'
563+ def parse_vs_table (fp : Iterable [str ], hex_str_vs : str = 'FE0F' ) -> Iterator [TableEntry ]:
564+ """Parse emoji-variation-sequences.txt for codepoints that precede `hex_str_vs`."""
518565 for line in fp :
519566 data , _ , comment = line .partition ('#' )
520567 data_fields : Iterator [str ] = (field .strip () for field in data .split (';' ))
@@ -526,8 +573,8 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
526573 yield TableEntry (None , tuple (properties ), comment )
527574 continue
528575 code_points = code_points_str .split ()
529- if len (code_points ) == 2 and code_points [1 ] == hex_str_vs16 :
530- # yield a single "code range" entry for a single value that precedes FE0F
576+ if len (code_points ) == 2 and code_points [1 ] == hex_str_vs :
577+ # yield a single "code range" entry for a single value that precedes hex_str_vs
531578 yield TableEntry ((int (code_points [0 ], 16 ), int (code_points [0 ], 16 )), tuple (properties ), comment )
532579
533580
@@ -717,6 +764,7 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]:
717764 UnicodeVersionPyRenderCtx (fetch_unicode_versions ())
718765 )
719766 yield UnicodeTableRenderDef .new ('table_vs16.py' , fetch_table_vs16_data ())
767+ yield UnicodeTableRenderDef .new ('table_vs15.py' , fetch_table_vs15_data ())
720768 yield UnicodeTableRenderDef .new ('table_wide.py' , fetch_table_wide_data ())
721769 yield UnicodeTableRenderDef .new ('table_zero.py' , fetch_table_zero_data ())
722770 yield UnicodeVersionRstRenderDef .new (fetch_source_headers ())
0 commit comments