Variation Selector 15 (VS-15, U+FE0E) support.

jquast · jquast · commit 8c7f59c19acb · 2025-10-20T19:26:02.000-04:00
I did a few spot checks of VS-15 when implementing VS-16, and
erroneously believed that all emojis in VS-15 sequences were already
listed as an EAW width of 1. But that's not true. There are several
emojis that are "wide" that are changed to "narrow" with VS-15.
diff --git a/bin/update-tables.py b/bin/update-tables.py
@@ -433,19 +433,22 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
     """
     table: dict[UnicodeVersion, TableDef] = {}
     unicode_latest = fetch_unicode_versions()[-1]
+    hex_str_vs = 'FE0F'
 
     wide_tables = fetch_table_wide_data().table
     unicode_version = UnicodeVersion.parse('9.0.0')
 
     # parse table formatted by the latest emoji release (developed with
     # 15.1.0) and parse a single file for all individual releases
-    table[unicode_version] = parse_vs16_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
-                                             ubound_unicode_version=unicode_version)
+    table[unicode_version] = parse_vs_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
+                                           ubound_unicode_version=unicode_version,
+                                           hex_str_vs=hex_str_vs)
 
     # parse and join the final emoji release 12.0 of the earlier "type"
     table[unicode_version].values.update(
-        parse_vs16_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
-                        ubound_unicode_version=unicode_version).values)
+        parse_vs_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
+                      ubound_unicode_version=unicode_version,
+                      hex_str_vs=hex_str_vs).values)
 
     # perform culling on any values that are already understood as 'wide'
     # without the variation-16 selector
@@ -458,16 +461,61 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
     return UnicodeTableRenderCtx('VS16_NARROW_TO_WIDE', table)
 
 
-def parse_vs16_data(fname: str, ubound_unicode_version: UnicodeVersion):
+def parse_vs_data(fname: str, ubound_unicode_version: UnicodeVersion, hex_str_vs: str):
     with open(fname, encoding='utf-8') as fin:
-        table_iter = parse_vs16_table(fin)
+        table_iter = parse_vs_table(fin, hex_str_vs)
         # pull "date string"
         date = next(table_iter).comment.split(':', 1)[1].strip()
         # pull values only matching this unicode version and lower
         values = {entry.code_range[0] for entry in table_iter}
     return TableDef(ubound_unicode_version, date, values)
 
 
+def fetch_table_vs15_data() -> UnicodeTableRenderCtx:
+    """
+    Fetch and create a "wide to narrow variation-15" lookup table.
+
+    Characters in this table are wide, but when combined with a variation selector-15 (\uFE0E), they
+    become narrow, for the given versions of unicode.
+
+    UNICODE_VERSION=9.0.0 or greater is required to enable detection of the effect of *any*
+    'variation selector-15' wide emoji becoming narrow.
+
+    Some terminals display U+231a, u+FE0E as a narrow font, but consuming a wide cell (iTerm2),
+    while most others display it as a wide cell, only.
+
+    It is fair to call these ambiguous, see related 'ucs-detect' project.
+    """
+    table: dict[UnicodeVersion, TableDef] = {}
+    unicode_latest = fetch_unicode_versions()[-1]
+    hex_str_vs = 'FE0E'
+
+    wide_tables = fetch_table_wide_data().table
+    unicode_version = UnicodeVersion.parse('9.0.0')
+
+    # parse table formatted by the latest emoji release (developed with
+    # 15.1.0) and parse a single file for all individual releases
+    table[unicode_version] = parse_vs_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
+                                           ubound_unicode_version=unicode_version,
+                                           hex_str_vs=hex_str_vs)
+
+    # parse and join the final emoji release 12.0 of the earlier "type"
+    table[unicode_version].values.update(
+        parse_vs_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
+                      ubound_unicode_version=unicode_version,
+                      hex_str_vs=hex_str_vs).values)
+
+    # perform culling on any values that are already understood as 'narrow'
+    # without the variation-15 selector
+    wide_table = wide_tables[unicode_version].as_value_ranges()
+    table[unicode_version].values = {
+        ucs for ucs in table[unicode_version].values
+        if _bisearch(ucs, wide_table)
+    }
+
+    return UnicodeTableRenderCtx('VS15_WIDE_TO_NARROW', table)
+
+
 def cite_source_description(filename: str) -> tuple[str, str]:
     """Return unicode.org source data file's own description as citation."""
     with open(filename, encoding='utf-8') as f:
@@ -512,9 +560,8 @@ def parse_unicode_table(file: Iterable[str]) -> Iterator[TableEntry]:
         yield TableEntry(code_range, tuple(properties), comment)
 
 
-def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
-    """Parse emoji-variation-sequences.txt for codepoints that precede 0xFE0F."""
-    hex_str_vs16 = 'FE0F'
+def parse_vs_table(fp: Iterable[str], hex_str_vs: str = 'FE0F') -> Iterator[TableEntry]:
+    """Parse emoji-variation-sequences.txt for codepoints that precede `hex_str_vs`."""
     for line in fp:
         data, _, comment = line.partition('#')
         data_fields: Iterator[str] = (field.strip() for field in data.split(';'))
@@ -526,8 +573,8 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
                 yield TableEntry(None, tuple(properties), comment)
             continue
         code_points = code_points_str.split()
-        if len(code_points) == 2 and code_points[1] == hex_str_vs16:
-            # yield a single "code range" entry for a single value that precedes FE0F
+        if len(code_points) == 2 and code_points[1] == hex_str_vs:
+            # yield a single "code range" entry for a single value that precedes hex_str_vs
             yield TableEntry((int(code_points[0], 16), int(code_points[0], 16)), tuple(properties), comment)
 
 
@@ -717,6 +764,7 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]:
             UnicodeVersionPyRenderCtx(fetch_unicode_versions())
         )
         yield UnicodeTableRenderDef.new('table_vs16.py', fetch_table_vs16_data())
+        yield UnicodeTableRenderDef.new('table_vs15.py', fetch_table_vs15_data())
         yield UnicodeTableRenderDef.new('table_wide.py', fetch_table_wide_data())
         yield UnicodeTableRenderDef.new('table_zero.py', fetch_table_zero_data())
         yield UnicodeVersionRstRenderDef.new(fetch_source_headers())
diff --git a/bin/verify-table-integrity.py b/bin/verify-table-integrity.py
@@ -64,9 +64,7 @@
 
 
 def bisearch_pair(ucs, table):
-    """
-    A copy of wcwidth._bisearch() but also returns the range of matched values.
-    """
+    """A copy of wcwidth._bisearch() but also returns the range of matched values."""
     lbound = 0
     ubound = len(table) - 1
 
diff --git a/docs/intro.rst b/docs/intro.rst
@@ -216,6 +216,9 @@ Other Languages
 =======
 History
 =======
+*Unreleased*
+  * **Bugfix** accounting of some kinds of emoji sequences using U+FE0E
+    Variation Selector 15 (VS-15).
 
 0.2.14 *2025-09-22*
   * **Drop Support** for Python 2.7 and 3.5. `PR #117`_.
diff --git a/docs/specs.rst b/docs/specs.rst
@@ -47,6 +47,9 @@ Width of 1
 String characters are measured width of 1 when they are not
 measured as `Width of 0`_ or `Width of 2`_.
 
+Any character in sequence with `U+FE0E`_ (variation Selector 15) defined
+by `emoji-variation-sequences.txt`_ as ``text style``.
+
 Width of 2
 ----------
 
@@ -74,6 +77,7 @@ Any character in sequence with `U+FE0F`_ (Variation Selector 16) defined by
 .. _`U+2029`: https://codepoints.net/U+2029
 .. _`U+D7B0`: https://codepoints.net/U+D7B0
 .. _`U+D7FF`: https://codepoints.net/U+D7FF
+.. _`U+FE0E`: https://codepoints.net/U+FE0E
 .. _`U+FE0F`: https://codepoints.net/U+FE0F
 .. _`DerivedGeneralCategory.txt`: https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt
 .. _`EastAsianWidth.txt`: https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
diff --git a/tests/test_emojis.py b/tests/test_emojis.py
@@ -176,7 +176,7 @@ def test_recommended_emoji_zwj_sequences():
 
 def test_recommended_variation_16_sequences():
     """
-    Test wcswidth of all of the unicode.org-published emoji-variation-sequences.txt
+    Test wcswidth of vs-16 sequences from unicode.org's emoji-variation-sequences.txt
     """
     # given,
     lines, sequences = read_sequences_from_file('emoji-variation-sequences.txt')
@@ -202,6 +202,34 @@ def test_recommended_variation_16_sequences():
     assert num >= 742
 
 
+def test_recommended_variation_15_sequences():
+    """
+    Test wcswidth of vs-15 sequences from unicode.org's emoji-variation-sequences.txt
+    """
+    # given,
+    lines, sequences = read_sequences_from_file('emoji-variation-sequences.txt')
+
+    errors = []
+    num = 0
+    for sequence, line in zip(sequences, lines):
+        num += 1
+        if '\ufe0e' not in sequence:
+            # filter for only \uFE0E (VS-15)
+            continue
+        measured_width = wcwidth.wcswidth(sequence)
+        if measured_width != 1:
+            errors.append({
+                'expected_width': 1,
+                'line': line,
+                'measured_width': wcwidth.wcswidth(sequence),
+                'sequence': sequence,
+            })
+
+    # verify
+    assert errors == []
+    assert num >= 742
+
+
 def test_unicode_9_vs16():
     """Verify effect of VS-16 on unicode_version 9.0 and later"""
     phrase = ("\u2640"        # FEMALE SIGN
@@ -219,8 +247,25 @@ def test_unicode_9_vs16():
     assert length_phrase == expect_length_phrase
 
 
+def test_unicode_9_vs15():
+    """Verify effect of VS-15 on unicode_version 9.0 and later"""
+    phrase = ("\U0001f4da"        # BOOKS
+              "\uFE0E")           # VARIATION SELECTOR-15
+
+    expect_length_each = (2, 0)
+    expect_length_phrase = 1
+
+    # exercise,
+    length_each = tuple(wcwidth.wcwidth(w_char, unicode_version='9.0') for w_char in phrase)
+    length_phrase = wcwidth.wcswidth(phrase, unicode_version='9.0')
+
+    # verify.
+    assert length_each == expect_length_each
+    assert length_phrase == expect_length_phrase
+
+
 def test_unicode_8_vs16():
-    """Verify that VS-16 has no effect on unicode_version 8.0 and earler"""
+    """Verify that VS-16 has no effect on unicode_version 8.0 and earlier"""
     phrase = ("\u2640"        # FEMALE SIGN
               "\uFE0F")       # VARIATION SELECTOR-16
 
@@ -234,3 +279,20 @@ def test_unicode_8_vs16():
     # verify.
     assert length_each == expect_length_each
     assert length_phrase == expect_length_phrase
+
+
+def test_unicode_8_vs15():
+    """Verify that VS-15 has no effect on unicode_version 8.0 and earlier"""
+    phrase = ("\U0001f4da"        # BOOKS
+              "\uFE0E")           # VARIATION SELECTOR-15
+
+    expect_length_each = (1, 0)
+    expect_length_phrase = 1
+
+    # exercise,
+    length_each = tuple(wcwidth.wcwidth(w_char, unicode_version='8.0') for w_char in phrase)
+    length_phrase = wcwidth.wcswidth(phrase, unicode_version='8.0')
+
+    # verify.
+    assert length_each == expect_length_each
+    assert length_phrase == expect_length_phrase
diff --git a/tox.ini b/tox.ini
@@ -134,10 +134,14 @@ basepython = python3.13
 commands = {envbindir}/isort --quiet --apply --recursive wcwidth tests bin
 
 [testenv:pylint]
+# Files table_vs15.py and table_wide.py erroneously report "duplicate lines".
+# Except for adding '# pylint: disable=duplicate-code' to the template files, we
+# can chose only to disable a specific check, or specific files. We ignore the
+# files.
 basepython = python3.13
 deps = pylint
 commands = {envbindir}/pylint --rcfile={toxinidir}/.pylintrc \
-           --ignore=tests,docs,setup.py,conf.py,build,distutils,.pyenv,.git,.tox \
+           --ignore=tests,docs,setup.py,conf.py,build,distutils,.pyenv,.git,.tox,table_wide.py,table_vs15.py \
            {posargs:{toxinidir}}/wcwidth
 
 [testenv:flake8]
diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py
@@ -5,12 +5,13 @@
 """
 # re-export all functions & definitions, even private ones, from top-level
 # module path, to allow for 'from wcwidth import _private_func'.  Of course,
-# user beware that any _private function may disappear or change signature at
-# any future version.
+# user beware that any _private functions or variables not exported by __all__
+# may disappear or change signature at any future version.
 
 # local
 from .wcwidth import ZERO_WIDTH  # noqa
 from .wcwidth import (WIDE_EASTASIAN,
+                      VS15_WIDE_TO_NARROW,
                       VS16_NARROW_TO_WIDE,
                       wcwidth,
                       wcswidth,
@@ -23,7 +24,8 @@
 # 'from wcwidth import *', but also to say, "This is the public API".
 __all__ = ('wcwidth', 'wcswidth', 'list_versions')
 
-# We also used pkg_resources to load unicode version tables from version.json,
-# generated by bin/update-tables.py, but some environments are unable to
-# import pkg_resources for one reason or another, yikes!
+# We previously used pkg_resources to load unicode version tables from
+# 'version.json', generated by bin/update-tables.py, but some environments are
+# unable to import pkg_resources for one reason or another, so this is
+# MANUALLY DUPLICATED here and in setup.py
 __version__ = '0.2.14'
diff --git a/wcwidth/table_vs15.py b/wcwidth/table_vs15.py
diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py