Variation Selector 15 (VS-15, U+FE0E) support.

jquast · jquast · commit 651f52c87692 · 2024-02-14T15:04:06.000-05:00
I did a few spot checks of VS-15 when implementing VS-16, and
erroneously believed that all emojis in VS-15 sequences were already
listed as an EAW width of 1. But that's not true. There are several
emojis that are "wide" that are changed to "narrow" with VS-15.
diff --git a/bin/update-tables.py b/bin/update-tables.py
@@ -417,19 +417,22 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
     """
     table: dict[UnicodeVersion, TableDef] = {}
     unicode_latest = fetch_unicode_versions()[-1]
+    hex_str_vs = 'FE0F'
 
     wide_tables = fetch_table_wide_data().table
     unicode_version = UnicodeVersion.parse('9.0.0')
 
     # parse table formatted by the latest emoji release (developed with
     # 15.1.0) and parse a single file for all individual releases
-    table[unicode_version] = parse_vs16_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
-                                             ubound_unicode_version=unicode_version)
+    table[unicode_version] = parse_vs_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
+                                           ubound_unicode_version=unicode_version,
+                                           hex_str_vs=hex_str_vs)
 
     # parse and join the final emoji release 12.0 of the earlier "type"
     table[unicode_version].values.update(
-        parse_vs16_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
-                        ubound_unicode_version=unicode_version).values)
+        parse_vs_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
+                      ubound_unicode_version=unicode_version,
+                      hex_str_vs=hex_str_vs).values)
 
     # perform culling on any values that are already understood as 'wide'
     # without the variation-16 selector
@@ -442,16 +445,61 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
     return UnicodeTableRenderCtx('VS16_NARROW_TO_WIDE', table)
 
 
-def parse_vs16_data(fname: str, ubound_unicode_version: UnicodeVersion):
+def parse_vs_data(fname: str, ubound_unicode_version: UnicodeVersion, hex_str_vs: str):
     with open(fname, encoding='utf-8') as fin:
-        table_iter = parse_vs16_table(fin)
+        table_iter = parse_vs_table(fin, hex_str_vs)
         # pull "date string"
         date = next(table_iter).comment.split(':', 1)[1].strip()
         # pull values only matching this unicode version and lower
         values = {entry.code_range[0] for entry in table_iter}
     return TableDef(ubound_unicode_version, date, values)
 
 
+def fetch_table_vs15_data() -> UnicodeTableRenderCtx:
+    """
+    Fetch and create a "wide to narrow variation-15" lookup table.
+
+    Characters in this table are wide, but when combined with a variation selector-15 (\uFE0E), they
+    become narrow, for the given versions of unicode.
+
+    UNICODE_VERSION=9.0.0 or greater is required to enable detection of the effect of *any*
+    'variation selector-15' wide emoji becoming narrow.
+
+    Some terminals display U+231a, u+FE0E as a narrow font, but consuming a wide cell (iTerm2),
+    while most others display it as a wide cell, only.
+
+    It is fair to call these ambiguous, see related 'ucs-detect' project.
+    """
+    table: dict[UnicodeVersion, TableDef] = {}
+    unicode_latest = fetch_unicode_versions()[-1]
+    hex_str_vs = 'FE0E'
+
+    wide_tables = fetch_table_wide_data().table
+    unicode_version = UnicodeVersion.parse('9.0.0')
+
+    # parse table formatted by the latest emoji release (developed with
+    # 15.1.0) and parse a single file for all individual releases
+    table[unicode_version] = parse_vs_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
+                                           ubound_unicode_version=unicode_version,
+                                           hex_str_vs=hex_str_vs)
+
+    # parse and join the final emoji release 12.0 of the earlier "type"
+    table[unicode_version].values.update(
+        parse_vs_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
+                      ubound_unicode_version=unicode_version,
+                      hex_str_vs=hex_str_vs).values)
+
+    # perform culling on any values that are already understood as 'narrow'
+    # without the variation-15 selector
+    wide_table = wide_tables[unicode_version].as_value_ranges()
+    table[unicode_version].values = {
+        ucs for ucs in table[unicode_version].values
+        if _bisearch(ucs, wide_table)
+    }
+
+    return UnicodeTableRenderCtx('VS15_WIDE_TO_NARROW', table)
+
+
 def cite_source_description(filename: str) -> tuple[str, str]:
     """Return unicode.org source data file's own description as citation."""
     with open(filename, encoding='utf-8') as f:
@@ -496,9 +544,8 @@ def parse_unicode_table(file: Iterable[str]) -> Iterator[TableEntry]:
         yield TableEntry(code_range, tuple(properties), comment)
 
 
-def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
-    """Parse emoji-variation-sequences.txt for codepoints that preceed 0xFE0F."""
-    hex_str_vs16 = 'FE0F'
+def parse_vs_table(fp: Iterable[str], hex_str_vs: str = 'FE0F') -> Iterator[TableEntry]:
+    """Parse emoji-variation-sequences.txt for codepoints that precede `hex_str_vs`"""
     for line in fp:
         data, _, comment = line.partition('#')
         data_fields: Iterator[str] = (field.strip() for field in data.split(';'))
@@ -510,7 +557,7 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
                 yield TableEntry(None, tuple(properties), comment)
             continue
         code_points = code_points_str.split()
-        if len(code_points) == 2 and code_points[1] == hex_str_vs16:
+        if len(code_points) == 2 and code_points[1] == hex_str_vs:
             # yeild a single "code range" entry for a single value that preceeds FE0F
             yield TableEntry((int(code_points[0], 16), int(code_points[0], 16)), tuple(properties), comment)
 
@@ -663,6 +710,7 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]:
             UnicodeVersionPyRenderCtx(fetch_unicode_versions())
         )
         yield UnicodeTableRenderDef.new('table_vs16.py', fetch_table_vs16_data())
+        yield UnicodeTableRenderDef.new('table_vs15.py', fetch_table_vs15_data())
         yield UnicodeTableRenderDef.new('table_wide.py', fetch_table_wide_data())
         yield UnicodeTableRenderDef.new('table_zero.py', fetch_table_zero_data())
         yield UnicodeVersionRstRenderDef.new(fetch_source_headers())
diff --git a/bin/verify-table-integrity.py b/bin/verify-table-integrity.py
@@ -64,9 +64,7 @@
 
 
 def bisearch_pair(ucs, table):
-    """
-    A copy of wcwidth._bisearch() but also returns the range of matched values.
-    """
+    """A copy of wcwidth._bisearch() but also returns the range of matched values."""
     lbound = 0
     ubound = len(table) - 1
 
@@ -85,6 +83,7 @@ def bisearch_pair(ucs, table):
 
 
 def main(log: logging.Logger):
+    # local
     from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, list_versions
 
     reversed_uni_versions = list(reversed(list_versions()))
diff --git a/docs/intro.rst b/docs/intro.rst
@@ -217,6 +217,10 @@ Other Languages
 History
 =======
 
+0.2.14 *2024-02-14*
+  * **Bugfix** accounting of some kinds of emoji sequences using U+FE0E
+    Variation Selector 15 (`PR #999`_).
+
 0.2.13 *2024-01-06*
   * **Bugfix** zero-width support for Hangul Jamo (Korean)
 
diff --git a/docs/specs.rst b/docs/specs.rst
@@ -47,6 +47,9 @@ Width of 1
 String characters are measured width of 1 when they are not
 measured as `Width of 0`_ or `Width of 2`_.
 
+Any character in sequence with `U+FE0E`_ (variation Selector 15) defined
+by `emoji-variation-sequences.txt`_ as ``text style``.
+
 Width of 2
 ----------
 
@@ -73,6 +76,7 @@ Any character in sequence with `U+FE0F`_ (Variation Selector 16) defined by
 .. _`U+2029`: https://codepoints.net/U+2029
 .. _`U+D7B0`: https://codepoints.net/U+D7B0
 .. _`U+D7FF`: https://codepoints.net/U+D7FF
+.. _`U+FE0E`: https://codepoints.net/U+FE0E
 .. _`U+FE0F`: https://codepoints.net/U+FE0F
 .. _`DerivedGeneralCategory.txt`: https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt
 .. _`EastAsianWidth.txt`: https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
diff --git a/setup.py b/setup.py
@@ -44,7 +44,7 @@ def main():
     setuptools.setup(
         name='wcwidth',
         # NOTE: manually manage __version__ in wcwidth/__init__.py !
-        version='0.2.13',
+        version='0.2.14',
         description=(
             "Measures the displayed width of unicode strings in a terminal"),
         long_description=codecs.open(
diff --git a/tests/test_emojis.py b/tests/test_emojis.py
@@ -131,7 +131,7 @@ def test_longer_emoji_zwj_sequence():
               u"\u200d"       # 'Cf', 'N' -- ZERO WIDTH JOINER
               u"\U0001F9D1"   # 'So', 'W' -- ADULT
               u"\U0001F3FD"   # 'Sk', 'W' -- EMOJI MODIFIER FITZPATRICK TYPE-4
-    ) * 2
+              ) * 2
     # This test adapted from https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf
     expect_length_each = (2, 0, 0, 1, 0, 0, 2, 0, 2, 0) * 2
     expect_length_phrase = 4
@@ -148,8 +148,8 @@ def test_longer_emoji_zwj_sequence():
 def read_sequences_from_file(filename):
     fp = codecs.open(os.path.join(os.path.dirname(__file__), filename), 'r', encoding='utf-8')
     lines = [line.strip()
-                for line in fp.readlines()
-                if not line.startswith('#') and line.strip()]
+             for line in fp.readlines()
+             if not line.startswith('#') and line.strip()]
     fp.close()
     sequences = [make_sequence_from_line(line) for line in lines]
     return lines, sequences
@@ -184,7 +184,7 @@ def test_recommended_emoji_zwj_sequences():
 
 def test_recommended_variation_16_sequences():
     """
-    Test wcswidth of all of the unicode.org-published emoji-variation-sequences.txt
+    Test wcswidth of vs-16 sequences from unicode.org's emoji-variation-sequences.txt
     """
     # given,
     lines, sequences = read_sequences_from_file('emoji-variation-sequences.txt')
@@ -210,6 +210,34 @@ def test_recommended_variation_16_sequences():
     assert num >= 742
 
 
+def test_recommended_variation_15_sequences():
+    """
+    Test wcswidth of vs-15 sequences from unicode.org's emoji-variation-sequences.txt
+    """
+    # given,
+    lines, sequences = read_sequences_from_file('emoji-variation-sequences.txt')
+
+    errors = []
+    num = 0
+    for sequence, line in zip(sequences, lines):
+        num += 1
+        if '\ufe0e' not in sequence:
+            # filter for only \uFE0E (VS-15)
+            continue
+        measured_width = wcwidth.wcswidth(sequence)
+        if measured_width != 1:
+            errors.append({
+                'expected_width': 1,
+                'line': line,
+                'measured_width': wcwidth.wcswidth(sequence),
+                'sequence': sequence,
+            })
+
+    # verify
+    assert errors == []
+    assert num >= 742
+
+
 def test_unicode_9_vs16():
     """Verify effect of VS-16 on unicode_version 9.0 and later"""
     phrase = (u"\u2640"        # FEMALE SIGN
@@ -226,8 +254,26 @@ def test_unicode_9_vs16():
     assert length_each == expect_length_each
     assert length_phrase == expect_length_phrase
 
+
+def test_unicode_9_vs15():
+    """Verify effect of VS-16 on unicode_version 9.0 and later"""
+    phrase = (u"\U0001f4da"        # BOOKS
+              u"\uFE0E")           # VARIATION SELECTOR-15
+
+    expect_length_each = (2, 0)
+    expect_length_phrase = 1
+
+    # exercise,
+    length_each = tuple(wcwidth.wcwidth(w_char, unicode_version='9.0') for w_char in phrase)
+    length_phrase = wcwidth.wcswidth(phrase, unicode_version='9.0')
+
+    # verify.
+    assert length_each == expect_length_each
+    assert length_phrase == expect_length_phrase
+
+
 def test_unicode_8_vs16():
-    """Verify that VS-16 has no effect on unicode_version 8.0 and earler"""
+    """Verify that VS-16 has no effect on unicode_version 8.0 and earlier"""
     phrase = (u"\u2640"        # FEMALE SIGN
               u"\uFE0F")       # VARIATION SELECTOR-16
 
@@ -240,4 +286,21 @@ def test_unicode_8_vs16():
 
     # verify.
     assert length_each == expect_length_each
-    assert length_phrase == expect_length_phrase
+    assert length_phrase == expect_length_phrase
+
+
+def test_unicode_8_vs15():
+    """Verify that VS-15 has no effect on unicode_version 8.0 and earlier"""
+    phrase = (u"\U0001f4da"        # BOOKS
+              u"\uFE0E")           # VARIATION SELECTOR-15
+
+    expect_length_each = (1, 0)
+    expect_length_phrase = 1
+
+    # exercise,
+    length_each = tuple(wcwidth.wcwidth(w_char, unicode_version='8.0') for w_char in phrase)
+    length_phrase = wcwidth.wcswidth(phrase, unicode_version='8.0')
+
+    # verify.
+    assert length_each == expect_length_each
+    assert length_phrase == expect_length_phrase
diff --git a/tests/test_table_integrity.py b/tests/test_table_integrity.py
@@ -1,15 +1,18 @@
 """
 Executes verify-table-integrity.py as a unit test.
 """
+# std imports
 import os
 import sys
 import subprocess
 
+# 3rd party
 import pytest
 
+
 @pytest.mark.skipif(sys.version_info[:2] != (3, 12), reason='Test only with a single version of python')
 def test_verify_table_integrity():
     subprocess.check_output([sys.executable, os.path.join(os.path.dirname(__file__),
                                                           os.path.pardir,
                                                           'bin',
-                                                          'verify-table-integrity.py')])
+                                                          'verify-table-integrity.py')])
diff --git a/tox.ini b/tox.ini
@@ -154,9 +154,13 @@ basepython = python3.11
 commands = {envbindir}/isort --quiet --apply --recursive wcwidth tests bin
 
 [testenv:pylint]
+# Files table_vs15.py and table_wide.py erroneously report "duplicate lines".
+# Except for adding '# pylint: disable=duplicate-code' to the template files, we
+# can chose only to disable a specific check, or specific files. We ignore the
+# files.
 basepython = python3.11
 commands = {envbindir}/pylint --rcfile={toxinidir}/.pylintrc \
-           --ignore=tests,docs,setup.py,conf.py,build,distutils,.pyenv,.git,.tox \
+           --ignore=tests,docs,setup.py,conf.py,build,distutils,.pyenv,.git,.tox,table_wide.py,table_vs15.py \
            {posargs:{toxinidir}}/wcwidth
 
 [testenv:flake8]
diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py
@@ -5,12 +5,13 @@
 """
 # re-export all functions & definitions, even private ones, from top-level
 # module path, to allow for 'from wcwidth import _private_func'.  Of course,
-# user beware that any _private function may disappear or change signature at
-# any future version.
+# user beware that any _private functions or variables not exported by __all__
+# may disappear or change signature at any future version.
 
 # local
 from .wcwidth import ZERO_WIDTH  # noqa
 from .wcwidth import (WIDE_EASTASIAN,
+                      VS15_WIDE_TO_NARROW,
                       VS16_NARROW_TO_WIDE,
                       wcwidth,
                       wcswidth,
@@ -23,7 +24,8 @@
 # 'from wcwidth import *', but also to say, "This is the public API".
 __all__ = ('wcwidth', 'wcswidth', 'list_versions')
 
-# We also used pkg_resources to load unicode version tables from version.json,
-# generated by bin/update-tables.py, but some environments are unable to
-# import pkg_resources for one reason or another, yikes!
-__version__ = '0.2.13'
+# We previously used pkg_resources to load unicode version tables from
+# 'version.json', generated by bin/update-tables.py, but some environments are
+# unable to import pkg_resources for one reason or another, so this is
+# MANUALLY DUPLICATED here and in setup.py
+__version__ = '0.2.14'
diff --git a/wcwidth/table_vs15.py b/wcwidth/table_vs15.py
diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py