Skip to content

Commit 8c7f59c

Browse files
committed
Variation Selector 15 (VS-15, U+FE0E) support.
I did a few spot checks of VS-15 when implementing VS-16, and erroneously believed that all emojis in VS-15 sequences were already listed as an EAW width of 1. But that's not true. There are several emojis that are "wide" that are changed to "narrow" with VS-15.
1 parent 5ba540d commit 8c7f59c

File tree

9 files changed

+258
-22
lines changed

9 files changed

+258
-22
lines changed

bin/update-tables.py

Lines changed: 59 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -433,19 +433,22 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
433433
"""
434434
table: dict[UnicodeVersion, TableDef] = {}
435435
unicode_latest = fetch_unicode_versions()[-1]
436+
hex_str_vs = 'FE0F'
436437

437438
wide_tables = fetch_table_wide_data().table
438439
unicode_version = UnicodeVersion.parse('9.0.0')
439440

440441
# parse table formatted by the latest emoji release (developed with
441442
# 15.1.0) and parse a single file for all individual releases
442-
table[unicode_version] = parse_vs16_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
443-
ubound_unicode_version=unicode_version)
443+
table[unicode_version] = parse_vs_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
444+
ubound_unicode_version=unicode_version,
445+
hex_str_vs=hex_str_vs)
444446

445447
# parse and join the final emoji release 12.0 of the earlier "type"
446448
table[unicode_version].values.update(
447-
parse_vs16_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
448-
ubound_unicode_version=unicode_version).values)
449+
parse_vs_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
450+
ubound_unicode_version=unicode_version,
451+
hex_str_vs=hex_str_vs).values)
449452

450453
# perform culling on any values that are already understood as 'wide'
451454
# without the variation-16 selector
@@ -458,16 +461,61 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
458461
return UnicodeTableRenderCtx('VS16_NARROW_TO_WIDE', table)
459462

460463

461-
def parse_vs16_data(fname: str, ubound_unicode_version: UnicodeVersion):
464+
def parse_vs_data(fname: str, ubound_unicode_version: UnicodeVersion, hex_str_vs: str):
462465
with open(fname, encoding='utf-8') as fin:
463-
table_iter = parse_vs16_table(fin)
466+
table_iter = parse_vs_table(fin, hex_str_vs)
464467
# pull "date string"
465468
date = next(table_iter).comment.split(':', 1)[1].strip()
466469
# pull values only matching this unicode version and lower
467470
values = {entry.code_range[0] for entry in table_iter}
468471
return TableDef(ubound_unicode_version, date, values)
469472

470473

474+
def fetch_table_vs15_data() -> UnicodeTableRenderCtx:
475+
"""
476+
Fetch and create a "wide to narrow variation-15" lookup table.
477+
478+
Characters in this table are wide, but when combined with a variation selector-15 (\uFE0E), they
479+
become narrow, for the given versions of unicode.
480+
481+
UNICODE_VERSION=9.0.0 or greater is required to enable detection of the effect of *any*
482+
'variation selector-15' wide emoji becoming narrow.
483+
484+
Some terminals display U+231a, u+FE0E as a narrow font, but consuming a wide cell (iTerm2),
485+
while most others display it as a wide cell, only.
486+
487+
It is fair to call these ambiguous, see related 'ucs-detect' project.
488+
"""
489+
table: dict[UnicodeVersion, TableDef] = {}
490+
unicode_latest = fetch_unicode_versions()[-1]
491+
hex_str_vs = 'FE0E'
492+
493+
wide_tables = fetch_table_wide_data().table
494+
unicode_version = UnicodeVersion.parse('9.0.0')
495+
496+
# parse table formatted by the latest emoji release (developed with
497+
# 15.1.0) and parse a single file for all individual releases
498+
table[unicode_version] = parse_vs_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
499+
ubound_unicode_version=unicode_version,
500+
hex_str_vs=hex_str_vs)
501+
502+
# parse and join the final emoji release 12.0 of the earlier "type"
503+
table[unicode_version].values.update(
504+
parse_vs_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
505+
ubound_unicode_version=unicode_version,
506+
hex_str_vs=hex_str_vs).values)
507+
508+
# perform culling on any values that are already understood as 'narrow'
509+
# without the variation-15 selector
510+
wide_table = wide_tables[unicode_version].as_value_ranges()
511+
table[unicode_version].values = {
512+
ucs for ucs in table[unicode_version].values
513+
if _bisearch(ucs, wide_table)
514+
}
515+
516+
return UnicodeTableRenderCtx('VS15_WIDE_TO_NARROW', table)
517+
518+
471519
def cite_source_description(filename: str) -> tuple[str, str]:
472520
"""Return unicode.org source data file's own description as citation."""
473521
with open(filename, encoding='utf-8') as f:
@@ -512,9 +560,8 @@ def parse_unicode_table(file: Iterable[str]) -> Iterator[TableEntry]:
512560
yield TableEntry(code_range, tuple(properties), comment)
513561

514562

515-
def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
516-
"""Parse emoji-variation-sequences.txt for codepoints that precede 0xFE0F."""
517-
hex_str_vs16 = 'FE0F'
563+
def parse_vs_table(fp: Iterable[str], hex_str_vs: str = 'FE0F') -> Iterator[TableEntry]:
564+
"""Parse emoji-variation-sequences.txt for codepoints that precede `hex_str_vs`."""
518565
for line in fp:
519566
data, _, comment = line.partition('#')
520567
data_fields: Iterator[str] = (field.strip() for field in data.split(';'))
@@ -526,8 +573,8 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
526573
yield TableEntry(None, tuple(properties), comment)
527574
continue
528575
code_points = code_points_str.split()
529-
if len(code_points) == 2 and code_points[1] == hex_str_vs16:
530-
# yield a single "code range" entry for a single value that precedes FE0F
576+
if len(code_points) == 2 and code_points[1] == hex_str_vs:
577+
# yield a single "code range" entry for a single value that precedes hex_str_vs
531578
yield TableEntry((int(code_points[0], 16), int(code_points[0], 16)), tuple(properties), comment)
532579

533580

@@ -717,6 +764,7 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]:
717764
UnicodeVersionPyRenderCtx(fetch_unicode_versions())
718765
)
719766
yield UnicodeTableRenderDef.new('table_vs16.py', fetch_table_vs16_data())
767+
yield UnicodeTableRenderDef.new('table_vs15.py', fetch_table_vs15_data())
720768
yield UnicodeTableRenderDef.new('table_wide.py', fetch_table_wide_data())
721769
yield UnicodeTableRenderDef.new('table_zero.py', fetch_table_zero_data())
722770
yield UnicodeVersionRstRenderDef.new(fetch_source_headers())

bin/verify-table-integrity.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,7 @@
6464

6565

6666
def bisearch_pair(ucs, table):
67-
"""
68-
A copy of wcwidth._bisearch() but also returns the range of matched values.
69-
"""
67+
"""A copy of wcwidth._bisearch() but also returns the range of matched values."""
7068
lbound = 0
7169
ubound = len(table) - 1
7270

docs/intro.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,9 @@ Other Languages
216216
=======
217217
History
218218
=======
219+
*Unreleased*
220+
* **Bugfix** accounting of some kinds of emoji sequences using U+FE0E
221+
Variation Selector 15 (VS-15).
219222

220223
0.2.14 *2025-09-22*
221224
* **Drop Support** for Python 2.7 and 3.5. `PR #117`_.

docs/specs.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ Width of 1
4747
String characters are measured width of 1 when they are not
4848
measured as `Width of 0`_ or `Width of 2`_.
4949

50+
Any character in sequence with `U+FE0E`_ (variation Selector 15) defined
51+
by `emoji-variation-sequences.txt`_ as ``text style``.
52+
5053
Width of 2
5154
----------
5255

@@ -74,6 +77,7 @@ Any character in sequence with `U+FE0F`_ (Variation Selector 16) defined by
7477
.. _`U+2029`: https://codepoints.net/U+2029
7578
.. _`U+D7B0`: https://codepoints.net/U+D7B0
7679
.. _`U+D7FF`: https://codepoints.net/U+D7FF
80+
.. _`U+FE0E`: https://codepoints.net/U+FE0E
7781
.. _`U+FE0F`: https://codepoints.net/U+FE0F
7882
.. _`DerivedGeneralCategory.txt`: https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt
7983
.. _`EastAsianWidth.txt`: https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt

tests/test_emojis.py

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ def test_recommended_emoji_zwj_sequences():
176176

177177
def test_recommended_variation_16_sequences():
178178
"""
179-
Test wcswidth of all of the unicode.org-published emoji-variation-sequences.txt
179+
Test wcswidth of vs-16 sequences from unicode.org's emoji-variation-sequences.txt
180180
"""
181181
# given,
182182
lines, sequences = read_sequences_from_file('emoji-variation-sequences.txt')
@@ -202,6 +202,34 @@ def test_recommended_variation_16_sequences():
202202
assert num >= 742
203203

204204

205+
def test_recommended_variation_15_sequences():
206+
"""
207+
Test wcswidth of vs-15 sequences from unicode.org's emoji-variation-sequences.txt
208+
"""
209+
# given,
210+
lines, sequences = read_sequences_from_file('emoji-variation-sequences.txt')
211+
212+
errors = []
213+
num = 0
214+
for sequence, line in zip(sequences, lines):
215+
num += 1
216+
if '\ufe0e' not in sequence:
217+
# filter for only \uFE0E (VS-15)
218+
continue
219+
measured_width = wcwidth.wcswidth(sequence)
220+
if measured_width != 1:
221+
errors.append({
222+
'expected_width': 1,
223+
'line': line,
224+
'measured_width': wcwidth.wcswidth(sequence),
225+
'sequence': sequence,
226+
})
227+
228+
# verify
229+
assert errors == []
230+
assert num >= 742
231+
232+
205233
def test_unicode_9_vs16():
206234
"""Verify effect of VS-16 on unicode_version 9.0 and later"""
207235
phrase = ("\u2640" # FEMALE SIGN
@@ -219,8 +247,25 @@ def test_unicode_9_vs16():
219247
assert length_phrase == expect_length_phrase
220248

221249

250+
def test_unicode_9_vs15():
251+
"""Verify effect of VS-15 on unicode_version 9.0 and later"""
252+
phrase = ("\U0001f4da" # BOOKS
253+
"\uFE0E") # VARIATION SELECTOR-15
254+
255+
expect_length_each = (2, 0)
256+
expect_length_phrase = 1
257+
258+
# exercise,
259+
length_each = tuple(wcwidth.wcwidth(w_char, unicode_version='9.0') for w_char in phrase)
260+
length_phrase = wcwidth.wcswidth(phrase, unicode_version='9.0')
261+
262+
# verify.
263+
assert length_each == expect_length_each
264+
assert length_phrase == expect_length_phrase
265+
266+
222267
def test_unicode_8_vs16():
223-
"""Verify that VS-16 has no effect on unicode_version 8.0 and earler"""
268+
"""Verify that VS-16 has no effect on unicode_version 8.0 and earlier"""
224269
phrase = ("\u2640" # FEMALE SIGN
225270
"\uFE0F") # VARIATION SELECTOR-16
226271

@@ -234,3 +279,20 @@ def test_unicode_8_vs16():
234279
# verify.
235280
assert length_each == expect_length_each
236281
assert length_phrase == expect_length_phrase
282+
283+
284+
def test_unicode_8_vs15():
285+
"""Verify that VS-15 has no effect on unicode_version 8.0 and earlier"""
286+
phrase = ("\U0001f4da" # BOOKS
287+
"\uFE0E") # VARIATION SELECTOR-15
288+
289+
expect_length_each = (1, 0)
290+
expect_length_phrase = 1
291+
292+
# exercise,
293+
length_each = tuple(wcwidth.wcwidth(w_char, unicode_version='8.0') for w_char in phrase)
294+
length_phrase = wcwidth.wcswidth(phrase, unicode_version='8.0')
295+
296+
# verify.
297+
assert length_each == expect_length_each
298+
assert length_phrase == expect_length_phrase

tox.ini

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,10 +134,14 @@ basepython = python3.13
134134
commands = {envbindir}/isort --quiet --apply --recursive wcwidth tests bin
135135

136136
[testenv:pylint]
137+
# Files table_vs15.py and table_wide.py erroneously report "duplicate lines".
138+
# Except for adding '# pylint: disable=duplicate-code' to the template files, we
139+
# can chose only to disable a specific check, or specific files. We ignore the
140+
# files.
137141
basepython = python3.13
138142
deps = pylint
139143
commands = {envbindir}/pylint --rcfile={toxinidir}/.pylintrc \
140-
--ignore=tests,docs,setup.py,conf.py,build,distutils,.pyenv,.git,.tox \
144+
--ignore=tests,docs,setup.py,conf.py,build,distutils,.pyenv,.git,.tox,table_wide.py,table_vs15.py \
141145
{posargs:{toxinidir}}/wcwidth
142146

143147
[testenv:flake8]

wcwidth/__init__.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,13 @@
55
"""
66
# re-export all functions & definitions, even private ones, from top-level
77
# module path, to allow for 'from wcwidth import _private_func'. Of course,
8-
# user beware that any _private function may disappear or change signature at
9-
# any future version.
8+
# user beware that any _private functions or variables not exported by __all__
9+
# may disappear or change signature at any future version.
1010

1111
# local
1212
from .wcwidth import ZERO_WIDTH # noqa
1313
from .wcwidth import (WIDE_EASTASIAN,
14+
VS15_WIDE_TO_NARROW,
1415
VS16_NARROW_TO_WIDE,
1516
wcwidth,
1617
wcswidth,
@@ -23,7 +24,8 @@
2324
# 'from wcwidth import *', but also to say, "This is the public API".
2425
__all__ = ('wcwidth', 'wcswidth', 'list_versions')
2526

26-
# We also used pkg_resources to load unicode version tables from version.json,
27-
# generated by bin/update-tables.py, but some environments are unable to
28-
# import pkg_resources for one reason or another, yikes!
27+
# We previously used pkg_resources to load unicode version tables from
28+
# 'version.json', generated by bin/update-tables.py, but some environments are
29+
# unable to import pkg_resources for one reason or another, so this is
30+
# MANUALLY DUPLICATED here and in setup.py
2931
__version__ = '0.2.14'

0 commit comments

Comments
 (0)