Skip to content

Commit 651f52c

Browse files
committed
Variation Selector 15 (VS-15, U+FE0E) support.
I did a few spot checks of VS-15 when implementing VS-16, and erroneously believed that all emojis in VS-15 sequences were already listed as an EAW width of 1. But that's not true. There are several emojis that are "wide" that are changed to "narrow" with VS-15.
1 parent b0e4c88 commit 651f52c

File tree

11 files changed

+270
-28
lines changed

11 files changed

+270
-28
lines changed

bin/update-tables.py

Lines changed: 58 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -417,19 +417,22 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
417417
"""
418418
table: dict[UnicodeVersion, TableDef] = {}
419419
unicode_latest = fetch_unicode_versions()[-1]
420+
hex_str_vs = 'FE0F'
420421

421422
wide_tables = fetch_table_wide_data().table
422423
unicode_version = UnicodeVersion.parse('9.0.0')
423424

424425
# parse table formatted by the latest emoji release (developed with
425426
# 15.1.0) and parse a single file for all individual releases
426-
table[unicode_version] = parse_vs16_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
427-
ubound_unicode_version=unicode_version)
427+
table[unicode_version] = parse_vs_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
428+
ubound_unicode_version=unicode_version,
429+
hex_str_vs=hex_str_vs)
428430

429431
# parse and join the final emoji release 12.0 of the earlier "type"
430432
table[unicode_version].values.update(
431-
parse_vs16_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
432-
ubound_unicode_version=unicode_version).values)
433+
parse_vs_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
434+
ubound_unicode_version=unicode_version,
435+
hex_str_vs=hex_str_vs).values)
433436

434437
# perform culling on any values that are already understood as 'wide'
435438
# without the variation-16 selector
@@ -442,16 +445,61 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
442445
return UnicodeTableRenderCtx('VS16_NARROW_TO_WIDE', table)
443446

444447

445-
def parse_vs16_data(fname: str, ubound_unicode_version: UnicodeVersion):
448+
def parse_vs_data(fname: str, ubound_unicode_version: UnicodeVersion, hex_str_vs: str):
446449
with open(fname, encoding='utf-8') as fin:
447-
table_iter = parse_vs16_table(fin)
450+
table_iter = parse_vs_table(fin, hex_str_vs)
448451
# pull "date string"
449452
date = next(table_iter).comment.split(':', 1)[1].strip()
450453
# pull values only matching this unicode version and lower
451454
values = {entry.code_range[0] for entry in table_iter}
452455
return TableDef(ubound_unicode_version, date, values)
453456

454457

458+
def fetch_table_vs15_data() -> UnicodeTableRenderCtx:
459+
"""
460+
Fetch and create a "wide to narrow variation-15" lookup table.
461+
462+
Characters in this table are wide, but when combined with a variation selector-15 (\uFE0E), they
463+
become narrow, for the given versions of unicode.
464+
465+
UNICODE_VERSION=9.0.0 or greater is required to enable detection of the effect of *any*
466+
'variation selector-15' wide emoji becoming narrow.
467+
468+
Some terminals display U+231a, u+FE0E as a narrow font, but consuming a wide cell (iTerm2),
469+
while most others display it as a wide cell, only.
470+
471+
It is fair to call these ambiguous, see related 'ucs-detect' project.
472+
"""
473+
table: dict[UnicodeVersion, TableDef] = {}
474+
unicode_latest = fetch_unicode_versions()[-1]
475+
hex_str_vs = 'FE0E'
476+
477+
wide_tables = fetch_table_wide_data().table
478+
unicode_version = UnicodeVersion.parse('9.0.0')
479+
480+
# parse table formatted by the latest emoji release (developed with
481+
# 15.1.0) and parse a single file for all individual releases
482+
table[unicode_version] = parse_vs_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
483+
ubound_unicode_version=unicode_version,
484+
hex_str_vs=hex_str_vs)
485+
486+
# parse and join the final emoji release 12.0 of the earlier "type"
487+
table[unicode_version].values.update(
488+
parse_vs_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
489+
ubound_unicode_version=unicode_version,
490+
hex_str_vs=hex_str_vs).values)
491+
492+
# perform culling on any values that are already understood as 'narrow'
493+
# without the variation-15 selector
494+
wide_table = wide_tables[unicode_version].as_value_ranges()
495+
table[unicode_version].values = {
496+
ucs for ucs in table[unicode_version].values
497+
if _bisearch(ucs, wide_table)
498+
}
499+
500+
return UnicodeTableRenderCtx('VS15_WIDE_TO_NARROW', table)
501+
502+
455503
def cite_source_description(filename: str) -> tuple[str, str]:
456504
"""Return unicode.org source data file's own description as citation."""
457505
with open(filename, encoding='utf-8') as f:
@@ -496,9 +544,8 @@ def parse_unicode_table(file: Iterable[str]) -> Iterator[TableEntry]:
496544
yield TableEntry(code_range, tuple(properties), comment)
497545

498546

499-
def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
500-
"""Parse emoji-variation-sequences.txt for codepoints that preceed 0xFE0F."""
501-
hex_str_vs16 = 'FE0F'
547+
def parse_vs_table(fp: Iterable[str], hex_str_vs: str = 'FE0F') -> Iterator[TableEntry]:
548+
"""Parse emoji-variation-sequences.txt for codepoints that precede `hex_str_vs`"""
502549
for line in fp:
503550
data, _, comment = line.partition('#')
504551
data_fields: Iterator[str] = (field.strip() for field in data.split(';'))
@@ -510,7 +557,7 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
510557
yield TableEntry(None, tuple(properties), comment)
511558
continue
512559
code_points = code_points_str.split()
513-
if len(code_points) == 2 and code_points[1] == hex_str_vs16:
560+
if len(code_points) == 2 and code_points[1] == hex_str_vs:
514561
# yeild a single "code range" entry for a single value that preceeds FE0F
515562
yield TableEntry((int(code_points[0], 16), int(code_points[0], 16)), tuple(properties), comment)
516563

@@ -663,6 +710,7 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]:
663710
UnicodeVersionPyRenderCtx(fetch_unicode_versions())
664711
)
665712
yield UnicodeTableRenderDef.new('table_vs16.py', fetch_table_vs16_data())
713+
yield UnicodeTableRenderDef.new('table_vs15.py', fetch_table_vs15_data())
666714
yield UnicodeTableRenderDef.new('table_wide.py', fetch_table_wide_data())
667715
yield UnicodeTableRenderDef.new('table_zero.py', fetch_table_zero_data())
668716
yield UnicodeVersionRstRenderDef.new(fetch_source_headers())

bin/verify-table-integrity.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,7 @@
6464

6565

6666
def bisearch_pair(ucs, table):
67-
"""
68-
A copy of wcwidth._bisearch() but also returns the range of matched values.
69-
"""
67+
"""A copy of wcwidth._bisearch() but also returns the range of matched values."""
7068
lbound = 0
7169
ubound = len(table) - 1
7270

@@ -85,6 +83,7 @@ def bisearch_pair(ucs, table):
8583

8684

8785
def main(log: logging.Logger):
86+
# local
8887
from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, list_versions
8988

9089
reversed_uni_versions = list(reversed(list_versions()))

docs/intro.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,10 @@ Other Languages
217217
History
218218
=======
219219

220+
0.2.14 *2024-02-14*
221+
* **Bugfix** accounting of some kinds of emoji sequences using U+FE0E
222+
Variation Selector 15 (`PR #999`_).
223+
220224
0.2.13 *2024-01-06*
221225
* **Bugfix** zero-width support for Hangul Jamo (Korean)
222226

docs/specs.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ Width of 1
4747
String characters are measured width of 1 when they are not
4848
measured as `Width of 0`_ or `Width of 2`_.
4949

50+
Any character in sequence with `U+FE0E`_ (variation Selector 15) defined
51+
by `emoji-variation-sequences.txt`_ as ``text style``.
52+
5053
Width of 2
5154
----------
5255

@@ -73,6 +76,7 @@ Any character in sequence with `U+FE0F`_ (Variation Selector 16) defined by
7376
.. _`U+2029`: https://codepoints.net/U+2029
7477
.. _`U+D7B0`: https://codepoints.net/U+D7B0
7578
.. _`U+D7FF`: https://codepoints.net/U+D7FF
79+
.. _`U+FE0E`: https://codepoints.net/U+FE0E
7680
.. _`U+FE0F`: https://codepoints.net/U+FE0F
7781
.. _`DerivedGeneralCategory.txt`: https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt
7882
.. _`EastAsianWidth.txt`: https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def main():
4444
setuptools.setup(
4545
name='wcwidth',
4646
# NOTE: manually manage __version__ in wcwidth/__init__.py !
47-
version='0.2.13',
47+
version='0.2.14',
4848
description=(
4949
"Measures the displayed width of unicode strings in a terminal"),
5050
long_description=codecs.open(

tests/test_emojis.py

Lines changed: 69 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def test_longer_emoji_zwj_sequence():
131131
u"\u200d" # 'Cf', 'N' -- ZERO WIDTH JOINER
132132
u"\U0001F9D1" # 'So', 'W' -- ADULT
133133
u"\U0001F3FD" # 'Sk', 'W' -- EMOJI MODIFIER FITZPATRICK TYPE-4
134-
) * 2
134+
) * 2
135135
# This test adapted from https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf
136136
expect_length_each = (2, 0, 0, 1, 0, 0, 2, 0, 2, 0) * 2
137137
expect_length_phrase = 4
@@ -148,8 +148,8 @@ def test_longer_emoji_zwj_sequence():
148148
def read_sequences_from_file(filename):
149149
fp = codecs.open(os.path.join(os.path.dirname(__file__), filename), 'r', encoding='utf-8')
150150
lines = [line.strip()
151-
for line in fp.readlines()
152-
if not line.startswith('#') and line.strip()]
151+
for line in fp.readlines()
152+
if not line.startswith('#') and line.strip()]
153153
fp.close()
154154
sequences = [make_sequence_from_line(line) for line in lines]
155155
return lines, sequences
@@ -184,7 +184,7 @@ def test_recommended_emoji_zwj_sequences():
184184

185185
def test_recommended_variation_16_sequences():
186186
"""
187-
Test wcswidth of all of the unicode.org-published emoji-variation-sequences.txt
187+
Test wcswidth of vs-16 sequences from unicode.org's emoji-variation-sequences.txt
188188
"""
189189
# given,
190190
lines, sequences = read_sequences_from_file('emoji-variation-sequences.txt')
@@ -210,6 +210,34 @@ def test_recommended_variation_16_sequences():
210210
assert num >= 742
211211

212212

213+
def test_recommended_variation_15_sequences():
214+
"""
215+
Test wcswidth of vs-15 sequences from unicode.org's emoji-variation-sequences.txt
216+
"""
217+
# given,
218+
lines, sequences = read_sequences_from_file('emoji-variation-sequences.txt')
219+
220+
errors = []
221+
num = 0
222+
for sequence, line in zip(sequences, lines):
223+
num += 1
224+
if '\ufe0e' not in sequence:
225+
# filter for only \uFE0E (VS-15)
226+
continue
227+
measured_width = wcwidth.wcswidth(sequence)
228+
if measured_width != 1:
229+
errors.append({
230+
'expected_width': 1,
231+
'line': line,
232+
'measured_width': wcwidth.wcswidth(sequence),
233+
'sequence': sequence,
234+
})
235+
236+
# verify
237+
assert errors == []
238+
assert num >= 742
239+
240+
213241
def test_unicode_9_vs16():
214242
"""Verify effect of VS-16 on unicode_version 9.0 and later"""
215243
phrase = (u"\u2640" # FEMALE SIGN
@@ -226,8 +254,26 @@ def test_unicode_9_vs16():
226254
assert length_each == expect_length_each
227255
assert length_phrase == expect_length_phrase
228256

257+
258+
def test_unicode_9_vs15():
259+
"""Verify effect of VS-16 on unicode_version 9.0 and later"""
260+
phrase = (u"\U0001f4da" # BOOKS
261+
u"\uFE0E") # VARIATION SELECTOR-15
262+
263+
expect_length_each = (2, 0)
264+
expect_length_phrase = 1
265+
266+
# exercise,
267+
length_each = tuple(wcwidth.wcwidth(w_char, unicode_version='9.0') for w_char in phrase)
268+
length_phrase = wcwidth.wcswidth(phrase, unicode_version='9.0')
269+
270+
# verify.
271+
assert length_each == expect_length_each
272+
assert length_phrase == expect_length_phrase
273+
274+
229275
def test_unicode_8_vs16():
230-
"""Verify that VS-16 has no effect on unicode_version 8.0 and earler"""
276+
"""Verify that VS-16 has no effect on unicode_version 8.0 and earlier"""
231277
phrase = (u"\u2640" # FEMALE SIGN
232278
u"\uFE0F") # VARIATION SELECTOR-16
233279

@@ -240,4 +286,21 @@ def test_unicode_8_vs16():
240286

241287
# verify.
242288
assert length_each == expect_length_each
243-
assert length_phrase == expect_length_phrase
289+
assert length_phrase == expect_length_phrase
290+
291+
292+
def test_unicode_8_vs15():
293+
"""Verify that VS-15 has no effect on unicode_version 8.0 and earlier"""
294+
phrase = (u"\U0001f4da" # BOOKS
295+
u"\uFE0E") # VARIATION SELECTOR-15
296+
297+
expect_length_each = (1, 0)
298+
expect_length_phrase = 1
299+
300+
# exercise,
301+
length_each = tuple(wcwidth.wcwidth(w_char, unicode_version='8.0') for w_char in phrase)
302+
length_phrase = wcwidth.wcswidth(phrase, unicode_version='8.0')
303+
304+
# verify.
305+
assert length_each == expect_length_each
306+
assert length_phrase == expect_length_phrase

tests/test_table_integrity.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
"""
22
Executes verify-table-integrity.py as a unit test.
33
"""
4+
# std imports
45
import os
56
import sys
67
import subprocess
78

9+
# 3rd party
810
import pytest
911

12+
1013
@pytest.mark.skipif(sys.version_info[:2] != (3, 12), reason='Test only with a single version of python')
1114
def test_verify_table_integrity():
1215
subprocess.check_output([sys.executable, os.path.join(os.path.dirname(__file__),
1316
os.path.pardir,
1417
'bin',
15-
'verify-table-integrity.py')])
18+
'verify-table-integrity.py')])

tox.ini

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,9 +154,13 @@ basepython = python3.11
154154
commands = {envbindir}/isort --quiet --apply --recursive wcwidth tests bin
155155

156156
[testenv:pylint]
157+
# Files table_vs15.py and table_wide.py erroneously report "duplicate lines".
158+
# Except for adding '# pylint: disable=duplicate-code' to the template files, we
159+
# can chose only to disable a specific check, or specific files. We ignore the
160+
# files.
157161
basepython = python3.11
158162
commands = {envbindir}/pylint --rcfile={toxinidir}/.pylintrc \
159-
--ignore=tests,docs,setup.py,conf.py,build,distutils,.pyenv,.git,.tox \
163+
--ignore=tests,docs,setup.py,conf.py,build,distutils,.pyenv,.git,.tox,table_wide.py,table_vs15.py \
160164
{posargs:{toxinidir}}/wcwidth
161165

162166
[testenv:flake8]

wcwidth/__init__.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,13 @@
55
"""
66
# re-export all functions & definitions, even private ones, from top-level
77
# module path, to allow for 'from wcwidth import _private_func'. Of course,
8-
# user beware that any _private function may disappear or change signature at
9-
# any future version.
8+
# user beware that any _private functions or variables not exported by __all__
9+
# may disappear or change signature at any future version.
1010

1111
# local
1212
from .wcwidth import ZERO_WIDTH # noqa
1313
from .wcwidth import (WIDE_EASTASIAN,
14+
VS15_WIDE_TO_NARROW,
1415
VS16_NARROW_TO_WIDE,
1516
wcwidth,
1617
wcswidth,
@@ -23,7 +24,8 @@
2324
# 'from wcwidth import *', but also to say, "This is the public API".
2425
__all__ = ('wcwidth', 'wcswidth', 'list_versions')
2526

26-
# We also used pkg_resources to load unicode version tables from version.json,
27-
# generated by bin/update-tables.py, but some environments are unable to
28-
# import pkg_resources for one reason or another, yikes!
29-
__version__ = '0.2.13'
27+
# We previously used pkg_resources to load unicode version tables from
28+
# 'version.json', generated by bin/update-tables.py, but some environments are
29+
# unable to import pkg_resources for one reason or another, so this is
30+
# MANUALLY DUPLICATED here and in setup.py
31+
__version__ = '0.2.14'

0 commit comments

Comments
 (0)