Skip to content

Commit 3af992a

Browse files
authored
Non-bugfix about zero & wide definition conflicts (again!) (#110)
In this update to update-tables.py, 04d6d90 I wrote, > `verify-table-integrity.py` exercises a "bug" of duplicated tables that has no effect, because wcswidth() first checks for zero-width, and that is preferred in cases of conflict. This PR also resolves that error of duplication. In that change I used method [set.discard()](https://docs.python.org/3/library/stdtypes.html#frozenset.discard) in error, the discard method takes a single item as an argument, while I was providing using a whole set and so it had no effect. Instead, I now use [set.difference()](https://docs.python.org/3/library/stdtypes.html#frozenset.difference) to re-assign the value. Also, - the `category_codes` argument has been removed in update-tables.py, it is not used. - `verify-table-integrity.py` has been improved to show both range values in conflict - `verify-table-integrity.py` now included as a unit test for a single version of python (3.12) - new unit test about conflicting wide & zero values. This demonstrates that the update to table_wide.py has no effect, as these tests succeed before and after change to table_wide.py.
1 parent 0ba0278 commit 3af992a

File tree

6 files changed

+160
-83
lines changed

6 files changed

+160
-83
lines changed

bin/update-tables.py

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -112,11 +112,11 @@ class TableEntry:
112112
properties: tuple[str, ...]
113113
comment: str
114114

115-
def filter_by_category(self, category_codes: str, wide: int) -> bool:
115+
def filter_by_category_width(self, wide: int) -> bool:
116116
"""
117-
Return whether entry matches given category code and displayed width.
117+
Return whether entry matches displayed width.
118118
119-
Categories are described here, https://www.unicode.org/reports/tr44/#GC_Values_Table
119+
Parses both DerivedGeneralCategory.txt and EastAsianWidth.txt
120120
"""
121121
if self.code_range is None:
122122
return False
@@ -146,13 +146,12 @@ def filter_by_category(self, category_codes: str, wide: int) -> bool:
146146
return wide == 1
147147

148148
@staticmethod
149-
def parse_category_values(category_codes: str,
150-
table_iter: Iterator[TableEntry],
151-
wide: int) -> set[tuple[int, int]]:
149+
def parse_width_category_values(table_iter: Iterator[TableEntry],
150+
wide: int) -> set[tuple[int, int]]:
152151
"""Parse value ranges of unicode data files, by given category and width."""
153152
return {n
154153
for entry in table_iter
155-
if entry.filter_by_category(category_codes, wide)
154+
if entry.filter_by_category_width(wide)
156155
for n in list(range(entry.code_range[0], entry.code_range[1]))}
157156

158157

@@ -326,18 +325,16 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
326325
for version in fetch_unicode_versions():
327326
# parse typical 'wide' characters by categories 'W' and 'F',
328327
table[version] = parse_category(fname=UnicodeDataFile.EastAsianWidth(version),
329-
category_codes=('W', 'F'),
330328
wide=2)
331329

332330
# subtract(!) wide characters that were defined above as 'W' category in EastAsianWidth,
333331
# but also zero-width category 'Mn' or 'Mc' in DerivedGeneralCategory!
334-
table[version].values.discard(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
335-
category_codes=('Mn', 'Mc'),
336-
wide=0).values)
332+
table[version].values = table[version].values.difference(parse_category(
333+
fname=UnicodeDataFile.DerivedGeneralCategory(version),
334+
wide=0).values)
337335

338336
# finally, join with atypical 'wide' characters defined by category 'Sk',
339337
table[version].values.update(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
340-
category_codes=('Sk',),
341338
wide=2).values)
342339
return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)
343340

@@ -352,7 +349,6 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
352349
for version in fetch_unicode_versions():
353350
# Determine values of zero-width character lookup table by the following category codes
354351
table[version] = parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
355-
category_codes=('Me', 'Mn', 'Mc', 'Cf', 'Zl', 'Zp', 'Sk'),
356352
wide=0)
357353

358354
# And, include NULL
@@ -501,9 +497,9 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
501497

502498

503499
@functools.cache
504-
def parse_category(fname: str, category_codes: Container[str], wide: int) -> TableDef:
500+
def parse_category(fname: str, wide: int) -> TableDef:
505501
"""Parse value ranges of unicode data files, by given categories into string tables."""
506-
print(f'parsing {fname} category_codes={",".join(category_codes)}: ', end='', flush=True)
502+
print(f'parsing {fname}, wide={wide}: ', end='', flush=True)
507503

508504
with open(fname, encoding='utf-8') as f:
509505
table_iter = parse_unicode_table(f)
@@ -512,7 +508,7 @@ def parse_category(fname: str, category_codes: Container[str], wide: int) -> Tab
512508
version = next(table_iter).comment.strip()
513509
# and "date string" from second line
514510
date = next(table_iter).comment.split(':', 1)[1].strip()
515-
values = TableEntry.parse_category_values(category_codes, table_iter, wide)
511+
values = TableEntry.parse_width_category_values(table_iter, wide)
516512
print('ok')
517513
return TableDef(version, date, values)
518514

bin/verify-table-integrity.py

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,30 @@
6363
import logging
6464

6565

66+
def bisearch_pair(ucs, table):
67+
"""
68+
A copy of wcwidth._bisearch() but also returns the range of matched values.
69+
"""
70+
lbound = 0
71+
ubound = len(table) - 1
72+
73+
if ucs < table[0][0] or ucs > table[ubound][1]:
74+
return (0, None, None)
75+
while ubound >= lbound:
76+
mid = (lbound + ubound) // 2
77+
if ucs > table[mid][1]:
78+
lbound = mid + 1
79+
elif ucs < table[mid][0]:
80+
ubound = mid - 1
81+
else:
82+
return (1, table[mid][0], table[mid][1])
83+
84+
return (0, None, None)
85+
86+
6687
def main(log: logging.Logger):
67-
# local
68-
from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, _bisearch, list_versions
88+
from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, list_versions
89+
6990
reversed_uni_versions = list(reversed(list_versions()))
7091
tables = {'ZERO_WIDTH': ZERO_WIDTH,
7192
'WIDE_EASTASIAN': WIDE_EASTASIAN}
@@ -81,14 +102,21 @@ def main(log: logging.Logger):
81102
other_table = tables[other_table_name][version]
82103
for start_range, stop_range in curr_table:
83104
for unichar_n in range(start_range, stop_range):
84-
if not _bisearch(unichar_n, next_table):
85-
log.info(f'value {hex(unichar_n)} in table_name={table_name}'
86-
f' version={version} is not defined in next_version={next_version}'
87-
f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
88-
if _bisearch(unichar_n, other_table):
89-
log.error(f'value {hex(unichar_n)} in table_name={table_name}'
90-
f' version={version} is duplicated in other_table_name={other_table_name}'
91-
f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
105+
result, _, _ = bisearch_pair(unichar_n, next_table)
106+
if not result:
107+
log.info(
108+
f'value 0x{unichar_n:05x} in table_name={table_name}'
109+
f' version={version} is not defined in next_version={next_version}'
110+
f' from inclusive range {hex(start_range)}-{hex(stop_range)}'
111+
)
112+
result, lbound, ubound = bisearch_pair(unichar_n, other_table)
113+
if result:
114+
log.error(
115+
f'value 0x{unichar_n:05x} in table_name={table_name}'
116+
f' version={version} is duplicated in other_table_name={other_table_name}'
117+
f' from inclusive range 0x{start_range:05x}-0x{stop_range:05x} of'
118+
f' {table_name} against 0x{lbound:05x}-0x{ubound:05x} in {other_table_name}'
119+
)
92120
errors += 1
93121
if errors:
94122
log.error(f'{errors} errors, exit 1')

docs/intro.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ Other Languages
217217
History
218218
=======
219219
0.2.12 *2023-11-21*
220-
* re-release to remove .pyi file misplaced in wheel files `Issue #101`.
220+
* re-release to remove .pyi file misplaced in wheel files `Issue #101`_.
221221

222222
0.2.11 *2023-11-20*
223223
* Include tests files in the source distribution (`PR #98`_, `PR #100`_).

tests/test_core.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,3 +355,17 @@ def test_kannada_script_2():
355355
# verify.
356356
assert length_each == expect_length_each
357357
assert length_phrase == expect_length_phrase
358+
359+
360+
def test_zero_wide_conflict():
361+
# Test characters considered both "wide" and "zero" width
362+
# - (0x03000, 0x0303e,), # Ideographic Space ..Ideographic Variation In
363+
# + (0x03000, 0x03029,), # Ideographic Space ..Hangzhou Numeral Nine
364+
assert wcwidth.wcwidth(unichr(0x03029), unicode_version='4.1.0') == 2
365+
assert wcwidth.wcwidth(unichr(0x0302a), unicode_version='4.1.0') == 0
366+
367+
# - (0x03099, 0x030ff,), # Combining Katakana-hirag..Katakana Digraph Koto
368+
# + (0x0309b, 0x030ff,), # Katakana-hiragana Voiced..Katakana Digraph Koto
369+
assert wcwidth.wcwidth(unichr(0x03099), unicode_version='4.1.0') == 0
370+
assert wcwidth.wcwidth(unichr(0x0309a), unicode_version='4.1.0') == 0
371+
assert wcwidth.wcwidth(unichr(0x0309b), unicode_version='4.1.0') == 2

tests/test_table_integrity.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""
2+
Executes verify-table-integrity.py as a unit test.
3+
"""
4+
import os
5+
import sys
6+
import subprocess
7+
8+
import pytest
9+
10+
@pytest.mark.skipif(sys.version_info[:2] != (3, 12), reason='Test only with a single version of python')
11+
def test_verify_table_integrity():
12+
subprocess.check_output([sys.executable, os.path.join(os.path.dirname(__file__),
13+
os.path.pardir,
14+
'bin',
15+
'verify-table-integrity.py')])

0 commit comments

Comments
 (0)