Skip to content

Commit 8ed7f2c

Browse files
authored
Implement zero-width support for Hangul Jamo (#111)
From jquast/ucs-detect#9
1 parent 3af992a commit 8ed7f2c

File tree

5 files changed

+100
-21
lines changed

5 files changed

+100
-21
lines changed

bin/update-tables.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,19 @@
5454
MAX_RETRIES = int(os.environ.get('MAX_RETRIES', '6'))
5555
BACKOFF_FACTOR = float(os.environ.get('BACKOFF_FACTOR', '0.1'))
5656

57+
# Hangul Jamo is a decomposed form of Hangul Syllables, see
58+
# see https://www.unicode.org/faq/korean.html#3
59+
# https://github.com/ridiculousfish/widecharwidth/pull/17
60+
# https://github.com/jquast/ucs-detect/issues/9
61+
# https://devblogs.microsoft.com/oldnewthing/20201009-00/?p=104351
62+
# "Conjoining Jamo are divided into three classes: L, V, T (Leading
63+
# consonant, Vowel, Trailing consonant). A Hangul Syllable consists of
64+
# <LV> or <LVT> sequences."
65+
HANGUL_JAMO_ZEROWIDTH = (
66+
*range(0x1160, 0x1200), # Hangul Jungseong Filler .. Hangul Jongseong Ssangnieun
67+
*range(0xD7B0, 0xD800), # Hangul Jungseong O-Yeo .. Undefined Character of Hangul Jamo Extended-B
68+
)
69+
5770

5871
def _bisearch(ucs, table):
5972
"""A copy of wcwwidth._bisearch, to prevent having issues when depending on code that imports
@@ -333,6 +346,9 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
333346
fname=UnicodeDataFile.DerivedGeneralCategory(version),
334347
wide=0).values)
335348

349+
# Also subtract Hangul Jamo Vowels and Hangul Trailing Consonants
350+
table[version].values = table[version].values.difference(HANGUL_JAMO_ZEROWIDTH)
351+
336352
# finally, join with atypical 'wide' characters defined by category 'Sk',
337353
table[version].values.update(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
338354
wide=2).values)
@@ -351,8 +367,11 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
351367
table[version] = parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
352368
wide=0)
353369

354-
# And, include NULL
370+
# Include NULL
355371
table[version].values.add(0)
372+
373+
# Add Hangul Jamo Vowels and Hangul Trailing Consonants
374+
table[version].values.update(HANGUL_JAMO_ZEROWIDTH)
356375
return UnicodeTableRenderCtx('ZERO_WIDTH', table)
357376

358377

docs/intro.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,9 @@ Other Languages
216216
=======
217217
History
218218
=======
219+
Unreleased
220+
* **Bugfix** zero-width support for Hangul Jamo (Korean)
221+
219222
0.2.12 *2023-11-21*
220223
* re-release to remove .pyi file misplaced in wheel files `Issue #101`_.
221224

tests/test_core.py

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -222,17 +222,48 @@ def test_balinese_script():
222222
assert length_phrase == expect_length_phrase
223223

224224

225+
def test_kr_jamo():
226+
"""
227+
Test basic combining of HANGUL CHOSEONG and JUNGSEONG
228+
229+
Example and from Raymond Chen's blog post,
230+
https://devblogs.microsoft.com/oldnewthing/20201009-00/?p=104351
231+
"""
232+
# This is an example where both characters are "wide" when displayed alone.
233+
#
234+
# But JUNGSEONG (vowel) is designed for combination with a CHOSEONG (consonant).
235+
#
236+
# This wcwidth library understands their width only when combination,
237+
# and not by independent display, like other zero-width characters that may
238+
# only combine with an appropriate preceding character.
239+
phrase = (
240+
u"\u1100" # ᄀ HANGUL CHOSEONG KIYEOK (consonant)
241+
u"\u1161" # ᅡ HANGUL JUNGSEONG A (vowel)
242+
)
243+
expect_length_each = (2, 0)
244+
expect_length_phrase = 2
245+
246+
# exercise,
247+
length_each = tuple(map(wcwidth.wcwidth, phrase))
248+
length_phrase = wcwidth.wcswidth(phrase)
249+
250+
# verify.
251+
assert length_each == expect_length_each
252+
assert length_phrase == expect_length_phrase
253+
254+
225255
def test_kr_jamo_filler():
226256
u"""
227257
Jamo filler is 0 width.
228258
229-
According to https://www.unicode.org/L2/L2006/06310-hangul-decompose9.pdf this character and others
230-
like it, ``\uffa0``, ``\u1160``, ``\u115f``, ``\u1160``, are not commonly viewed with a terminal,
231-
seems it doesn't matter whether it is implemented or not, they are not typically used !
259+
Example from https://www.unicode.org/L2/L2006/06310-hangul-decompose9.pdf
232260
"""
233-
phrase = u"\u1100\u1160"
234-
expect_length_each = (2, 1)
235-
expect_length_phrase = 3
261+
phrase = (
262+
u"\u1100" # HANGUL CHOSEONG KIYEOK (consonant)
263+
u"\u1160" # HANGUL JUNGSEONG FILLER (vowel)
264+
)
265+
expect_length_each = (2, 0)
266+
expect_length_phrase = 2
236267

237268
# exercise,
238269
length_each = tuple(map(wcwidth.wcwidth, phrase))

wcwidth/table_wide.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
Exports WIDE_EASTASIAN table keyed by supporting unicode version level.
33
4-
This code generated by wcwidth/bin/update-tables.py on 2024-01-03 17:16:09 UTC.
4+
This code generated by wcwidth/bin/update-tables.py on 2024-01-06 01:39:49 UTC.
55
"""
66
WIDE_EASTASIAN = {
77
'4.1.0': (
@@ -126,8 +126,6 @@
126126
# Date: 2009-06-09, 17:47:00 PDT [KW]
127127
#
128128
(0x01100, 0x0115f,), # Hangul Choseong Kiyeok ..Hangul Choseong Filler
129-
(0x011a3, 0x011a7,), # Hangul Jungseong A-eu ..Hangul Jungseong O-yae
130-
(0x011fa, 0x011ff,), # Hangul Jongseong Kiyeok-..Hangul Jongseong Ssangni
131129
(0x02329, 0x0232a,), # Left-pointing Angle Brac..Right-pointing Angle Bra
132130
(0x02e80, 0x02e99,), # Cjk Radical Repeat ..Cjk Radical Rap
133131
(0x02e9b, 0x02ef3,), # Cjk Radical Choke ..Cjk Radical C-simplified
@@ -149,8 +147,6 @@
149147
(0x0a490, 0x0a4c6,), # Yi Radical Qot ..Yi Radical Ke
150148
(0x0a960, 0x0a97c,), # Hangul Choseong Tikeut-m..Hangul Choseong Ssangyeo
151149
(0x0ac00, 0x0d7a3,), # Hangul Syllable Ga ..Hangul Syllable Hih
152-
(0x0d7b0, 0x0d7c6,), # Hangul Jungseong O-yeo ..Hangul Jungseong Araea-e
153-
(0x0d7cb, 0x0d7fb,), # Hangul Jongseong Nieun-r..Hangul Jongseong Phieuph
154150
(0x0f900, 0x0faff,), # Cjk Compatibility Ideogr..(nil)
155151
(0x0fe10, 0x0fe19,), # Presentation Form For Ve..Presentation Form For Ve
156152
(0x0fe30, 0x0fe52,), # Presentation Form For Ve..Small Full Stop
@@ -169,8 +165,6 @@
169165
# Date: 2010-08-17, 12:17:00 PDT [KW]
170166
#
171167
(0x01100, 0x0115f,), # Hangul Choseong Kiyeok ..Hangul Choseong Filler
172-
(0x011a3, 0x011a7,), # Hangul Jungseong A-eu ..Hangul Jungseong O-yae
173-
(0x011fa, 0x011ff,), # Hangul Jongseong Kiyeok-..Hangul Jongseong Ssangni
174168
(0x02329, 0x0232a,), # Left-pointing Angle Brac..Right-pointing Angle Bra
175169
(0x02e80, 0x02e99,), # Cjk Radical Repeat ..Cjk Radical Rap
176170
(0x02e9b, 0x02ef3,), # Cjk Radical Choke ..Cjk Radical C-simplified
@@ -192,8 +186,6 @@
192186
(0x0a490, 0x0a4c6,), # Yi Radical Qot ..Yi Radical Ke
193187
(0x0a960, 0x0a97c,), # Hangul Choseong Tikeut-m..Hangul Choseong Ssangyeo
194188
(0x0ac00, 0x0d7a3,), # Hangul Syllable Ga ..Hangul Syllable Hih
195-
(0x0d7b0, 0x0d7c6,), # Hangul Jungseong O-yeo ..Hangul Jungseong Araea-e
196-
(0x0d7cb, 0x0d7fb,), # Hangul Jongseong Nieun-r..Hangul Jongseong Phieuph
197189
(0x0f900, 0x0faff,), # Cjk Compatibility Ideogr..(nil)
198190
(0x0fe10, 0x0fe19,), # Presentation Form For Ve..Presentation Form For Ve
199191
(0x0fe30, 0x0fe52,), # Presentation Form For Ve..Small Full Stop
@@ -214,8 +206,6 @@
214206
# Date: 2011-09-19, 18:46:00 GMT [KW]
215207
#
216208
(0x01100, 0x0115f,), # Hangul Choseong Kiyeok ..Hangul Choseong Filler
217-
(0x011a3, 0x011a7,), # Hangul Jungseong A-eu ..Hangul Jungseong O-yae
218-
(0x011fa, 0x011ff,), # Hangul Jongseong Kiyeok-..Hangul Jongseong Ssangni
219209
(0x02329, 0x0232a,), # Left-pointing Angle Brac..Right-pointing Angle Bra
220210
(0x02e80, 0x02e99,), # Cjk Radical Repeat ..Cjk Radical Rap
221211
(0x02e9b, 0x02ef3,), # Cjk Radical Choke ..Cjk Radical C-simplified
@@ -237,8 +227,6 @@
237227
(0x0a490, 0x0a4c6,), # Yi Radical Qot ..Yi Radical Ke
238228
(0x0a960, 0x0a97c,), # Hangul Choseong Tikeut-m..Hangul Choseong Ssangyeo
239229
(0x0ac00, 0x0d7a3,), # Hangul Syllable Ga ..Hangul Syllable Hih
240-
(0x0d7b0, 0x0d7c6,), # Hangul Jungseong O-yeo ..Hangul Jungseong Araea-e
241-
(0x0d7cb, 0x0d7fb,), # Hangul Jongseong Nieun-r..Hangul Jongseong Phieuph
242230
(0x0f900, 0x0faff,), # Cjk Compatibility Ideogr..(nil)
243231
(0x0fe10, 0x0fe19,), # Presentation Form For Ve..Presentation Form For Ve
244232
(0x0fe30, 0x0fe52,), # Presentation Form For Ve..Small Full Stop

0 commit comments

Comments
 (0)