Skip to content

Commit 5448df5

Browse files
committed
GB18030: Help transitioning away from PUA code points.
* lib/gb18030ext.h (gb18030_2005_ext_wctomb): Remove function. (gb18030ext_wctomb): Renamed from gb18030_2022_ext_wctomb. * lib/gb18030uni.h (gb18030_2005_uni_wctomb): Map 6 Ext-B code points to 4-bytes sequences. (gb18030_2022_uni_wctomb): Small refactoring. * lib/gb18030_2005.h (gb18030_2005_pua2charset): Map 6 PUA code points to 4-bytes sequences instead of 2-bytes sequences. (gb18030_2005_wctomb): Update accordingly. Invoke gb18030ext_wctomb instead of gb18030_2005_ext_wctomb. * lib/gb18030_2022.h (gb18030_2022_wctomb): Invoke gb18030ext_wctomb instead of gb18030_2022_ext_wctomb. * tests/GB18030-2005.IRREVERSIBLE.TXT: Update the inverse mappings of 6 Ext-B code points and 6 PUA code points. * NEWS: Mention it.
1 parent bf03f38 commit 5448df5

File tree

7 files changed

+124
-161
lines changed

7 files changed

+124
-161
lines changed

ChangeLog

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,21 @@
1+
2023-05-29 Bruno Haible <bruno@clisp.org>
2+
3+
GB18030: Help transitioning away from PUA code points.
4+
* lib/gb18030ext.h (gb18030_2005_ext_wctomb): Remove function.
5+
(gb18030ext_wctomb): Renamed from gb18030_2022_ext_wctomb.
6+
* lib/gb18030uni.h (gb18030_2005_uni_wctomb): Map 6 Ext-B code points to
7+
4-bytes sequences.
8+
(gb18030_2022_uni_wctomb): Small refactoring.
9+
* lib/gb18030_2005.h (gb18030_2005_pua2charset): Map 6 PUA code points
10+
to 4-bytes sequences instead of 2-bytes sequences.
11+
(gb18030_2005_wctomb): Update accordingly. Invoke gb18030ext_wctomb
12+
instead of gb18030_2005_ext_wctomb.
13+
* lib/gb18030_2022.h (gb18030_2022_wctomb): Invoke gb18030ext_wctomb
14+
instead of gb18030_2022_ext_wctomb.
15+
* tests/GB18030-2005.IRREVERSIBLE.TXT: Update the inverse mappings of 6
16+
Ext-B code points and 6 PUA code points.
17+
* NEWS: Mention it.
18+
119
2023-05-24 Bruno Haible <bruno@clisp.org>
220

321
man pages: List a fifth condition when iconv(3) may stop.

NEWS

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ New in 1.18:
33
* GB18030 is now an alias for GB18030:2005. A new converter for GB18030:2022
44
is added. Since this encoding merely cleans up a few private-use-area
55
mappings, you can continue to use the GB18030 converter, for backward
6-
compatibility.
6+
compatibility. Its Unicode to GB18030 conversion direction has been
7+
enhanced, to help transitioning away from PUA code points.
78
* When converting from/to an EBCDIC encoding, a non-standard way of
89
converting newlines can be requested
910
- at the C level, by calling iconvctl with argument ICONV_SET_FROM_SURFACE

lib/gb18030_2005.h

Lines changed: 56 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -266,39 +266,43 @@ gb18030_2005_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
266266
}
267267
}
268268

269-
static const unsigned short gb18030_2005_pua2charset[31*3] = {
270-
/* Unicode range GB18030 range */
271-
0xe766, 0xe76b, 0xa2ab, /*.. 0xa2b0, */
272-
0xe76d, 0xe76d, 0xa2e4,
273-
0xe76e, 0xe76f, 0xa2ef, /*.. 0xa2f0, */
274-
0xe770, 0xe771, 0xa2fd, /*.. 0xa2fe, */
275-
0xe772, 0xe77c, 0xa4f4, /*.. 0xa4fe, */
276-
0xe77d, 0xe784, 0xa5f7, /*.. 0xa5fe, */
277-
0xe785, 0xe78c, 0xa6b9, /*.. 0xa6c0, */
278-
0xe78d, 0xe793, 0xa6d9, /*.. 0xa6df, */
279-
0xe794, 0xe795, 0xa6ec, /*.. 0xa6ed, */
280-
0xe796, 0xe796, 0xa6f3,
281-
0xe797, 0xe79f, 0xa6f6, /*.. 0xa6fe, */
282-
0xe7a0, 0xe7ae, 0xa7c2, /*.. 0xa7d0, */
283-
0xe7af, 0xe7bb, 0xa7f2, /*.. 0xa7fe, */
284-
0xe7bc, 0xe7c6, 0xa896, /*.. 0xa8a0, */
285-
0xe7c9, 0xe7cc, 0xa8c1, /*.. 0xa8c4, */
286-
0xe7cd, 0xe7e1, 0xa8ea, /*.. 0xa8fe, */
287-
0xe7e2, 0xe7e2, 0xa958,
288-
0xe7e3, 0xe7e3, 0xa95b,
289-
0xe7e4, 0xe7e6, 0xa95d, /*.. 0xa95f, */
290-
0xe7f4, 0xe800, 0xa997, /*.. 0xa9a3, */
291-
0xe801, 0xe80f, 0xa9f0, /*.. 0xa9fe, */
292-
0xe810, 0xe814, 0xd7fa, /*.. 0xd7fe, */
293-
0xe816, 0xe818, 0xfe51, /*.. 0xfe53, */
294-
0xe81e, 0xe81e, 0xfe59,
295-
0xe826, 0xe826, 0xfe61,
296-
0xe82b, 0xe82c, 0xfe66, /*.. 0xfe67, */
297-
0xe831, 0xe832, 0xfe6c, /*.. 0xfe6d, */
298-
0xe83b, 0xe83b, 0xfe76,
299-
0xe843, 0xe843, 0xfe7e,
300-
0xe854, 0xe855, 0xfe90, /*.. 0xfe91, */
301-
0xe864, 0xe864, 0xfea0,
269+
static const struct { unsigned short uni[2]; unsigned int charset; } gb18030_2005_pua2charset[35] = {
270+
/* Unicode range GB18030 range */
271+
{ { 0xe766, 0xe76b }, 0xa2ab /*.. 0xa2b0, */ },
272+
{ { 0xe76d, 0xe76d }, 0xa2e4 },
273+
{ { 0xe76e, 0xe76f }, 0xa2ef /*.. 0xa2f0, */ },
274+
{ { 0xe770, 0xe771 }, 0xa2fd /*.. 0xa2fe, */ },
275+
{ { 0xe772, 0xe77c }, 0xa4f4 /*.. 0xa4fe, */ },
276+
{ { 0xe77d, 0xe784 }, 0xa5f7 /*.. 0xa5fe, */ },
277+
{ { 0xe785, 0xe78c }, 0xa6b9 /*.. 0xa6c0, */ },
278+
{ { 0xe78d, 0xe793 }, 0xa6d9 /*.. 0xa6df, */ },
279+
{ { 0xe794, 0xe795 }, 0xa6ec /*.. 0xa6ed, */ },
280+
{ { 0xe796, 0xe796 }, 0xa6f3 },
281+
{ { 0xe797, 0xe79f }, 0xa6f6 /*.. 0xa6fe, */ },
282+
{ { 0xe7a0, 0xe7ae }, 0xa7c2 /*.. 0xa7d0, */ },
283+
{ { 0xe7af, 0xe7bb }, 0xa7f2 /*.. 0xa7fe, */ },
284+
{ { 0xe7bc, 0xe7c6 }, 0xa896 /*.. 0xa8a0, */ },
285+
{ { 0xe7c9, 0xe7cc }, 0xa8c1 /*.. 0xa8c4, */ },
286+
{ { 0xe7cd, 0xe7e1 }, 0xa8ea /*.. 0xa8fe, */ },
287+
{ { 0xe7e2, 0xe7e2 }, 0xa958 },
288+
{ { 0xe7e3, 0xe7e3 }, 0xa95b },
289+
{ { 0xe7e4, 0xe7e6 }, 0xa95d /*.. 0xa95f, */ },
290+
{ { 0xe7f4, 0xe800 }, 0xa997 /*.. 0xa9a3, */ },
291+
{ { 0xe801, 0xe80f }, 0xa9f0 /*.. 0xa9fe, */ },
292+
{ { 0xe810, 0xe814 }, 0xd7fa /*.. 0xd7fe, */ },
293+
{ { 0xe816, 0xe816 }, 0x95329031 },
294+
{ { 0xe817, 0xe817 }, 0x95329033 },
295+
{ { 0xe818, 0xe818 }, 0x95329730 },
296+
{ { 0xe81e, 0xe81e }, 0xfe59 },
297+
{ { 0xe826, 0xe826 }, 0xfe61 },
298+
{ { 0xe82b, 0xe82c }, 0xfe66 /*.. 0xfe67, */ },
299+
{ { 0xe831, 0xe831 }, 0x9536b937 },
300+
{ { 0xe832, 0xe832 }, 0xfe6d },
301+
{ { 0xe83b, 0xe83b }, 0x9630ba35 },
302+
{ { 0xe843, 0xe843 }, 0xfe7e },
303+
{ { 0xe854, 0xe854 }, 0xfe90 },
304+
{ { 0xe855, 0xe855 }, 0x9635b630 },
305+
{ { 0xe864, 0xe864 }, 0xfea0 },
302306
};
303307

304308
static int
@@ -316,7 +320,7 @@ gb18030_2005_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
316320
if (ret != RET_ILUNI)
317321
return ret;
318322

319-
ret = gb18030_2005_ext_wctomb(conv,r,wc,n);
323+
ret = gb18030ext_wctomb(conv,r,wc,n);
320324
if (ret != RET_ILUNI)
321325
return ret;
322326

@@ -337,23 +341,32 @@ gb18030_2005_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
337341
return 2;
338342
}
339343
} else {
340-
/* User-defined characters, two-byte part of range U+E766..U+E864 */
344+
/* User-defined characters, two-byte part and 6 four-byte mappings in
345+
range U+E766..U+E864 */
341346
unsigned int k1 = 0;
342-
unsigned int k2 = 31;
347+
unsigned int k2 = 35;
343348
/* Invariant: We know that if wc occurs in Unicode interval in
344349
gb18030_2005_pua2charset, it does so at a k with k1 <= k < k2. */
345350
while (k1 < k2) {
346351
unsigned int k = (k1 + k2) / 2;
347-
if (wc < gb18030_2005_pua2charset[k*3+0])
352+
if (wc < gb18030_2005_pua2charset[k].uni[0])
348353
k2 = k;
349-
else if (wc > gb18030_2005_pua2charset[k*3+1])
354+
else if (wc > gb18030_2005_pua2charset[k].uni[1])
350355
k1 = k + 1;
351356
else {
352-
unsigned short c =
353-
gb18030_2005_pua2charset[k*3+2] + (wc - gb18030_2005_pua2charset[k*3+0]);
354-
r[0] = (c >> 8);
355-
r[1] = (c & 0xff);
356-
return 2;
357+
unsigned int c =
358+
gb18030_2005_pua2charset[k].charset + (wc - gb18030_2005_pua2charset[k].uni[0]);
359+
if (c < 0x10000) {
360+
r[0] = (c >> 8);
361+
r[1] = c & 0xff;
362+
return 2;
363+
} else {
364+
r[0] = (c >> 24);
365+
r[1] = (c >> 16) & 0xff;
366+
r[2] = (c >> 8) & 0xff;
367+
r[3] = c & 0xff;
368+
return 4;
369+
}
357370
}
358371
}
359372
}

lib/gb18030_2022.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ gb18030_2022_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
151151
if (ret != RET_ILUNI)
152152
return ret;
153153

154-
ret = gb18030_2022_ext_wctomb(conv,r,wc,n);
154+
ret = gb18030ext_wctomb(conv,r,wc,n);
155155
if (ret != RET_ILUNI)
156156
return ret;
157157

lib/gb18030ext.h

Lines changed: 1 addition & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -357,93 +357,7 @@ static const unsigned short gb18030ext_pagefe[16] = {
357357
};
358358

359359
static int
360-
gb18030_2005_ext_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
361-
{
362-
if (n >= 2) {
363-
unsigned short c = 0;
364-
if (wc == 0x01f9)
365-
c = 0xa8bf;
366-
else if (wc == 0x1e3f)
367-
c = 0xa8bc;
368-
else if (wc == 0x20ac)
369-
c = 0xa2e3;
370-
else if (wc >= 0x2e80 && wc < 0x2ed0)
371-
c = gb18030ext_page2e[wc-0x2e80];
372-
else if (wc >= 0x2ff0 && wc < 0x3000)
373-
c = gb18030ext_page2f[wc-0x2ff0];
374-
else if (wc == 0x303e)
375-
c = 0xa989;
376-
else if (wc >= 0x3440 && wc < 0x3478)
377-
c = gb18030ext_page34[wc-0x3440];
378-
else if (wc == 0x359e)
379-
c = 0xfe5a;
380-
else if (wc >= 0x3608 && wc < 0x3620)
381-
c = gb18030ext_page36[wc-0x3608];
382-
else if (wc == 0x3918)
383-
c = 0xfe60;
384-
else if (wc == 0x396e)
385-
c = 0xfe5f;
386-
else if (wc >= 0x39c8 && wc < 0x39e0)
387-
c = gb18030ext_page39[wc-0x39c8];
388-
else if (wc == 0x3a73)
389-
c = 0xfe64;
390-
else if (wc == 0x3b4e)
391-
c = 0xfe68;
392-
else if (wc == 0x3c6e)
393-
c = 0xfe69;
394-
else if (wc == 0x3ce0)
395-
c = 0xfe6a;
396-
else if (wc == 0x4056)
397-
c = 0xfe6f;
398-
else if (wc == 0x415f)
399-
c = 0xfe70;
400-
else if (wc == 0x4337)
401-
c = 0xfe72;
402-
else if (wc >= 0x43a8 && wc < 0x43e0)
403-
c = gb18030ext_page43[wc-0x43a8];
404-
else if (wc == 0x44d6)
405-
c = 0xfe7b;
406-
else if (wc >= 0x4648 && wc < 0x4668)
407-
c = gb18030ext_page46[wc-0x4648];
408-
else if (wc >= 0x4720 && wc < 0x4730)
409-
c = gb18030ext_page47_1[wc-0x4720];
410-
else if (wc >= 0x4778 && wc < 0x4790)
411-
c = gb18030ext_page47_2[wc-0x4778];
412-
else if (wc >= 0x4940 && wc < 0x49b8)
413-
c = gb18030ext_page49[wc-0x4940];
414-
else if (wc >= 0x4c70 && wc < 0x4ca8)
415-
c = gb18030ext_page4c[wc-0x4c70];
416-
else if (wc >= 0x4d10 && wc < 0x4d20)
417-
c = gb18030ext_page4d[wc-0x4d10];
418-
else if (wc == 0x4dae)
419-
c = 0xfe9f;
420-
else if (wc >= 0x9fb4 && wc < 0x9fbc)
421-
c = gb18030ext_page9f[wc-0x9fb0];
422-
else if (wc >= 0xfe10 && wc < 0xfe1a)
423-
c = gb18030ext_pagefe[wc-0xfe10];
424-
else if (wc == 0x20087)
425-
c = 0xfe51;
426-
else if (wc == 0x20089)
427-
c = 0xfe52;
428-
else if (wc == 0x200cc)
429-
c = 0xfe53;
430-
else if (wc == 0x215d7)
431-
c = 0xfe6c;
432-
else if (wc == 0x2298f)
433-
c = 0xfe76;
434-
else if (wc == 0x241fe)
435-
c = 0xfe91;
436-
if (c != 0) {
437-
r[0] = (c >> 8); r[1] = (c & 0xff);
438-
return 2;
439-
}
440-
return RET_ILUNI;
441-
}
442-
return RET_TOOSMALL;
443-
}
444-
445-
static int
446-
gb18030_2022_ext_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
360+
gb18030ext_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
447361
{
448362
if (n >= 2) {
449363
unsigned short c = 0;

lib/gb18030uni.h

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -301,13 +301,14 @@ static int
301301
gb18030_2005_uni_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
302302
{
303303
if (n >= 4) {
304-
unsigned int i = wc;
305-
if (i >= 0x0080 && i <= 0xffff) {
306-
if (i == 0xe7c7) {
304+
unsigned int i;
305+
if (wc >= 0x0080 && wc <= 0xffff) {
306+
if (wc == 0xe7c7) {
307307
i = 7457;
308308
} else {
309309
unsigned int k1 = 0;
310310
unsigned int k2 = 205;
311+
i = wc;
311312
while (k1 < k2) {
312313
unsigned int k = (k1 + k2) / 2;
313314
if (i <= gb18030uni_uni2charset_ranges[2*k+1])
@@ -322,13 +323,28 @@ gb18030_2005_uni_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
322323
i -= diff;
323324
}
324325
}
325-
r[3] = (i % 10) + 0x30; i = i / 10;
326-
r[2] = (i % 126) + 0x81; i = i / 126;
327-
r[1] = (i % 10) + 0x30; i = i / 10;
328-
r[0] = i + 0x81;
329-
return 4;
330-
}
331-
return RET_ILUNI;
326+
} else if (wc >= 0x20087 && wc <= 0x241fe) {
327+
if (wc == 0x20087)
328+
i = 0x3e2cf;
329+
else if (wc == 0x20089)
330+
i = 0x3e2d1;
331+
else if (wc == 0x200cc)
332+
i = 0x3e314;
333+
else if (wc == 0x215d7)
334+
i = 0x3f81f;
335+
else if (wc == 0x2298f)
336+
i = 0x40bd7;
337+
else if (wc == 0x241fe)
338+
i = 0x42446;
339+
else
340+
return RET_ILUNI;
341+
} else
342+
return RET_ILUNI;
343+
r[3] = (i % 10) + 0x30; i = i / 10;
344+
r[2] = (i % 126) + 0x81; i = i / 126;
345+
r[1] = (i % 10) + 0x30; i = i / 10;
346+
r[0] = i + 0x81;
347+
return 4;
332348
}
333349
return RET_TOOSMALL;
334350
}
@@ -337,17 +353,18 @@ static int
337353
gb18030_2022_uni_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
338354
{
339355
if (n >= 4) {
340-
unsigned int i = wc;
341-
if (i >= 0x0080 && i <= 0xffff) {
342-
if (i == 0xe7c7) {
356+
if (wc >= 0x0080 && wc <= 0xffff) {
357+
unsigned int i;
358+
if (wc == 0xe7c7) {
343359
i = 7457;
344-
} else if (i >= 0xe78d && i <= 0xe796) {
345-
i = 39076 + gb18030_2022_uni2charset_pua2[i-0xe78d];
346-
} else if (i >= 0xe81e && i <= 0xe864 && gb18030_2022_uni2charset_pua1[i-0xe81e]) {
347-
i = 19056 + gb18030_2022_uni2charset_pua1[i-0xe81e];
360+
} else if (wc >= 0xe78d && wc <= 0xe796) {
361+
i = 39076 + gb18030_2022_uni2charset_pua2[wc-0xe78d];
362+
} else if (wc >= 0xe81e && wc <= 0xe864 && gb18030_2022_uni2charset_pua1[wc-0xe81e]) {
363+
i = 19056 + gb18030_2022_uni2charset_pua1[wc-0xe81e];
348364
} else {
349365
unsigned int k1 = 0;
350366
unsigned int k2 = 205;
367+
i = wc;
351368
while (k1 < k2) {
352369
unsigned int k = (k1 + k2) / 2;
353370
if (i <= gb18030uni_uni2charset_ranges[2*k+1])

tests/GB18030-2005.IRREVERSIBLE.TXT

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@
1616
0x84318333 0xFE17
1717
0x84318334 0xFE18
1818
0x84318335 0xFE19
19-
0x95329031 0x20087
20-
0x95329033 0x20089
21-
0x95329730 0x200CC
22-
0x9536B937 0x215D7
23-
0x9630BA35 0x2298F
24-
0x9635B630 0x241FE
19+
0x95329031 0xE816
20+
0x95329033 0xE817
21+
0x95329730 0xE818
22+
0x9536B937 0xE831
23+
0x9630BA35 0xE83B
24+
0x9635B630 0xE855
2525
0xA6D9 0xE78D
2626
0xA6DA 0xE78E
2727
0xA6DB 0xE78F
@@ -32,17 +32,17 @@
3232
0xA6EC 0xE794
3333
0xA6ED 0xE795
3434
0xA6F3 0xE796
35-
0xFE51 0xE816
36-
0xFE52 0xE817
37-
0xFE53 0xE818
35+
0xFE51 0x20087
36+
0xFE52 0x20089
37+
0xFE53 0x200CC
3838
0xFE59 0xE81E
3939
0xFE61 0xE826
4040
0xFE66 0xE82B
4141
0xFE67 0xE82C
42-
0xFE6C 0xE831
42+
0xFE6C 0x215D7
4343
0xFE6D 0xE832
44-
0xFE76 0xE83B
44+
0xFE76 0x2298F
4545
0xFE7E 0xE843
4646
0xFE90 0xE854
47-
0xFE91 0xE855
47+
0xFE91 0x241FE
4848
0xFEA0 0xE864

0 commit comments

Comments
 (0)