Skip to content

Commit 6549d20

Browse files
committed
Implement GB18030 version 2022.
* lib/encodings.def (GB18030): Add alias GB18030:2005. (GB18030:2022): New encoding. * lib/gb18030ext.h (gb18030_2005_ext_2uni_pagefe): Renamed from gb18030ext_2uni_pagefe. (gb18030_2022_ext_2uni_pagefe): New array. (gb18030_2005_ext_mbtowc): Renamed from gb18030ext_mbtowc. (gb18030_2022_ext_mbtowc): New function. (gb18030_2005_ext_wctomb): Renamed from gb18030ext_wctomb. (gb18030_2022_ext_wctomb): New function. * lib/gb18030uni.h (gb18030_2022_charset2uni_pua1, gb18030_2022_charset2uni_pua2): New arrays. (gb18030_2005_uni_mbtowc): Renamed from gb18030uni_mbtowc. (gb18030_2022_uni_mbtowc): New function. (gb18030_2022_uni2charset_pua1, gb18030_2022_uni2charset_pua2): New arrays. (gb18030_2005_uni_wctomb): Renamed from gb18030uni_wctomb. (gb18030_2022_uni_wctomb): New function. * lib/gb18030_2005.h: Renamed from lib/gb18030.h. Update comments. (gb18030_2005_mbtowc): Renamed from gb18030_mbtowc. (gb18030_2005_pua2charset): Renamed from gb18030_pua2charset. (gb18030_2005_wctomb): Renamed from gb18030_wctomb. * lib/gb18030_2022.h: New file, based on lib/gb18030_2005.h. * lib/converters.h: Don't include gb18030.h. Include gb18030_2005.h, gb18030_2022.h. * lib/Makefile.in (SOURCE_FILES): Remove gb18030.h. Add gb18030_2005.h, gb18030_2022.h. * tests/GB18030-2005-BMP.TXT: Renamed from tests/GB18030-BMP.TXT. * tests/GB18030-2005.IRREVERSIBLE.TXT: Renamed from tests/GB18030.IRREVERSIBLE.TXT. * tests/GB18030-2022-BMP.TXT: New file. * tests/Makefile.in (check): Test GB18030:2005 instead of GB18030. Also test GB18030:2022. (clean): Don't remove GB18030.TXT. Instead, remove GB18030-2005.TXT and GB18030-2022.TXT. (SOURCE_FILES): Update. Add GB18030-2022-BMP.TXT. * README: Mention the new encoding. * man/iconv_open.3: Likewise. * NEWS: Likewise.
1 parent 45425ff commit 6549d20

15 files changed

+64110
-40
lines changed

ChangeLog

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,45 @@
1+
2023-05-20 Bruno Haible <bruno@clisp.org>
2+
3+
Implement GB18030 version 2022.
4+
* lib/encodings.def (GB18030): Add alias GB18030:2005.
5+
(GB18030:2022): New encoding.
6+
* lib/gb18030ext.h (gb18030_2005_ext_2uni_pagefe): Renamed from
7+
gb18030ext_2uni_pagefe.
8+
(gb18030_2022_ext_2uni_pagefe): New array.
9+
(gb18030_2005_ext_mbtowc): Renamed from gb18030ext_mbtowc.
10+
(gb18030_2022_ext_mbtowc): New function.
11+
(gb18030_2005_ext_wctomb): Renamed from gb18030ext_wctomb.
12+
(gb18030_2022_ext_wctomb): New function.
13+
* lib/gb18030uni.h (gb18030_2022_charset2uni_pua1,
14+
gb18030_2022_charset2uni_pua2): New arrays.
15+
(gb18030_2005_uni_mbtowc): Renamed from gb18030uni_mbtowc.
16+
(gb18030_2022_uni_mbtowc): New function.
17+
(gb18030_2022_uni2charset_pua1, gb18030_2022_uni2charset_pua2): New
18+
arrays.
19+
(gb18030_2005_uni_wctomb): Renamed from gb18030uni_wctomb.
20+
(gb18030_2022_uni_wctomb): New function.
21+
* lib/gb18030_2005.h: Renamed from lib/gb18030.h. Update comments.
22+
(gb18030_2005_mbtowc): Renamed from gb18030_mbtowc.
23+
(gb18030_2005_pua2charset): Renamed from gb18030_pua2charset.
24+
(gb18030_2005_wctomb): Renamed from gb18030_wctomb.
25+
* lib/gb18030_2022.h: New file, based on lib/gb18030_2005.h.
26+
* lib/converters.h: Don't include gb18030.h. Include gb18030_2005.h,
27+
gb18030_2022.h.
28+
* lib/Makefile.in (SOURCE_FILES): Remove gb18030.h. Add gb18030_2005.h,
29+
gb18030_2022.h.
30+
* tests/GB18030-2005-BMP.TXT: Renamed from tests/GB18030-BMP.TXT.
31+
* tests/GB18030-2005.IRREVERSIBLE.TXT: Renamed from
32+
tests/GB18030.IRREVERSIBLE.TXT.
33+
* tests/GB18030-2022-BMP.TXT: New file.
34+
* tests/Makefile.in (check): Test GB18030:2005 instead of GB18030. Also
35+
test GB18030:2022.
36+
(clean): Don't remove GB18030.TXT. Instead, remove GB18030-2005.TXT and
37+
GB18030-2022.TXT.
38+
(SOURCE_FILES): Update. Add GB18030-2022-BMP.TXT.
39+
* README: Mention the new encoding.
40+
* man/iconv_open.3: Likewise.
41+
* NEWS: Likewise.
42+
143
2023-05-19 Bruno Haible <bruno@clisp.org>
244

345
Make the compiler used by Makefile.devel customizable.

NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
New in 1.18:
22
* Many more transliterations.
3+
* GB18030 is now an alias for GB18030:2005. A new converter for GB18030:2022
4+
is added. Since this encoding merely cleans up a few private-use-area
5+
mappings, you can continue to use the GB18030 converter, for backward
6+
compatibility.
37
* When converting from/to an EBCDIC encoding, a non-standard way of
48
converting newlines can be requested
59
- at the C level, by calling iconvctl with argument ICONV_SET_FROM_SURFACE

README

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ It provides support for the encodings:
1818
EUC-JP, SHIFT_JIS, CP932, ISO-2022-JP, ISO-2022-JP-2, ISO-2022-JP-1,
1919
ISO-2022-JP-MS
2020
Chinese
21-
EUC-CN, HZ, GBK, CP936, GB18030, EUC-TW, BIG5, CP950, BIG5-HKSCS,
22-
BIG5-HKSCS:2004, BIG5-HKSCS:2001, BIG5-HKSCS:1999, ISO-2022-CN,
23-
ISO-2022-CN-EXT
21+
EUC-CN, HZ, GBK, CP936, GB18030, GB18030:2022, EUC-TW, BIG5, CP950,
22+
BIG5-HKSCS, BIG5-HKSCS:2004, BIG5-HKSCS:2001, BIG5-HKSCS:1999,
23+
ISO-2022-CN, ISO-2022-CN-EXT
2424
Korean
2525
EUC-KR, CP949, ISO-2022-KR, JOHAB
2626
Armenian

lib/Makefile.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ SOURCE_FILES = \
279279
euc_cn.h \
280280
ces_gbk.h \
281281
cp936.h \
282-
gb18030.h \
282+
gb18030_2005.h gb18030_2022.h \
283283
gb18030ext.h \
284284
gb18030uni.h \
285285
iso2022_cn.h \

lib/converters.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,8 @@ typedef struct {
238238
#include "euc_cn.h"
239239
#include "ces_gbk.h"
240240
#include "cp936.h"
241-
#include "gb18030.h"
241+
#include "gb18030_2005.h"
242+
#include "gb18030_2022.h"
242243
#include "iso2022_cn.h"
243244
#include "iso2022_cnext.h"
244245
#include "hz.h"

lib/encodings.def

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -912,9 +912,15 @@ DEFENCODING(( "CP936", /* IANA */
912912

913913
DEFENCODING(( "GB18030", /* IANA, glibc */
914914
/*"CP54936", Windows */
915+
"GB18030:2005",
915916
),
916-
gb18030,
917-
{ gb18030_mbtowc, NULL }, { gb18030_wctomb, NULL })
917+
gb18030_2005,
918+
{ gb18030_2005_mbtowc, NULL },{ gb18030_2005_wctomb, NULL })
919+
920+
DEFENCODING(( "GB18030:2022",
921+
),
922+
gb18030_2022,
923+
{ gb18030_2022_mbtowc, NULL },{ gb18030_2022_wctomb, NULL })
918924

919925
DEFENCODING(( "ISO-2022-CN", /* IANA, RFC 1922 */
920926
"csISO2022CN",

lib/gb18030.h renamed to lib/gb18030_2005.h

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 1999-2001, 2005, 2012, 2016 Free Software Foundation, Inc.
2+
* Copyright (C) 1999-2001, 2005, 2012, 2016, 2023 Free Software Foundation, Inc.
33
* This file is part of the GNU LIBICONV Library.
44
*
55
* The GNU LIBICONV Library is free software; you can redistribute it
@@ -18,14 +18,14 @@
1818
*/
1919

2020
/*
21-
* GB18030
21+
* GB18030:2005
2222
*/
2323

2424
/*
2525
* GB18030, as specified in the GB18030 standard, is an extension of GBK.
2626
*
2727
* In what follows, page numbers refer to the GB18030 standard (second
28-
* printing).
28+
* printing) from 2005.
2929
*
3030
*
3131
* It consists of the following parts:
@@ -185,7 +185,7 @@
185185
#include "gb18030uni.h"
186186

187187
static int
188-
gb18030_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
188+
gb18030_2005_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
189189
{
190190
int ret;
191191

@@ -198,13 +198,13 @@ gb18030_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
198198
if (ret != RET_ILSEQ)
199199
return ret;
200200

201-
ret = gb18030ext_mbtowc(conv,pwc,s,n);
201+
ret = gb18030_2005_ext_mbtowc(conv,pwc,s,n);
202202
if (ret != RET_ILSEQ)
203203
return ret;
204204

205205
/* Code set 2 (remainder of Unicode U+0000..U+FFFF), including
206206
User-defined characters, two-byte part of range U+E766..U+E864 */
207-
ret = gb18030uni_mbtowc(conv,pwc,s,n);
207+
ret = gb18030_2005_uni_mbtowc(conv,pwc,s,n);
208208
if (ret != RET_ILSEQ)
209209
return ret;
210210
/* User-defined characters range U+E000..U+E765 */
@@ -266,7 +266,7 @@ gb18030_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
266266
}
267267
}
268268

269-
static const unsigned short gb18030_pua2charset[31*3] = {
269+
static const unsigned short gb18030_2005_pua2charset[31*3] = {
270270
/* Unicode range GB18030 range */
271271
0xe766, 0xe76b, 0xa2ab, /*.. 0xa2b0, */
272272
0xe76d, 0xe76d, 0xa2e4,
@@ -302,7 +302,7 @@ static const unsigned short gb18030_pua2charset[31*3] = {
302302
};
303303

304304
static int
305-
gb18030_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
305+
gb18030_2005_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
306306
{
307307
int ret;
308308

@@ -316,7 +316,7 @@ gb18030_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
316316
if (ret != RET_ILUNI)
317317
return ret;
318318

319-
ret = gb18030ext_wctomb(conv,r,wc,n);
319+
ret = gb18030_2005_ext_wctomb(conv,r,wc,n);
320320
if (ret != RET_ILUNI)
321321
return ret;
322322

@@ -341,16 +341,16 @@ gb18030_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
341341
unsigned int k1 = 0;
342342
unsigned int k2 = 31;
343343
/* Invariant: We know that if wc occurs in Unicode interval in
344-
gb18030_pua2charset, it does so at a k with k1 <= k < k2. */
344+
gb18030_2005_pua2charset, it does so at a k with k1 <= k < k2. */
345345
while (k1 < k2) {
346346
unsigned int k = (k1 + k2) / 2;
347-
if (wc < gb18030_pua2charset[k*3+0])
347+
if (wc < gb18030_2005_pua2charset[k*3+0])
348348
k2 = k;
349-
else if (wc > gb18030_pua2charset[k*3+1])
349+
else if (wc > gb18030_2005_pua2charset[k*3+1])
350350
k1 = k + 1;
351351
else {
352352
unsigned short c =
353-
gb18030_pua2charset[k*3+2] + (wc - gb18030_pua2charset[k*3+0]);
353+
gb18030_2005_pua2charset[k*3+2] + (wc - gb18030_2005_pua2charset[k*3+0]);
354354
r[0] = (c >> 8);
355355
r[1] = (c & 0xff);
356356
return 2;
@@ -360,7 +360,7 @@ gb18030_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
360360
} else
361361
return RET_TOOSMALL;
362362
}
363-
ret = gb18030uni_wctomb(conv,r,wc,n);
363+
ret = gb18030_2005_uni_wctomb(conv,r,wc,n);
364364
if (ret != RET_ILUNI)
365365
return ret;
366366

0 commit comments

Comments
 (0)