Skip to content

Commit 5babd6b

Browse files
committed
Update JAVA encoding for UCS-4.
1 parent 485041c commit 5babd6b

File tree

2 files changed

+66
-6
lines changed

2 files changed

+66
-6
lines changed

ChangeLog

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
2002-05-26 Bruno Haible <bruno@clisp.org>
2+
3+
* lib/java.h (java_mbtowc): Accept 12-byte sequences for non-BMP
4+
characters.
5+
(java_wctomb): Produce 12-byte sequences for non-BMP characters.
6+
17
2002-05-29 Bruno Haible <bruno@clisp.org>
28

39
Fix installation of iconv program when linked with a libintl that was

lib/java.h

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 1999-2000 Free Software Foundation, Inc.
2+
* Copyright (C) 1999-2002 Free Software Foundation, Inc.
33
* This file is part of the GNU LIBICONV Library.
44
*
55
* The GNU LIBICONV Library is free software; you can redistribute it
@@ -20,14 +20,16 @@
2020

2121
/*
2222
* JAVA
23-
* This is ISO 8859-1 with \uXXXX escape sequences, denoting Unicode characters.
23+
* This is ISO 8859-1 with \uXXXX escape sequences, denoting Unicode BMP
24+
* characters. Consecutive pairs of \uXXXX escape sequences in the surrogate
25+
* range, as in UTF-16, denote Unicode characters outside the BMP.
2426
*/
2527

2628
static int
2729
java_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
2830
{
2931
unsigned char c;
30-
ucs4_t wc;
32+
ucs4_t wc, wc2;
3133
int i;
3234

3335
c = s[0];
@@ -54,8 +56,39 @@ java_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
5456
goto simply_backslash;
5557
wc |= (ucs4_t) c << (4 * (5-i));
5658
}
57-
*pwc = wc;
58-
return 6;
59+
if (!(wc >= 0xd800 && wc < 0xe000)) {
60+
*pwc = wc;
61+
return 6;
62+
}
63+
if (wc >= 0xdc00)
64+
goto simply_backslash;
65+
if (n < 7)
66+
return RET_TOOFEW(0);
67+
if (s[6] != '\\')
68+
goto simply_backslash;
69+
if (n < 8)
70+
return RET_TOOFEW(0);
71+
if (s[7] != 'u')
72+
goto simply_backslash;
73+
wc2 = 0;
74+
for (i = 8; i < 12; i++) {
75+
if (n <= i)
76+
return RET_TOOFEW(0);
77+
c = s[i];
78+
if (c >= '0' && c <= '9')
79+
c -= '0';
80+
else if (c >= 'A' && c <= 'Z')
81+
c -= 'A'-10;
82+
else if (c >= 'a' && c <= 'z')
83+
c -= 'a'-10;
84+
else
85+
goto simply_backslash;
86+
wc2 |= (ucs4_t) c << (4 * (11-i));
87+
}
88+
if (!(wc2 >= 0xdc00 && wc2 < 0xe000))
89+
goto simply_backslash;
90+
*pwc = 0x10000 + ((wc - 0xd800) << 10) + (wc2 - 0xdc00);
91+
return 12;
5992
simply_backslash:
6093
*pwc = '\\';
6194
return 1;
@@ -67,7 +100,7 @@ java_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
67100
if (wc < 0x80) {
68101
*r = wc;
69102
return 1;
70-
} else {
103+
} else if (wc < 0x10000) {
71104
if (n >= 6) {
72105
unsigned int i;
73106
r[0] = '\\';
@@ -79,5 +112,26 @@ java_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
79112
return 6;
80113
} else
81114
return RET_TOOSMALL;
115+
} else if (wc < 0x110000) {
116+
if (n >= 12) {
117+
ucs4_t wc1 = 0xd800 + ((wc - 0x10000) >> 10);
118+
ucs4_t wc2 = 0xdc00 + ((wc - 0x10000) & 0x3ff);
119+
unsigned int i;
120+
r[0] = '\\';
121+
r[1] = 'u';
122+
i = (wc1 >> 12) & 0x0f; r[2] = (i < 10 ? '0'+i : 'a'-10+i);
123+
i = (wc1 >> 8) & 0x0f; r[3] = (i < 10 ? '0'+i : 'a'-10+i);
124+
i = (wc1 >> 4) & 0x0f; r[4] = (i < 10 ? '0'+i : 'a'-10+i);
125+
i = wc1 & 0x0f; r[5] = (i < 10 ? '0'+i : 'a'-10+i);
126+
r[6] = '\\';
127+
r[7] = 'u';
128+
i = (wc2 >> 12) & 0x0f; r[8] = (i < 10 ? '0'+i : 'a'-10+i);
129+
i = (wc2 >> 8) & 0x0f; r[9] = (i < 10 ? '0'+i : 'a'-10+i);
130+
i = (wc2 >> 4) & 0x0f; r[10] = (i < 10 ? '0'+i : 'a'-10+i);
131+
i = wc2 & 0x0f; r[11] = (i < 10 ? '0'+i : 'a'-10+i);
132+
return 12;
133+
} else
134+
return RET_TOOSMALL;
82135
}
136+
return RET_ILUNI;
83137
}

0 commit comments

Comments
 (0)