|
1 | | - |
2 | | - |
3 | 1 | # cppp-reiconv |
4 | | -A character set conversion library based on GNU LIBICONV. |
5 | 2 |
|
6 | | -# Instruction |
7 | | -This library provides some charset conversation function for The C++ Plus Project. |
| 3 | +<img alt="C+++" src="https://avatars.githubusercontent.com/u/130828670" width="10%"> |
| 4 | + |
| 5 | +A character set conversion library based on GNU LIBICONV. **Supports C and C++20.** |
| 6 | + |
| 7 | +If your C++ standard is lower than C++20. A few code needs to be modified. |
| 8 | + |
| 9 | +## Build |
| 10 | + |
| 11 | +This library requires `build-aux` and `cppp-platform` for building. |
| 12 | + |
| 13 | +See [docs/](docs/README.md) for full requirements list. |
| 14 | + |
| 15 | +**We are planning to use [Rubisco](https://github.com/cppp-project/rubisco) to manage submodules. But it is not ready yet. So we don not use git submodule for now. Please clone them manually.** |
| 16 | + |
| 17 | +Use the following command to fetch source code: |
| 18 | + |
| 19 | +```shell |
| 20 | +git clone https://github.com/cppp-project/cppp-reiconv |
| 21 | +cd cppp-reiconv |
| 22 | +git clone https://github.com/cppp-project/build-aux --depth 1 |
| 23 | +git clone https://github.com/cppp-project/rubisco --depth 1 |
| 24 | +``` |
| 25 | + |
| 26 | +Use the following command to build and install. |
| 27 | + |
| 28 | +```shell |
| 29 | +mkdir build |
| 30 | +cd build |
| 31 | +cmake .. -DCMAKE_BUILD_TYPE=[[BUILD_TYPE]] -DCMAKE_INSTALL_PREFIX=[[PREFIX]] |
| 32 | +cmake --build . --config=[[BUILD_TYPE]] |
| 33 | +cmake --install . --config=[[BUILD_TYPE]] |
| 34 | +``` |
| 35 | + |
| 36 | +## Simple Usage |
| 37 | + |
| 38 | +```cpp |
| 39 | +#include <cppp/reiconv.hpp> |
| 40 | + |
| 41 | +#include <cstdlib> |
| 42 | +#include <iostream> |
| 43 | + |
| 44 | +int main() |
| 45 | +{ |
| 46 | + const std::string_view src = "\xb8\xfc\xcf\xb2\xe1\xba\xc9\xbd\xc7\xa7\xc0\xef\xd1\xa9\xa3\xac\xc8\xfd\xbe\xfc\xb9\xfd\xba\xf3\xbe\xa1\xbf\xaa\xd1\xd5\xa3\xa1"; |
| 47 | + |
| 48 | + std::string result = reiconv::convert("GB18030", "UTF-8", src); |
8 | 49 |
|
9 | | -It provides support for the encodings: |
| 50 | + std::cout << result << std::endl; |
| 51 | + |
| 52 | + const std::string_view correct_result = "\u66f4\u559c\u5cb7\u5c71\u5343\u91cc\u96ea\uff0c\u4e09\u519b\u8fc7\u540e\u5c3d\u5f00\u989c\uff01"; |
| 53 | + |
| 54 | + if (result == correct_result) |
| 55 | + { |
| 56 | + std::cout << "Correct!" << std::endl; |
| 57 | + } |
| 58 | + else |
| 59 | + { |
| 60 | + // If it happens, please report a bug. |
| 61 | + std::cout << "Wrong!" << std::endl; |
| 62 | + } |
| 63 | + |
| 64 | + return EXIT_SUCCESS; |
| 65 | +} |
| 66 | +``` |
| 67 | + |
| 68 | +The full documentation can be found in [docs/](docs/README.md). |
| 69 | + |
| 70 | +## Supported Encodings |
| 71 | + |
| 72 | +Same as GNU LIBICONV, It provides support for the encodings: |
10 | 73 |
|
11 | 74 | + European languages |
12 | | - - ASCII, ISO-8859-{1,2,3,4,5,7,9,10,13,14,15,16}, |
13 | | - - KOI8-R, KOI8-U, KOI8-RU, |
14 | | - - CP{1250,1251,1252,1253,1254,1257}, CP{850,866,1131}, |
15 | | - - Mac{Roman,CentralEurope,Iceland,Croatian,Romania}, |
16 | | - - Mac{Cyrillic,Ukraine,Greek,Turkish}, |
17 | | - - Macintosh |
| 75 | + + ASCII, ISO-8859-{1,2,3,4,5,7,9,10,13,14,15,16}, |
| 76 | + + KOI8-R, KOI8-U, KOI8-RU, |
| 77 | + + CP{1250,1251,1252,1253,1254,1257}, CP{850,866,1131}, |
| 78 | + + Mac{Roman,CentralEurope,Iceland,Croatian,Romania}, |
| 79 | + + Mac{Cyrillic,Ukraine,Greek,Turkish}, |
| 80 | + + Macintosh |
18 | 81 | + Semitic languages |
19 | | - - ISO-8859-{6,8}, CP{1255,1256}, CP862, Mac{Hebrew,Arabic} |
| 82 | + + ISO-8859-{6,8}, CP{1255,1256}, CP862, Mac{Hebrew,Arabic} |
20 | 83 | + Japanese |
21 | | - - EUC-JP, SHIFT_JIS, CP932, ISO-2022-JP, ISO-2022-JP-2, ISO-2022-JP-1, |
22 | | - - ISO-2022-JP-MS |
| 84 | + + EUC-JP, SHIFT_JIS, CP932, ISO-2022-JP, ISO-2022-JP-2, ISO-2022-JP-1, |
| 85 | + + ISO-2022-JP-MS |
23 | 86 | + Chinese |
24 | | - - EUC-CN, HZ, GBK, CP936, GB18030, GB18030:2022, EUC-TW, BIG5, CP950, |
25 | | - - BIG5-HKSCS, BIG5-HKSCS:2004, BIG5-HKSCS:2001, BIG5-HKSCS:1999, |
26 | | - - ISO-2022-CN, ISO-2022-CN-EXT |
| 87 | + + EUC-CN, HZ, GBK, CP936, GB18030, GB18030:2022, EUC-TW, BIG5, CP950, |
| 88 | + + BIG5-HKSCS, BIG5-HKSCS:2004, BIG5-HKSCS:2001, BIG5-HKSCS:1999, |
| 89 | + + ISO-2022-CN, ISO-2022-CN-EXT |
27 | 90 | + Korean |
28 | | - - EUC-KR, CP949, ISO-2022-KR, JOHAB |
| 91 | + + EUC-KR, CP949, ISO-2022-KR, JOHAB |
29 | 92 | + Armenian |
30 | | - - ARMSCII-8 |
| 93 | + + ARMSCII-8 |
31 | 94 | + Georgian |
32 | | - - Georgian-Academy, Georgian-PS |
| 95 | + + Georgian-Academy, Georgian-PS |
33 | 96 | + Tajik |
34 | | - - KOI8-T |
| 97 | + + KOI8-T |
35 | 98 | + Kazakh |
36 | | - - PT154, RK1048 |
| 99 | + + PT154, RK1048 |
37 | 100 | + Thai |
38 | | - - ISO-8859-11, TIS-620, CP874, MacThai |
| 101 | + + ISO-8859-11, TIS-620, CP874, MacThai |
39 | 102 | + Laotian |
40 | | - - MuleLao-1, CP1133 |
| 103 | + + MuleLao-1, CP1133 |
41 | 104 | + Vietnamese |
42 | | - - VISCII, TCVN, CP1258 |
| 105 | + + VISCII, TCVN, CP1258 |
43 | 106 | + Platform specifics |
44 | | - - HP-ROMAN8, NEXTSTEP |
| 107 | + + HP-ROMAN8, NEXTSTEP |
45 | 108 | + Full Unicode |
46 | | - - UTF-8 |
47 | | - - UCS-2, UCS-2BE, UCS-2LE |
48 | | - - UCS-4, UCS-4BE, UCS-4LE |
49 | | - - UTF-16, UTF-16BE, UTF-16LE |
50 | | - - UTF-32, UTF-32BE, UTF-32LE |
51 | | - - UTF-7 |
52 | | - - C99, JAVA |
| 109 | + + UTF-8 |
| 110 | + + UCS-2, UCS-2BE, UCS-2LE |
| 111 | + + UCS-4, UCS-4BE, UCS-4LE |
| 112 | + + UTF-16, UTF-16BE, UTF-16LE |
| 113 | + + UTF-32, UTF-32BE, UTF-32LE |
| 114 | + + UTF-7 |
| 115 | + + C99, JAVA |
53 | 116 | + Full Unicode, in terms of 'uint16_t' or 'uint32_t' |
54 | | - - (with machine dependent endianness and alignment) |
55 | | - - UCS-2-INTERNAL, UCS-4-INTERNAL |
| 117 | + + UCS-2-INTERNAL, UCS-4-INTERNAL (with machine dependent endianness and alignment) |
56 | 118 |
|
57 | | -When configured with the option -DENABLE_EXTRA, it also provides |
58 | | -support for a few extra encodings: |
| 119 | +And some extra encodings. These encodings are GNU LIBICONV's extra encodings. |
59 | 120 |
|
60 | 121 | + European languages |
61 | | - - CP{437,737,775,852,853,855,857,858,860,861,863,865,869,1125} |
| 122 | + + CP{437,737,775,852,853,855,857,858,860,861,863,865,869,1125} |
62 | 123 | + Semitic languages |
63 | | - - CP864 |
| 124 | + + CP864 |
64 | 125 | + Japanese |
65 | | - - EUC-JISX0213, Shift_JISX0213, ISO-2022-JP-3 |
| 126 | + + EUC-JISX0213, Shift_JISX0213, ISO-2022-JP-3 |
66 | 127 | + Chinese |
67 | | - - BIG5-2003 (experimental) |
| 128 | + + BIG5-2003 (experimental) |
68 | 129 | + Turkmen |
69 | | - - TDS565 |
| 130 | + + TDS565 |
70 | 131 | + Platform specifics |
71 | | - - ATARIST, RISCOS-LATIN1 |
| 132 | + + ATARIST, RISCOS-LATIN1 |
72 | 133 | + EBCDIC compatible (not ASCII compatible, very rarely used) |
73 | | - - European languages |
74 | | - - - IBM-{037,273,277,278,280,282,284,285,297,423,500,870,871,875,880}, |
75 | | - - - IBM-{905,924,1025,1026,1047,1112,1122,1123,1140,1141,1142,1143}, |
76 | | - - - IBM-{1144,1145,1146,1147,1148,1149,1153,1154,1155,1156,1157,1158}, |
77 | | - - - IBM-{1165,1166,4971} |
78 | | - - Semitic languages |
79 | | - - - IBM-{424,425,12712,16804} |
80 | | - - Persian |
81 | | - - - IBM-1097 |
82 | | - - Thai |
83 | | - - - IBM-{838,1160} |
84 | | - - Laotian |
85 | | - - - IBM-1132 |
86 | | - - Vietnamese |
87 | | - - - IBM-{1130,1164} |
88 | | - - Indic languages |
89 | | - - - IBM-1137 |
| 134 | + + European languages |
| 135 | + + IBM-{037,273,277,278,280,282,284,285,297,423,500,870,871,875,880}, |
| 136 | + + IBM-{905,924,1025,1026,1047,1112,1122,1123,1140,1141,1142,1143}, |
| 137 | + + IBM-{1144,1145,1146,1147,1148,1149,1153,1154,1155,1156,1157,1158}, |
| 138 | + + IBM-{1165,1166,4971} |
| 139 | + + Semitic languages |
| 140 | + + IBM-{424,425,12712,16804} |
| 141 | + + Persian |
| 142 | + + IBM-1097 |
| 143 | + + Thai |
| 144 | + + IBM-{838,1160} |
| 145 | + + Laotian |
| 146 | + + IBM-1132 |
| 147 | + + Vietnamese |
| 148 | + + IBM-{1130,1164} |
| 149 | + + Indic languages |
| 150 | + + IBM-1137 |
90 | 151 |
|
91 | 152 | It can convert from any of these encodings to any other, through Unicode |
92 | 153 | conversion. |
93 | 154 |
|
94 | | -cppp-reiconv is for you if your application needs to support multiple character |
95 | | -encodings, but that support lacks from your system. |
96 | | - |
97 | | - |
98 | | -# Build and installation |
| 155 | +## Build Options |
99 | 156 |
|
100 | 157 | We use CMake for build |
101 | 158 |
|
102 | | -+ Standard build |
103 | | -```shell |
104 | | -mkdir build |
105 | | -cd build |
106 | | -cmake .. -DCMAKE_INSTALL_PREFIX=[[PREFIX]] |
107 | | -cmake --build . --config=Release |
108 | | -cmake --install . |
109 | | -``` |
| 159 | ++ `BUILD_TESTING`: Build tests. Default is ON. |
110 | 160 |
|
111 | | -+ Full build |
112 | 161 | ```shell |
113 | | -mkdir build |
114 | | -cd build |
115 | | -cmake .. -DCMAKE_INSTALL_PREFIX=[[PREFIX]] -DENABLE_EXTRA=ON |
116 | | -cmake --build . --config=Release |
117 | | -cmake --install . |
| 162 | +cmake .. -DBUILD_TESTING=ON |
| 163 | +cmake --build . --config=RelWithDebInfo |
| 164 | +ctest -C RelWithDebInfo --output-on-failure |
118 | 165 | ``` |
119 | 166 |
|
120 | | -When use full build, extra encodings will be enabled. |
| 167 | ++ `ICONV_COMPAT`: Enable compatibility with iconv. Default is OFF. |
121 | 168 |
|
122 | | -+ Build with test suite |
123 | | -```shell |
124 | | -mkdir build |
125 | | -cd build |
126 | | -cmake .. -DCMAKE_INSTALL_PREFIX=[[PREFIX]] -DENABLE_EXTRA=ON -DENABLE_TEST=ON |
127 | | -cmake --build . --config=Release |
128 | | -cmake --install . |
129 | | -``` |
| 169 | +If `ICONV_COMPAT` is ON, it will install `iconv.h` with basic functions. |
| 170 | +But we do not support all features of POSIX:2024. See [TODO](TODO). |
| 171 | + |
| 172 | +## Install |
130 | 173 |
|
131 | | -# Install |
132 | 174 | This library installs: |
133 | | - - a shared library 'libcppp-reiconv'. |
134 | | - - a static library 'libcppp-reiconv.static'. |
135 | | - - a header file '<cppp/reiconv.hpp>'. |
136 | 175 |
|
137 | | -To use it, simply #include <cppp/reiconv.hpp> and import the lib to use the functions. |
| 176 | ++ A shared library `libcppp-reiconv`. |
| 177 | ++ A static library `libcppp-reiconv.static`. |
| 178 | ++ Header files |
| 179 | + |
| 180 | +```text |
| 181 | +include |
| 182 | +├── cppp |
| 183 | +│ ├── cppp-platform.h |
| 184 | +│ ├── encodings |
| 185 | +│ │ ├── reiconv.h |
| 186 | +│ │ └── reiconv.hpp |
| 187 | +│ ├── reiconv.h |
| 188 | +│ └── reiconv.hpp |
| 189 | +└── iconv.h # Only if ICONV_COMPAT is ON |
| 190 | +``` |
138 | 191 |
|
139 | | -# Copyright |
| 192 | +## Copyright |
140 | 193 |
|
141 | 194 | The cppp-reiconv is under LGPLv3, |
142 | 195 | see file [LICENSE](./LICENSE). |
143 | 196 |
|
144 | | -# Download |
| 197 | +## Download |
| 198 | + |
| 199 | +See <https://github.com/cppp-project/cppp-reiconv/releases> |
145 | 200 |
|
146 | | -https://github.com/cppp-project/cppp-reiconv/releases |
| 201 | +## Homepage |
147 | 202 |
|
148 | | -# Homepage |
| 203 | +<https://github.com/cppp-project/cppp-reiconv> |
149 | 204 |
|
150 | | -https://github.com/cppp-project/cppp-reiconv |
| 205 | +## Bug reports |
151 | 206 |
|
152 | | -# Bug reports |
153 | | - + Create a issue on GitHub [Report now](https://github.com/cppp-project/cppp-reiconv/issues/new/) |
| 207 | ++ Create a issue on GitHub [Report now](https://github.com/cppp-project/cppp-reiconv/issues/new/) |
0 commit comments