[UTF16LE]
[reactos.git] / reactos / tools / utf16le / utf16le.cpp
1 /*
2 * Usage: utf16le inputfile outputfile
3 *
4 * This is a tool and is compiled using the host compiler,
5 * i.e. on Linux gcc and not mingw-gcc (cross-compiler).
6 * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE)
7 * to utf-16 LE and especially made for automatic conversions of
8 * INF-files from utf-8 to utf-16LE (so we can furthermore
9 * store the INF files in utf-8 for subversion.
10 *
11 * Author: Matthias Kupfer (mkupfer@reactos.org)
12 */
13
14 #include <fstream>
15 #include <iostream>
16
17 //#define DISPLAY_DETECTED_UNICODE
18
19 using namespace std;
20
21 class utf_converter
22 {
23 public:
24 // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only
25 // due to ambiguous BOM
26 enum enc_types { detect, utf8, utf16le, utf16be, utf32le, utf32be };
27 enum err_types { none, iopen, oopen, eof, read, write, decode };
28 protected:
29 err_types error;
30 enc_types encoding;
31 unsigned char buffer[4], fill, index; // need 4 char buffer for optional BOM handling
32 fstream inputfile,outputfile;
33 static const unsigned char utf8table[64];
34 public:
35 utf_converter(string ifname, string ofname, enc_types enc = detect) : error(none), encoding(enc), fill(0), index(0)
36 {
37 enc_types tmp_enc;
38 inputfile.open(ifname.c_str(), ios::in | ios::binary);
39 if (!inputfile)
40 {
41 error = iopen;
42 return;
43 }
44 outputfile.open(ofname.c_str(), ios::out | ios::binary);
45 if (!outputfile)
46 {
47 error = oopen;
48 return;
49 }
50 tmp_enc = getBOM();
51 if (enc != detect)
52 {
53 if (enc != tmp_enc)
54 cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl;
55 }
56 else
57 encoding = tmp_enc;
58 }
59 err_types getError()
60 {
61 return error;
62 }
63 enc_types getBOM()
64 {
65 index = 0;
66 /* first byte can also detect with:
67 if ((buffer[0] & 0x11) || !buffer[0]))
68 valid values are 0xef, 0xff, 0xfe, 0x00
69 */
70 inputfile.read(reinterpret_cast<char*>(&buffer),4);
71 fill =inputfile.gcount();
72 // stupid utf8 bom
73 if ((fill > 2) &&
74 (buffer[0] == 0xef) &&
75 (buffer[1] == 0xbb) &&
76 (buffer[2] == 0xbf))
77 {
78 index += 3;
79 fill -=3;
80 #ifdef DISPLAY_DETECTED_UNICODE
81 cerr << "UTF-8 BOM found" << endl;
82 #endif
83 return utf8;
84 }
85 if ((fill > 1) &&
86 (buffer[0] == 0xfe) &&
87 (buffer[1] == 0xff))
88 {
89 index += 2;
90 fill -= 2;
91 #ifdef DISPLAY_DETECTED_UNICODE
92 cerr << "UTF-16BE BOM found" << endl;
93 #endif
94 return utf16be;
95 }
96 if ((fill > 1) &&
97 (buffer[0] == 0xff) &&
98 (buffer[1] == 0xfe))
99 {
100 if ((fill == 4) &&
101 (buffer[2] == 0x00) &&
102 (buffer[3] == 0x00))
103 {
104 cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl;
105 fill = 0;
106 index = 0;
107 return utf32le;
108 }
109 fill -= 2;
110 index += 2;
111 #ifdef DISPLAY_DETECTED_UNICODE
112 cerr << "UTF-16LE BOM found" << endl;
113 #endif
114 return utf16le;
115 }
116 if ((fill == 4) &&
117 (buffer[0] == 0x00) &&
118 (buffer[1] == 0x00) &&
119 (buffer[2] == 0xfe) &&
120 (buffer[3] == 0xff))
121 {
122 fill = 0;
123 index = 0;
124 #ifdef DISPLAY_DETECTED_UNICODE
125 cerr << "UTF-32BE BOM found" << endl;
126 #endif
127 return utf32be;
128 }
129 return utf8; // no valid bom so use utf8 as default
130 }
131 int getByte(unsigned char &c)
132 {
133 if (fill)
134 {
135 index %= 4;
136 --fill;
137 c = buffer[index++];
138 return 1;
139 } else
140 {
141 inputfile.read(reinterpret_cast<char*>(&c),1);
142 return inputfile.gcount();
143 }
144 }
145 int getWord(unsigned short &w)
146 {
147 unsigned char c[2];
148 if (!getByte(c[0]))
149 return 0;
150 if (!getByte(c[1]))
151 return 1;
152 if (encoding == utf16le)
153 w = c[0] | (c[1] << 8);
154 else
155 w = c[1] | (c[0] << 8);
156 return 2;
157 }
158 int getDWord(wchar_t &d)
159 {
160 unsigned char c[4];
161 for (int i=0;i<4;i++)
162 if (!getByte(c[i]))
163 return i;
164 if (encoding == utf32le)
165 d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);
166 else
167 d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24);
168 return 4;
169 }
170 wchar_t get_wchar_t()
171 {
172 wchar_t ret = (wchar_t)-1;
173 switch (encoding)
174 {
175 case detect: // if still unknwon
176 encoding = utf8; // assume utf8 as default
177 case utf8:
178 unsigned char c, tmp;
179 if (!getByte(tmp))
180 return ret;
181 // table for 64 bytes (all 11xxxxxx resp. >=192)
182 // resulting byte is determined:
183 // lower 3 bits: number of following bytes (max.8) 0=error
184 // upper 5 bits: data filled with 0
185 if (tmp & 0x80)
186 {
187 if ((tmp & 0xc0) != 0xc0)
188 {
189 cerr << "UTF-8 Error: invalid data byte" << endl;
190 return ret;
191 }
192 unsigned char i = utf8table[tmp & 0x3f];
193 ret = i >> 3;
194 i &= 7;
195 while (i--)
196 {
197 ret <<= 6;
198 if (!getByte(c))
199 return wchar_t(-1);
200 ret |= c & 0x3f;
201 }
202 return ret;
203 }
204 else
205 return wchar_t(tmp);
206 case utf16le:
207 case utf16be:
208 unsigned short w,w2;
209 if (getWord(w) != 2)
210 return ret;
211 if ((w & 0xfc00) == 0xd800) // high surrogate first
212 {
213 if (getWord(w2) != 2)
214 return ret;
215 if ((w2 & 0xfc00) != 0xdc00)
216 {
217 cerr << "UTF-16 Error: invalid low surrogate" << endl;
218 return ret;
219 }
220 return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff);
221 }
222 return w;
223 case utf32le:
224 case utf32be:
225 if (getDWord(ret) != 4)
226 return wchar_t (-1);
227 return ret;
228 }
229 return ret;
230 }
231 void convert2utf16le()
232 {
233 wchar_t c;
234 unsigned char buffer[2] = {0xff, 0xfe};
235 outputfile.write(reinterpret_cast<char*>(&buffer),2); // write BOM
236 c = get_wchar_t();
237 while (!inputfile.eof())
238 {
239 buffer[0] = c & 0xff;
240 buffer[1] = (c >> 8) & 0xff; // create utf16-le char
241 outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char
242 c = get_wchar_t();
243 }
244 }
245 ~utf_converter()
246 {
247 if (inputfile)
248 inputfile.close();
249 if (outputfile)
250 outputfile.close();
251 }
252 };
253
254 const unsigned char utf_converter::utf8table[64] = {
255 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
256 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249,
257 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
258 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7
259 };
260
261
262 int main(int argc, char* argv[])
263 {
264 utf_converter::err_types err;
265 if (argc < 3)
266 {
267 cout << "usage: " << argv[0] << " inputfile outputfile" << endl;
268 return -1;
269 }
270 utf_converter conv(argv[1],argv[2]);
271 if ((err = conv.getError())!=utf_converter::none)
272 {
273 switch (err)
274 {
275 case utf_converter::iopen:
276 cerr << "Couldn't open input file." << endl;
277 break;
278 case utf_converter::oopen:
279 cerr << "Couldn't open output file." << endl;
280 break;
281 default:
282 cerr << "Unknown error." << endl;
283 }
284 return -1;
285 } else
286 conv.convert2utf16le();
287 return 0;
288 }
289
290 // vim:set ts=4 sw=4: