[TOOLS] Fix/suppress all MSVC/x64 warnings (#1525)
[reactos.git] / sdk / tools / utf16le / utf16le.cpp
1 /*
2 * Usage: utf16le inputfile outputfile
3 *
4 * This is a tool and is compiled using the host compiler,
5 * i.e. on Linux gcc and not mingw-gcc (cross-compiler).
6 * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE)
7 * to utf-16 LE and especially made for automatic conversions of
8 * INF-files from utf-8 to utf-16LE (so we can furthermore
9 * store the INF files in utf-8 for subversion.
10 *
11 * Author: Matthias Kupfer (mkupfer@reactos.org)
12 */
13
14 #include <fstream>
15 #include <iostream>
16 #include <string.h>
17
18 //#define DISPLAY_DETECTED_UNICODE
19
20 using namespace std;
21
22 #ifdef _MSC_VER
23 #define strcasecmp _stricmp
24 #endif
25
26 class utf_converter
27 {
28 public:
29 // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only
30 // due to ambiguous BOM
31 enum enc_types { detect, utf8, utf16le, utf16be, utf32le, utf32be };
32 enum err_types { none, iopen, oopen, eof, read, write, decode };
33 enum bom_types { bom, nobom };
34 protected:
35 err_types error;
36 enc_types encoding;
37 bom_types bom_type;
38 unsigned char buffer[4], index; // need 4 char buffer for optional BOM handling
39 std::streamsize fill;
40 fstream inputfile,outputfile;
41 static const unsigned char utf8table[64];
42 public:
43 utf_converter(string ifname, string ofname, bom_types ofbom = bom, enc_types enc = detect) : error(none), bom_type(ofbom), encoding(enc), fill(0), index(0)
44 {
45 enc_types tmp_enc;
46 inputfile.open(ifname.c_str(), ios::in | ios::binary);
47 if (!inputfile)
48 {
49 error = iopen;
50 return;
51 }
52 outputfile.open(ofname.c_str(), ios::out | ios::binary);
53 if (!outputfile)
54 {
55 error = oopen;
56 return;
57 }
58 tmp_enc = getBOM();
59 if (enc != detect)
60 {
61 if (enc != tmp_enc)
62 cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl;
63 }
64 else
65 encoding = tmp_enc;
66 }
67 err_types getError()
68 {
69 return error;
70 }
71 enc_types getBOM()
72 {
73 index = 0;
74 /* first byte can also detect with:
75 if ((buffer[0] & 0x11) || !buffer[0]))
76 valid values are 0xef, 0xff, 0xfe, 0x00
77 */
78 inputfile.read(reinterpret_cast<char*>(&buffer),4);
79 fill = inputfile.gcount();
80 // stupid utf8 bom
81 if ((fill > 2) &&
82 (buffer[0] == 0xef) &&
83 (buffer[1] == 0xbb) &&
84 (buffer[2] == 0xbf))
85 {
86 index += 3;
87 fill -=3;
88 #ifdef DISPLAY_DETECTED_UNICODE
89 cerr << "UTF-8 BOM found" << endl;
90 #endif
91 return utf8;
92 }
93 if ((fill > 1) &&
94 (buffer[0] == 0xfe) &&
95 (buffer[1] == 0xff))
96 {
97 index += 2;
98 fill -= 2;
99 #ifdef DISPLAY_DETECTED_UNICODE
100 cerr << "UTF-16BE BOM found" << endl;
101 #endif
102 return utf16be;
103 }
104 if ((fill > 1) &&
105 (buffer[0] == 0xff) &&
106 (buffer[1] == 0xfe))
107 {
108 if ((fill == 4) &&
109 (buffer[2] == 0x00) &&
110 (buffer[3] == 0x00))
111 {
112 cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl;
113 fill = 0;
114 index = 0;
115 return utf32le;
116 }
117 fill -= 2;
118 index += 2;
119 #ifdef DISPLAY_DETECTED_UNICODE
120 cerr << "UTF-16LE BOM found" << endl;
121 #endif
122 return utf16le;
123 }
124 if ((fill == 4) &&
125 (buffer[0] == 0x00) &&
126 (buffer[1] == 0x00) &&
127 (buffer[2] == 0xfe) &&
128 (buffer[3] == 0xff))
129 {
130 fill = 0;
131 index = 0;
132 #ifdef DISPLAY_DETECTED_UNICODE
133 cerr << "UTF-32BE BOM found" << endl;
134 #endif
135 return utf32be;
136 }
137 return utf8; // no valid bom so use utf8 as default
138 }
139 std::streamsize getByte(unsigned char &c)
140 {
141 if (fill)
142 {
143 index %= 4;
144 --fill;
145 c = buffer[index++];
146 return 1;
147 } else
148 {
149 inputfile.read(reinterpret_cast<char*>(&c),1);
150 return inputfile.gcount();
151 }
152 }
153 std::streamsize getWord(unsigned short &w)
154 {
155 unsigned char c[2];
156 if (!getByte(c[0]))
157 return 0;
158 if (!getByte(c[1]))
159 return 1;
160 if (encoding == utf16le)
161 w = c[0] | (c[1] << 8);
162 else
163 w = c[1] | (c[0] << 8);
164 return 2;
165 }
166 std::streamsize getDWord(wchar_t &d)
167 {
168 unsigned char c[4];
169 for (int i=0;i<4;i++)
170 if (!getByte(c[i]))
171 return i;
172 if (encoding == utf32le)
173 d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);
174 else
175 d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24);
176 return 4;
177 }
178 wchar_t get_wchar_t()
179 {
180 wchar_t ret = (wchar_t)-1;
181 switch (encoding)
182 {
183 case detect: // if still unknwon
184 encoding = utf8; // assume utf8 as default
185 case utf8:
186 unsigned char c, tmp;
187 if (!getByte(tmp))
188 return ret;
189 // table for 64 bytes (all 11xxxxxx resp. >=192)
190 // resulting byte is determined:
191 // lower 3 bits: number of following bytes (max.8) 0=error
192 // upper 5 bits: data filled with 0
193 if (tmp & 0x80)
194 {
195 if ((tmp & 0xc0) != 0xc0)
196 {
197 cerr << "UTF-8 Error: invalid data byte" << endl;
198 return ret;
199 }
200 unsigned char i = utf8table[tmp & 0x3f];
201 ret = i >> 3;
202 i &= 7;
203 while (i--)
204 {
205 ret <<= 6;
206 if (!getByte(c))
207 return wchar_t(-1);
208 ret |= c & 0x3f;
209 }
210 return ret;
211 }
212 else
213 return wchar_t(tmp);
214 case utf16le:
215 case utf16be:
216 unsigned short w,w2;
217 if (getWord(w) != 2)
218 return ret;
219 if ((w & 0xfc00) == 0xd800) // high surrogate first
220 {
221 if (getWord(w2) != 2)
222 return ret;
223 if ((w2 & 0xfc00) != 0xdc00)
224 {
225 cerr << "UTF-16 Error: invalid low surrogate" << endl;
226 return ret;
227 }
228 return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff);
229 }
230 return w;
231 case utf32le:
232 case utf32be:
233 if (getDWord(ret) != 4)
234 return wchar_t (-1);
235 return ret;
236 }
237 return ret;
238 }
239 void convert2utf16le()
240 {
241 unsigned char buffer[2] = { 0xff, 0xfe };
242
243 if (bom_type == bom)
244 {
245 outputfile.write(reinterpret_cast<char*>(&buffer), 2); // write BOM
246 }
247
248 wchar_t c = get_wchar_t();
249
250 while (!inputfile.eof())
251 {
252 buffer[0] = c & 0xff;
253 buffer[1] = (c >> 8) & 0xff; // create utf16-le char
254 outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char
255 c = get_wchar_t();
256 }
257 }
258 ~utf_converter()
259 {
260 if (inputfile)
261 inputfile.close();
262 if (outputfile)
263 outputfile.close();
264 }
265 };
266
267 const unsigned char utf_converter::utf8table[64] = {
268 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
269 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249,
270 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
271 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7
272 };
273
274
275 int main(int argc, char* argv[])
276 {
277 utf_converter::err_types err;
278
279 if (argc < 3)
280 {
281 cout << "usage: " << argv[0] << " inputfile outputfile" << endl;
282 return -1;
283 }
284
285 utf_converter::bom_types bom_type = utf_converter::bom;
286
287 if (argc == 4 && strcasecmp(argv[3], "nobom") == 0)
288 {
289 bom_type = utf_converter::nobom;
290 }
291
292 utf_converter conv(argv[1], argv[2], bom_type);
293
294 if ((err = conv.getError())!=utf_converter::none)
295 {
296 switch (err)
297 {
298 case utf_converter::iopen:
299 cerr << "Couldn't open input file." << endl;
300 break;
301 case utf_converter::oopen:
302 cerr << "Couldn't open output file." << endl;
303 break;
304 default:
305 cerr << "Unknown error." << endl;
306 }
307 return -1;
308 }
309 else
310 {
311 conv.convert2utf16le();
312 }
313
314 return 0;
315 }