Merge branch 'ntfs_rebase'
[reactos.git] / sdk / tools / utf16le / utf16le.cpp
1 /*
2 * Usage: utf16le inputfile outputfile
3 *
4 * This is a tool and is compiled using the host compiler,
5 * i.e. on Linux gcc and not mingw-gcc (cross-compiler).
6 * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE)
7 * to utf-16 LE and especially made for automatic conversions of
8 * INF-files from utf-8 to utf-16LE (so we can furthermore
9 * store the INF files in utf-8 for subversion.
10 *
11 * Author: Matthias Kupfer (mkupfer@reactos.org)
12 */
13
14 #include <fstream>
15 #include <iostream>
16 #include <string.h>
17
18 //#define DISPLAY_DETECTED_UNICODE
19
20 using namespace std;
21
22 #ifdef _MSC_VER
23 #define strcasecmp _stricmp
24 #endif
25
26 class utf_converter
27 {
28 public:
29 // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only
30 // due to ambiguous BOM
31 enum enc_types { detect, utf8, utf16le, utf16be, utf32le, utf32be };
32 enum err_types { none, iopen, oopen, eof, read, write, decode };
33 enum bom_types { bom, nobom };
34 protected:
35 err_types error;
36 enc_types encoding;
37 bom_types bom_type;
38 unsigned char buffer[4], fill, index; // need 4 char buffer for optional BOM handling
39 fstream inputfile,outputfile;
40 static const unsigned char utf8table[64];
41 public:
42 utf_converter(string ifname, string ofname, bom_types ofbom = bom, enc_types enc = detect) : error(none), bom_type(ofbom), encoding(enc), fill(0), index(0)
43 {
44 enc_types tmp_enc;
45 inputfile.open(ifname.c_str(), ios::in | ios::binary);
46 if (!inputfile)
47 {
48 error = iopen;
49 return;
50 }
51 outputfile.open(ofname.c_str(), ios::out | ios::binary);
52 if (!outputfile)
53 {
54 error = oopen;
55 return;
56 }
57 tmp_enc = getBOM();
58 if (enc != detect)
59 {
60 if (enc != tmp_enc)
61 cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl;
62 }
63 else
64 encoding = tmp_enc;
65 }
66 err_types getError()
67 {
68 return error;
69 }
70 enc_types getBOM()
71 {
72 index = 0;
73 /* first byte can also detect with:
74 if ((buffer[0] & 0x11) || !buffer[0]))
75 valid values are 0xef, 0xff, 0xfe, 0x00
76 */
77 inputfile.read(reinterpret_cast<char*>(&buffer),4);
78 fill =inputfile.gcount();
79 // stupid utf8 bom
80 if ((fill > 2) &&
81 (buffer[0] == 0xef) &&
82 (buffer[1] == 0xbb) &&
83 (buffer[2] == 0xbf))
84 {
85 index += 3;
86 fill -=3;
87 #ifdef DISPLAY_DETECTED_UNICODE
88 cerr << "UTF-8 BOM found" << endl;
89 #endif
90 return utf8;
91 }
92 if ((fill > 1) &&
93 (buffer[0] == 0xfe) &&
94 (buffer[1] == 0xff))
95 {
96 index += 2;
97 fill -= 2;
98 #ifdef DISPLAY_DETECTED_UNICODE
99 cerr << "UTF-16BE BOM found" << endl;
100 #endif
101 return utf16be;
102 }
103 if ((fill > 1) &&
104 (buffer[0] == 0xff) &&
105 (buffer[1] == 0xfe))
106 {
107 if ((fill == 4) &&
108 (buffer[2] == 0x00) &&
109 (buffer[3] == 0x00))
110 {
111 cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl;
112 fill = 0;
113 index = 0;
114 return utf32le;
115 }
116 fill -= 2;
117 index += 2;
118 #ifdef DISPLAY_DETECTED_UNICODE
119 cerr << "UTF-16LE BOM found" << endl;
120 #endif
121 return utf16le;
122 }
123 if ((fill == 4) &&
124 (buffer[0] == 0x00) &&
125 (buffer[1] == 0x00) &&
126 (buffer[2] == 0xfe) &&
127 (buffer[3] == 0xff))
128 {
129 fill = 0;
130 index = 0;
131 #ifdef DISPLAY_DETECTED_UNICODE
132 cerr << "UTF-32BE BOM found" << endl;
133 #endif
134 return utf32be;
135 }
136 return utf8; // no valid bom so use utf8 as default
137 }
138 int getByte(unsigned char &c)
139 {
140 if (fill)
141 {
142 index %= 4;
143 --fill;
144 c = buffer[index++];
145 return 1;
146 } else
147 {
148 inputfile.read(reinterpret_cast<char*>(&c),1);
149 return inputfile.gcount();
150 }
151 }
152 int getWord(unsigned short &w)
153 {
154 unsigned char c[2];
155 if (!getByte(c[0]))
156 return 0;
157 if (!getByte(c[1]))
158 return 1;
159 if (encoding == utf16le)
160 w = c[0] | (c[1] << 8);
161 else
162 w = c[1] | (c[0] << 8);
163 return 2;
164 }
165 int getDWord(wchar_t &d)
166 {
167 unsigned char c[4];
168 for (int i=0;i<4;i++)
169 if (!getByte(c[i]))
170 return i;
171 if (encoding == utf32le)
172 d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);
173 else
174 d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24);
175 return 4;
176 }
177 wchar_t get_wchar_t()
178 {
179 wchar_t ret = (wchar_t)-1;
180 switch (encoding)
181 {
182 case detect: // if still unknwon
183 encoding = utf8; // assume utf8 as default
184 case utf8:
185 unsigned char c, tmp;
186 if (!getByte(tmp))
187 return ret;
188 // table for 64 bytes (all 11xxxxxx resp. >=192)
189 // resulting byte is determined:
190 // lower 3 bits: number of following bytes (max.8) 0=error
191 // upper 5 bits: data filled with 0
192 if (tmp & 0x80)
193 {
194 if ((tmp & 0xc0) != 0xc0)
195 {
196 cerr << "UTF-8 Error: invalid data byte" << endl;
197 return ret;
198 }
199 unsigned char i = utf8table[tmp & 0x3f];
200 ret = i >> 3;
201 i &= 7;
202 while (i--)
203 {
204 ret <<= 6;
205 if (!getByte(c))
206 return wchar_t(-1);
207 ret |= c & 0x3f;
208 }
209 return ret;
210 }
211 else
212 return wchar_t(tmp);
213 case utf16le:
214 case utf16be:
215 unsigned short w,w2;
216 if (getWord(w) != 2)
217 return ret;
218 if ((w & 0xfc00) == 0xd800) // high surrogate first
219 {
220 if (getWord(w2) != 2)
221 return ret;
222 if ((w2 & 0xfc00) != 0xdc00)
223 {
224 cerr << "UTF-16 Error: invalid low surrogate" << endl;
225 return ret;
226 }
227 return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff);
228 }
229 return w;
230 case utf32le:
231 case utf32be:
232 if (getDWord(ret) != 4)
233 return wchar_t (-1);
234 return ret;
235 }
236 return ret;
237 }
238 void convert2utf16le()
239 {
240 unsigned char buffer[2] = { 0xff, 0xfe };
241
242 if (bom_type == bom)
243 {
244 outputfile.write(reinterpret_cast<char*>(&buffer), 2); // write BOM
245 }
246
247 wchar_t c = get_wchar_t();
248
249 while (!inputfile.eof())
250 {
251 buffer[0] = c & 0xff;
252 buffer[1] = (c >> 8) & 0xff; // create utf16-le char
253 outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char
254 c = get_wchar_t();
255 }
256 }
257 ~utf_converter()
258 {
259 if (inputfile)
260 inputfile.close();
261 if (outputfile)
262 outputfile.close();
263 }
264 };
265
266 const unsigned char utf_converter::utf8table[64] = {
267 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
268 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249,
269 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
270 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7
271 };
272
273
274 int main(int argc, char* argv[])
275 {
276 utf_converter::err_types err;
277
278 if (argc < 3)
279 {
280 cout << "usage: " << argv[0] << " inputfile outputfile" << endl;
281 return -1;
282 }
283
284 utf_converter::bom_types bom_type = utf_converter::bom;
285
286 if (argc == 4 && strcasecmp(argv[3], "nobom") == 0)
287 {
288 bom_type = utf_converter::nobom;
289 }
290
291 utf_converter conv(argv[1], argv[2], bom_type);
292
293 if ((err = conv.getError())!=utf_converter::none)
294 {
295 switch (err)
296 {
297 case utf_converter::iopen:
298 cerr << "Couldn't open input file." << endl;
299 break;
300 case utf_converter::oopen:
301 cerr << "Couldn't open output file." << endl;
302 break;
303 default:
304 cerr << "Unknown error." << endl;
305 }
306 return -1;
307 }
308 else
309 {
310 conv.convert2utf16le();
311 }
312
313 return 0;
314 }