2 * Usage: utf16le inputfile outputfile
4 * This is a tool and is compiled using the host compiler,
5 * i.e. on Linux gcc and not mingw-gcc (cross-compiler).
6 * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE)
7 * to utf-16 LE and especially made for automatic conversions of
8 * INF-files from utf-8 to utf-16LE (so we can furthermore
9 * store the INF files in utf-8 for subversion.
11 * Author: Matthias Kupfer (mkupfer@reactos.org)
18 //#define DISPLAY_DETECTED_UNICODE
23 #define strcasecmp _stricmp
29 // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only
30 // due to ambiguous BOM
31 enum enc_types
{ detect
, utf8
, utf16le
, utf16be
, utf32le
, utf32be
};
32 enum err_types
{ none
, iopen
, oopen
, eof
, read
, write
, decode
};
33 enum bom_types
{ bom
, nobom
};
38 unsigned char buffer
[4], fill
, index
; // need 4 char buffer for optional BOM handling
39 fstream inputfile
,outputfile
;
40 static const unsigned char utf8table
[64];
42 utf_converter(string ifname
, string ofname
, bom_types ofbom
= bom
, enc_types enc
= detect
) : error(none
), bom_type(ofbom
), encoding(enc
), fill(0), index(0)
45 inputfile
.open(ifname
.c_str(), ios::in
| ios::binary
);
51 outputfile
.open(ofname
.c_str(), ios::out
| ios::binary
);
61 cerr
<< "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl
;
73 /* first byte can also detect with:
74 if ((buffer[0] & 0x11) || !buffer[0]))
75 valid values are 0xef, 0xff, 0xfe, 0x00
77 inputfile
.read(reinterpret_cast<char*>(&buffer
),4);
78 fill
=inputfile
.gcount();
81 (buffer
[0] == 0xef) &&
82 (buffer
[1] == 0xbb) &&
87 #ifdef DISPLAY_DETECTED_UNICODE
88 cerr
<< "UTF-8 BOM found" << endl
;
93 (buffer
[0] == 0xfe) &&
98 #ifdef DISPLAY_DETECTED_UNICODE
99 cerr
<< "UTF-16BE BOM found" << endl
;
104 (buffer
[0] == 0xff) &&
108 (buffer
[2] == 0x00) &&
111 cerr
<< "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl
;
118 #ifdef DISPLAY_DETECTED_UNICODE
119 cerr
<< "UTF-16LE BOM found" << endl
;
124 (buffer
[0] == 0x00) &&
125 (buffer
[1] == 0x00) &&
126 (buffer
[2] == 0xfe) &&
131 #ifdef DISPLAY_DETECTED_UNICODE
132 cerr
<< "UTF-32BE BOM found" << endl
;
136 return utf8
; // no valid bom so use utf8 as default
138 int getByte(unsigned char &c
)
148 inputfile
.read(reinterpret_cast<char*>(&c
),1);
149 return inputfile
.gcount();
152 int getWord(unsigned short &w
)
159 if (encoding
== utf16le
)
160 w
= c
[0] | (c
[1] << 8);
162 w
= c
[1] | (c
[0] << 8);
165 int getDWord(wchar_t &d
)
168 for (int i
=0;i
<4;i
++)
171 if (encoding
== utf32le
)
172 d
= c
[0] | (c
[1] << 8) | (c
[2] << 16) | (c
[3] << 24);
174 d
= c
[3] | (c
[2] << 8) | (c
[1] << 16) | (c
[0] << 24);
177 wchar_t get_wchar_t()
179 wchar_t ret
= (wchar_t)-1;
182 case detect
: // if still unknwon
183 encoding
= utf8
; // assume utf8 as default
185 unsigned char c
, tmp
;
188 // table for 64 bytes (all 11xxxxxx resp. >=192)
189 // resulting byte is determined:
190 // lower 3 bits: number of following bytes (max.8) 0=error
191 // upper 5 bits: data filled with 0
194 if ((tmp
& 0xc0) != 0xc0)
196 cerr
<< "UTF-8 Error: invalid data byte" << endl
;
199 unsigned char i
= utf8table
[tmp
& 0x3f];
218 if ((w
& 0xfc00) == 0xd800) // high surrogate first
220 if (getWord(w2
) != 2)
222 if ((w2
& 0xfc00) != 0xdc00)
224 cerr
<< "UTF-16 Error: invalid low surrogate" << endl
;
227 return (((w
& 0x3ff) + 0x40) << 10) | (w2
& 0x3ff);
232 if (getDWord(ret
) != 4)
238 void convert2utf16le()
240 unsigned char buffer
[2] = { 0xff, 0xfe };
244 outputfile
.write(reinterpret_cast<char*>(&buffer
), 2); // write BOM
247 wchar_t c
= get_wchar_t();
249 while (!inputfile
.eof())
251 buffer
[0] = c
& 0xff;
252 buffer
[1] = (c
>> 8) & 0xff; // create utf16-le char
253 outputfile
.write(reinterpret_cast<char*>(&buffer
),2); // write char
266 const unsigned char utf_converter::utf8table
[64] = {
267 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
268 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249,
269 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
270 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7
274 int main(int argc
, char* argv
[])
276 utf_converter::err_types err
;
280 cout
<< "usage: " << argv
[0] << " inputfile outputfile" << endl
;
284 utf_converter::bom_types bom_type
= utf_converter::bom
;
286 if (argc
== 4 && strcasecmp(argv
[3], "nobom") == 0)
288 bom_type
= utf_converter::nobom
;
291 utf_converter
conv(argv
[1], argv
[2], bom_type
);
293 if ((err
= conv
.getError())!=utf_converter::none
)
297 case utf_converter::iopen
:
298 cerr
<< "Couldn't open input file." << endl
;
300 case utf_converter::oopen
:
301 cerr
<< "Couldn't open output file." << endl
;
304 cerr
<< "Unknown error." << endl
;
310 conv
.convert2utf16le();