2 * Usage: utf16le inputfile outputfile
4 * This is a tool and is compiled using the host compiler,
5 * i.e. on Linux gcc and not mingw-gcc (cross-compiler).
6 * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE)
7 * to utf-16 LE and especially made for automatic conversions of
8 * INF-files from utf-8 to utf-16LE (so we can furthermore
9 * store the INF files in utf-8 for subversion.
11 * Author: Matthias Kupfer (mkupfer@reactos.org)
17 //#define DISPLAY_DETECTED_UNICODE
24 // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only
25 // due to ambiguous BOM
26 enum enc_types
{ detect
, utf8
, utf16le
, utf16be
, utf32le
, utf32be
};
27 enum err_types
{ none
, iopen
, oopen
, eof
, read
, write
, decode
};
31 unsigned char buffer
[4], fill
, index
; // need 4 char buffer for optional BOM handling
32 fstream inputfile
,outputfile
;
33 static const unsigned char utf8table
[64];
35 utf_converter(string ifname
, string ofname
, enc_types enc
= detect
) : error(none
), encoding(enc
), fill(0), index(0)
38 inputfile
.open(ifname
.c_str(), ios::in
);
44 outputfile
.open(ofname
.c_str(), ios::out
);
54 cerr
<< "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl
;
66 /* first byte can also detect with:
67 if ((buffer[0] & 0x11) || !buffer[0]))
68 valid values are 0xef, 0xff, 0xfe, 0x00
70 inputfile
.read(reinterpret_cast<char*>(&buffer
),4);
71 fill
=inputfile
.gcount();
74 (buffer
[0] == 0xef) &&
75 (buffer
[1] == 0xbb) &&
80 #ifdef DISPLAY_DETECTED_UNICODE
81 cerr
<< "UTF-8 BOM found" << endl
;
86 (buffer
[0] == 0xfe) &&
91 #ifdef DISPLAY_DETECTED_UNICODE
92 cerr
<< "UTF-16BE BOM found" << endl
;
97 (buffer
[0] == 0xff) &&
101 (buffer
[2] == 0x00) &&
104 cerr
<< "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl
;
111 #ifdef DISPLAY_DETECTED_UNICODE
112 cerr
<< "UTF-16LE BOM found" << endl
;
117 (buffer
[0] == 0x00) &&
118 (buffer
[1] == 0x00) &&
119 (buffer
[2] == 0xfe) &&
124 #ifdef DISPLAY_DETECTED_UNICODE
125 cerr
<< "UTF-32BE BOM found" << endl
;
129 return utf8
; // no valid bom so use utf8 as default
131 int getByte(unsigned char &c
)
141 inputfile
.read(reinterpret_cast<char*>(&c
),1);
142 return inputfile
.gcount();
145 int getWord(unsigned short &w
)
152 if (encoding
== utf16le
)
153 w
= c
[0] | (c
[1] << 8);
155 w
= c
[1] | (c
[0] << 8);
158 int getDWord(wchar_t &d
)
161 for (int i
=0;i
<4;i
++)
164 if (encoding
== utf32le
)
165 d
= c
[0] | (c
[1] << 8) | (c
[2] << 16) | (c
[3] << 24);
167 d
= c
[3] | (c
[2] << 8) | (c
[1] << 16) | (c
[0] << 24);
170 wchar_t get_wchar_t()
172 wchar_t ret
= (wchar_t)-1;
175 case detect
: // if still unknwon
176 encoding
= utf8
; // assume utf8 as default
178 unsigned char c
, tmp
;
181 // table for 64 bytes (all 11xxxxxx resp. >=192)
182 // resulting byte is determined:
183 // lower 3 bits: number of following bytes (max.8) 0=error
184 // upper 5 bits: data filled with 0
187 if ((tmp
& 0xc0) != 0xc0)
189 cerr
<< "UTF-8 Error: invalid data byte" << endl
;
192 unsigned char i
= utf8table
[tmp
& 0x3f];
211 if ((w
& 0xfc00) == 0xd800) // high surrogate first
213 if (getWord(w2
) != 2)
215 if ((w2
& 0xfc00) != 0xdc00)
217 cerr
<< "UTF-16 Error: invalid low surrogate" << endl
;
220 return (((w
& 0x3ff) + 0x40) << 10) | (w2
& 0x3ff);
225 if (getDWord(ret
) != 4)
231 void convert2utf16le()
234 unsigned char buffer
[2] = {0xff, 0xfe};
235 outputfile
.write(reinterpret_cast<char*>(&buffer
),2); // write BOM
237 while (!inputfile
.eof())
239 buffer
[0] = c
& 0xff;
240 buffer
[1] = (c
>> 8) & 0xff; // create utf16-le char
241 outputfile
.write(reinterpret_cast<char*>(&buffer
),2); // write char
254 const unsigned char utf_converter::utf8table
[64] = {
255 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
256 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249,
257 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
258 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7
262 int main(int argc
, char* argv
[])
264 utf_converter::err_types err
;
267 cout
<< "usage: " << argv
[0] << " inputfile outputfile" << endl
;
270 utf_converter
conv(argv
[1],argv
[2]);
271 if ((err
= conv
.getError())!=utf_converter::none
)
275 case utf_converter::iopen
:
276 cerr
<< "Couldn't open input file." << endl
;
278 case utf_converter::oopen
:
279 cerr
<< "Couldn't open output file." << endl
;
282 cerr
<< "Unknown error." << endl
;
286 conv
.convert2utf16le();
290 // vim:set ts=4 sw=4: