sdk/tools/utf16le/utf16le.cpp

   1 /*
   2  * Usage: utf16le inputfile outputfile
   3  *
   4  * This is a tool and is compiled using the host compiler,
   5  * i.e. on Linux gcc and not mingw-gcc (cross-compiler).
   6  * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE)
   7  * to utf-16 LE and especially made for automatic conversions of
   8  * INF-files from utf-8 to utf-16LE (so we can furthermore
   9  * store the INF files in utf-8 for subversion.
  10  *
  11  * Author: Matthias Kupfer (mkupfer@reactos.org)
  12  */
  13
  14 #include <fstream>
  15 #include <iostream>
  16 #include <string.h>
  17
  18 //#define DISPLAY_DETECTED_UNICODE
  19
  20 using namespace std;
  21
  22 #ifdef _MSC_VER
  23 #define strcasecmp _stricmp
  24 #endif
  25
  26 class utf_converter
  27 {
  28 public:
  29     // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only
  30     // due to ambiguous BOM
  31     enum enc_types { detect, utf8, utf16le, utf16be, utf32le, utf32be };
  32     enum err_types { none, iopen, oopen, eof, read, write, decode };
  33     enum bom_types { bom, nobom };
  34 protected:
  35     err_types error;
  36     enc_types encoding;
  37     bom_types bom_type;
  38     unsigned char buffer[4], fill, index; // need 4 char buffer for optional BOM handling
  39     fstream inputfile,outputfile;
  40     static const unsigned char utf8table[64];
  41 public:
  42     utf_converter(string ifname, string ofname, bom_types ofbom = bom, enc_types enc = detect) : error(none), bom_type(ofbom), encoding(enc), fill(0), index(0)
  43     {
  44         enc_types tmp_enc;
  45         inputfile.open(ifname.c_str(), ios::in | ios::binary);
  46         if (!inputfile)
  47         {
  48             error = iopen;
  49             return;
  50         }
  51         outputfile.open(ofname.c_str(), ios::out | ios::binary);
  52         if (!outputfile)
  53         {
  54             error = oopen;
  55             return;
  56         }
  57         tmp_enc = getBOM();
  58         if (enc != detect)
  59         {
  60             if (enc != tmp_enc)
  61                 cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl;
  62         }
  63         else
  64             encoding = tmp_enc;
  65     }
  66     err_types getError()
  67     {
  68         return error;
  69     }
  70     enc_types getBOM()
  71     {
  72         index = 0;
  73         /* first byte can also detect with:
  74         if ((buffer[0] & 0x11) || !buffer[0]))
  75         valid values are 0xef, 0xff, 0xfe, 0x00
  76         */
  77         inputfile.read(reinterpret_cast<char*>(&buffer),4);
  78         fill =inputfile.gcount();
  79         // stupid utf8 bom
  80         if ((fill > 2) &&
  81             (buffer[0] == 0xef) &&
  82             (buffer[1] == 0xbb) &&
  83             (buffer[2] == 0xbf))
  84         {
  85             index += 3;
  86             fill -=3;
  87 #ifdef DISPLAY_DETECTED_UNICODE
  88             cerr << "UTF-8 BOM found" << endl;
  89 #endif
  90             return utf8;
  91         }
  92         if ((fill > 1) &&
  93             (buffer[0] == 0xfe) &&
  94             (buffer[1] == 0xff))
  95         {
  96             index += 2;
  97             fill -= 2;
  98 #ifdef DISPLAY_DETECTED_UNICODE
  99             cerr << "UTF-16BE BOM found" << endl;
 100 #endif
 101             return utf16be;
 102         }
 103         if ((fill > 1) &&
 104             (buffer[0] == 0xff) &&
 105             (buffer[1] == 0xfe))
 106         {
 107             if ((fill == 4) &&
 108                 (buffer[2] == 0x00) &&
 109                 (buffer[3] == 0x00))
 110             {
 111                 cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl;
 112                 fill = 0;
 113                 index = 0;
 114                 return utf32le;
 115             }
 116             fill -= 2;
 117             index += 2;
 118 #ifdef DISPLAY_DETECTED_UNICODE
 119             cerr << "UTF-16LE BOM found" << endl;
 120 #endif
 121             return utf16le;
 122         }
 123         if ((fill == 4) &&
 124             (buffer[0] == 0x00) &&
 125             (buffer[1] == 0x00) &&
 126             (buffer[2] == 0xfe) &&
 127             (buffer[3] == 0xff))
 128         {
 129             fill = 0;
 130             index = 0;
 131 #ifdef DISPLAY_DETECTED_UNICODE
 132             cerr << "UTF-32BE BOM found" << endl;
 133 #endif
 134             return utf32be;
 135         }
 136         return utf8; // no valid bom so use utf8 as default
 137     }
 138     int getByte(unsigned char &c)
 139     {
 140         if (fill)
 141         {
 142             index %= 4;
 143             --fill;
 144             c = buffer[index++];
 145             return 1;
 146         } else
 147         {
 148             inputfile.read(reinterpret_cast<char*>(&c),1);
 149             return inputfile.gcount();
 150         }
 151     }
 152     int getWord(unsigned short &w)
 153     {
 154         unsigned char c[2];
 155         if (!getByte(c[0]))
 156                 return 0;
 157         if (!getByte(c[1]))
 158                 return 1;
 159         if (encoding == utf16le)
 160             w = c[0] | (c[1] << 8);
 161         else
 162             w = c[1] | (c[0] << 8);
 163         return 2;
 164     }
 165     int getDWord(wchar_t &d)
 166     {
 167         unsigned char c[4];
 168         for (int i=0;i<4;i++)
 169             if (!getByte(c[i]))
 170                     return i;
 171         if (encoding == utf32le)
 172             d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);
 173         else
 174             d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24);
 175         return 4;
 176     }
 177     wchar_t get_wchar_t()
 178     {
 179         wchar_t ret = (wchar_t)-1;
 180         switch (encoding)
 181         {
 182             case detect: // if still unknwon
 183                 encoding = utf8; // assume utf8 as default
 184             case utf8:
 185                 unsigned char c, tmp;
 186                 if (!getByte(tmp))
 187                     return ret;
 188                 // table for 64 bytes (all 11xxxxxx resp. >=192)
 189                 // resulting byte is determined:
 190                 // lower 3 bits: number of following bytes (max.8) 0=error
 191                 // upper 5 bits: data filled with 0
 192                 if (tmp & 0x80)
 193                 {
 194                     if ((tmp & 0xc0) != 0xc0)
 195                     {
 196                         cerr << "UTF-8 Error: invalid data byte" << endl;
 197                         return ret;
 198                     }
 199                     unsigned char i = utf8table[tmp & 0x3f];
 200                     ret = i >> 3;
 201                     i &= 7;
 202                     while (i--)
 203                     {
 204                         ret <<= 6;
 205                         if (!getByte(c))
 206                             return wchar_t(-1);
 207                         ret |= c & 0x3f;
 208                     }
 209                     return ret;
 210                 }
 211                 else
 212                     return wchar_t(tmp);
 213             case utf16le:
 214             case utf16be:
 215                 unsigned short w,w2;
 216                 if (getWord(w) != 2)
 217                     return ret;
 218                 if ((w & 0xfc00) == 0xd800) // high surrogate first
 219                 {
 220                     if (getWord(w2) != 2)
 221                         return ret;
 222                     if ((w2 & 0xfc00) != 0xdc00)
 223                     {
 224                         cerr << "UTF-16 Error: invalid low surrogate" << endl;
 225                         return ret;
 226                     }
 227                     return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff);
 228                 }
 229                 return w;
 230             case utf32le:
 231             case utf32be:
 232                 if (getDWord(ret) != 4)
 233                     return wchar_t (-1);
 234                 return ret;
 235         }
 236         return ret;
 237     }
 238     void convert2utf16le()
 239     {
 240         unsigned char buffer[2] = { 0xff, 0xfe };
 241
 242         if (bom_type == bom)
 243         {
 244             outputfile.write(reinterpret_cast<char*>(&buffer), 2); // write BOM
 245         }
 246
 247         wchar_t c = get_wchar_t();
 248
 249         while (!inputfile.eof())
 250         {
 251             buffer[0] = c & 0xff;
 252             buffer[1] = (c >> 8) & 0xff; // create utf16-le char
 253             outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char
 254             c = get_wchar_t();
 255         }
 256     }
 257     ~utf_converter()
 258     {
 259         if (inputfile)
 260             inputfile.close();
 261         if (outputfile)
 262             outputfile.close();
 263     }
 264 };
 265
 266 const unsigned char utf_converter::utf8table[64] = {
 267 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
 268 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249,
 269 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
 270 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7
 271 };
 272
 273
 274 int main(int argc, char* argv[])
 275 {
 276     utf_converter::err_types err;
 277
 278     if (argc < 3)
 279     {
 280         cout << "usage: " << argv[0] << " inputfile outputfile" << endl;
 281         return -1;
 282     }
 283
 284     utf_converter::bom_types bom_type = utf_converter::bom;
 285
 286     if (argc == 4 && strcasecmp(argv[3], "nobom") == 0)
 287     {
 288         bom_type = utf_converter::nobom;
 289     }
 290
 291     utf_converter conv(argv[1], argv[2], bom_type);
 292
 293     if ((err = conv.getError())!=utf_converter::none)
 294     {
 295         switch (err)
 296         {
 297             case utf_converter::iopen:
 298                 cerr << "Couldn't open input file." << endl;
 299                 break;
 300             case utf_converter::oopen:
 301                 cerr << "Couldn't open output file." << endl;
 302                 break;
 303             default:
 304                 cerr << "Unknown error." << endl;
 305         }
 306         return -1;
 307     }
 308     else
 309     {
 310         conv.convert2utf16le();
 311     }
 312
 313     return 0;
 314 }