#include <fstream>
#include <iostream>
+#include <string.h>
//#define DISPLAY_DETECTED_UNICODE
using namespace std;
+#ifdef _MSC_VER
+#define strcasecmp _stricmp
+#endif
+
class utf_converter
{
public:
// due to ambiguous BOM
enum enc_types { detect, utf8, utf16le, utf16be, utf32le, utf32be };
enum err_types { none, iopen, oopen, eof, read, write, decode };
+ enum bom_types { bom, nobom };
protected:
err_types error;
enc_types encoding;
- unsigned char buffer[4], fill, index; // need 4 char buffer for optional BOM handling
+ bom_types bom_type;
+ unsigned char buffer[4], index; // need 4 char buffer for optional BOM handling
+ std::streamsize fill;
fstream inputfile,outputfile;
static const unsigned char utf8table[64];
public:
- utf_converter(string ifname, string ofname, enc_types enc = detect) : error(none), encoding(enc), fill(0), index(0)
+ utf_converter(string ifname, string ofname, bom_types ofbom = bom, enc_types enc = detect) : error(none), bom_type(ofbom), encoding(enc), fill(0), index(0)
{
enc_types tmp_enc;
inputfile.open(ifname.c_str(), ios::in | ios::binary);
valid values are 0xef, 0xff, 0xfe, 0x00
*/
inputfile.read(reinterpret_cast<char*>(&buffer),4);
- fill =inputfile.gcount();
+ fill = inputfile.gcount();
// stupid utf8 bom
if ((fill > 2) &&
(buffer[0] == 0xef) &&
}
return utf8; // no valid bom so use utf8 as default
}
- int getByte(unsigned char &c)
+ std::streamsize getByte(unsigned char &c)
{
if (fill)
{
return inputfile.gcount();
}
}
- int getWord(unsigned short &w)
+ std::streamsize getWord(unsigned short &w)
{
unsigned char c[2];
if (!getByte(c[0]))
w = c[1] | (c[0] << 8);
return 2;
}
- int getDWord(wchar_t &d)
+ std::streamsize getDWord(wchar_t &d)
{
unsigned char c[4];
for (int i=0;i<4;i++)
}
void convert2utf16le()
{
- wchar_t c;
- unsigned char buffer[2] = {0xff, 0xfe};
- outputfile.write(reinterpret_cast<char*>(&buffer),2); // write BOM
- c = get_wchar_t();
+ unsigned char buffer[2] = { 0xff, 0xfe };
+
+ if (bom_type == bom)
+ {
+ outputfile.write(reinterpret_cast<char*>(&buffer), 2); // write BOM
+ }
+
+ wchar_t c = get_wchar_t();
+
while (!inputfile.eof())
{
buffer[0] = c & 0xff;
int main(int argc, char* argv[])
{
utf_converter::err_types err;
+
if (argc < 3)
{
cout << "usage: " << argv[0] << " inputfile outputfile" << endl;
return -1;
}
- utf_converter conv(argv[1],argv[2]);
+
+ utf_converter::bom_types bom_type = utf_converter::bom;
+
+ if (argc == 4 && strcasecmp(argv[3], "nobom") == 0)
+ {
+ bom_type = utf_converter::nobom;
+ }
+
+ utf_converter conv(argv[1], argv[2], bom_type);
+
if ((err = conv.getError())!=utf_converter::none)
{
switch (err)
cerr << "Unknown error." << endl;
}
return -1;
- } else
- conv.convert2utf16le();
+ }
+ else
+ {
+ conv.convert2utf16le();
+ }
+
return 0;
}