reactos/tools/unicode/mbtowc.c

   1 /*
   2  * MultiByteToWideChar implementation
   3  *
   4  * Copyright 2000 Alexandre Julliard
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  19  */
  20
  21 #include <string.h>
  22
  23 #include "wine/unicode.h"
  24
  25 /* get the decomposition of a Unicode char */
  26 static int get_decomposition( WCHAR src, WCHAR *dst, unsigned int dstlen )
  27 {
  28     extern const WCHAR unicode_decompose_table[];
  29     const WCHAR *ptr = unicode_decompose_table;
  30     int res;
  31
  32     *dst = src;
  33     ptr = unicode_decompose_table + ptr[src >> 8];
  34     ptr = unicode_decompose_table + ptr[(src >> 4) & 0x0f] + 2 * (src & 0x0f);
  35     if (!*ptr) return 1;
  36     if (dstlen <= 1) return 0;
  37     /* apply the decomposition recursively to the first char */
  38     if ((res = get_decomposition( *ptr, dst, dstlen-1 ))) dst[res++] = ptr[1];
  39     return res;
  40 }
  41
  42 /* check the code whether it is in Unicode Private Use Area (PUA). */
  43 /* MB_ERR_INVALID_CHARS raises an error converting from 1-byte character to PUA. */
  44 static inline int is_private_use_area_char(WCHAR code)
  45 {
  46     return (code >= 0xe000 && code <= 0xf8ff);
  47 }
  48
  49 /* check src string for invalid chars; return non-zero if invalid char found */
  50 static inline int check_invalid_chars_sbcs( const struct sbcs_table *table, int flags,
  51                                             const unsigned char *src, unsigned int srclen )
  52 {
  53     const WCHAR * const cp2uni = (flags & MB_USEGLYPHCHARS) ? table->cp2uni_glyphs : table->cp2uni;
  54     const WCHAR def_unicode_char = table->info.def_unicode_char;
  55     const unsigned char def_char = table->uni2cp_low[table->uni2cp_high[def_unicode_char >> 8]
  56                                                      + (def_unicode_char & 0xff)];
  57     while (srclen)
  58     {
  59         if ((cp2uni[*src] == def_unicode_char && *src != def_char) ||
  60             is_private_use_area_char(cp2uni[*src])) break;
  61         src++;
  62         srclen--;
  63     }
  64     return srclen;
  65 }
  66
  67 /* mbstowcs for single-byte code page */
  68 /* all lengths are in characters, not bytes */
  69 static inline int mbstowcs_sbcs( const struct sbcs_table *table, int flags,
  70                                  const unsigned char *src, unsigned int srclen,
  71                                  WCHAR *dst, unsigned int dstlen )
  72 {
  73     const WCHAR * const cp2uni = (flags & MB_USEGLYPHCHARS) ? table->cp2uni_glyphs : table->cp2uni;
  74     int ret = srclen;
  75
  76     if (dstlen < srclen)
  77     {
  78         /* buffer too small: fill it up to dstlen and return error */
  79         srclen = dstlen;
  80         ret = -1;
  81     }
  82
  83     for (;;)
  84     {
  85         switch(srclen)
  86         {
  87         default:
  88         case 16: dst[15] = cp2uni[src[15]];
  89         case 15: dst[14] = cp2uni[src[14]];
  90         case 14: dst[13] = cp2uni[src[13]];
  91         case 13: dst[12] = cp2uni[src[12]];
  92         case 12: dst[11] = cp2uni[src[11]];
  93         case 11: dst[10] = cp2uni[src[10]];
  94         case 10: dst[9]  = cp2uni[src[9]];
  95         case 9:  dst[8]  = cp2uni[src[8]];
  96         case 8:  dst[7]  = cp2uni[src[7]];
  97         case 7:  dst[6]  = cp2uni[src[6]];
  98         case 6:  dst[5]  = cp2uni[src[5]];
  99         case 5:  dst[4]  = cp2uni[src[4]];
 100         case 4:  dst[3]  = cp2uni[src[3]];
 101         case 3:  dst[2]  = cp2uni[src[2]];
 102         case 2:  dst[1]  = cp2uni[src[1]];
 103         case 1:  dst[0]  = cp2uni[src[0]];
 104         case 0: break;
 105         }
 106         if (srclen < 16) return ret;
 107         dst += 16;
 108         src += 16;
 109         srclen -= 16;
 110     }
 111 }
 112
 113 /* mbstowcs for single-byte code page with char decomposition */
 114 static int mbstowcs_sbcs_decompose( const struct sbcs_table *table, int flags,
 115                                     const unsigned char *src, unsigned int srclen,
 116                                     WCHAR *dst, unsigned int dstlen )
 117 {
 118     const WCHAR * const cp2uni = (flags & MB_USEGLYPHCHARS) ? table->cp2uni_glyphs : table->cp2uni;
 119     unsigned int len;
 120
 121     if (!dstlen)  /* compute length */
 122     {
 123         WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
 124         for (len = 0; srclen; srclen--, src++)
 125             len += get_decomposition( cp2uni[*src], dummy, 4 );
 126         return len;
 127     }
 128
 129     for (len = dstlen; srclen && len; srclen--, src++)
 130     {
 131         int res = get_decomposition( cp2uni[*src], dst, len );
 132         if (!res) break;
 133         len -= res;
 134         dst += res;
 135     }
 136     if (srclen) return -1;  /* overflow */
 137     return dstlen - len;
 138 }
 139
 140 /* query necessary dst length for src string */
 141 static inline int get_length_dbcs( const struct dbcs_table *table,
 142                                    const unsigned char *src, unsigned int srclen )
 143 {
 144     const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes;
 145     int len;
 146
 147     for (len = 0; srclen; srclen--, src++, len++)
 148     {
 149         if (cp2uni_lb[*src])
 150         {
 151             if (!--srclen) break;  /* partial char, ignore it */
 152             src++;
 153         }
 154     }
 155     return len;
 156 }
 157
 158 /* check src string for invalid chars; return non-zero if invalid char found */
 159 static inline int check_invalid_chars_dbcs( const struct dbcs_table *table,
 160                                             const unsigned char *src, unsigned int srclen )
 161 {
 162     const WCHAR * const cp2uni = table->cp2uni;
 163     const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes;
 164     const WCHAR def_unicode_char = table->info.def_unicode_char;
 165     const unsigned short def_char = table->uni2cp_low[table->uni2cp_high[def_unicode_char >> 8]
 166                                                       + (def_unicode_char & 0xff)];
 167     while (srclen)
 168     {
 169         unsigned char off = cp2uni_lb[*src];
 170         if (off)  /* multi-byte char */
 171         {
 172             if (srclen == 1) break;  /* partial char, error */
 173             if (cp2uni[(off << 8) + src[1]] == def_unicode_char &&
 174                 ((src[0] << 8) | src[1]) != def_char) break;
 175             src++;
 176             srclen--;
 177         }
 178         else if ((cp2uni[*src] == def_unicode_char && *src != def_char) ||
 179                  is_private_use_area_char(cp2uni[*src])) break;
 180         src++;
 181         srclen--;
 182     }
 183     return srclen;
 184 }
 185
 186 /* mbstowcs for double-byte code page */
 187 /* all lengths are in characters, not bytes */
 188 static inline int mbstowcs_dbcs( const struct dbcs_table *table,
 189                                  const unsigned char *src, unsigned int srclen,
 190                                  WCHAR *dst, unsigned int dstlen )
 191 {
 192     const WCHAR * const cp2uni = table->cp2uni;
 193     const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes;
 194     unsigned int len;
 195
 196     if (!dstlen) return get_length_dbcs( table, src, srclen );
 197
 198     for (len = dstlen; srclen && len; len--, srclen--, src++, dst++)
 199     {
 200         unsigned char off = cp2uni_lb[*src];
 201         if (off)
 202         {
 203             if (!--srclen) break;  /* partial char, ignore it */
 204             src++;
 205             *dst = cp2uni[(off << 8) + *src];
 206         }
 207         else *dst = cp2uni[*src];
 208     }
 209     if (srclen) return -1;  /* overflow */
 210     return dstlen - len;
 211 }
 212
 213
 214 /* mbstowcs for double-byte code page with character decomposition */
 215 static int mbstowcs_dbcs_decompose( const struct dbcs_table *table,
 216                                     const unsigned char *src, unsigned int srclen,
 217                                     WCHAR *dst, unsigned int dstlen )
 218 {
 219     const WCHAR * const cp2uni = table->cp2uni;
 220     const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes;
 221     unsigned int len;
 222     WCHAR ch;
 223     int res;
 224
 225     if (!dstlen)  /* compute length */
 226     {
 227         WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
 228         for (len = 0; srclen; srclen--, src++)
 229         {
 230             unsigned char off = cp2uni_lb[*src];
 231             if (off)
 232             {
 233                 if (!--srclen) break;  /* partial char, ignore it */
 234                 src++;
 235                 ch = cp2uni[(off << 8) + *src];
 236             }
 237             else ch = cp2uni[*src];
 238             len += get_decomposition( ch, dummy, 4 );
 239         }
 240         return len;
 241     }
 242
 243     for (len = dstlen; srclen && len; srclen--, src++)
 244     {
 245         unsigned char off = cp2uni_lb[*src];
 246         if (off)
 247         {
 248             if (!--srclen) break;  /* partial char, ignore it */
 249             src++;
 250             ch = cp2uni[(off << 8) + *src];
 251         }
 252         else ch = cp2uni[*src];
 253         if (!(res = get_decomposition( ch, dst, len ))) break;
 254         dst += res;
 255         len -= res;
 256     }
 257     if (srclen) return -1;  /* overflow */
 258     return dstlen - len;
 259 }
 260
 261
 262 /* return -1 on dst buffer overflow, -2 on invalid input char */
 263 int wine_cp_mbstowcs( const union cptable *table, int flags,
 264                       const char *s, int srclen,
 265                       WCHAR *dst, int dstlen )
 266 {
 267     const unsigned char *src = (const unsigned char*) s;
 268
 269     if (table->info.char_size == 1)
 270     {
 271         if (flags & MB_ERR_INVALID_CHARS)
 272         {
 273             if (check_invalid_chars_sbcs( &table->sbcs, flags, src, srclen )) return -2;
 274         }
 275         if (!(flags & MB_COMPOSITE))
 276         {
 277             if (!dstlen) return srclen;
 278             return mbstowcs_sbcs( &table->sbcs, flags, src, srclen, dst, dstlen );
 279         }
 280         return mbstowcs_sbcs_decompose( &table->sbcs, flags, src, srclen, dst, dstlen );
 281     }
 282     else /* mbcs */
 283     {
 284         if (flags & MB_ERR_INVALID_CHARS)
 285         {
 286             if (check_invalid_chars_dbcs( &table->dbcs, src, srclen )) return -2;
 287         }
 288         if (!(flags & MB_COMPOSITE))
 289             return mbstowcs_dbcs( &table->dbcs, src, srclen, dst, dstlen );
 290         else
 291             return mbstowcs_dbcs_decompose( &table->dbcs, src, srclen, dst, dstlen );
 292     }
 293 }
 294
 295 /* CP_SYMBOL implementation */
 296 /* return -1 on dst buffer overflow */
 297 int wine_cpsymbol_mbstowcs( const char *src, int srclen, WCHAR *dst, int dstlen)
 298 {
 299     int len, i;
 300     if( dstlen == 0) return srclen;
 301     len = dstlen > srclen ? srclen : dstlen;
 302     for( i = 0; i < len; i++)
 303     {
 304         unsigned char c = src [ i ];
 305         if( c < 0x20 )
 306             dst[i] = c;
 307         else
 308             dst[i] = c + 0xf000;
 309     }
 310     if( srclen > len) return -1;
 311     return len;
 312 }