68be134e7aefce212bacdfbec1849047bcf86343
[reactos.git] / reactos / tools / unicode / wctomb.c
1 /*
2 * WideCharToMultiByte implementation
3 *
4 * Copyright 2000 Alexandre Julliard
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21 #include <string.h>
22
23 #include "wine/unicode.h"
24
25 /* search for a character in the unicode_compose_table; helper for compose() */
26 static inline int binary_search( WCHAR ch, int low, int high )
27 {
28 extern const WCHAR unicode_compose_table[];
29 while (low <= high)
30 {
31 int pos = (low + high) / 2;
32 if (unicode_compose_table[2*pos] < ch)
33 {
34 low = pos + 1;
35 continue;
36 }
37 if (unicode_compose_table[2*pos] > ch)
38 {
39 high = pos - 1;
40 continue;
41 }
42 return pos;
43 }
44 return -1;
45 }
46
47 /* return the result of the composition of two Unicode chars, or 0 if none */
48 static WCHAR compose( const WCHAR *str )
49 {
50 extern const WCHAR unicode_compose_table[];
51 extern const unsigned int unicode_compose_table_size;
52
53 int idx = 1, low = 0, high = unicode_compose_table_size - 1;
54 for (;;)
55 {
56 int pos = binary_search( str[idx], low, high );
57 if (pos == -1) return 0;
58 if (!idx--) return unicode_compose_table[2*pos+1];
59 low = unicode_compose_table[2*pos+1];
60 high = unicode_compose_table[2*pos+3] - 1;
61 }
62 }
63
64
65 /****************************************************************/
66 /* sbcs support */
67
68 /* check if 'ch' is an acceptable sbcs mapping for 'wch' */
69 static inline int is_valid_sbcs_mapping( const struct sbcs_table *table, int flags,
70 WCHAR wch, unsigned char ch )
71 {
72 if (flags & WC_NO_BEST_FIT_CHARS) return (table->cp2uni[ch] == wch);
73 if (ch != (unsigned char)table->info.def_char) return 1;
74 return (wch == table->info.def_unicode_char);
75 }
76
77 /* query necessary dst length for src string */
78 static int get_length_sbcs( const struct sbcs_table *table, int flags,
79 const WCHAR *src, unsigned int srclen, int *used )
80 {
81 const unsigned char * const uni2cp_low = table->uni2cp_low;
82 const unsigned short * const uni2cp_high = table->uni2cp_high;
83 int ret, tmp;
84 WCHAR composed;
85
86 if (!used) used = &tmp; /* avoid checking on every char */
87 *used = 0;
88
89 for (ret = 0; srclen; ret++, src++, srclen--)
90 {
91 WCHAR wch = *src;
92 unsigned char ch;
93
94 if ((flags & WC_COMPOSITECHECK) && (srclen > 1) && (composed = compose(src)))
95 {
96 /* now check if we can use the composed char */
97 ch = uni2cp_low[uni2cp_high[composed >> 8] + (composed & 0xff)];
98 if (is_valid_sbcs_mapping( table, flags, composed, ch ))
99 {
100 /* we have a good mapping, use it */
101 src++;
102 srclen--;
103 continue;
104 }
105 /* no mapping for the composed char, check the other flags */
106 if (flags & WC_DEFAULTCHAR) /* use the default char instead */
107 {
108 *used = 1;
109 src++; /* skip the non-spacing char */
110 srclen--;
111 continue;
112 }
113 if (flags & WC_DISCARDNS) /* skip the second char of the composition */
114 {
115 src++;
116 srclen--;
117 }
118 /* WC_SEPCHARS is the default */
119 }
120 if (!*used)
121 {
122 ch = uni2cp_low[uni2cp_high[wch >> 8] + (wch & 0xff)];
123 *used = !is_valid_sbcs_mapping( table, flags, wch, ch );
124 }
125 }
126 return ret;
127 }
128
129 /* wcstombs for single-byte code page */
130 static inline int wcstombs_sbcs( const struct sbcs_table *table,
131 const WCHAR *src, unsigned int srclen,
132 char *dst, unsigned int dstlen )
133 {
134 const unsigned char * const uni2cp_low = table->uni2cp_low;
135 const unsigned short * const uni2cp_high = table->uni2cp_high;
136 int ret = srclen;
137
138 if (dstlen < srclen)
139 {
140 /* buffer too small: fill it up to dstlen and return error */
141 srclen = dstlen;
142 ret = -1;
143 }
144
145 if (dst <= (const char *)src && dst + 16 > (const char *)src)
146 {
147 /* overlapping buffers, do it char by char */
148 while (srclen--)
149 {
150 *dst++ = uni2cp_low[uni2cp_high[*src >> 8] + (*src & 0xff)];
151 src++;
152 }
153 return ret;
154 }
155
156 for (;;)
157 {
158 switch(srclen)
159 {
160 default:
161 case 16: dst[15] = uni2cp_low[uni2cp_high[src[15] >> 8] + (src[15] & 0xff)];
162 case 15: dst[14] = uni2cp_low[uni2cp_high[src[14] >> 8] + (src[14] & 0xff)];
163 case 14: dst[13] = uni2cp_low[uni2cp_high[src[13] >> 8] + (src[13] & 0xff)];
164 case 13: dst[12] = uni2cp_low[uni2cp_high[src[12] >> 8] + (src[12] & 0xff)];
165 case 12: dst[11] = uni2cp_low[uni2cp_high[src[11] >> 8] + (src[11] & 0xff)];
166 case 11: dst[10] = uni2cp_low[uni2cp_high[src[10] >> 8] + (src[10] & 0xff)];
167 case 10: dst[9] = uni2cp_low[uni2cp_high[src[9] >> 8] + (src[9] & 0xff)];
168 case 9: dst[8] = uni2cp_low[uni2cp_high[src[8] >> 8] + (src[8] & 0xff)];
169 case 8: dst[7] = uni2cp_low[uni2cp_high[src[7] >> 8] + (src[7] & 0xff)];
170 case 7: dst[6] = uni2cp_low[uni2cp_high[src[6] >> 8] + (src[6] & 0xff)];
171 case 6: dst[5] = uni2cp_low[uni2cp_high[src[5] >> 8] + (src[5] & 0xff)];
172 case 5: dst[4] = uni2cp_low[uni2cp_high[src[4] >> 8] + (src[4] & 0xff)];
173 case 4: dst[3] = uni2cp_low[uni2cp_high[src[3] >> 8] + (src[3] & 0xff)];
174 case 3: dst[2] = uni2cp_low[uni2cp_high[src[2] >> 8] + (src[2] & 0xff)];
175 case 2: dst[1] = uni2cp_low[uni2cp_high[src[1] >> 8] + (src[1] & 0xff)];
176 case 1: dst[0] = uni2cp_low[uni2cp_high[src[0] >> 8] + (src[0] & 0xff)];
177 case 0: break;
178 }
179 if (srclen < 16) return ret;
180 dst += 16;
181 src += 16;
182 srclen -= 16;
183 }
184 }
185
186 /* slow version of wcstombs_sbcs that handles the various flags */
187 static int wcstombs_sbcs_slow( const struct sbcs_table *table, int flags,
188 const WCHAR *src, unsigned int srclen,
189 char *dst, unsigned int dstlen,
190 const char *defchar, int *used )
191 {
192 const unsigned char * const uni2cp_low = table->uni2cp_low;
193 const unsigned short * const uni2cp_high = table->uni2cp_high;
194 const unsigned char table_default = table->info.def_char & 0xff;
195 unsigned int len;
196 int tmp;
197 WCHAR composed;
198
199 if (!defchar) defchar = &table_default;
200 if (!used) used = &tmp; /* avoid checking on every char */
201 *used = 0;
202
203 for (len = dstlen; srclen && len; dst++, len--, src++, srclen--)
204 {
205 WCHAR wch = *src;
206
207 if ((flags & WC_COMPOSITECHECK) && (srclen > 1) && (composed = compose(src)))
208 {
209 /* now check if we can use the composed char */
210 *dst = uni2cp_low[uni2cp_high[composed >> 8] + (composed & 0xff)];
211 if (is_valid_sbcs_mapping( table, flags, composed, *dst ))
212 {
213 /* we have a good mapping, use it */
214 src++;
215 srclen--;
216 continue;
217 }
218 /* no mapping for the composed char, check the other flags */
219 if (flags & WC_DEFAULTCHAR) /* use the default char instead */
220 {
221 *dst = *defchar;
222 *used = 1;
223 src++; /* skip the non-spacing char */
224 srclen--;
225 continue;
226 }
227 if (flags & WC_DISCARDNS) /* skip the second char of the composition */
228 {
229 src++;
230 srclen--;
231 }
232 /* WC_SEPCHARS is the default */
233 }
234
235 *dst = uni2cp_low[uni2cp_high[wch >> 8] + (wch & 0xff)];
236 if (!is_valid_sbcs_mapping( table, flags, wch, *dst ))
237 {
238 *dst = *defchar;
239 *used = 1;
240 }
241 }
242 if (srclen) return -1; /* overflow */
243 return dstlen - len;
244 }
245
246
247 /****************************************************************/
248 /* dbcs support */
249
250 /* check if 'ch' is an acceptable dbcs mapping for 'wch' */
251 static inline int is_valid_dbcs_mapping( const struct dbcs_table *table, int flags,
252 WCHAR wch, unsigned short ch )
253 {
254 if (ch == table->info.def_char && wch != table->info.def_unicode_char) return 0;
255 if (flags & WC_NO_BEST_FIT_CHARS)
256 {
257 /* check if char maps back to the same Unicode value */
258 if (ch & 0xff00)
259 {
260 unsigned char off = table->cp2uni_leadbytes[ch >> 8];
261 return (table->cp2uni[(off << 8) + (ch & 0xff)] == wch);
262 }
263 return (table->cp2uni[ch & 0xff] == wch);
264 }
265 return 1;
266 }
267
268 /* query necessary dst length for src string */
269 static int get_length_dbcs( const struct dbcs_table *table, int flags,
270 const WCHAR *src, unsigned int srclen,
271 const char *defchar, int *used )
272 {
273 const unsigned short * const uni2cp_low = table->uni2cp_low;
274 const unsigned short * const uni2cp_high = table->uni2cp_high;
275 WCHAR defchar_value = table->info.def_char;
276 WCHAR composed;
277 int len, tmp;
278
279 if (!defchar && !used && !(flags & WC_COMPOSITECHECK))
280 {
281 for (len = 0; srclen; srclen--, src++, len++)
282 {
283 if (uni2cp_low[uni2cp_high[*src >> 8] + (*src & 0xff)] & 0xff00) len++;
284 }
285 return len;
286 }
287
288 if (defchar) defchar_value = defchar[1] ? ((defchar[0] << 8) | defchar[1]) : defchar[0];
289 if (!used) used = &tmp; /* avoid checking on every char */
290 *used = 0;
291 for (len = 0; srclen; len++, srclen--, src++)
292 {
293 unsigned short res;
294 WCHAR wch = *src;
295
296 if ((flags & WC_COMPOSITECHECK) && (srclen > 1) && (composed = compose(src)))
297 {
298 /* now check if we can use the composed char */
299 res = uni2cp_low[uni2cp_high[composed >> 8] + (composed & 0xff)];
300
301 if (is_valid_dbcs_mapping( table, flags, composed, res ))
302 {
303 /* we have a good mapping for the composed char, use it */
304 if (res & 0xff00) len++;
305 src++;
306 srclen--;
307 continue;
308 }
309 /* no mapping for the composed char, check the other flags */
310 if (flags & WC_DEFAULTCHAR) /* use the default char instead */
311 {
312 if (defchar_value & 0xff00) len++;
313 *used = 1;
314 src++; /* skip the non-spacing char */
315 srclen--;
316 continue;
317 }
318 if (flags & WC_DISCARDNS) /* skip the second char of the composition */
319 {
320 src++;
321 srclen--;
322 }
323 /* WC_SEPCHARS is the default */
324 }
325
326 res = uni2cp_low[uni2cp_high[wch >> 8] + (wch & 0xff)];
327 if (!is_valid_dbcs_mapping( table, flags, wch, res ))
328 {
329 res = defchar_value;
330 *used = 1;
331 }
332 if (res & 0xff00) len++;
333 }
334 return len;
335 }
336
337 /* wcstombs for double-byte code page */
338 static inline int wcstombs_dbcs( const struct dbcs_table *table,
339 const WCHAR *src, unsigned int srclen,
340 char *dst, unsigned int dstlen )
341 {
342 const unsigned short * const uni2cp_low = table->uni2cp_low;
343 const unsigned short * const uni2cp_high = table->uni2cp_high;
344 int len;
345
346 for (len = dstlen; srclen && len; len--, srclen--, src++)
347 {
348 unsigned short res = uni2cp_low[uni2cp_high[*src >> 8] + (*src & 0xff)];
349 if (res & 0xff00)
350 {
351 if (len == 1) break; /* do not output a partial char */
352 len--;
353 *dst++ = res >> 8;
354 }
355 *dst++ = (char)res;
356 }
357 if (srclen) return -1; /* overflow */
358 return dstlen - len;
359 }
360
361 /* slow version of wcstombs_dbcs that handles the various flags */
362 static int wcstombs_dbcs_slow( const struct dbcs_table *table, int flags,
363 const WCHAR *src, unsigned int srclen,
364 char *dst, unsigned int dstlen,
365 const char *defchar, int *used )
366 {
367 const unsigned short * const uni2cp_low = table->uni2cp_low;
368 const unsigned short * const uni2cp_high = table->uni2cp_high;
369 WCHAR defchar_value = table->info.def_char;
370 WCHAR composed;
371 int len, tmp;
372
373 if (defchar) defchar_value = defchar[1] ? ((defchar[0] << 8) | defchar[1]) : defchar[0];
374 if (!used) used = &tmp; /* avoid checking on every char */
375 *used = 0;
376
377 for (len = dstlen; srclen && len; len--, srclen--, src++)
378 {
379 unsigned short res;
380 WCHAR wch = *src;
381
382 if ((flags & WC_COMPOSITECHECK) && (srclen > 1) && (composed = compose(src)))
383 {
384 /* now check if we can use the composed char */
385 res = uni2cp_low[uni2cp_high[composed >> 8] + (composed & 0xff)];
386
387 if (is_valid_dbcs_mapping( table, flags, composed, res ))
388 {
389 /* we have a good mapping for the composed char, use it */
390 src++;
391 srclen--;
392 goto output_char;
393 }
394 /* no mapping for the composed char, check the other flags */
395 if (flags & WC_DEFAULTCHAR) /* use the default char instead */
396 {
397 res = defchar_value;
398 *used = 1;
399 src++; /* skip the non-spacing char */
400 srclen--;
401 goto output_char;
402 }
403 if (flags & WC_DISCARDNS) /* skip the second char of the composition */
404 {
405 src++;
406 srclen--;
407 }
408 /* WC_SEPCHARS is the default */
409 }
410
411 res = uni2cp_low[uni2cp_high[wch >> 8] + (wch & 0xff)];
412 if (!is_valid_dbcs_mapping( table, flags, wch, res ))
413 {
414 res = defchar_value;
415 *used = 1;
416 }
417
418 output_char:
419 if (res & 0xff00)
420 {
421 if (len == 1) break; /* do not output a partial char */
422 len--;
423 *dst++ = res >> 8;
424 }
425 *dst++ = (char)res;
426 }
427 if (srclen) return -1; /* overflow */
428 return dstlen - len;
429 }
430
431 /* wide char to multi byte string conversion */
432 /* return -1 on dst buffer overflow */
433 int wine_cp_wcstombs( const union cptable *table, int flags,
434 const WCHAR *src, int srclen,
435 char *dst, int dstlen, const char *defchar, int *used )
436 {
437 if (table->info.char_size == 1)
438 {
439 if (flags || defchar || used)
440 {
441 if (!dstlen) return get_length_sbcs( &table->sbcs, flags, src, srclen, used );
442 return wcstombs_sbcs_slow( &table->sbcs, flags, src, srclen,
443 dst, dstlen, defchar, used );
444 }
445 if (!dstlen) return srclen;
446 return wcstombs_sbcs( &table->sbcs, src, srclen, dst, dstlen );
447 }
448 else /* mbcs */
449 {
450 if (!dstlen) return get_length_dbcs( &table->dbcs, flags, src, srclen, defchar, used );
451 if (flags || defchar || used)
452 return wcstombs_dbcs_slow( &table->dbcs, flags, src, srclen,
453 dst, dstlen, defchar, used );
454 return wcstombs_dbcs( &table->dbcs, src, srclen, dst, dstlen );
455 }
456 }
457
458 /* CP_SYMBOL implementation */
459 /* return -1 on dst buffer overflow, -2 on invalid character */
460 int wine_cpsymbol_wcstombs( const WCHAR *src, int srclen, char *dst, int dstlen)
461 {
462 int len, i;
463 if( dstlen == 0) return srclen;
464 len = dstlen > srclen ? srclen : dstlen;
465 for( i = 0; i < len; i++)
466 {
467 WCHAR w = src [ i ];
468 if( w < 0x20 )
469 dst[i] = w;
470 else if( w >= 0xf020 && w < 0xf100)
471 dst[i] = w - 0xf000;
472 else
473 return -2;
474 }
475 if( srclen > len) return -1;
476 return len;
477 }