- support of [Strings.LanguageID]-sections for inf-files added in setupapi
[reactos.git] / reactos / lib / 3rdparty / icu4ros / icu / source / common / ustrtrns.c
1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2001-2007, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 *
9 * File ustrtrns.c
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 9/10/2001 Ram Creation.
15 ******************************************************************************
16 */
17
18 /*******************************************************************************
19 *
20 * u_strTo* and u_strFrom* APIs
21 * WCS functions moved to ustr_wcs.c for better modularization
22 *
23 *******************************************************************************
24 */
25
26
27 #include "unicode/putil.h"
28 #include "unicode/ustring.h"
29 #include "cstring.h"
30 #include "cmemory.h"
31 #include "ustr_imp.h"
32
33 U_CAPI UChar* U_EXPORT2
34 u_strFromUTF32(UChar *dest,
35 int32_t destCapacity,
36 int32_t *pDestLength,
37 const UChar32 *src,
38 int32_t srcLength,
39 UErrorCode *pErrorCode)
40 {
41 int32_t reqLength = 0;
42 uint32_t ch =0;
43 UChar *pDestLimit =dest+destCapacity;
44 UChar *pDest = dest;
45 const uint32_t *pSrc = (const uint32_t *)src;
46
47 /* args check */
48 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
49 return NULL;
50 }
51
52 if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
53 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
54 return NULL;
55 }
56
57 /* Check if the source is null terminated */
58 if(srcLength == -1 ){
59 while(((ch=*pSrc)!=0) && (pDest < pDestLimit)){
60 ++pSrc;
61 if(ch<=0xFFFF){
62 *(pDest++)=(UChar)ch;
63 }else if(ch<=0x10ffff){
64 *(pDest++)=UTF16_LEAD(ch);
65 if(pDest<pDestLimit){
66 *(pDest++)=UTF16_TRAIL(ch);
67 }else{
68 reqLength++;
69 break;
70 }
71 }else{
72 *pErrorCode = U_INVALID_CHAR_FOUND;
73 return NULL;
74 }
75 }
76 while((ch=*pSrc++) != 0){
77 reqLength+=UTF_CHAR_LENGTH(ch);
78 }
79 }else{
80 const uint32_t* pSrcLimit = ((const uint32_t*)pSrc) + srcLength;
81 while((pSrc < pSrcLimit) && (pDest < pDestLimit)){
82 ch = *pSrc++;
83 if(ch<=0xFFFF){
84 *(pDest++)=(UChar)ch;
85 }else if(ch<=0x10FFFF){
86 *(pDest++)=UTF16_LEAD(ch);
87 if(pDest<pDestLimit){
88 *(pDest++)=UTF16_TRAIL(ch);
89 }else{
90 reqLength++;
91 break;
92 }
93 }else{
94 *pErrorCode = U_INVALID_CHAR_FOUND;
95 return NULL;
96 }
97 }
98 while(pSrc <pSrcLimit){
99 ch = *pSrc++;
100 reqLength+=UTF_CHAR_LENGTH(ch);
101 }
102 }
103
104 reqLength += (int32_t)(pDest - dest);
105 if(pDestLength){
106 *pDestLength = reqLength;
107 }
108
109 /* Terminate the buffer */
110 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
111
112 return dest;
113 }
114
115
116 U_CAPI UChar32* U_EXPORT2
117 u_strToUTF32(UChar32 *dest,
118 int32_t destCapacity,
119 int32_t *pDestLength,
120 const UChar *src,
121 int32_t srcLength,
122 UErrorCode *pErrorCode)
123 {
124 const UChar* pSrc = src;
125 const UChar* pSrcLimit;
126 int32_t reqLength=0;
127 uint32_t ch=0;
128 uint32_t *pDest = (uint32_t *)dest;
129 uint32_t *pDestLimit = pDest + destCapacity;
130 UChar ch2=0;
131
132 /* args check */
133 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
134 return NULL;
135 }
136
137
138 if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
139 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
140 return NULL;
141 }
142
143 if(srcLength==-1) {
144 while((ch=*pSrc)!=0 && pDest!=pDestLimit) {
145 ++pSrc;
146 /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/
147 if(UTF_IS_LEAD(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
148 ++pSrc;
149 ch=UTF16_GET_PAIR_VALUE(ch, ch2);
150 }
151 *(pDest++)= ch;
152 }
153 while((ch=*pSrc++)!=0) {
154 if(UTF_IS_LEAD(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
155 ++pSrc;
156 }
157 ++reqLength;
158 }
159 } else {
160 pSrcLimit = pSrc+srcLength;
161 while(pSrc<pSrcLimit && pDest<pDestLimit) {
162 ch=*pSrc++;
163 if(UTF_IS_LEAD(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
164 ++pSrc;
165 ch=UTF16_GET_PAIR_VALUE(ch, ch2);
166 }
167 *(pDest++)= ch;
168 }
169 while(pSrc!=pSrcLimit) {
170 ch=*pSrc++;
171 if(UTF_IS_LEAD(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
172 ++pSrc;
173 }
174 ++reqLength;
175 }
176 }
177
178 reqLength+=(int32_t)(pDest - (uint32_t *)dest);
179 if(pDestLength){
180 *pDestLength = reqLength;
181 }
182
183 /* Terminate the buffer */
184 u_terminateUChar32s(dest,destCapacity,reqLength,pErrorCode);
185
186 return dest;
187 }
188
189 /* for utf8_nextCharSafeBodyTerminated() */
190 static const UChar32
191 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
192
193 /*
194 * Version of utf8_nextCharSafeBody() with the following differences:
195 * - checks for NUL termination instead of length
196 * - works with pointers instead of indexes
197 * - always strict (strict==-1)
198 *
199 * *ps points to after the lead byte and will be moved to after the last trail byte.
200 * c is the lead byte.
201 * @return the code point, or U_SENTINEL
202 */
203 static UChar32
204 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
205 const uint8_t *s=*ps;
206 uint8_t trail, illegal=0;
207 uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
208 UTF8_MASK_LEAD_BYTE((c), count);
209 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
210 switch(count) {
211 /* each branch falls through to the next one */
212 case 5:
213 case 4:
214 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
215 illegal=1;
216 break;
217 case 3:
218 trail=(uint8_t)(*s++ - 0x80);
219 c=(c<<6)|trail;
220 if(trail>0x3f || c>=0x110) {
221 /* not a trail byte, or code point>0x10ffff (outside Unicode) */
222 illegal=1;
223 break;
224 }
225 case 2:
226 trail=(uint8_t)(*s++ - 0x80);
227 if(trail>0x3f) {
228 /* not a trail byte */
229 illegal=1;
230 break;
231 }
232 c=(c<<6)|trail;
233 case 1:
234 trail=(uint8_t)(*s++ - 0x80);
235 if(trail>0x3f) {
236 /* not a trail byte */
237 illegal=1;
238 }
239 c=(c<<6)|trail;
240 break;
241 case 0:
242 return U_SENTINEL;
243 /* no default branch to optimize switch() - all values are covered */
244 }
245
246 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
247 /* illegal is also set if count>=4 */
248 if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
249 /* error handling */
250 /* don't go beyond this sequence */
251 s=*ps;
252 while(count>0 && UTF8_IS_TRAIL(*s)) {
253 ++s;
254 --count;
255 }
256 c=U_SENTINEL;
257 }
258 *ps=s;
259 return c;
260 }
261
262 /*
263 * Version of utf8_nextCharSafeBody() with the following differences:
264 * - works with pointers instead of indexes
265 * - always strict (strict==-1)
266 *
267 * *ps points to after the lead byte and will be moved to after the last trail byte.
268 * c is the lead byte.
269 * @return the code point, or U_SENTINEL
270 */
271 static UChar32
272 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
273 const uint8_t *s=*ps;
274 uint8_t trail, illegal=0;
275 uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
276 if((limit-s)>=count) {
277 UTF8_MASK_LEAD_BYTE((c), count);
278 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
279 switch(count) {
280 /* each branch falls through to the next one */
281 case 5:
282 case 4:
283 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
284 illegal=1;
285 break;
286 case 3:
287 trail=*s++;
288 c=(c<<6)|(trail&0x3f);
289 if(c<0x110) {
290 illegal|=(trail&0xc0)^0x80;
291 } else {
292 /* code point>0x10ffff, outside Unicode */
293 illegal=1;
294 break;
295 }
296 case 2:
297 trail=*s++;
298 c=(c<<6)|(trail&0x3f);
299 illegal|=(trail&0xc0)^0x80;
300 case 1:
301 trail=*s++;
302 c=(c<<6)|(trail&0x3f);
303 illegal|=(trail&0xc0)^0x80;
304 break;
305 case 0:
306 return U_SENTINEL;
307 /* no default branch to optimize switch() - all values are covered */
308 }
309 } else {
310 illegal=1; /* too few bytes left */
311 }
312
313 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
314 /* illegal is also set if count>=4 */
315 if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
316 /* error handling */
317 /* don't go beyond this sequence */
318 s=*ps;
319 while(count>0 && s<limit && UTF8_IS_TRAIL(*s)) {
320 ++s;
321 --count;
322 }
323 c=U_SENTINEL;
324 }
325 *ps=s;
326 return c;
327 }
328
329 U_CAPI UChar* U_EXPORT2
330 u_strFromUTF8WithSub(UChar *dest,
331 int32_t destCapacity,
332 int32_t *pDestLength,
333 const char* src,
334 int32_t srcLength,
335 UChar32 subchar, int32_t *pNumSubstitutions,
336 UErrorCode *pErrorCode){
337
338 UChar *pDest = dest;
339 UChar *pDestLimit = dest+destCapacity;
340 UChar32 ch;
341 int32_t reqLength = 0;
342 const uint8_t* pSrc = (const uint8_t*) src;
343 uint8_t t1, t2; /* trail bytes */
344 int32_t numSubstitutions;
345
346 /* args check */
347 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
348 return NULL;
349 }
350
351 if( (src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0) ||
352 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
353 ) {
354 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
355 return NULL;
356 }
357
358 numSubstitutions=0;
359
360 /*
361 * Inline processing of UTF-8 byte sequences:
362 *
363 * Byte sequences for the most common characters are handled inline in
364 * the conversion loops. In order to reduce the path lengths for those
365 * characters, the tests are arranged in a kind of binary search.
366 * ASCII (<=0x7f) is checked first, followed by the dividing point
367 * between 2- and 3-byte sequences (0xe0).
368 * The 3-byte branch is tested first to speed up CJK text.
369 * The compiler should combine the subtractions for the two tests for 0xe0.
370 * Each branch then tests for the other end of its range.
371 */
372
373 if(srcLength < 0){
374 /*
375 * Transform a NUL-terminated string.
376 * The code explicitly checks for NULs only in the lead byte position.
377 * A NUL byte in the trail byte position fails the trail byte range check anyway.
378 */
379 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
380 if(ch <= 0x7f){
381 *pDest++=(UChar)ch;
382 ++pSrc;
383 } else {
384 if(ch > 0xe0) {
385 if( /* handle U+1000..U+CFFF inline */
386 ch <= 0xec &&
387 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
388 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
389 ) {
390 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
391 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
392 pSrc += 3;
393 continue;
394 }
395 } else if(ch < 0xe0) {
396 if( /* handle U+0080..U+07FF inline */
397 ch >= 0xc2 &&
398 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
399 ) {
400 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
401 pSrc += 2;
402 continue;
403 }
404 }
405
406 /* function call for "complicated" and error cases */
407 ++pSrc; /* continue after the lead byte */
408 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
409 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
410 *pErrorCode = U_INVALID_CHAR_FOUND;
411 return NULL;
412 } else if(ch<=0xFFFF) {
413 *(pDest++)=(UChar)ch;
414 } else {
415 *(pDest++)=UTF16_LEAD(ch);
416 if(pDest<pDestLimit) {
417 *(pDest++)=UTF16_TRAIL(ch);
418 } else {
419 reqLength++;
420 break;
421 }
422 }
423 }
424 }
425
426 /* Pre-flight the rest of the string. */
427 while((ch = *pSrc) != 0) {
428 if(ch <= 0x7f){
429 ++reqLength;
430 ++pSrc;
431 } else {
432 if(ch > 0xe0) {
433 if( /* handle U+1000..U+CFFF inline */
434 ch <= 0xec &&
435 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
436 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
437 ) {
438 ++reqLength;
439 pSrc += 3;
440 continue;
441 }
442 } else if(ch < 0xe0) {
443 if( /* handle U+0080..U+07FF inline */
444 ch >= 0xc2 &&
445 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
446 ) {
447 ++reqLength;
448 pSrc += 2;
449 continue;
450 }
451 }
452
453 /* function call for "complicated" and error cases */
454 ++pSrc; /* continue after the lead byte */
455 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
456 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
457 *pErrorCode = U_INVALID_CHAR_FOUND;
458 return NULL;
459 }
460 reqLength += U16_LENGTH(ch);
461 }
462 }
463 } else /* srcLength >= 0 */ {
464 const uint8_t *pSrcLimit = pSrc + srcLength;
465 int32_t count;
466
467 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
468 for(;;) {
469 /*
470 * Each iteration of the inner loop progresses by at most 3 UTF-8
471 * bytes and one UChar, for most characters.
472 * For supplementary code points (4 & 2), which are rare,
473 * there is an additional adjustment.
474 */
475 count = (int32_t)(pDestLimit - pDest);
476 srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
477 if(count > srcLength) {
478 count = srcLength; /* min(remaining dest, remaining src/3) */
479 }
480 if(count < 3) {
481 /*
482 * Too much overhead if we get near the end of the string,
483 * continue with the next loop.
484 */
485 break;
486 }
487
488 do {
489 ch = *pSrc;
490 if(ch <= 0x7f){
491 *pDest++=(UChar)ch;
492 ++pSrc;
493 } else {
494 if(ch > 0xe0) {
495 if( /* handle U+1000..U+CFFF inline */
496 ch <= 0xec &&
497 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
498 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
499 ) {
500 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
501 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
502 pSrc += 3;
503 continue;
504 }
505 } else if(ch < 0xe0) {
506 if( /* handle U+0080..U+07FF inline */
507 ch >= 0xc2 &&
508 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
509 ) {
510 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
511 pSrc += 2;
512 continue;
513 }
514 }
515
516 if(ch >= 0xf0 || subchar > 0xffff) {
517 /*
518 * We may read up to six bytes and write up to two UChars,
519 * which we didn't account for with computing count,
520 * so we adjust it here.
521 */
522 if(--count == 0) {
523 break;
524 }
525 }
526
527 /* function call for "complicated" and error cases */
528 ++pSrc; /* continue after the lead byte */
529 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
530 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
531 *pErrorCode = U_INVALID_CHAR_FOUND;
532 return NULL;
533 }else if(ch<=0xFFFF){
534 *(pDest++)=(UChar)ch;
535 }else{
536 *(pDest++)=UTF16_LEAD(ch);
537 if(pDest<pDestLimit){
538 *(pDest++)=UTF16_TRAIL(ch);
539 }else{
540 reqLength++;
541 break;
542 }
543 }
544 }
545 } while(--count > 0);
546 }
547
548 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
549 ch = *pSrc;
550 if(ch <= 0x7f){
551 *pDest++=(UChar)ch;
552 ++pSrc;
553 } else {
554 if(ch > 0xe0) {
555 if( /* handle U+1000..U+CFFF inline */
556 ch <= 0xec &&
557 ((pSrcLimit - pSrc) >= 3) &&
558 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
559 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
560 ) {
561 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
562 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
563 pSrc += 3;
564 continue;
565 }
566 } else if(ch < 0xe0) {
567 if( /* handle U+0080..U+07FF inline */
568 ch >= 0xc2 &&
569 ((pSrcLimit - pSrc) >= 2) &&
570 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
571 ) {
572 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
573 pSrc += 2;
574 continue;
575 }
576 }
577
578 /* function call for "complicated" and error cases */
579 ++pSrc; /* continue after the lead byte */
580 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
581 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
582 *pErrorCode = U_INVALID_CHAR_FOUND;
583 return NULL;
584 }else if(ch<=0xFFFF){
585 *(pDest++)=(UChar)ch;
586 }else{
587 *(pDest++)=UTF16_LEAD(ch);
588 if(pDest<pDestLimit){
589 *(pDest++)=UTF16_TRAIL(ch);
590 }else{
591 reqLength++;
592 break;
593 }
594 }
595 }
596 }
597 /* donot fill the dest buffer just count the UChars needed */
598 while(pSrc < pSrcLimit){
599 ch = *pSrc;
600 if(ch <= 0x7f){
601 reqLength++;
602 ++pSrc;
603 } else {
604 if(ch > 0xe0) {
605 if( /* handle U+1000..U+CFFF inline */
606 ch <= 0xec &&
607 ((pSrcLimit - pSrc) >= 3) &&
608 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
609 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
610 ) {
611 reqLength++;
612 pSrc += 3;
613 continue;
614 }
615 } else if(ch < 0xe0) {
616 if( /* handle U+0080..U+07FF inline */
617 ch >= 0xc2 &&
618 ((pSrcLimit - pSrc) >= 2) &&
619 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
620 ) {
621 reqLength++;
622 pSrc += 2;
623 continue;
624 }
625 }
626
627 /* function call for "complicated" and error cases */
628 ++pSrc; /* continue after the lead byte */
629 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
630 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
631 *pErrorCode = U_INVALID_CHAR_FOUND;
632 return NULL;
633 }
634 reqLength+=UTF_CHAR_LENGTH(ch);
635 }
636 }
637 }
638
639 reqLength+=(int32_t)(pDest - dest);
640
641 if(pNumSubstitutions!=NULL) {
642 *pNumSubstitutions=numSubstitutions;
643 }
644
645 if(pDestLength){
646 *pDestLength = reqLength;
647 }
648
649 /* Terminate the buffer */
650 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
651
652 return dest;
653 }
654
655 U_CAPI UChar* U_EXPORT2
656 u_strFromUTF8(UChar *dest,
657 int32_t destCapacity,
658 int32_t *pDestLength,
659 const char* src,
660 int32_t srcLength,
661 UErrorCode *pErrorCode){
662 return u_strFromUTF8WithSub(
663 dest, destCapacity, pDestLength,
664 src, srcLength,
665 U_SENTINEL, NULL,
666 pErrorCode);
667 }
668
669 U_CAPI UChar * U_EXPORT2
670 u_strFromUTF8Lenient(UChar *dest,
671 int32_t destCapacity,
672 int32_t *pDestLength,
673 const char *src,
674 int32_t srcLength,
675 UErrorCode *pErrorCode) {
676
677 UChar *pDest = dest;
678 UChar32 ch;
679 int32_t reqLength = 0;
680 uint8_t* pSrc = (uint8_t*) src;
681
682 /* args check */
683 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
684 return NULL;
685 }
686
687 if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)) {
688 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
689 return NULL;
690 }
691
692 if(srcLength < 0) {
693 /* Transform a NUL-terminated string. */
694 UChar *pDestLimit = dest+destCapacity;
695 uint8_t t1, t2, t3; /* trail bytes */
696
697 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
698 if(ch < 0xc0) {
699 /*
700 * ASCII, or a trail byte in lead position which is treated like
701 * a single-byte sequence for better character boundary
702 * resynchronization after illegal sequences.
703 */
704 *pDest++=(UChar)ch;
705 ++pSrc;
706 continue;
707 } else if(ch < 0xe0) { /* U+0080..U+07FF */
708 if((t1 = pSrc[1]) != 0) {
709 /* 0x3080 = (0xc0 << 6) + 0x80 */
710 *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
711 pSrc += 2;
712 continue;
713 }
714 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
715 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
716 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
717 /* 0x2080 = (0x80 << 6) + 0x80 */
718 *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
719 pSrc += 3;
720 continue;
721 }
722 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
723 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
724 pSrc += 4;
725 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
726 ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
727 *(pDest++) = U16_LEAD(ch);
728 if(pDest < pDestLimit) {
729 *(pDest++) = U16_TRAIL(ch);
730 } else {
731 reqLength = 1;
732 break;
733 }
734 continue;
735 }
736 }
737
738 /* truncated character at the end */
739 *pDest++ = 0xfffd;
740 while(*++pSrc != 0) {}
741 break;
742 }
743
744 /* Pre-flight the rest of the string. */
745 while((ch = *pSrc) != 0) {
746 if(ch < 0xc0) {
747 /*
748 * ASCII, or a trail byte in lead position which is treated like
749 * a single-byte sequence for better character boundary
750 * resynchronization after illegal sequences.
751 */
752 ++reqLength;
753 ++pSrc;
754 continue;
755 } else if(ch < 0xe0) { /* U+0080..U+07FF */
756 if(pSrc[1] != 0) {
757 ++reqLength;
758 pSrc += 2;
759 continue;
760 }
761 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
762 if(pSrc[1] != 0 && pSrc[2] != 0) {
763 ++reqLength;
764 pSrc += 3;
765 continue;
766 }
767 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
768 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
769 reqLength += 2;
770 pSrc += 4;
771 continue;
772 }
773 }
774
775 /* truncated character at the end */
776 ++reqLength;
777 break;
778 }
779 } else /* srcLength >= 0 */ {
780 const uint8_t *pSrcLimit = pSrc + srcLength;
781
782 /*
783 * This function requires that if srcLength is given, then it must be
784 * destCapatity >= srcLength so that we need not check for
785 * destination buffer overflow in the loop.
786 */
787 if(destCapacity < srcLength) {
788 if(pDestLength != NULL) {
789 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
790 }
791 *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
792 return NULL;
793 }
794
795 if((pSrcLimit - pSrc) >= 4) {
796 pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
797
798 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
799 do {
800 ch = *pSrc++;
801 if(ch < 0xc0) {
802 /*
803 * ASCII, or a trail byte in lead position which is treated like
804 * a single-byte sequence for better character boundary
805 * resynchronization after illegal sequences.
806 */
807 *pDest++=(UChar)ch;
808 } else if(ch < 0xe0) { /* U+0080..U+07FF */
809 /* 0x3080 = (0xc0 << 6) + 0x80 */
810 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
811 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
812 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
813 /* 0x2080 = (0x80 << 6) + 0x80 */
814 ch = (ch << 12) + (*pSrc++ << 6);
815 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
816 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
817 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
818 ch = (ch << 18) + (*pSrc++ << 12);
819 ch += *pSrc++ << 6;
820 ch += *pSrc++ - 0x3c82080;
821 *(pDest++) = U16_LEAD(ch);
822 *(pDest++) = U16_TRAIL(ch);
823 }
824 } while(pSrc < pSrcLimit);
825
826 pSrcLimit += 3; /* restore original pSrcLimit */
827 }
828
829 while(pSrc < pSrcLimit) {
830 ch = *pSrc++;
831 if(ch < 0xc0) {
832 /*
833 * ASCII, or a trail byte in lead position which is treated like
834 * a single-byte sequence for better character boundary
835 * resynchronization after illegal sequences.
836 */
837 *pDest++=(UChar)ch;
838 continue;
839 } else if(ch < 0xe0) { /* U+0080..U+07FF */
840 if(pSrc < pSrcLimit) {
841 /* 0x3080 = (0xc0 << 6) + 0x80 */
842 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
843 continue;
844 }
845 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
846 if((pSrcLimit - pSrc) >= 2) {
847 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
848 /* 0x2080 = (0x80 << 6) + 0x80 */
849 ch = (ch << 12) + (*pSrc++ << 6);
850 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
851 pSrc += 3;
852 continue;
853 }
854 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
855 if((pSrcLimit - pSrc) >= 3) {
856 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
857 ch = (ch << 18) + (*pSrc++ << 12);
858 ch += *pSrc++ << 6;
859 ch += *pSrc++ - 0x3c82080;
860 *(pDest++) = U16_LEAD(ch);
861 *(pDest++) = U16_TRAIL(ch);
862 pSrc += 4;
863 continue;
864 }
865 }
866
867 /* truncated character at the end */
868 *pDest++ = 0xfffd;
869 break;
870 }
871 }
872
873 reqLength+=(int32_t)(pDest - dest);
874
875 if(pDestLength){
876 *pDestLength = reqLength;
877 }
878
879 /* Terminate the buffer */
880 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
881
882 return dest;
883 }
884
885 static U_INLINE uint8_t *
886 _appendUTF8(uint8_t *pDest, UChar32 c) {
887 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
888 if((c)<=0x7f) {
889 *pDest++=(uint8_t)c;
890 } else if(c<=0x7ff) {
891 *pDest++=(uint8_t)((c>>6)|0xc0);
892 *pDest++=(uint8_t)((c&0x3f)|0x80);
893 } else if(c<=0xffff) {
894 *pDest++=(uint8_t)((c>>12)|0xe0);
895 *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
896 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
897 } else /* if((uint32_t)(c)<=0x10ffff) */ {
898 *pDest++=(uint8_t)(((c)>>18)|0xf0);
899 *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
900 *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
901 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
902 }
903 return pDest;
904 }
905
906
907 U_CAPI char* U_EXPORT2
908 u_strToUTF8WithSub(char *dest,
909 int32_t destCapacity,
910 int32_t *pDestLength,
911 const UChar *pSrc,
912 int32_t srcLength,
913 UChar32 subchar, int32_t *pNumSubstitutions,
914 UErrorCode *pErrorCode){
915
916 int32_t reqLength=0;
917 uint32_t ch=0,ch2=0;
918 uint8_t *pDest = (uint8_t *)dest;
919 uint8_t *pDestLimit = pDest + destCapacity;
920 int32_t numSubstitutions;
921
922 /* args check */
923 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
924 return NULL;
925 }
926
927 if( (pSrc==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0) ||
928 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
929 ) {
930 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
931 return NULL;
932 }
933
934 numSubstitutions=0;
935
936 if(srcLength==-1) {
937 while((ch=*pSrc)!=0) {
938 ++pSrc;
939 if(ch <= 0x7f) {
940 if(pDest<pDestLimit) {
941 *pDest++ = (char)ch;
942 } else {
943 reqLength = 1;
944 break;
945 }
946 } else if(ch <= 0x7ff) {
947 if((pDestLimit - pDest) >= 2) {
948 *pDest++=(uint8_t)((ch>>6)|0xc0);
949 *pDest++=(uint8_t)((ch&0x3f)|0x80);
950 } else {
951 reqLength = 2;
952 break;
953 }
954 } else if(ch <= 0xd7ff || ch >= 0xe000) {
955 if((pDestLimit - pDest) >= 3) {
956 *pDest++=(uint8_t)((ch>>12)|0xe0);
957 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
958 *pDest++=(uint8_t)((ch&0x3f)|0x80);
959 } else {
960 reqLength = 3;
961 break;
962 }
963 } else /* ch is a surrogate */ {
964 int32_t length;
965
966 /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/
967 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
968 ++pSrc;
969 ch=UTF16_GET_PAIR_VALUE(ch, ch2);
970 } else if(subchar>=0) {
971 ch=subchar;
972 ++numSubstitutions;
973 } else {
974 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
975 *pErrorCode = U_INVALID_CHAR_FOUND;
976 return NULL;
977 }
978
979 length = U8_LENGTH(ch);
980 if((pDestLimit - pDest) >= length) {
981 /* convert and append*/
982 pDest=_appendUTF8(pDest, ch);
983 } else {
984 reqLength = length;
985 break;
986 }
987 }
988 }
989 while((ch=*pSrc++)!=0) {
990 if(ch<=0x7f) {
991 ++reqLength;
992 } else if(ch<=0x7ff) {
993 reqLength+=2;
994 } else if(!UTF_IS_SURROGATE(ch)) {
995 reqLength+=3;
996 } else if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
997 ++pSrc;
998 reqLength+=4;
999 } else if(subchar>=0) {
1000 reqLength+=U8_LENGTH(subchar);
1001 ++numSubstitutions;
1002 } else {
1003 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1004 *pErrorCode = U_INVALID_CHAR_FOUND;
1005 return NULL;
1006 }
1007 }
1008 } else {
1009 const UChar *pSrcLimit = pSrc+srcLength;
1010 int32_t count;
1011
1012 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1013 for(;;) {
1014 /*
1015 * Each iteration of the inner loop progresses by at most 3 UTF-8
1016 * bytes and one UChar, for most characters.
1017 * For supplementary code points (4 & 2), which are rare,
1018 * there is an additional adjustment.
1019 */
1020 count = (int32_t)((pDestLimit - pDest) / 3);
1021 srcLength = (int32_t)(pSrcLimit - pSrc);
1022 if(count > srcLength) {
1023 count = srcLength; /* min(remaining dest/3, remaining src) */
1024 }
1025 if(count < 3) {
1026 /*
1027 * Too much overhead if we get near the end of the string,
1028 * continue with the next loop.
1029 */
1030 break;
1031 }
1032 do {
1033 ch=*pSrc++;
1034 if(ch <= 0x7f) {
1035 *pDest++ = (char)ch;
1036 } else if(ch <= 0x7ff) {
1037 *pDest++=(uint8_t)((ch>>6)|0xc0);
1038 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1039 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1040 *pDest++=(uint8_t)((ch>>12)|0xe0);
1041 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1042 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1043 } else /* ch is a surrogate */ {
1044 /*
1045 * We will read two UChars and probably output four bytes,
1046 * which we didn't account for with computing count,
1047 * so we adjust it here.
1048 */
1049 if(--count == 0) {
1050 --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1051 break; /* recompute count */
1052 }
1053
1054 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
1055 ++pSrc;
1056 ch=UTF16_GET_PAIR_VALUE(ch, ch2);
1057
1058 /* writing 4 bytes per 2 UChars is ok */
1059 *pDest++=(uint8_t)((ch>>18)|0xf0);
1060 *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1061 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1062 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1063 } else {
1064 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1065 if(subchar>=0) {
1066 ch=subchar;
1067 ++numSubstitutions;
1068 } else {
1069 *pErrorCode = U_INVALID_CHAR_FOUND;
1070 return NULL;
1071 }
1072
1073 /* convert and append*/
1074 pDest=_appendUTF8(pDest, ch);
1075 }
1076 }
1077 } while(--count > 0);
1078 }
1079
1080 while(pSrc<pSrcLimit) {
1081 ch=*pSrc++;
1082 if(ch <= 0x7f) {
1083 if(pDest<pDestLimit) {
1084 *pDest++ = (char)ch;
1085 } else {
1086 reqLength = 1;
1087 break;
1088 }
1089 } else if(ch <= 0x7ff) {
1090 if((pDestLimit - pDest) >= 2) {
1091 *pDest++=(uint8_t)((ch>>6)|0xc0);
1092 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1093 } else {
1094 reqLength = 2;
1095 break;
1096 }
1097 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1098 if((pDestLimit - pDest) >= 3) {
1099 *pDest++=(uint8_t)((ch>>12)|0xe0);
1100 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1101 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1102 } else {
1103 reqLength = 3;
1104 break;
1105 }
1106 } else /* ch is a surrogate */ {
1107 int32_t length;
1108
1109 if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
1110 ++pSrc;
1111 ch=UTF16_GET_PAIR_VALUE(ch, ch2);
1112 } else if(subchar>=0) {
1113 ch=subchar;
1114 ++numSubstitutions;
1115 } else {
1116 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1117 *pErrorCode = U_INVALID_CHAR_FOUND;
1118 return NULL;
1119 }
1120
1121 length = U8_LENGTH(ch);
1122 if((pDestLimit - pDest) >= length) {
1123 /* convert and append*/
1124 pDest=_appendUTF8(pDest, ch);
1125 } else {
1126 reqLength = length;
1127 break;
1128 }
1129 }
1130 }
1131 while(pSrc<pSrcLimit) {
1132 ch=*pSrc++;
1133 if(ch<=0x7f) {
1134 ++reqLength;
1135 } else if(ch<=0x7ff) {
1136 reqLength+=2;
1137 } else if(!UTF_IS_SURROGATE(ch)) {
1138 reqLength+=3;
1139 } else if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
1140 ++pSrc;
1141 reqLength+=4;
1142 } else if(subchar>=0) {
1143 reqLength+=U8_LENGTH(subchar);
1144 ++numSubstitutions;
1145 } else {
1146 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1147 *pErrorCode = U_INVALID_CHAR_FOUND;
1148 return NULL;
1149 }
1150 }
1151 }
1152
1153 reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1154
1155 if(pNumSubstitutions!=NULL) {
1156 *pNumSubstitutions=numSubstitutions;
1157 }
1158
1159 if(pDestLength){
1160 *pDestLength = reqLength;
1161 }
1162
1163 /* Terminate the buffer */
1164 u_terminateChars((char*)dest,destCapacity,reqLength,pErrorCode);
1165
1166 return (char*)dest;
1167 }
1168
1169 U_CAPI char* U_EXPORT2
1170 u_strToUTF8(char *dest,
1171 int32_t destCapacity,
1172 int32_t *pDestLength,
1173 const UChar *pSrc,
1174 int32_t srcLength,
1175 UErrorCode *pErrorCode){
1176 return u_strToUTF8WithSub(
1177 dest, destCapacity, pDestLength,
1178 pSrc, srcLength,
1179 U_SENTINEL, NULL,
1180 pErrorCode);
1181 }