[LIBXML2] Update to version 2.9.8. CORE-15280
[reactos.git] / sdk / lib / 3rdparty / libxml2 / encoding.c
1 /*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
16 * See Copyright for the status of this software.
17 *
18 * daniel@veillard.com
19 *
20 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
21 */
22
23 #define IN_LIBXML
24 #include "libxml.h"
25
26 #include <string.h>
27 #include <limits.h>
28
29 #ifdef HAVE_CTYPE_H
30 #include <ctype.h>
31 #endif
32 #ifdef HAVE_STDLIB_H
33 #include <stdlib.h>
34 #endif
35 #ifdef LIBXML_ICONV_ENABLED
36 #ifdef HAVE_ERRNO_H
37 #include <errno.h>
38 #endif
39 #endif
40 #include <libxml/encoding.h>
41 #include <libxml/xmlmemory.h>
42 #ifdef LIBXML_HTML_ENABLED
43 #include <libxml/HTMLparser.h>
44 #endif
45 #include <libxml/globals.h>
46 #include <libxml/xmlerror.h>
47
48 #include "buf.h"
49 #include "enc.h"
50
51 static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
52 static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
53
54 typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
55 typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
56 struct _xmlCharEncodingAlias {
57 const char *name;
58 const char *alias;
59 };
60
61 static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
62 static int xmlCharEncodingAliasesNb = 0;
63 static int xmlCharEncodingAliasesMax = 0;
64
65 #if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED)
66 #if 0
67 #define DEBUG_ENCODING /* Define this to get encoding traces */
68 #endif
69 #else
70 #ifdef LIBXML_ISO8859X_ENABLED
71 static void xmlRegisterCharEncodingHandlersISO8859x (void);
72 #endif
73 #endif
74
75 static int xmlLittleEndian = 1;
76
77 /**
78 * xmlEncodingErrMemory:
79 * @extra: extra informations
80 *
81 * Handle an out of memory condition
82 */
83 static void
84 xmlEncodingErrMemory(const char *extra)
85 {
86 __xmlSimpleError(XML_FROM_I18N, XML_ERR_NO_MEMORY, NULL, NULL, extra);
87 }
88
89 /**
90 * xmlErrEncoding:
91 * @error: the error number
92 * @msg: the error message
93 *
94 * n encoding error
95 */
96 static void LIBXML_ATTR_FORMAT(2,0)
97 xmlEncodingErr(xmlParserErrors error, const char *msg, const char *val)
98 {
99 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL,
100 XML_FROM_I18N, error, XML_ERR_FATAL,
101 NULL, 0, val, NULL, NULL, 0, 0, msg, val);
102 }
103
104 #ifdef LIBXML_ICU_ENABLED
105 static uconv_t*
106 openIcuConverter(const char* name, int toUnicode)
107 {
108 UErrorCode status = U_ZERO_ERROR;
109 uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t));
110 if (conv == NULL)
111 return NULL;
112
113 conv->pivot_source = conv->pivot_buf;
114 conv->pivot_target = conv->pivot_buf;
115
116 conv->uconv = ucnv_open(name, &status);
117 if (U_FAILURE(status))
118 goto error;
119
120 status = U_ZERO_ERROR;
121 if (toUnicode) {
122 ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP,
123 NULL, NULL, NULL, &status);
124 }
125 else {
126 ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP,
127 NULL, NULL, NULL, &status);
128 }
129 if (U_FAILURE(status))
130 goto error;
131
132 status = U_ZERO_ERROR;
133 conv->utf8 = ucnv_open("UTF-8", &status);
134 if (U_SUCCESS(status))
135 return conv;
136
137 error:
138 if (conv->uconv)
139 ucnv_close(conv->uconv);
140 xmlFree(conv);
141 return NULL;
142 }
143
144 static void
145 closeIcuConverter(uconv_t *conv)
146 {
147 if (conv != NULL) {
148 ucnv_close(conv->uconv);
149 ucnv_close(conv->utf8);
150 xmlFree(conv);
151 }
152 }
153 #endif /* LIBXML_ICU_ENABLED */
154
155 /************************************************************************
156 * *
157 * Conversions To/From UTF8 encoding *
158 * *
159 ************************************************************************/
160
161 /**
162 * asciiToUTF8:
163 * @out: a pointer to an array of bytes to store the result
164 * @outlen: the length of @out
165 * @in: a pointer to an array of ASCII chars
166 * @inlen: the length of @in
167 *
168 * Take a block of ASCII chars in and try to convert it to an UTF-8
169 * block of chars out.
170 * Returns 0 if success, or -1 otherwise
171 * The value of @inlen after return is the number of octets consumed
172 * if the return value is positive, else unpredictable.
173 * The value of @outlen after return is the number of octets consumed.
174 */
175 static int
176 asciiToUTF8(unsigned char* out, int *outlen,
177 const unsigned char* in, int *inlen) {
178 unsigned char* outstart = out;
179 const unsigned char* base = in;
180 const unsigned char* processed = in;
181 unsigned char* outend = out + *outlen;
182 const unsigned char* inend;
183 unsigned int c;
184
185 inend = in + (*inlen);
186 while ((in < inend) && (out - outstart + 5 < *outlen)) {
187 c= *in++;
188
189 if (out >= outend)
190 break;
191 if (c < 0x80) {
192 *out++ = c;
193 } else {
194 *outlen = out - outstart;
195 *inlen = processed - base;
196 return(-1);
197 }
198
199 processed = (const unsigned char*) in;
200 }
201 *outlen = out - outstart;
202 *inlen = processed - base;
203 return(*outlen);
204 }
205
206 #ifdef LIBXML_OUTPUT_ENABLED
207 /**
208 * UTF8Toascii:
209 * @out: a pointer to an array of bytes to store the result
210 * @outlen: the length of @out
211 * @in: a pointer to an array of UTF-8 chars
212 * @inlen: the length of @in
213 *
214 * Take a block of UTF-8 chars in and try to convert it to an ASCII
215 * block of chars out.
216 *
217 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
218 * The value of @inlen after return is the number of octets consumed
219 * if the return value is positive, else unpredictable.
220 * The value of @outlen after return is the number of octets consumed.
221 */
222 static int
223 UTF8Toascii(unsigned char* out, int *outlen,
224 const unsigned char* in, int *inlen) {
225 const unsigned char* processed = in;
226 const unsigned char* outend;
227 const unsigned char* outstart = out;
228 const unsigned char* instart = in;
229 const unsigned char* inend;
230 unsigned int c, d;
231 int trailing;
232
233 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
234 if (in == NULL) {
235 /*
236 * initialization nothing to do
237 */
238 *outlen = 0;
239 *inlen = 0;
240 return(0);
241 }
242 inend = in + (*inlen);
243 outend = out + (*outlen);
244 while (in < inend) {
245 d = *in++;
246 if (d < 0x80) { c= d; trailing= 0; }
247 else if (d < 0xC0) {
248 /* trailing byte in leading position */
249 *outlen = out - outstart;
250 *inlen = processed - instart;
251 return(-2);
252 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
253 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
254 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
255 else {
256 /* no chance for this in Ascii */
257 *outlen = out - outstart;
258 *inlen = processed - instart;
259 return(-2);
260 }
261
262 if (inend - in < trailing) {
263 break;
264 }
265
266 for ( ; trailing; trailing--) {
267 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
268 break;
269 c <<= 6;
270 c |= d & 0x3F;
271 }
272
273 /* assertion: c is a single UTF-4 value */
274 if (c < 0x80) {
275 if (out >= outend)
276 break;
277 *out++ = c;
278 } else {
279 /* no chance for this in Ascii */
280 *outlen = out - outstart;
281 *inlen = processed - instart;
282 return(-2);
283 }
284 processed = in;
285 }
286 *outlen = out - outstart;
287 *inlen = processed - instart;
288 return(*outlen);
289 }
290 #endif /* LIBXML_OUTPUT_ENABLED */
291
292 /**
293 * isolat1ToUTF8:
294 * @out: a pointer to an array of bytes to store the result
295 * @outlen: the length of @out
296 * @in: a pointer to an array of ISO Latin 1 chars
297 * @inlen: the length of @in
298 *
299 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
300 * block of chars out.
301 * Returns the number of bytes written if success, or -1 otherwise
302 * The value of @inlen after return is the number of octets consumed
303 * if the return value is positive, else unpredictable.
304 * The value of @outlen after return is the number of octets consumed.
305 */
306 int
307 isolat1ToUTF8(unsigned char* out, int *outlen,
308 const unsigned char* in, int *inlen) {
309 unsigned char* outstart = out;
310 const unsigned char* base = in;
311 unsigned char* outend;
312 const unsigned char* inend;
313 const unsigned char* instop;
314
315 if ((out == NULL) || (in == NULL) || (outlen == NULL) || (inlen == NULL))
316 return(-1);
317
318 outend = out + *outlen;
319 inend = in + (*inlen);
320 instop = inend;
321
322 while ((in < inend) && (out < outend - 1)) {
323 if (*in >= 0x80) {
324 *out++ = (((*in) >> 6) & 0x1F) | 0xC0;
325 *out++ = ((*in) & 0x3F) | 0x80;
326 ++in;
327 }
328 if ((instop - in) > (outend - out)) instop = in + (outend - out);
329 while ((in < instop) && (*in < 0x80)) {
330 *out++ = *in++;
331 }
332 }
333 if ((in < inend) && (out < outend) && (*in < 0x80)) {
334 *out++ = *in++;
335 }
336 *outlen = out - outstart;
337 *inlen = in - base;
338 return(*outlen);
339 }
340
341 /**
342 * UTF8ToUTF8:
343 * @out: a pointer to an array of bytes to store the result
344 * @outlen: the length of @out
345 * @inb: a pointer to an array of UTF-8 chars
346 * @inlenb: the length of @in in UTF-8 chars
347 *
348 * No op copy operation for UTF8 handling.
349 *
350 * Returns the number of bytes written, or -1 if lack of space.
351 * The value of *inlen after return is the number of octets consumed
352 * if the return value is positive, else unpredictable.
353 */
354 static int
355 UTF8ToUTF8(unsigned char* out, int *outlen,
356 const unsigned char* inb, int *inlenb)
357 {
358 int len;
359
360 if ((out == NULL) || (outlen == NULL) || (inlenb == NULL))
361 return(-1);
362 if (inb == NULL) {
363 /* inb == NULL means output is initialized. */
364 *outlen = 0;
365 *inlenb = 0;
366 return(0);
367 }
368 if (*outlen > *inlenb) {
369 len = *inlenb;
370 } else {
371 len = *outlen;
372 }
373 if (len < 0)
374 return(-1);
375
376 memcpy(out, inb, len);
377
378 *outlen = len;
379 *inlenb = len;
380 return(*outlen);
381 }
382
383
384 #ifdef LIBXML_OUTPUT_ENABLED
385 /**
386 * UTF8Toisolat1:
387 * @out: a pointer to an array of bytes to store the result
388 * @outlen: the length of @out
389 * @in: a pointer to an array of UTF-8 chars
390 * @inlen: the length of @in
391 *
392 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
393 * block of chars out.
394 *
395 * Returns the number of bytes written if success, -2 if the transcoding fails,
396 or -1 otherwise
397 * The value of @inlen after return is the number of octets consumed
398 * if the return value is positive, else unpredictable.
399 * The value of @outlen after return is the number of octets consumed.
400 */
401 int
402 UTF8Toisolat1(unsigned char* out, int *outlen,
403 const unsigned char* in, int *inlen) {
404 const unsigned char* processed = in;
405 const unsigned char* outend;
406 const unsigned char* outstart = out;
407 const unsigned char* instart = in;
408 const unsigned char* inend;
409 unsigned int c, d;
410 int trailing;
411
412 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
413 if (in == NULL) {
414 /*
415 * initialization nothing to do
416 */
417 *outlen = 0;
418 *inlen = 0;
419 return(0);
420 }
421 inend = in + (*inlen);
422 outend = out + (*outlen);
423 while (in < inend) {
424 d = *in++;
425 if (d < 0x80) { c= d; trailing= 0; }
426 else if (d < 0xC0) {
427 /* trailing byte in leading position */
428 *outlen = out - outstart;
429 *inlen = processed - instart;
430 return(-2);
431 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
432 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
433 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
434 else {
435 /* no chance for this in IsoLat1 */
436 *outlen = out - outstart;
437 *inlen = processed - instart;
438 return(-2);
439 }
440
441 if (inend - in < trailing) {
442 break;
443 }
444
445 for ( ; trailing; trailing--) {
446 if (in >= inend)
447 break;
448 if (((d= *in++) & 0xC0) != 0x80) {
449 *outlen = out - outstart;
450 *inlen = processed - instart;
451 return(-2);
452 }
453 c <<= 6;
454 c |= d & 0x3F;
455 }
456
457 /* assertion: c is a single UTF-4 value */
458 if (c <= 0xFF) {
459 if (out >= outend)
460 break;
461 *out++ = c;
462 } else {
463 /* no chance for this in IsoLat1 */
464 *outlen = out - outstart;
465 *inlen = processed - instart;
466 return(-2);
467 }
468 processed = in;
469 }
470 *outlen = out - outstart;
471 *inlen = processed - instart;
472 return(*outlen);
473 }
474 #endif /* LIBXML_OUTPUT_ENABLED */
475
476 /**
477 * UTF16LEToUTF8:
478 * @out: a pointer to an array of bytes to store the result
479 * @outlen: the length of @out
480 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
481 * @inlenb: the length of @in in UTF-16LE chars
482 *
483 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
484 * block of chars out. This function assumes the endian property
485 * is the same between the native type of this machine and the
486 * inputed one.
487 *
488 * Returns the number of bytes written, or -1 if lack of space, or -2
489 * if the transcoding fails (if *in is not a valid utf16 string)
490 * The value of *inlen after return is the number of octets consumed
491 * if the return value is positive, else unpredictable.
492 */
493 static int
494 UTF16LEToUTF8(unsigned char* out, int *outlen,
495 const unsigned char* inb, int *inlenb)
496 {
497 unsigned char* outstart = out;
498 const unsigned char* processed = inb;
499 unsigned char* outend = out + *outlen;
500 unsigned short* in = (unsigned short*) inb;
501 unsigned short* inend;
502 unsigned int c, d, inlen;
503 unsigned char *tmp;
504 int bits;
505
506 if ((*inlenb % 2) == 1)
507 (*inlenb)--;
508 inlen = *inlenb / 2;
509 inend = in + inlen;
510 while ((in < inend) && (out - outstart + 5 < *outlen)) {
511 if (xmlLittleEndian) {
512 c= *in++;
513 } else {
514 tmp = (unsigned char *) in;
515 c = *tmp++;
516 c = c | (((unsigned int)*tmp) << 8);
517 in++;
518 }
519 if ((c & 0xFC00) == 0xD800) { /* surrogates */
520 if (in >= inend) { /* (in > inend) shouldn't happens */
521 break;
522 }
523 if (xmlLittleEndian) {
524 d = *in++;
525 } else {
526 tmp = (unsigned char *) in;
527 d = *tmp++;
528 d = d | (((unsigned int)*tmp) << 8);
529 in++;
530 }
531 if ((d & 0xFC00) == 0xDC00) {
532 c &= 0x03FF;
533 c <<= 10;
534 c |= d & 0x03FF;
535 c += 0x10000;
536 }
537 else {
538 *outlen = out - outstart;
539 *inlenb = processed - inb;
540 return(-2);
541 }
542 }
543
544 /* assertion: c is a single UTF-4 value */
545 if (out >= outend)
546 break;
547 if (c < 0x80) { *out++= c; bits= -6; }
548 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
549 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
550 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
551
552 for ( ; bits >= 0; bits-= 6) {
553 if (out >= outend)
554 break;
555 *out++= ((c >> bits) & 0x3F) | 0x80;
556 }
557 processed = (const unsigned char*) in;
558 }
559 *outlen = out - outstart;
560 *inlenb = processed - inb;
561 return(*outlen);
562 }
563
564 #ifdef LIBXML_OUTPUT_ENABLED
565 /**
566 * UTF8ToUTF16LE:
567 * @outb: a pointer to an array of bytes to store the result
568 * @outlen: the length of @outb
569 * @in: a pointer to an array of UTF-8 chars
570 * @inlen: the length of @in
571 *
572 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
573 * block of chars out.
574 *
575 * Returns the number of bytes written, or -1 if lack of space, or -2
576 * if the transcoding failed.
577 */
578 static int
579 UTF8ToUTF16LE(unsigned char* outb, int *outlen,
580 const unsigned char* in, int *inlen)
581 {
582 unsigned short* out = (unsigned short*) outb;
583 const unsigned char* processed = in;
584 const unsigned char *const instart = in;
585 unsigned short* outstart= out;
586 unsigned short* outend;
587 const unsigned char* inend;
588 unsigned int c, d;
589 int trailing;
590 unsigned char *tmp;
591 unsigned short tmp1, tmp2;
592
593 /* UTF16LE encoding has no BOM */
594 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
595 if (in == NULL) {
596 *outlen = 0;
597 *inlen = 0;
598 return(0);
599 }
600 inend= in + *inlen;
601 outend = out + (*outlen / 2);
602 while (in < inend) {
603 d= *in++;
604 if (d < 0x80) { c= d; trailing= 0; }
605 else if (d < 0xC0) {
606 /* trailing byte in leading position */
607 *outlen = (out - outstart) * 2;
608 *inlen = processed - instart;
609 return(-2);
610 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
611 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
612 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
613 else {
614 /* no chance for this in UTF-16 */
615 *outlen = (out - outstart) * 2;
616 *inlen = processed - instart;
617 return(-2);
618 }
619
620 if (inend - in < trailing) {
621 break;
622 }
623
624 for ( ; trailing; trailing--) {
625 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
626 break;
627 c <<= 6;
628 c |= d & 0x3F;
629 }
630
631 /* assertion: c is a single UTF-4 value */
632 if (c < 0x10000) {
633 if (out >= outend)
634 break;
635 if (xmlLittleEndian) {
636 *out++ = c;
637 } else {
638 tmp = (unsigned char *) out;
639 *tmp = c ;
640 *(tmp + 1) = c >> 8 ;
641 out++;
642 }
643 }
644 else if (c < 0x110000) {
645 if (out+1 >= outend)
646 break;
647 c -= 0x10000;
648 if (xmlLittleEndian) {
649 *out++ = 0xD800 | (c >> 10);
650 *out++ = 0xDC00 | (c & 0x03FF);
651 } else {
652 tmp1 = 0xD800 | (c >> 10);
653 tmp = (unsigned char *) out;
654 *tmp = (unsigned char) tmp1;
655 *(tmp + 1) = tmp1 >> 8;
656 out++;
657
658 tmp2 = 0xDC00 | (c & 0x03FF);
659 tmp = (unsigned char *) out;
660 *tmp = (unsigned char) tmp2;
661 *(tmp + 1) = tmp2 >> 8;
662 out++;
663 }
664 }
665 else
666 break;
667 processed = in;
668 }
669 *outlen = (out - outstart) * 2;
670 *inlen = processed - instart;
671 return(*outlen);
672 }
673
674 /**
675 * UTF8ToUTF16:
676 * @outb: a pointer to an array of bytes to store the result
677 * @outlen: the length of @outb
678 * @in: a pointer to an array of UTF-8 chars
679 * @inlen: the length of @in
680 *
681 * Take a block of UTF-8 chars in and try to convert it to an UTF-16
682 * block of chars out.
683 *
684 * Returns the number of bytes written, or -1 if lack of space, or -2
685 * if the transcoding failed.
686 */
687 static int
688 UTF8ToUTF16(unsigned char* outb, int *outlen,
689 const unsigned char* in, int *inlen)
690 {
691 if (in == NULL) {
692 /*
693 * initialization, add the Byte Order Mark for UTF-16LE
694 */
695 if (*outlen >= 2) {
696 outb[0] = 0xFF;
697 outb[1] = 0xFE;
698 *outlen = 2;
699 *inlen = 0;
700 #ifdef DEBUG_ENCODING
701 xmlGenericError(xmlGenericErrorContext,
702 "Added FFFE Byte Order Mark\n");
703 #endif
704 return(2);
705 }
706 *outlen = 0;
707 *inlen = 0;
708 return(0);
709 }
710 return (UTF8ToUTF16LE(outb, outlen, in, inlen));
711 }
712 #endif /* LIBXML_OUTPUT_ENABLED */
713
714 /**
715 * UTF16BEToUTF8:
716 * @out: a pointer to an array of bytes to store the result
717 * @outlen: the length of @out
718 * @inb: a pointer to an array of UTF-16 passed as a byte array
719 * @inlenb: the length of @in in UTF-16 chars
720 *
721 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
722 * block of chars out. This function assumes the endian property
723 * is the same between the native type of this machine and the
724 * inputed one.
725 *
726 * Returns the number of bytes written, or -1 if lack of space, or -2
727 * if the transcoding fails (if *in is not a valid utf16 string)
728 * The value of *inlen after return is the number of octets consumed
729 * if the return value is positive, else unpredictable.
730 */
731 static int
732 UTF16BEToUTF8(unsigned char* out, int *outlen,
733 const unsigned char* inb, int *inlenb)
734 {
735 unsigned char* outstart = out;
736 const unsigned char* processed = inb;
737 unsigned char* outend = out + *outlen;
738 unsigned short* in = (unsigned short*) inb;
739 unsigned short* inend;
740 unsigned int c, d, inlen;
741 unsigned char *tmp;
742 int bits;
743
744 if ((*inlenb % 2) == 1)
745 (*inlenb)--;
746 inlen = *inlenb / 2;
747 inend= in + inlen;
748 while (in < inend) {
749 if (xmlLittleEndian) {
750 tmp = (unsigned char *) in;
751 c = *tmp++;
752 c = c << 8;
753 c = c | (unsigned int) *tmp;
754 in++;
755 } else {
756 c= *in++;
757 }
758 if ((c & 0xFC00) == 0xD800) { /* surrogates */
759 if (in >= inend) { /* (in > inend) shouldn't happens */
760 *outlen = out - outstart;
761 *inlenb = processed - inb;
762 return(-2);
763 }
764 if (xmlLittleEndian) {
765 tmp = (unsigned char *) in;
766 d = *tmp++;
767 d = d << 8;
768 d = d | (unsigned int) *tmp;
769 in++;
770 } else {
771 d= *in++;
772 }
773 if ((d & 0xFC00) == 0xDC00) {
774 c &= 0x03FF;
775 c <<= 10;
776 c |= d & 0x03FF;
777 c += 0x10000;
778 }
779 else {
780 *outlen = out - outstart;
781 *inlenb = processed - inb;
782 return(-2);
783 }
784 }
785
786 /* assertion: c is a single UTF-4 value */
787 if (out >= outend)
788 break;
789 if (c < 0x80) { *out++= c; bits= -6; }
790 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
791 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
792 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
793
794 for ( ; bits >= 0; bits-= 6) {
795 if (out >= outend)
796 break;
797 *out++= ((c >> bits) & 0x3F) | 0x80;
798 }
799 processed = (const unsigned char*) in;
800 }
801 *outlen = out - outstart;
802 *inlenb = processed - inb;
803 return(*outlen);
804 }
805
806 #ifdef LIBXML_OUTPUT_ENABLED
807 /**
808 * UTF8ToUTF16BE:
809 * @outb: a pointer to an array of bytes to store the result
810 * @outlen: the length of @outb
811 * @in: a pointer to an array of UTF-8 chars
812 * @inlen: the length of @in
813 *
814 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
815 * block of chars out.
816 *
817 * Returns the number of byte written, or -1 by lack of space, or -2
818 * if the transcoding failed.
819 */
820 static int
821 UTF8ToUTF16BE(unsigned char* outb, int *outlen,
822 const unsigned char* in, int *inlen)
823 {
824 unsigned short* out = (unsigned short*) outb;
825 const unsigned char* processed = in;
826 const unsigned char *const instart = in;
827 unsigned short* outstart= out;
828 unsigned short* outend;
829 const unsigned char* inend;
830 unsigned int c, d;
831 int trailing;
832 unsigned char *tmp;
833 unsigned short tmp1, tmp2;
834
835 /* UTF-16BE has no BOM */
836 if ((outb == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
837 if (in == NULL) {
838 *outlen = 0;
839 *inlen = 0;
840 return(0);
841 }
842 inend= in + *inlen;
843 outend = out + (*outlen / 2);
844 while (in < inend) {
845 d= *in++;
846 if (d < 0x80) { c= d; trailing= 0; }
847 else if (d < 0xC0) {
848 /* trailing byte in leading position */
849 *outlen = out - outstart;
850 *inlen = processed - instart;
851 return(-2);
852 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
853 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
854 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
855 else {
856 /* no chance for this in UTF-16 */
857 *outlen = out - outstart;
858 *inlen = processed - instart;
859 return(-2);
860 }
861
862 if (inend - in < trailing) {
863 break;
864 }
865
866 for ( ; trailing; trailing--) {
867 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
868 c <<= 6;
869 c |= d & 0x3F;
870 }
871
872 /* assertion: c is a single UTF-4 value */
873 if (c < 0x10000) {
874 if (out >= outend) break;
875 if (xmlLittleEndian) {
876 tmp = (unsigned char *) out;
877 *tmp = c >> 8;
878 *(tmp + 1) = c;
879 out++;
880 } else {
881 *out++ = c;
882 }
883 }
884 else if (c < 0x110000) {
885 if (out+1 >= outend) break;
886 c -= 0x10000;
887 if (xmlLittleEndian) {
888 tmp1 = 0xD800 | (c >> 10);
889 tmp = (unsigned char *) out;
890 *tmp = tmp1 >> 8;
891 *(tmp + 1) = (unsigned char) tmp1;
892 out++;
893
894 tmp2 = 0xDC00 | (c & 0x03FF);
895 tmp = (unsigned char *) out;
896 *tmp = tmp2 >> 8;
897 *(tmp + 1) = (unsigned char) tmp2;
898 out++;
899 } else {
900 *out++ = 0xD800 | (c >> 10);
901 *out++ = 0xDC00 | (c & 0x03FF);
902 }
903 }
904 else
905 break;
906 processed = in;
907 }
908 *outlen = (out - outstart) * 2;
909 *inlen = processed - instart;
910 return(*outlen);
911 }
912 #endif /* LIBXML_OUTPUT_ENABLED */
913
914 /************************************************************************
915 * *
916 * Generic encoding handling routines *
917 * *
918 ************************************************************************/
919
920 /**
921 * xmlDetectCharEncoding:
922 * @in: a pointer to the first bytes of the XML entity, must be at least
923 * 2 bytes long (at least 4 if encoding is UTF4 variant).
924 * @len: pointer to the length of the buffer
925 *
926 * Guess the encoding of the entity using the first bytes of the entity content
927 * according to the non-normative appendix F of the XML-1.0 recommendation.
928 *
929 * Returns one of the XML_CHAR_ENCODING_... values.
930 */
931 xmlCharEncoding
932 xmlDetectCharEncoding(const unsigned char* in, int len)
933 {
934 if (in == NULL)
935 return(XML_CHAR_ENCODING_NONE);
936 if (len >= 4) {
937 if ((in[0] == 0x00) && (in[1] == 0x00) &&
938 (in[2] == 0x00) && (in[3] == 0x3C))
939 return(XML_CHAR_ENCODING_UCS4BE);
940 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
941 (in[2] == 0x00) && (in[3] == 0x00))
942 return(XML_CHAR_ENCODING_UCS4LE);
943 if ((in[0] == 0x00) && (in[1] == 0x00) &&
944 (in[2] == 0x3C) && (in[3] == 0x00))
945 return(XML_CHAR_ENCODING_UCS4_2143);
946 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
947 (in[2] == 0x00) && (in[3] == 0x00))
948 return(XML_CHAR_ENCODING_UCS4_3412);
949 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
950 (in[2] == 0xA7) && (in[3] == 0x94))
951 return(XML_CHAR_ENCODING_EBCDIC);
952 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
953 (in[2] == 0x78) && (in[3] == 0x6D))
954 return(XML_CHAR_ENCODING_UTF8);
955 /*
956 * Although not part of the recommendation, we also
957 * attempt an "auto-recognition" of UTF-16LE and
958 * UTF-16BE encodings.
959 */
960 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
961 (in[2] == 0x3F) && (in[3] == 0x00))
962 return(XML_CHAR_ENCODING_UTF16LE);
963 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
964 (in[2] == 0x00) && (in[3] == 0x3F))
965 return(XML_CHAR_ENCODING_UTF16BE);
966 }
967 if (len >= 3) {
968 /*
969 * Errata on XML-1.0 June 20 2001
970 * We now allow an UTF8 encoded BOM
971 */
972 if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
973 (in[2] == 0xBF))
974 return(XML_CHAR_ENCODING_UTF8);
975 }
976 /* For UTF-16 we can recognize by the BOM */
977 if (len >= 2) {
978 if ((in[0] == 0xFE) && (in[1] == 0xFF))
979 return(XML_CHAR_ENCODING_UTF16BE);
980 if ((in[0] == 0xFF) && (in[1] == 0xFE))
981 return(XML_CHAR_ENCODING_UTF16LE);
982 }
983 return(XML_CHAR_ENCODING_NONE);
984 }
985
986 /**
987 * xmlCleanupEncodingAliases:
988 *
989 * Unregisters all aliases
990 */
991 void
992 xmlCleanupEncodingAliases(void) {
993 int i;
994
995 if (xmlCharEncodingAliases == NULL)
996 return;
997
998 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
999 if (xmlCharEncodingAliases[i].name != NULL)
1000 xmlFree((char *) xmlCharEncodingAliases[i].name);
1001 if (xmlCharEncodingAliases[i].alias != NULL)
1002 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1003 }
1004 xmlCharEncodingAliasesNb = 0;
1005 xmlCharEncodingAliasesMax = 0;
1006 xmlFree(xmlCharEncodingAliases);
1007 xmlCharEncodingAliases = NULL;
1008 }
1009
1010 /**
1011 * xmlGetEncodingAlias:
1012 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1013 *
1014 * Lookup an encoding name for the given alias.
1015 *
1016 * Returns NULL if not found, otherwise the original name
1017 */
1018 const char *
1019 xmlGetEncodingAlias(const char *alias) {
1020 int i;
1021 char upper[100];
1022
1023 if (alias == NULL)
1024 return(NULL);
1025
1026 if (xmlCharEncodingAliases == NULL)
1027 return(NULL);
1028
1029 for (i = 0;i < 99;i++) {
1030 upper[i] = toupper(alias[i]);
1031 if (upper[i] == 0) break;
1032 }
1033 upper[i] = 0;
1034
1035 /*
1036 * Walk down the list looking for a definition of the alias
1037 */
1038 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1039 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1040 return(xmlCharEncodingAliases[i].name);
1041 }
1042 }
1043 return(NULL);
1044 }
1045
1046 /**
1047 * xmlAddEncodingAlias:
1048 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1049 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1050 *
1051 * Registers an alias @alias for an encoding named @name. Existing alias
1052 * will be overwritten.
1053 *
1054 * Returns 0 in case of success, -1 in case of error
1055 */
1056 int
1057 xmlAddEncodingAlias(const char *name, const char *alias) {
1058 int i;
1059 char upper[100];
1060
1061 if ((name == NULL) || (alias == NULL))
1062 return(-1);
1063
1064 for (i = 0;i < 99;i++) {
1065 upper[i] = toupper(alias[i]);
1066 if (upper[i] == 0) break;
1067 }
1068 upper[i] = 0;
1069
1070 if (xmlCharEncodingAliases == NULL) {
1071 xmlCharEncodingAliasesNb = 0;
1072 xmlCharEncodingAliasesMax = 20;
1073 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1074 xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1075 if (xmlCharEncodingAliases == NULL)
1076 return(-1);
1077 } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1078 xmlCharEncodingAliasesMax *= 2;
1079 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1080 xmlRealloc(xmlCharEncodingAliases,
1081 xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1082 }
1083 /*
1084 * Walk down the list looking for a definition of the alias
1085 */
1086 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1087 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1088 /*
1089 * Replace the definition.
1090 */
1091 xmlFree((char *) xmlCharEncodingAliases[i].name);
1092 xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1093 return(0);
1094 }
1095 }
1096 /*
1097 * Add the definition
1098 */
1099 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1100 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1101 xmlCharEncodingAliasesNb++;
1102 return(0);
1103 }
1104
1105 /**
1106 * xmlDelEncodingAlias:
1107 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1108 *
1109 * Unregisters an encoding alias @alias
1110 *
1111 * Returns 0 in case of success, -1 in case of error
1112 */
1113 int
1114 xmlDelEncodingAlias(const char *alias) {
1115 int i;
1116
1117 if (alias == NULL)
1118 return(-1);
1119
1120 if (xmlCharEncodingAliases == NULL)
1121 return(-1);
1122 /*
1123 * Walk down the list looking for a definition of the alias
1124 */
1125 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1126 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1127 xmlFree((char *) xmlCharEncodingAliases[i].name);
1128 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1129 xmlCharEncodingAliasesNb--;
1130 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1131 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1132 return(0);
1133 }
1134 }
1135 return(-1);
1136 }
1137
1138 /**
1139 * xmlParseCharEncoding:
1140 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1141 *
1142 * Compare the string to the encoding schemes already known. Note
1143 * that the comparison is case insensitive accordingly to the section
1144 * [XML] 4.3.3 Character Encoding in Entities.
1145 *
1146 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1147 * if not recognized.
1148 */
1149 xmlCharEncoding
1150 xmlParseCharEncoding(const char* name)
1151 {
1152 const char *alias;
1153 char upper[500];
1154 int i;
1155
1156 if (name == NULL)
1157 return(XML_CHAR_ENCODING_NONE);
1158
1159 /*
1160 * Do the alias resolution
1161 */
1162 alias = xmlGetEncodingAlias(name);
1163 if (alias != NULL)
1164 name = alias;
1165
1166 for (i = 0;i < 499;i++) {
1167 upper[i] = toupper(name[i]);
1168 if (upper[i] == 0) break;
1169 }
1170 upper[i] = 0;
1171
1172 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1173 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1174 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1175
1176 /*
1177 * NOTE: if we were able to parse this, the endianness of UTF16 is
1178 * already found and in use
1179 */
1180 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1181 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1182
1183 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1184 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1185 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1186
1187 /*
1188 * NOTE: if we were able to parse this, the endianness of UCS4 is
1189 * already found and in use
1190 */
1191 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1192 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1193 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1194
1195
1196 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1197 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1198 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1199
1200 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1201 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1202 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1203
1204 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1205 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1206 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1207 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1208 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1209 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1210 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1211
1212 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1213 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1214 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1215
1216 #ifdef DEBUG_ENCODING
1217 xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1218 #endif
1219 return(XML_CHAR_ENCODING_ERROR);
1220 }
1221
1222 /**
1223 * xmlGetCharEncodingName:
1224 * @enc: the encoding
1225 *
1226 * The "canonical" name for XML encoding.
1227 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1228 * Section 4.3.3 Character Encoding in Entities
1229 *
1230 * Returns the canonical name for the given encoding
1231 */
1232
1233 const char*
1234 xmlGetCharEncodingName(xmlCharEncoding enc) {
1235 switch (enc) {
1236 case XML_CHAR_ENCODING_ERROR:
1237 return(NULL);
1238 case XML_CHAR_ENCODING_NONE:
1239 return(NULL);
1240 case XML_CHAR_ENCODING_UTF8:
1241 return("UTF-8");
1242 case XML_CHAR_ENCODING_UTF16LE:
1243 return("UTF-16");
1244 case XML_CHAR_ENCODING_UTF16BE:
1245 return("UTF-16");
1246 case XML_CHAR_ENCODING_EBCDIC:
1247 return("EBCDIC");
1248 case XML_CHAR_ENCODING_UCS4LE:
1249 return("ISO-10646-UCS-4");
1250 case XML_CHAR_ENCODING_UCS4BE:
1251 return("ISO-10646-UCS-4");
1252 case XML_CHAR_ENCODING_UCS4_2143:
1253 return("ISO-10646-UCS-4");
1254 case XML_CHAR_ENCODING_UCS4_3412:
1255 return("ISO-10646-UCS-4");
1256 case XML_CHAR_ENCODING_UCS2:
1257 return("ISO-10646-UCS-2");
1258 case XML_CHAR_ENCODING_8859_1:
1259 return("ISO-8859-1");
1260 case XML_CHAR_ENCODING_8859_2:
1261 return("ISO-8859-2");
1262 case XML_CHAR_ENCODING_8859_3:
1263 return("ISO-8859-3");
1264 case XML_CHAR_ENCODING_8859_4:
1265 return("ISO-8859-4");
1266 case XML_CHAR_ENCODING_8859_5:
1267 return("ISO-8859-5");
1268 case XML_CHAR_ENCODING_8859_6:
1269 return("ISO-8859-6");
1270 case XML_CHAR_ENCODING_8859_7:
1271 return("ISO-8859-7");
1272 case XML_CHAR_ENCODING_8859_8:
1273 return("ISO-8859-8");
1274 case XML_CHAR_ENCODING_8859_9:
1275 return("ISO-8859-9");
1276 case XML_CHAR_ENCODING_2022_JP:
1277 return("ISO-2022-JP");
1278 case XML_CHAR_ENCODING_SHIFT_JIS:
1279 return("Shift-JIS");
1280 case XML_CHAR_ENCODING_EUC_JP:
1281 return("EUC-JP");
1282 case XML_CHAR_ENCODING_ASCII:
1283 return(NULL);
1284 }
1285 return(NULL);
1286 }
1287
1288 /************************************************************************
1289 * *
1290 * Char encoding handlers *
1291 * *
1292 ************************************************************************/
1293
1294
1295 /* the size should be growable, but it's not a big deal ... */
1296 #define MAX_ENCODING_HANDLERS 50
1297 static xmlCharEncodingHandlerPtr *handlers = NULL;
1298 static int nbCharEncodingHandler = 0;
1299
1300 /*
1301 * The default is UTF-8 for XML, that's also the default used for the
1302 * parser internals, so the default encoding handler is NULL
1303 */
1304
1305 static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1306
1307 /**
1308 * xmlNewCharEncodingHandler:
1309 * @name: the encoding name, in UTF-8 format (ASCII actually)
1310 * @input: the xmlCharEncodingInputFunc to read that encoding
1311 * @output: the xmlCharEncodingOutputFunc to write that encoding
1312 *
1313 * Create and registers an xmlCharEncodingHandler.
1314 *
1315 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1316 */
1317 xmlCharEncodingHandlerPtr
1318 xmlNewCharEncodingHandler(const char *name,
1319 xmlCharEncodingInputFunc input,
1320 xmlCharEncodingOutputFunc output) {
1321 xmlCharEncodingHandlerPtr handler;
1322 const char *alias;
1323 char upper[500];
1324 int i;
1325 char *up = NULL;
1326
1327 /*
1328 * Do the alias resolution
1329 */
1330 alias = xmlGetEncodingAlias(name);
1331 if (alias != NULL)
1332 name = alias;
1333
1334 /*
1335 * Keep only the uppercase version of the encoding.
1336 */
1337 if (name == NULL) {
1338 xmlEncodingErr(XML_I18N_NO_NAME,
1339 "xmlNewCharEncodingHandler : no name !\n", NULL);
1340 return(NULL);
1341 }
1342 for (i = 0;i < 499;i++) {
1343 upper[i] = toupper(name[i]);
1344 if (upper[i] == 0) break;
1345 }
1346 upper[i] = 0;
1347 up = xmlMemStrdup(upper);
1348 if (up == NULL) {
1349 xmlEncodingErrMemory("xmlNewCharEncodingHandler : out of memory !\n");
1350 return(NULL);
1351 }
1352
1353 /*
1354 * allocate and fill-up an handler block.
1355 */
1356 handler = (xmlCharEncodingHandlerPtr)
1357 xmlMalloc(sizeof(xmlCharEncodingHandler));
1358 if (handler == NULL) {
1359 xmlFree(up);
1360 xmlEncodingErrMemory("xmlNewCharEncodingHandler : out of memory !\n");
1361 return(NULL);
1362 }
1363 memset(handler, 0, sizeof(xmlCharEncodingHandler));
1364 handler->input = input;
1365 handler->output = output;
1366 handler->name = up;
1367
1368 #ifdef LIBXML_ICONV_ENABLED
1369 handler->iconv_in = NULL;
1370 handler->iconv_out = NULL;
1371 #endif
1372 #ifdef LIBXML_ICU_ENABLED
1373 handler->uconv_in = NULL;
1374 handler->uconv_out = NULL;
1375 #endif
1376
1377 /*
1378 * registers and returns the handler.
1379 */
1380 xmlRegisterCharEncodingHandler(handler);
1381 #ifdef DEBUG_ENCODING
1382 xmlGenericError(xmlGenericErrorContext,
1383 "Registered encoding handler for %s\n", name);
1384 #endif
1385 return(handler);
1386 }
1387
1388 /**
1389 * xmlInitCharEncodingHandlers:
1390 *
1391 * Initialize the char encoding support, it registers the default
1392 * encoding supported.
1393 * NOTE: while public, this function usually doesn't need to be called
1394 * in normal processing.
1395 */
1396 void
1397 xmlInitCharEncodingHandlers(void) {
1398 unsigned short int tst = 0x1234;
1399 unsigned char *ptr = (unsigned char *) &tst;
1400
1401 if (handlers != NULL) return;
1402
1403 handlers = (xmlCharEncodingHandlerPtr *)
1404 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1405
1406 if (*ptr == 0x12) xmlLittleEndian = 0;
1407 else if (*ptr == 0x34) xmlLittleEndian = 1;
1408 else {
1409 xmlEncodingErr(XML_ERR_INTERNAL_ERROR,
1410 "Odd problem at endianness detection\n", NULL);
1411 }
1412
1413 if (handlers == NULL) {
1414 xmlEncodingErrMemory("xmlInitCharEncodingHandlers : out of memory !\n");
1415 return;
1416 }
1417 xmlNewCharEncodingHandler("UTF-8", UTF8ToUTF8, UTF8ToUTF8);
1418 #ifdef LIBXML_OUTPUT_ENABLED
1419 xmlUTF16LEHandler =
1420 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1421 xmlUTF16BEHandler =
1422 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1423 xmlNewCharEncodingHandler("UTF-16", UTF16LEToUTF8, UTF8ToUTF16);
1424 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1425 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
1426 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
1427 #ifdef LIBXML_HTML_ENABLED
1428 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1429 #endif
1430 #else
1431 xmlUTF16LEHandler =
1432 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, NULL);
1433 xmlUTF16BEHandler =
1434 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, NULL);
1435 xmlNewCharEncodingHandler("UTF-16", UTF16LEToUTF8, NULL);
1436 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, NULL);
1437 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL);
1438 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL);
1439 #endif /* LIBXML_OUTPUT_ENABLED */
1440 #if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED)
1441 #ifdef LIBXML_ISO8859X_ENABLED
1442 xmlRegisterCharEncodingHandlersISO8859x ();
1443 #endif
1444 #endif
1445
1446 }
1447
1448 /**
1449 * xmlCleanupCharEncodingHandlers:
1450 *
1451 * Cleanup the memory allocated for the char encoding support, it
1452 * unregisters all the encoding handlers and the aliases.
1453 */
1454 void
1455 xmlCleanupCharEncodingHandlers(void) {
1456 xmlCleanupEncodingAliases();
1457
1458 if (handlers == NULL) return;
1459
1460 for (;nbCharEncodingHandler > 0;) {
1461 nbCharEncodingHandler--;
1462 if (handlers[nbCharEncodingHandler] != NULL) {
1463 if (handlers[nbCharEncodingHandler]->name != NULL)
1464 xmlFree(handlers[nbCharEncodingHandler]->name);
1465 xmlFree(handlers[nbCharEncodingHandler]);
1466 }
1467 }
1468 xmlFree(handlers);
1469 handlers = NULL;
1470 nbCharEncodingHandler = 0;
1471 xmlDefaultCharEncodingHandler = NULL;
1472 }
1473
1474 /**
1475 * xmlRegisterCharEncodingHandler:
1476 * @handler: the xmlCharEncodingHandlerPtr handler block
1477 *
1478 * Register the char encoding handler, surprising, isn't it ?
1479 */
1480 void
1481 xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1482 if (handlers == NULL) xmlInitCharEncodingHandlers();
1483 if ((handler == NULL) || (handlers == NULL)) {
1484 xmlEncodingErr(XML_I18N_NO_HANDLER,
1485 "xmlRegisterCharEncodingHandler: NULL handler !\n", NULL);
1486 return;
1487 }
1488
1489 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1490 xmlEncodingErr(XML_I18N_EXCESS_HANDLER,
1491 "xmlRegisterCharEncodingHandler: Too many handler registered, see %s\n",
1492 "MAX_ENCODING_HANDLERS");
1493 return;
1494 }
1495 handlers[nbCharEncodingHandler++] = handler;
1496 }
1497
1498 /**
1499 * xmlGetCharEncodingHandler:
1500 * @enc: an xmlCharEncoding value.
1501 *
1502 * Search in the registered set the handler able to read/write that encoding.
1503 *
1504 * Returns the handler or NULL if not found
1505 */
1506 xmlCharEncodingHandlerPtr
1507 xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1508 xmlCharEncodingHandlerPtr handler;
1509
1510 if (handlers == NULL) xmlInitCharEncodingHandlers();
1511 switch (enc) {
1512 case XML_CHAR_ENCODING_ERROR:
1513 return(NULL);
1514 case XML_CHAR_ENCODING_NONE:
1515 return(NULL);
1516 case XML_CHAR_ENCODING_UTF8:
1517 return(NULL);
1518 case XML_CHAR_ENCODING_UTF16LE:
1519 return(xmlUTF16LEHandler);
1520 case XML_CHAR_ENCODING_UTF16BE:
1521 return(xmlUTF16BEHandler);
1522 case XML_CHAR_ENCODING_EBCDIC:
1523 handler = xmlFindCharEncodingHandler("EBCDIC");
1524 if (handler != NULL) return(handler);
1525 handler = xmlFindCharEncodingHandler("ebcdic");
1526 if (handler != NULL) return(handler);
1527 handler = xmlFindCharEncodingHandler("EBCDIC-US");
1528 if (handler != NULL) return(handler);
1529 handler = xmlFindCharEncodingHandler("IBM-037");
1530 if (handler != NULL) return(handler);
1531 break;
1532 case XML_CHAR_ENCODING_UCS4BE:
1533 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1534 if (handler != NULL) return(handler);
1535 handler = xmlFindCharEncodingHandler("UCS-4");
1536 if (handler != NULL) return(handler);
1537 handler = xmlFindCharEncodingHandler("UCS4");
1538 if (handler != NULL) return(handler);
1539 break;
1540 case XML_CHAR_ENCODING_UCS4LE:
1541 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1542 if (handler != NULL) return(handler);
1543 handler = xmlFindCharEncodingHandler("UCS-4");
1544 if (handler != NULL) return(handler);
1545 handler = xmlFindCharEncodingHandler("UCS4");
1546 if (handler != NULL) return(handler);
1547 break;
1548 case XML_CHAR_ENCODING_UCS4_2143:
1549 break;
1550 case XML_CHAR_ENCODING_UCS4_3412:
1551 break;
1552 case XML_CHAR_ENCODING_UCS2:
1553 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1554 if (handler != NULL) return(handler);
1555 handler = xmlFindCharEncodingHandler("UCS-2");
1556 if (handler != NULL) return(handler);
1557 handler = xmlFindCharEncodingHandler("UCS2");
1558 if (handler != NULL) return(handler);
1559 break;
1560
1561 /*
1562 * We used to keep ISO Latin encodings native in the
1563 * generated data. This led to so many problems that
1564 * this has been removed. One can still change this
1565 * back by registering no-ops encoders for those
1566 */
1567 case XML_CHAR_ENCODING_8859_1:
1568 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1569 if (handler != NULL) return(handler);
1570 break;
1571 case XML_CHAR_ENCODING_8859_2:
1572 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1573 if (handler != NULL) return(handler);
1574 break;
1575 case XML_CHAR_ENCODING_8859_3:
1576 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1577 if (handler != NULL) return(handler);
1578 break;
1579 case XML_CHAR_ENCODING_8859_4:
1580 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1581 if (handler != NULL) return(handler);
1582 break;
1583 case XML_CHAR_ENCODING_8859_5:
1584 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1585 if (handler != NULL) return(handler);
1586 break;
1587 case XML_CHAR_ENCODING_8859_6:
1588 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1589 if (handler != NULL) return(handler);
1590 break;
1591 case XML_CHAR_ENCODING_8859_7:
1592 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1593 if (handler != NULL) return(handler);
1594 break;
1595 case XML_CHAR_ENCODING_8859_8:
1596 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1597 if (handler != NULL) return(handler);
1598 break;
1599 case XML_CHAR_ENCODING_8859_9:
1600 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1601 if (handler != NULL) return(handler);
1602 break;
1603
1604
1605 case XML_CHAR_ENCODING_2022_JP:
1606 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1607 if (handler != NULL) return(handler);
1608 break;
1609 case XML_CHAR_ENCODING_SHIFT_JIS:
1610 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1611 if (handler != NULL) return(handler);
1612 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1613 if (handler != NULL) return(handler);
1614 handler = xmlFindCharEncodingHandler("Shift_JIS");
1615 if (handler != NULL) return(handler);
1616 break;
1617 case XML_CHAR_ENCODING_EUC_JP:
1618 handler = xmlFindCharEncodingHandler("EUC-JP");
1619 if (handler != NULL) return(handler);
1620 break;
1621 default:
1622 break;
1623 }
1624
1625 #ifdef DEBUG_ENCODING
1626 xmlGenericError(xmlGenericErrorContext,
1627 "No handler found for encoding %d\n", enc);
1628 #endif
1629 return(NULL);
1630 }
1631
1632 /**
1633 * xmlFindCharEncodingHandler:
1634 * @name: a string describing the char encoding.
1635 *
1636 * Search in the registered set the handler able to read/write that encoding.
1637 *
1638 * Returns the handler or NULL if not found
1639 */
1640 xmlCharEncodingHandlerPtr
1641 xmlFindCharEncodingHandler(const char *name) {
1642 const char *nalias;
1643 const char *norig;
1644 xmlCharEncoding alias;
1645 #ifdef LIBXML_ICONV_ENABLED
1646 xmlCharEncodingHandlerPtr enc;
1647 iconv_t icv_in, icv_out;
1648 #endif /* LIBXML_ICONV_ENABLED */
1649 #ifdef LIBXML_ICU_ENABLED
1650 xmlCharEncodingHandlerPtr encu;
1651 uconv_t *ucv_in, *ucv_out;
1652 #endif /* LIBXML_ICU_ENABLED */
1653 char upper[100];
1654 int i;
1655
1656 if (handlers == NULL) xmlInitCharEncodingHandlers();
1657 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1658 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1659
1660 /*
1661 * Do the alias resolution
1662 */
1663 norig = name;
1664 nalias = xmlGetEncodingAlias(name);
1665 if (nalias != NULL)
1666 name = nalias;
1667
1668 /*
1669 * Check first for directly registered encoding names
1670 */
1671 for (i = 0;i < 99;i++) {
1672 upper[i] = toupper(name[i]);
1673 if (upper[i] == 0) break;
1674 }
1675 upper[i] = 0;
1676
1677 if (handlers != NULL) {
1678 for (i = 0;i < nbCharEncodingHandler; i++) {
1679 if (!strcmp(upper, handlers[i]->name)) {
1680 #ifdef DEBUG_ENCODING
1681 xmlGenericError(xmlGenericErrorContext,
1682 "Found registered handler for encoding %s\n", name);
1683 #endif
1684 return(handlers[i]);
1685 }
1686 }
1687 }
1688
1689 #ifdef LIBXML_ICONV_ENABLED
1690 /* check whether iconv can handle this */
1691 icv_in = iconv_open("UTF-8", name);
1692 icv_out = iconv_open(name, "UTF-8");
1693 if (icv_in == (iconv_t) -1) {
1694 icv_in = iconv_open("UTF-8", upper);
1695 }
1696 if (icv_out == (iconv_t) -1) {
1697 icv_out = iconv_open(upper, "UTF-8");
1698 }
1699 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1700 enc = (xmlCharEncodingHandlerPtr)
1701 xmlMalloc(sizeof(xmlCharEncodingHandler));
1702 if (enc == NULL) {
1703 iconv_close(icv_in);
1704 iconv_close(icv_out);
1705 return(NULL);
1706 }
1707 memset(enc, 0, sizeof(xmlCharEncodingHandler));
1708 enc->name = xmlMemStrdup(name);
1709 enc->input = NULL;
1710 enc->output = NULL;
1711 enc->iconv_in = icv_in;
1712 enc->iconv_out = icv_out;
1713 #ifdef DEBUG_ENCODING
1714 xmlGenericError(xmlGenericErrorContext,
1715 "Found iconv handler for encoding %s\n", name);
1716 #endif
1717 return enc;
1718 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1719 xmlEncodingErr(XML_ERR_INTERNAL_ERROR,
1720 "iconv : problems with filters for '%s'\n", name);
1721 }
1722 #endif /* LIBXML_ICONV_ENABLED */
1723 #ifdef LIBXML_ICU_ENABLED
1724 /* check whether icu can handle this */
1725 ucv_in = openIcuConverter(name, 1);
1726 ucv_out = openIcuConverter(name, 0);
1727 if (ucv_in != NULL && ucv_out != NULL) {
1728 encu = (xmlCharEncodingHandlerPtr)
1729 xmlMalloc(sizeof(xmlCharEncodingHandler));
1730 if (encu == NULL) {
1731 closeIcuConverter(ucv_in);
1732 closeIcuConverter(ucv_out);
1733 return(NULL);
1734 }
1735 memset(encu, 0, sizeof(xmlCharEncodingHandler));
1736 encu->name = xmlMemStrdup(name);
1737 encu->input = NULL;
1738 encu->output = NULL;
1739 encu->uconv_in = ucv_in;
1740 encu->uconv_out = ucv_out;
1741 #ifdef DEBUG_ENCODING
1742 xmlGenericError(xmlGenericErrorContext,
1743 "Found ICU converter handler for encoding %s\n", name);
1744 #endif
1745 return encu;
1746 } else if (ucv_in != NULL || ucv_out != NULL) {
1747 closeIcuConverter(ucv_in);
1748 closeIcuConverter(ucv_out);
1749 xmlEncodingErr(XML_ERR_INTERNAL_ERROR,
1750 "ICU converter : problems with filters for '%s'\n", name);
1751 }
1752 #endif /* LIBXML_ICU_ENABLED */
1753
1754 #ifdef DEBUG_ENCODING
1755 xmlGenericError(xmlGenericErrorContext,
1756 "No handler found for encoding %s\n", name);
1757 #endif
1758
1759 /*
1760 * Fallback using the canonical names
1761 */
1762 alias = xmlParseCharEncoding(norig);
1763 if (alias != XML_CHAR_ENCODING_ERROR) {
1764 const char* canon;
1765 canon = xmlGetCharEncodingName(alias);
1766 if ((canon != NULL) && (strcmp(name, canon))) {
1767 return(xmlFindCharEncodingHandler(canon));
1768 }
1769 }
1770
1771 /* If "none of the above", give up */
1772 return(NULL);
1773 }
1774
1775 /************************************************************************
1776 * *
1777 * ICONV based generic conversion functions *
1778 * *
1779 ************************************************************************/
1780
1781 #ifdef LIBXML_ICONV_ENABLED
1782 /**
1783 * xmlIconvWrapper:
1784 * @cd: iconv converter data structure
1785 * @out: a pointer to an array of bytes to store the result
1786 * @outlen: the length of @out
1787 * @in: a pointer to an array of ISO Latin 1 chars
1788 * @inlen: the length of @in
1789 *
1790 * Returns 0 if success, or
1791 * -1 by lack of space, or
1792 * -2 if the transcoding fails (for *in is not valid utf8 string or
1793 * the result of transformation can't fit into the encoding we want), or
1794 * -3 if there the last byte can't form a single output char.
1795 *
1796 * The value of @inlen after return is the number of octets consumed
1797 * as the return value is positive, else unpredictable.
1798 * The value of @outlen after return is the number of ocetes consumed.
1799 */
1800 static int
1801 xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
1802 const unsigned char *in, int *inlen) {
1803 size_t icv_inlen, icv_outlen;
1804 const char *icv_in = (const char *) in;
1805 char *icv_out = (char *) out;
1806 int ret;
1807
1808 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
1809 if (outlen != NULL) *outlen = 0;
1810 return(-1);
1811 }
1812 icv_inlen = *inlen;
1813 icv_outlen = *outlen;
1814 ret = iconv(cd, (ICONV_CONST char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
1815 *inlen -= icv_inlen;
1816 *outlen -= icv_outlen;
1817 if ((icv_inlen != 0) || (ret == -1)) {
1818 #ifdef EILSEQ
1819 if (errno == EILSEQ) {
1820 return -2;
1821 } else
1822 #endif
1823 #ifdef E2BIG
1824 if (errno == E2BIG) {
1825 return -1;
1826 } else
1827 #endif
1828 #ifdef EINVAL
1829 if (errno == EINVAL) {
1830 return -3;
1831 } else
1832 #endif
1833 {
1834 return -3;
1835 }
1836 }
1837 return 0;
1838 }
1839 #endif /* LIBXML_ICONV_ENABLED */
1840
1841 /************************************************************************
1842 * *
1843 * ICU based generic conversion functions *
1844 * *
1845 ************************************************************************/
1846
1847 #ifdef LIBXML_ICU_ENABLED
1848 /**
1849 * xmlUconvWrapper:
1850 * @cd: ICU uconverter data structure
1851 * @toUnicode : non-zero if toUnicode. 0 otherwise.
1852 * @out: a pointer to an array of bytes to store the result
1853 * @outlen: the length of @out
1854 * @in: a pointer to an array of ISO Latin 1 chars
1855 * @inlen: the length of @in
1856 * @flush: if true, indicates end of input
1857 *
1858 * Returns 0 if success, or
1859 * -1 by lack of space, or
1860 * -2 if the transcoding fails (for *in is not valid utf8 string or
1861 * the result of transformation can't fit into the encoding we want), or
1862 * -3 if there the last byte can't form a single output char.
1863 *
1864 * The value of @inlen after return is the number of octets consumed
1865 * as the return value is positive, else unpredictable.
1866 * The value of @outlen after return is the number of ocetes consumed.
1867 */
1868 static int
1869 xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
1870 const unsigned char *in, int *inlen, int flush) {
1871 const char *ucv_in = (const char *) in;
1872 char *ucv_out = (char *) out;
1873 UErrorCode err = U_ZERO_ERROR;
1874
1875 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
1876 if (outlen != NULL) *outlen = 0;
1877 return(-1);
1878 }
1879
1880 if (toUnicode) {
1881 /* encoding => UTF-16 => UTF-8 */
1882 ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen,
1883 &ucv_in, ucv_in + *inlen, cd->pivot_buf,
1884 &cd->pivot_source, &cd->pivot_target,
1885 cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err);
1886 } else {
1887 /* UTF-8 => UTF-16 => encoding */
1888 ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen,
1889 &ucv_in, ucv_in + *inlen, cd->pivot_buf,
1890 &cd->pivot_source, &cd->pivot_target,
1891 cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err);
1892 }
1893 *inlen = ucv_in - (const char*) in;
1894 *outlen = ucv_out - (char *) out;
1895 if (U_SUCCESS(err)) {
1896 /* reset pivot buf if this is the last call for input (flush==TRUE) */
1897 if (flush)
1898 cd->pivot_source = cd->pivot_target = cd->pivot_buf;
1899 return 0;
1900 }
1901 if (err == U_BUFFER_OVERFLOW_ERROR)
1902 return -1;
1903 if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND)
1904 return -2;
1905 return -3;
1906 }
1907 #endif /* LIBXML_ICU_ENABLED */
1908
1909 /************************************************************************
1910 * *
1911 * The real API used by libxml for on-the-fly conversion *
1912 * *
1913 ************************************************************************/
1914
1915 static int
1916 xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
1917 int *outlen, const unsigned char *in, int *inlen, int flush) {
1918 int ret;
1919 (void)flush;
1920
1921 if (handler->input != NULL) {
1922 ret = handler->input(out, outlen, in, inlen);
1923 }
1924 #ifdef LIBXML_ICONV_ENABLED
1925 else if (handler->iconv_in != NULL) {
1926 ret = xmlIconvWrapper(handler->iconv_in, out, outlen, in, inlen);
1927 }
1928 #endif /* LIBXML_ICONV_ENABLED */
1929 #ifdef LIBXML_ICU_ENABLED
1930 else if (handler->uconv_in != NULL) {
1931 ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen,
1932 flush);
1933 }
1934 #endif /* LIBXML_ICU_ENABLED */
1935 else {
1936 *outlen = 0;
1937 *inlen = 0;
1938 ret = -2;
1939 }
1940
1941 return(ret);
1942 }
1943
1944 /* Returns -4 if no output function was found. */
1945 static int
1946 xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
1947 int *outlen, const unsigned char *in, int *inlen) {
1948 int ret;
1949
1950 if (handler->output != NULL) {
1951 ret = handler->output(out, outlen, in, inlen);
1952 }
1953 #ifdef LIBXML_ICONV_ENABLED
1954 else if (handler->iconv_out != NULL) {
1955 ret = xmlIconvWrapper(handler->iconv_out, out, outlen, in, inlen);
1956 }
1957 #endif /* LIBXML_ICONV_ENABLED */
1958 #ifdef LIBXML_ICU_ENABLED
1959 else if (handler->uconv_out != NULL) {
1960 ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen,
1961 TRUE);
1962 }
1963 #endif /* LIBXML_ICU_ENABLED */
1964 else {
1965 *outlen = 0;
1966 *inlen = 0;
1967 ret = -4;
1968 }
1969
1970 return(ret);
1971 }
1972
1973 /**
1974 * xmlCharEncFirstLineInt:
1975 * @handler: char enconding transformation data structure
1976 * @out: an xmlBuffer for the output.
1977 * @in: an xmlBuffer for the input
1978 * @len: number of bytes to convert for the first line, or -1
1979 *
1980 * Front-end for the encoding handler input function, but handle only
1981 * the very first line, i.e. limit itself to 45 chars.
1982 *
1983 * Returns the number of byte written if success, or
1984 * -1 general error
1985 * -2 if the transcoding fails (for *in is not valid utf8 string or
1986 * the result of transformation can't fit into the encoding we want), or
1987 */
1988 int
1989 xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1990 xmlBufferPtr in, int len) {
1991 int ret;
1992 int written;
1993 int toconv;
1994
1995 if (handler == NULL) return(-1);
1996 if (out == NULL) return(-1);
1997 if (in == NULL) return(-1);
1998
1999 /* calculate space available */
2000 written = out->size - out->use - 1; /* count '\0' */
2001 toconv = in->use;
2002 /*
2003 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
2004 * 45 chars should be sufficient to reach the end of the encoding
2005 * declaration without going too far inside the document content.
2006 * on UTF-16 this means 90bytes, on UCS4 this means 180
2007 * The actual value depending on guessed encoding is passed as @len
2008 * if provided
2009 */
2010 if (len >= 0) {
2011 if (toconv > len)
2012 toconv = len;
2013 } else {
2014 if (toconv > 180)
2015 toconv = 180;
2016 }
2017 if (toconv * 2 >= written) {
2018 xmlBufferGrow(out, toconv * 2);
2019 written = out->size - out->use - 1;
2020 }
2021
2022 ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
2023 in->content, &toconv, 0);
2024 xmlBufferShrink(in, toconv);
2025 out->use += written;
2026 out->content[out->use] = 0;
2027 if (ret == -1) ret = -3;
2028
2029 #ifdef DEBUG_ENCODING
2030 switch (ret) {
2031 case 0:
2032 xmlGenericError(xmlGenericErrorContext,
2033 "converted %d bytes to %d bytes of input\n",
2034 toconv, written);
2035 break;
2036 case -1:
2037 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2038 toconv, written, in->use);
2039 break;
2040 case -2:
2041 xmlGenericError(xmlGenericErrorContext,
2042 "input conversion failed due to input error\n");
2043 break;
2044 case -3:
2045 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2046 toconv, written, in->use);
2047 break;
2048 default:
2049 xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
2050 }
2051 #endif /* DEBUG_ENCODING */
2052 /*
2053 * Ignore when input buffer is not on a boundary
2054 */
2055 if (ret == -3) ret = 0;
2056 if (ret == -1) ret = 0;
2057 return(ret);
2058 }
2059
2060 /**
2061 * xmlCharEncFirstLine:
2062 * @handler: char enconding transformation data structure
2063 * @out: an xmlBuffer for the output.
2064 * @in: an xmlBuffer for the input
2065 *
2066 * Front-end for the encoding handler input function, but handle only
2067 * the very first line, i.e. limit itself to 45 chars.
2068 *
2069 * Returns the number of byte written if success, or
2070 * -1 general error
2071 * -2 if the transcoding fails (for *in is not valid utf8 string or
2072 * the result of transformation can't fit into the encoding we want), or
2073 */
2074 int
2075 xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2076 xmlBufferPtr in) {
2077 return(xmlCharEncFirstLineInt(handler, out, in, -1));
2078 }
2079
2080 /**
2081 * xmlCharEncFirstLineInput:
2082 * @input: a parser input buffer
2083 * @len: number of bytes to convert for the first line, or -1
2084 *
2085 * Front-end for the encoding handler input function, but handle only
2086 * the very first line. Point is that this is based on autodetection
2087 * of the encoding and once that first line is converted we may find
2088 * out that a different decoder is needed to process the input.
2089 *
2090 * Returns the number of byte written if success, or
2091 * -1 general error
2092 * -2 if the transcoding fails (for *in is not valid utf8 string or
2093 * the result of transformation can't fit into the encoding we want), or
2094 */
2095 int
2096 xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len)
2097 {
2098 int ret;
2099 size_t written;
2100 size_t toconv;
2101 int c_in;
2102 int c_out;
2103 xmlBufPtr in;
2104 xmlBufPtr out;
2105
2106 if ((input == NULL) || (input->encoder == NULL) ||
2107 (input->buffer == NULL) || (input->raw == NULL))
2108 return (-1);
2109 out = input->buffer;
2110 in = input->raw;
2111
2112 toconv = xmlBufUse(in);
2113 if (toconv == 0)
2114 return (0);
2115 written = xmlBufAvail(out) - 1; /* count '\0' */
2116 /*
2117 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
2118 * 45 chars should be sufficient to reach the end of the encoding
2119 * declaration without going too far inside the document content.
2120 * on UTF-16 this means 90bytes, on UCS4 this means 180
2121 * The actual value depending on guessed encoding is passed as @len
2122 * if provided
2123 */
2124 if (len >= 0) {
2125 if (toconv > (unsigned int) len)
2126 toconv = len;
2127 } else {
2128 if (toconv > 180)
2129 toconv = 180;
2130 }
2131 if (toconv * 2 >= written) {
2132 xmlBufGrow(out, toconv * 2);
2133 written = xmlBufAvail(out) - 1;
2134 }
2135 if (written > 360)
2136 written = 360;
2137
2138 c_in = toconv;
2139 c_out = written;
2140 ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
2141 xmlBufContent(in), &c_in, 0);
2142 xmlBufShrink(in, c_in);
2143 xmlBufAddLen(out, c_out);
2144 if (ret == -1)
2145 ret = -3;
2146
2147 switch (ret) {
2148 case 0:
2149 #ifdef DEBUG_ENCODING
2150 xmlGenericError(xmlGenericErrorContext,
2151 "converted %d bytes to %d bytes of input\n",
2152 c_in, c_out);
2153 #endif
2154 break;
2155 case -1:
2156 #ifdef DEBUG_ENCODING
2157 xmlGenericError(xmlGenericErrorContext,
2158 "converted %d bytes to %d bytes of input, %d left\n",
2159 c_in, c_out, (int)xmlBufUse(in));
2160 #endif
2161 break;
2162 case -3:
2163 #ifdef DEBUG_ENCODING
2164 xmlGenericError(xmlGenericErrorContext,
2165 "converted %d bytes to %d bytes of input, %d left\n",
2166 c_in, c_out, (int)xmlBufUse(in));
2167 #endif
2168 break;
2169 case -2: {
2170 char buf[50];
2171 const xmlChar *content = xmlBufContent(in);
2172
2173 snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X",
2174 content[0], content[1],
2175 content[2], content[3]);
2176 buf[49] = 0;
2177 xmlEncodingErr(XML_I18N_CONV_FAILED,
2178 "input conversion failed due to input error, bytes %s\n",
2179 buf);
2180 }
2181 }
2182 /*
2183 * Ignore when input buffer is not on a boundary
2184 */
2185 if (ret == -3) ret = 0;
2186 if (ret == -1) ret = 0;
2187 return(ret);
2188 }
2189
2190 /**
2191 * xmlCharEncInput:
2192 * @input: a parser input buffer
2193 * @flush: try to flush all the raw buffer
2194 *
2195 * Generic front-end for the encoding handler on parser input
2196 *
2197 * Returns the number of byte written if success, or
2198 * -1 general error
2199 * -2 if the transcoding fails (for *in is not valid utf8 string or
2200 * the result of transformation can't fit into the encoding we want), or
2201 */
2202 int
2203 xmlCharEncInput(xmlParserInputBufferPtr input, int flush)
2204 {
2205 int ret;
2206 size_t written;
2207 size_t toconv;
2208 int c_in;
2209 int c_out;
2210 xmlBufPtr in;
2211 xmlBufPtr out;
2212
2213 if ((input == NULL) || (input->encoder == NULL) ||
2214 (input->buffer == NULL) || (input->raw == NULL))
2215 return (-1);
2216 out = input->buffer;
2217 in = input->raw;
2218
2219 toconv = xmlBufUse(in);
2220 if (toconv == 0)
2221 return (0);
2222 if ((toconv > 64 * 1024) && (flush == 0))
2223 toconv = 64 * 1024;
2224 written = xmlBufAvail(out);
2225 if (written > 0)
2226 written--; /* count '\0' */
2227 if (toconv * 2 >= written) {
2228 xmlBufGrow(out, toconv * 2);
2229 written = xmlBufAvail(out);
2230 if (written > 0)
2231 written--; /* count '\0' */
2232 }
2233 if ((written > 128 * 1024) && (flush == 0))
2234 written = 128 * 1024;
2235
2236 c_in = toconv;
2237 c_out = written;
2238 ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
2239 xmlBufContent(in), &c_in, flush);
2240 xmlBufShrink(in, c_in);
2241 xmlBufAddLen(out, c_out);
2242 if (ret == -1)
2243 ret = -3;
2244
2245 switch (ret) {
2246 case 0:
2247 #ifdef DEBUG_ENCODING
2248 xmlGenericError(xmlGenericErrorContext,
2249 "converted %d bytes to %d bytes of input\n",
2250 c_in, c_out);
2251 #endif
2252 break;
2253 case -1:
2254 #ifdef DEBUG_ENCODING
2255 xmlGenericError(xmlGenericErrorContext,
2256 "converted %d bytes to %d bytes of input, %d left\n",
2257 c_in, c_out, (int)xmlBufUse(in));
2258 #endif
2259 break;
2260 case -3:
2261 #ifdef DEBUG_ENCODING
2262 xmlGenericError(xmlGenericErrorContext,
2263 "converted %d bytes to %d bytes of input, %d left\n",
2264 c_in, c_out, (int)xmlBufUse(in));
2265 #endif
2266 break;
2267 case -2: {
2268 char buf[50];
2269 const xmlChar *content = xmlBufContent(in);
2270
2271 snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X",
2272 content[0], content[1],
2273 content[2], content[3]);
2274 buf[49] = 0;
2275 xmlEncodingErr(XML_I18N_CONV_FAILED,
2276 "input conversion failed due to input error, bytes %s\n",
2277 buf);
2278 }
2279 }
2280 /*
2281 * Ignore when input buffer is not on a boundary
2282 */
2283 if (ret == -3)
2284 ret = 0;
2285 return (c_out? c_out : ret);
2286 }
2287
2288 /**
2289 * xmlCharEncInFunc:
2290 * @handler: char encoding transformation data structure
2291 * @out: an xmlBuffer for the output.
2292 * @in: an xmlBuffer for the input
2293 *
2294 * Generic front-end for the encoding handler input function
2295 *
2296 * Returns the number of byte written if success, or
2297 * -1 general error
2298 * -2 if the transcoding fails (for *in is not valid utf8 string or
2299 * the result of transformation can't fit into the encoding we want), or
2300 */
2301 int
2302 xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
2303 xmlBufferPtr in)
2304 {
2305 int ret;
2306 int written;
2307 int toconv;
2308
2309 if (handler == NULL)
2310 return (-1);
2311 if (out == NULL)
2312 return (-1);
2313 if (in == NULL)
2314 return (-1);
2315
2316 toconv = in->use;
2317 if (toconv == 0)
2318 return (0);
2319 written = out->size - out->use -1; /* count '\0' */
2320 if (toconv * 2 >= written) {
2321 xmlBufferGrow(out, out->size + toconv * 2);
2322 written = out->size - out->use - 1;
2323 }
2324 ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
2325 in->content, &toconv, 1);
2326 xmlBufferShrink(in, toconv);
2327 out->use += written;
2328 out->content[out->use] = 0;
2329 if (ret == -1)
2330 ret = -3;
2331
2332 switch (ret) {
2333 case 0:
2334 #ifdef DEBUG_ENCODING
2335 xmlGenericError(xmlGenericErrorContext,
2336 "converted %d bytes to %d bytes of input\n",
2337 toconv, written);
2338 #endif
2339 break;
2340 case -1:
2341 #ifdef DEBUG_ENCODING
2342 xmlGenericError(xmlGenericErrorContext,
2343 "converted %d bytes to %d bytes of input, %d left\n",
2344 toconv, written, in->use);
2345 #endif
2346 break;
2347 case -3:
2348 #ifdef DEBUG_ENCODING
2349 xmlGenericError(xmlGenericErrorContext,
2350 "converted %d bytes to %d bytes of input, %d left\n",
2351 toconv, written, in->use);
2352 #endif
2353 break;
2354 case -2: {
2355 char buf[50];
2356
2357 snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X",
2358 in->content[0], in->content[1],
2359 in->content[2], in->content[3]);
2360 buf[49] = 0;
2361 xmlEncodingErr(XML_I18N_CONV_FAILED,
2362 "input conversion failed due to input error, bytes %s\n",
2363 buf);
2364 }
2365 }
2366 /*
2367 * Ignore when input buffer is not on a boundary
2368 */
2369 if (ret == -3)
2370 ret = 0;
2371 return (written? written : ret);
2372 }
2373
2374 #ifdef LIBXML_OUTPUT_ENABLED
2375 /**
2376 * xmlCharEncOutput:
2377 * @output: a parser output buffer
2378 * @init: is this an initialization call without data
2379 *
2380 * Generic front-end for the encoding handler on parser output
2381 * a first call with @init == 1 has to be made first to initiate the
2382 * output in case of non-stateless encoding needing to initiate their
2383 * state or the output (like the BOM in UTF16).
2384 * In case of UTF8 sequence conversion errors for the given encoder,
2385 * the content will be automatically remapped to a CharRef sequence.
2386 *
2387 * Returns the number of byte written if success, or
2388 * -1 general error
2389 * -2 if the transcoding fails (for *in is not valid utf8 string or
2390 * the result of transformation can't fit into the encoding we want), or
2391 */
2392 int
2393 xmlCharEncOutput(xmlOutputBufferPtr output, int init)
2394 {
2395 int ret;
2396 size_t written;
2397 size_t writtentot = 0;
2398 size_t toconv;
2399 int c_in;
2400 int c_out;
2401 xmlBufPtr in;
2402 xmlBufPtr out;
2403
2404 if ((output == NULL) || (output->encoder == NULL) ||
2405 (output->buffer == NULL) || (output->conv == NULL))
2406 return (-1);
2407 out = output->conv;
2408 in = output->buffer;
2409
2410 retry:
2411
2412 written = xmlBufAvail(out);
2413 if (written > 0)
2414 written--; /* count '\0' */
2415
2416 /*
2417 * First specific handling of the initialization call
2418 */
2419 if (init) {
2420 c_in = 0;
2421 c_out = written;
2422 /* TODO: Check return value. */
2423 xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
2424 NULL, &c_in);
2425 xmlBufAddLen(out, c_out);
2426 #ifdef DEBUG_ENCODING
2427 xmlGenericError(xmlGenericErrorContext,
2428 "initialized encoder\n");
2429 #endif
2430 return(0);
2431 }
2432
2433 /*
2434 * Conversion itself.
2435 */
2436 toconv = xmlBufUse(in);
2437 if (toconv == 0)
2438 return (0);
2439 if (toconv > 64 * 1024)
2440 toconv = 64 * 1024;
2441 if (toconv * 4 >= written) {
2442 xmlBufGrow(out, toconv * 4);
2443 written = xmlBufAvail(out) - 1;
2444 }
2445 if (written > 256 * 1024)
2446 written = 256 * 1024;
2447
2448 c_in = toconv;
2449 c_out = written;
2450 ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
2451 xmlBufContent(in), &c_in);
2452 xmlBufShrink(in, c_in);
2453 xmlBufAddLen(out, c_out);
2454 writtentot += c_out;
2455 if (ret == -1) {
2456 if (c_out > 0) {
2457 /* Can be a limitation of iconv or uconv */
2458 goto retry;
2459 }
2460 ret = -3;
2461 }
2462
2463 if (ret >= 0) output += ret;
2464
2465 /*
2466 * Attempt to handle error cases
2467 */
2468 switch (ret) {
2469 case 0:
2470 #ifdef DEBUG_ENCODING
2471 xmlGenericError(xmlGenericErrorContext,
2472 "converted %d bytes to %d bytes of output\n",
2473 c_in, c_out);
2474 #endif
2475 break;
2476 case -1:
2477 #ifdef DEBUG_ENCODING
2478 xmlGenericError(xmlGenericErrorContext,
2479 "output conversion failed by lack of space\n");
2480 #endif
2481 break;
2482 case -3:
2483 #ifdef DEBUG_ENCODING
2484 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
2485 c_in, c_out, (int) xmlBufUse(in));
2486 #endif
2487 break;
2488 case -4:
2489 xmlEncodingErr(XML_I18N_NO_OUTPUT,
2490 "xmlCharEncOutFunc: no output function !\n", NULL);
2491 ret = -1;
2492 break;
2493 case -2: {
2494 xmlChar charref[20];
2495 int len = (int) xmlBufUse(in);
2496 xmlChar *content = xmlBufContent(in);
2497 int cur, charrefLen;
2498
2499 cur = xmlGetUTF8Char(content, &len);
2500 if (cur <= 0)
2501 break;
2502
2503 #ifdef DEBUG_ENCODING
2504 xmlGenericError(xmlGenericErrorContext,
2505 "handling output conversion error\n");
2506 xmlGenericError(xmlGenericErrorContext,
2507 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2508 content[0], content[1],
2509 content[2], content[3]);
2510 #endif
2511 /*
2512 * Removes the UTF8 sequence, and replace it by a charref
2513 * and continue the transcoding phase, hoping the error
2514 * did not mangle the encoder state.
2515 */
2516 charrefLen = snprintf((char *) &charref[0], sizeof(charref),
2517 "&#%d;", cur);
2518 xmlBufShrink(in, len);
2519 xmlBufGrow(out, charrefLen * 4);
2520 c_out = xmlBufAvail(out) - 1;
2521 c_in = charrefLen;
2522 ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
2523 charref, &c_in);
2524
2525 if ((ret < 0) || (c_in != charrefLen)) {
2526 char buf[50];
2527
2528 snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X",
2529 content[0], content[1],
2530 content[2], content[3]);
2531 buf[49] = 0;
2532 xmlEncodingErr(XML_I18N_CONV_FAILED,
2533 "output conversion failed due to conv error, bytes %s\n",
2534 buf);
2535 if (xmlBufGetAllocationScheme(in) != XML_BUFFER_ALLOC_IMMUTABLE)
2536 content[0] = ' ';
2537 break;
2538 }
2539
2540 xmlBufAddLen(out, c_out);
2541 writtentot += c_out;
2542 goto retry;
2543 }
2544 }
2545 return(ret);
2546 }
2547 #endif
2548
2549 /**
2550 * xmlCharEncOutFunc:
2551 * @handler: char enconding transformation data structure
2552 * @out: an xmlBuffer for the output.
2553 * @in: an xmlBuffer for the input
2554 *
2555 * Generic front-end for the encoding handler output function
2556 * a first call with @in == NULL has to be made firs to initiate the
2557 * output in case of non-stateless encoding needing to initiate their
2558 * state or the output (like the BOM in UTF16).
2559 * In case of UTF8 sequence conversion errors for the given encoder,
2560 * the content will be automatically remapped to a CharRef sequence.
2561 *
2562 * Returns the number of byte written if success, or
2563 * -1 general error
2564 * -2 if the transcoding fails (for *in is not valid utf8 string or
2565 * the result of transformation can't fit into the encoding we want), or
2566 */
2567 int
2568 xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2569 xmlBufferPtr in) {
2570 int ret;
2571 int written;
2572 int writtentot = 0;
2573 int toconv;
2574 int output = 0;
2575
2576 if (handler == NULL) return(-1);
2577 if (out == NULL) return(-1);
2578
2579 retry:
2580
2581 written = out->size - out->use;
2582
2583 if (written > 0)
2584 written--; /* Gennady: count '/0' */
2585
2586 /*
2587 * First specific handling of in = NULL, i.e. the initialization call
2588 */
2589 if (in == NULL) {
2590 toconv = 0;
2591 /* TODO: Check return value. */
2592 xmlEncOutputChunk(handler, &out->content[out->use], &written,
2593 NULL, &toconv);
2594 out->use += written;
2595 out->content[out->use] = 0;
2596 #ifdef DEBUG_ENCODING
2597 xmlGenericError(xmlGenericErrorContext,
2598 "initialized encoder\n");
2599 #endif
2600 return(0);
2601 }
2602
2603 /*
2604 * Conversion itself.
2605 */
2606 toconv = in->use;
2607 if (toconv == 0)
2608 return(0);
2609 if (toconv * 4 >= written) {
2610 xmlBufferGrow(out, toconv * 4);
2611 written = out->size - out->use - 1;
2612 }
2613 ret = xmlEncOutputChunk(handler, &out->content[out->use], &written,
2614 in->content, &toconv);
2615 xmlBufferShrink(in, toconv);
2616 out->use += written;
2617 writtentot += written;
2618 out->content[out->use] = 0;
2619 if (ret == -1) {
2620 if (written > 0) {
2621 /* Can be a limitation of iconv or uconv */
2622 goto retry;
2623 }
2624 ret = -3;
2625 }
2626
2627 if (ret >= 0) output += ret;
2628
2629 /*
2630 * Attempt to handle error cases
2631 */
2632 switch (ret) {
2633 case 0:
2634 #ifdef DEBUG_ENCODING
2635 xmlGenericError(xmlGenericErrorContext,
2636 "converted %d bytes to %d bytes of output\n",
2637 toconv, written);
2638 #endif
2639 break;
2640 case -1:
2641 #ifdef DEBUG_ENCODING
2642 xmlGenericError(xmlGenericErrorContext,
2643 "output conversion failed by lack of space\n");
2644 #endif
2645 break;
2646 case -3:
2647 #ifdef DEBUG_ENCODING
2648 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
2649 toconv, written, in->use);
2650 #endif
2651 break;
2652 case -4:
2653 xmlEncodingErr(XML_I18N_NO_OUTPUT,
2654 "xmlCharEncOutFunc: no output function !\n", NULL);
2655 ret = -1;
2656 break;
2657 case -2: {
2658 xmlChar charref[20];
2659 int len = in->use;
2660 const xmlChar *utf = (const xmlChar *) in->content;
2661 int cur, charrefLen;
2662
2663 cur = xmlGetUTF8Char(utf, &len);
2664 if (cur <= 0)
2665 break;
2666
2667 #ifdef DEBUG_ENCODING
2668 xmlGenericError(xmlGenericErrorContext,
2669 "handling output conversion error\n");
2670 xmlGenericError(xmlGenericErrorContext,
2671 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2672 in->content[0], in->content[1],
2673 in->content[2], in->content[3]);
2674 #endif
2675 /*
2676 * Removes the UTF8 sequence, and replace it by a charref
2677 * and continue the transcoding phase, hoping the error
2678 * did not mangle the encoder state.
2679 */
2680 charrefLen = snprintf((char *) &charref[0], sizeof(charref),
2681 "&#%d;", cur);
2682 xmlBufferShrink(in, len);
2683 xmlBufferGrow(out, charrefLen * 4);
2684 written = out->size - out->use - 1;
2685 toconv = charrefLen;
2686 ret = xmlEncOutputChunk(handler, &out->content[out->use], &written,
2687 charref, &toconv);
2688
2689 if ((ret < 0) || (toconv != charrefLen)) {
2690 char buf[50];
2691
2692 snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X",
2693 in->content[0], in->content[1],
2694 in->content[2], in->content[3]);
2695 buf[49] = 0;
2696 xmlEncodingErr(XML_I18N_CONV_FAILED,
2697 "output conversion failed due to conv error, bytes %s\n",
2698 buf);
2699 if (in->alloc != XML_BUFFER_ALLOC_IMMUTABLE)
2700 in->content[0] = ' ';
2701 break;
2702 }
2703
2704 out->use += written;
2705 writtentot += written;
2706 out->content[out->use] = 0;
2707 goto retry;
2708 }
2709 }
2710 return(ret);
2711 }
2712
2713 /**
2714 * xmlCharEncCloseFunc:
2715 * @handler: char enconding transformation data structure
2716 *
2717 * Generic front-end for encoding handler close function
2718 *
2719 * Returns 0 if success, or -1 in case of error
2720 */
2721 int
2722 xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2723 int ret = 0;
2724 int tofree = 0;
2725 int i, handler_in_list = 0;
2726
2727 if (handler == NULL) return(-1);
2728 if (handler->name == NULL) return(-1);
2729 if (handlers != NULL) {
2730 for (i = 0;i < nbCharEncodingHandler; i++) {
2731 if (handler == handlers[i]) {
2732 handler_in_list = 1;
2733 break;
2734 }
2735 }
2736 }
2737 #ifdef LIBXML_ICONV_ENABLED
2738 /*
2739 * Iconv handlers can be used only once, free the whole block.
2740 * and the associated icon resources.
2741 */
2742 if ((handler_in_list == 0) &&
2743 ((handler->iconv_out != NULL) || (handler->iconv_in != NULL))) {
2744 tofree = 1;
2745 if (handler->iconv_out != NULL) {
2746 if (iconv_close(handler->iconv_out))
2747 ret = -1;
2748 handler->iconv_out = NULL;
2749 }
2750 if (handler->iconv_in != NULL) {
2751 if (iconv_close(handler->iconv_in))
2752 ret = -1;
2753 handler->iconv_in = NULL;
2754 }
2755 }
2756 #endif /* LIBXML_ICONV_ENABLED */
2757 #ifdef LIBXML_ICU_ENABLED
2758 if ((handler_in_list == 0) &&
2759 ((handler->uconv_out != NULL) || (handler->uconv_in != NULL))) {
2760 tofree = 1;
2761 if (handler->uconv_out != NULL) {
2762 closeIcuConverter(handler->uconv_out);
2763 handler->uconv_out = NULL;
2764 }
2765 if (handler->uconv_in != NULL) {
2766 closeIcuConverter(handler->uconv_in);
2767 handler->uconv_in = NULL;
2768 }
2769 }
2770 #endif
2771 if (tofree) {
2772 /* free up only dynamic handlers iconv/uconv */
2773 if (handler->name != NULL)
2774 xmlFree(handler->name);
2775 handler->name = NULL;
2776 xmlFree(handler);
2777 }
2778 #ifdef DEBUG_ENCODING
2779 if (ret)
2780 xmlGenericError(xmlGenericErrorContext,
2781 "failed to close the encoding handler\n");
2782 else
2783 xmlGenericError(xmlGenericErrorContext,
2784 "closed the encoding handler\n");
2785 #endif
2786
2787 return(ret);
2788 }
2789
2790 /**
2791 * xmlByteConsumed:
2792 * @ctxt: an XML parser context
2793 *
2794 * This function provides the current index of the parser relative
2795 * to the start of the current entity. This function is computed in
2796 * bytes from the beginning starting at zero and finishing at the
2797 * size in byte of the file if parsing a file. The function is
2798 * of constant cost if the input is UTF-8 but can be costly if run
2799 * on non-UTF-8 input.
2800 *
2801 * Returns the index in bytes from the beginning of the entity or -1
2802 * in case the index could not be computed.
2803 */
2804 long
2805 xmlByteConsumed(xmlParserCtxtPtr ctxt) {
2806 xmlParserInputPtr in;
2807
2808 if (ctxt == NULL) return(-1);
2809 in = ctxt->input;
2810 if (in == NULL) return(-1);
2811 if ((in->buf != NULL) && (in->buf->encoder != NULL)) {
2812 unsigned int unused = 0;
2813 xmlCharEncodingHandler * handler = in->buf->encoder;
2814 /*
2815 * Encoding conversion, compute the number of unused original
2816 * bytes from the input not consumed and substract that from
2817 * the raw consumed value, this is not a cheap operation
2818 */
2819 if (in->end - in->cur > 0) {
2820 unsigned char convbuf[32000];
2821 const unsigned char *cur = (const unsigned char *)in->cur;
2822 int toconv = in->end - in->cur, written = 32000;
2823
2824 int ret;
2825
2826 do {
2827 toconv = in->end - cur;
2828 written = 32000;
2829 ret = xmlEncOutputChunk(handler, &convbuf[0], &written,
2830 cur, &toconv);
2831 if (ret < 0) {
2832 if (written > 0)
2833 ret = -2;
2834 else
2835 return(-1);
2836 }
2837 unused += written;
2838 cur += toconv;
2839 } while (ret == -2);
2840 }
2841 if (in->buf->rawconsumed < unused)
2842 return(-1);
2843 return(in->buf->rawconsumed - unused);
2844 }
2845 return(in->consumed + (in->cur - in->base));
2846 }
2847
2848 #if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED)
2849 #ifdef LIBXML_ISO8859X_ENABLED
2850
2851 /**
2852 * UTF8ToISO8859x:
2853 * @out: a pointer to an array of bytes to store the result
2854 * @outlen: the length of @out
2855 * @in: a pointer to an array of UTF-8 chars
2856 * @inlen: the length of @in
2857 * @xlattable: the 2-level transcoding table
2858 *
2859 * Take a block of UTF-8 chars in and try to convert it to an ISO 8859-*
2860 * block of chars out.
2861 *
2862 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2863 * The value of @inlen after return is the number of octets consumed
2864 * as the return value is positive, else unpredictable.
2865 * The value of @outlen after return is the number of ocetes consumed.
2866 */
2867 static int
2868 UTF8ToISO8859x(unsigned char* out, int *outlen,
2869 const unsigned char* in, int *inlen,
2870 unsigned char const *xlattable) {
2871 const unsigned char* outstart = out;
2872 const unsigned char* inend;
2873 const unsigned char* instart = in;
2874 const unsigned char* processed = in;
2875
2876 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) ||
2877 (xlattable == NULL))
2878 return(-1);
2879 if (in == NULL) {
2880 /*
2881 * initialization nothing to do
2882 */
2883 *outlen = 0;
2884 *inlen = 0;
2885 return(0);
2886 }
2887 inend = in + (*inlen);
2888 while (in < inend) {
2889 unsigned char d = *in++;
2890 if (d < 0x80) {
2891 *out++ = d;
2892 } else if (d < 0xC0) {
2893 /* trailing byte in leading position */
2894 *outlen = out - outstart;
2895 *inlen = processed - instart;
2896 return(-2);
2897 } else if (d < 0xE0) {
2898 unsigned char c;
2899 if (!(in < inend)) {
2900 /* trailing byte not in input buffer */
2901 *outlen = out - outstart;
2902 *inlen = processed - instart;
2903 return(-3);
2904 }
2905 c = *in++;
2906 if ((c & 0xC0) != 0x80) {
2907 /* not a trailing byte */
2908 *outlen = out - outstart;
2909 *inlen = processed - instart;
2910 return(-2);
2911 }
2912 c = c & 0x3F;
2913 d = d & 0x1F;
2914 d = xlattable [48 + c + xlattable [d] * 64];
2915 if (d == 0) {
2916 /* not in character set */
2917 *outlen = out - outstart;
2918 *inlen = processed - instart;
2919 return(-2);
2920 }
2921 *out++ = d;
2922 } else if (d < 0xF0) {
2923 unsigned char c1;
2924 unsigned char c2;
2925 if (!(in < inend - 1)) {
2926 /* trailing bytes not in input buffer */
2927 *outlen = out - outstart;
2928 *inlen = processed - instart;
2929 return(-3);
2930 }
2931 c1 = *in++;
2932 if ((c1 & 0xC0) != 0x80) {
2933 /* not a trailing byte (c1) */
2934 *outlen = out - outstart;
2935 *inlen = processed - instart;
2936 return(-2);
2937 }
2938 c2 = *in++;
2939 if ((c2 & 0xC0) != 0x80) {
2940 /* not a trailing byte (c2) */
2941 *outlen = out - outstart;
2942 *inlen = processed - instart;
2943 return(-2);
2944 }
2945 c1 = c1 & 0x3F;
2946 c2 = c2 & 0x3F;
2947 d = d & 0x0F;
2948 d = xlattable [48 + c2 + xlattable [48 + c1 +
2949 xlattable [32 + d] * 64] * 64];
2950 if (d == 0) {
2951 /* not in character set */
2952 *outlen = out - outstart;
2953 *inlen = processed - instart;
2954 return(-2);
2955 }
2956 *out++ = d;
2957 } else {
2958 /* cannot transcode >= U+010000 */
2959 *outlen = out - outstart;
2960 *inlen = processed - instart;
2961 return(-2);
2962 }
2963 processed = in;
2964 }
2965 *outlen = out - outstart;
2966 *inlen = processed - instart;
2967 return(*outlen);
2968 }
2969
2970 /**
2971 * ISO8859xToUTF8
2972 * @out: a pointer to an array of bytes to store the result
2973 * @outlen: the length of @out
2974 * @in: a pointer to an array of ISO Latin 1 chars
2975 * @inlen: the length of @in
2976 *
2977 * Take a block of ISO 8859-* chars in and try to convert it to an UTF-8
2978 * block of chars out.
2979 * Returns 0 if success, or -1 otherwise
2980 * The value of @inlen after return is the number of octets consumed
2981 * The value of @outlen after return is the number of ocetes produced.
2982 */
2983 static int
2984 ISO8859xToUTF8(unsigned char* out, int *outlen,
2985 const unsigned char* in, int *inlen,
2986 unsigned short const *unicodetable) {
2987 unsigned char* outstart = out;
2988 unsigned char* outend;
2989 const unsigned char* instart = in;
2990 const unsigned char* inend;
2991 const unsigned char* instop;
2992 unsigned int c;
2993
2994 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) ||
2995 (in == NULL) || (unicodetable == NULL))
2996 return(-1);
2997 outend = out + *outlen;
2998 inend = in + *inlen;
2999 instop = inend;
3000
3001 while ((in < inend) && (out < outend - 2)) {
3002 if (*in >= 0x80) {
3003 c = unicodetable [*in - 0x80];
3004 if (c == 0) {
3005 /* undefined code point */
3006 *outlen = out - outstart;
3007 *inlen = in - instart;
3008 return (-1);
3009 }
3010 if (c < 0x800) {
3011 *out++ = ((c >> 6) & 0x1F) | 0xC0;
3012 *out++ = (c & 0x3F) | 0x80;
3013 } else {
3014 *out++ = ((c >> 12) & 0x0F) | 0xE0;
3015 *out++ = ((c >> 6) & 0x3F) | 0x80;
3016 *out++ = (c & 0x3F) | 0x80;
3017 }
3018 ++in;
3019 }
3020 if (instop - in > outend - out) instop = in + (outend - out);
3021 while ((*in < 0x80) && (in < instop)) {
3022 *out++ = *in++;
3023 }
3024 }
3025 if ((in < inend) && (out < outend) && (*in < 0x80)) {
3026 *out++ = *in++;
3027 }
3028 if ((in < inend) && (out < outend) && (*in < 0x80)) {
3029 *out++ = *in++;
3030 }
3031 *outlen = out - outstart;
3032 *inlen = in - instart;
3033 return (*outlen);
3034 }
3035
3036
3037 /************************************************************************
3038 * Lookup tables for ISO-8859-2..ISO-8859-16 transcoding *
3039 ************************************************************************/
3040
3041 static unsigned short const xmlunicodetable_ISO8859_2 [128] = {
3042 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
3043 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
3044 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
3045 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
3046 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7,
3047 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,
3048 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7,
3049 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,
3050 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
3051 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
3052 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
3053 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
3054 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
3055 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
3056 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
3057 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,
3058 };
3059
3060 static unsigned char const xmltranscodetable_ISO8859_2 [48 + 6 * 64] = {
3061 "\x00\x00\x01\x05\x02\x04\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00"
3062 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
3063 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
3064 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
3065 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
3066 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
3067 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
3068 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
3069 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
3070 "\xa0\x00\x00\x00\xa4\x00\x00\xa7\xa8\x00\x00\x00\x00\xad\x00\x00"
3071 "\xb0\x00\x00\x00\xb4\x00\x00\x00\xb8\x00\x00\x00\x00\x00\x00\x00"
3072 "\x00\x00\xc3\xe3\xa1\xb1\xc6\xe6\x00\x00\x00\x00\xc8\xe8\xcf\xef"
3073 "\xd0\xf0\x00\x00\x00\x00\x00\x00\xca\xea\xcc\xec\x00\x00\x00\x00"
3074 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
3075 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\xc5\xe5\x00\x00\xa5\xb5\x00"
3076 "\x00\x00\x00\x00\x00\x00\x00\xb7\x00\x00\x00\x00\x00\x00\x00\x00"
3077 "\x00\x00\x00\x00\x00\x00\x00\x00\xa2\xff\x00\xb2\x00\xbd\x00\x00"
3078 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"