2 **********************************************************************
3 * Copyright (C) 2000-2007, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.c
8 * tab size: 8 (not used)
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
29 #include "unicode/utypes.h"
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
46 #ifdef U_ENABLE_GENERIC_ISO_2022
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
74 * Markus Scherer 2003-dec-03
78 static const char SHIFT_IN_STR
[] = "\x0F";
79 static const char SHIFT_OUT_STR
[] = "\x0E";
93 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
94 * as bytes 21..7E. (Subtract 0x80.)
95 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
96 * as bytes 20..7F. (Subtract 0x80.)
97 * Do not encode C1 control codes with native bytes 80..9F
98 * as bytes 00..1F (C0 control codes).
108 * ISO 2022 control codes must not be converted from Unicode
109 * because they would mess up the byte stream.
110 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
111 * corresponding to SO, SI, and ESC.
113 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
115 /* for ISO-2022-JP and -CN implementations */
132 HWKANA_7BIT
=8, /* Halfwidth Katakana 7 bit */
135 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
141 * these are used in StateEnum and ISO2022State variables,
142 * but CNS_11643 must be used to index into myConverterArray[]
154 /* is the StateEnum charset value for a DBCS charset? */
155 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
157 #define CSM(cs) ((uint16_t)1<<(cs))
160 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
161 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
163 * Note: The converter uses some leniency:
164 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
165 * all versions, not just JIS7 and JIS8.
166 * - ICU does not distinguish between different versions of JIS X 0208.
168 static const uint16_t jpCharsetMasks
[5]={
169 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
),
170 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
),
171 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
),
172 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
),
173 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
)
185 typedef struct ISO2022State
{
186 int8_t cs
[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
187 int8_t g
; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
188 int8_t prevG
; /* g before single shift (SS2 or SS3) */
191 #define UCNV_OPTIONS_VERSION_MASK 0xf
192 #define UCNV_2022_MAX_CONVERTERS 10
195 UConverterSharedData
*myConverterArray
[UCNV_2022_MAX_CONVERTERS
];
196 UConverter
*currentConverter
;
197 Cnv2022Type currentType
;
198 ISO2022State toU2022State
, fromU2022State
;
201 #ifdef U_ENABLE_GENERIC_ISO_2022
206 }UConverterDataISO2022
;
209 /* ISO-2022 ----------------------------------------------------------------- */
211 /*Forward declaration */
213 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs
* args
,
216 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
,
219 #define ESC_2022 0x1B /*ESC*/
223 INVALID_2022
= -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
224 VALID_NON_TERMINAL_2022
= 0, /*so far corresponds to a valid iso 2022 escape sequence*/
225 VALID_TERMINAL_2022
= 1, /*corresponds to a valid iso 2022 escape sequence*/
226 VALID_MAYBE_TERMINAL_2022
= 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
227 } UCNV_TableStates_2022
;
230 * The way these state transition arrays work is:
231 * ex : ESC$B is the sequence for JISX208
232 * a) First Iteration: char is ESC
233 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
234 * int x = normalize_esq_chars_2022[27] which is equal to 1
235 * ii) Search for this value in escSeqStateTable_Key_2022[]
236 * value of x is stored at escSeqStateTable_Key_2022[0]
237 * iii) Save this index as offset
238 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
239 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
240 * b) Switch on this state and continue to next char
241 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
242 * which is normalize_esq_chars_2022[36] == 4
243 * ii) x is currently 1(from above)
244 * x<<=5 -- x is now 32
245 * x+=normalize_esq_chars_2022[36]
247 * iii) Search for this value in escSeqStateTable_Key_2022[]
248 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
249 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
250 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
251 * c) Switch on this state and continue to next char
252 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
253 * ii) x is currently 36 (from above)
254 * x<<=5 -- x is now 1152
255 * x+=normalize_esq_chars_2022[66]
257 * iii) Search for this value in escSeqStateTable_Key_2022[]
258 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
259 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
260 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
261 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
265 /*Below are the 3 arrays depicting a state transition table*/
266 static const int8_t normalize_esq_chars_2022
[256] = {
267 /* 0 1 2 3 4 5 6 7 8 9 */
269 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
270 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
271 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
272 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
273 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
274 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
275 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
276 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
277 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
278 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
279 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
280 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
281 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
282 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
297 #ifdef U_ENABLE_GENERIC_ISO_2022
299 * When the generic ISO-2022 converter is completely removed, not just disabled
300 * per #ifdef, then the following state table and the associated tables that are
301 * dimensioned with MAX_STATES_2022 should be trimmed.
303 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
304 * the associated escape sequences starting with ESC ( B should be removed.
305 * This includes the ones with key values 1097 and all of the ones above 1000000.
307 * For the latter, the tables can simply be truncated.
308 * For the former, since the tables must be kept parallel, it is probably best
309 * to simply duplicate an adjacent table cell, parallel in all tables.
311 * It may make sense to restructure the tables, especially by using small search
312 * tables for the variants instead of indexing them parallel to the table here.
316 #define MAX_STATES_2022 74
317 static const int32_t escSeqStateTable_Key_2022
[MAX_STATES_2022
] = {
318 /* 0 1 2 3 4 5 6 7 8 9 */
320 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
321 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
322 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
323 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
324 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
325 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
326 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
327 ,35947631 ,35947635 ,35947636 ,35947638
330 #ifdef U_ENABLE_GENERIC_ISO_2022
332 static const char* const escSeqStateTable_Result_2022
[MAX_STATES_2022
] = {
333 /* 0 1 2 3 4 5 6 7 8 9 */
335 NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,"latin1" ,"latin1"
336 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
337 ,"latin1" ,NULL
,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL
,NULL
,NULL
,NULL
,"UTF8"
338 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL
,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
339 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
340 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
341 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL
,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
342 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
347 static const UCNV_TableStates_2022 escSeqStateTable_Value_2022
[MAX_STATES_2022
] = {
348 /* 0 1 2 3 4 5 6 7 8 9 */
349 VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
350 ,VALID_MAYBE_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
351 ,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
352 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
353 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
354 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
355 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
356 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
360 /* Type def for refactoring changeState_2022 code*/
362 #ifdef U_ENABLE_GENERIC_ISO_2022
370 /*********** ISO 2022 Converter Protos ***********/
372 _ISO2022Open(UConverter
*cnv
, const char *name
, const char *locale
,uint32_t options
, UErrorCode
*errorCode
);
375 _ISO2022Close(UConverter
*converter
);
378 _ISO2022Reset(UConverter
*converter
, UConverterResetChoice choice
);
381 _ISO2022getName(const UConverter
* cnv
);
384 _ISO_2022_WriteSub(UConverterFromUnicodeArgs
*args
, int32_t offsetIndex
, UErrorCode
*err
);
387 _ISO_2022_SafeClone(const UConverter
*cnv
, void *stackBuffer
, int32_t *pBufferSize
, UErrorCode
*status
);
389 #ifdef U_ENABLE_GENERIC_ISO_2022
391 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs
* args
, UErrorCode
* err
);
394 /*const UConverterSharedData _ISO2022Data;*/
395 static const UConverterSharedData _ISO2022JPData
;
396 static const UConverterSharedData _ISO2022KRData
;
397 static const UConverterSharedData _ISO2022CNData
;
399 /*************** Converter implementations ******************/
401 /* The purpose of this function is to get around gcc compiler warnings. */
403 fromUWriteUInt8(UConverter
*cnv
,
404 const char *bytes
, int32_t length
,
405 uint8_t **target
, const char *targetLimit
,
408 UErrorCode
*pErrorCode
)
410 char *targetChars
= (char *)*target
;
411 ucnv_fromUWriteBytes(cnv
, bytes
, length
, &targetChars
, targetLimit
,
412 offsets
, sourceIndex
, pErrorCode
);
413 *target
= (uint8_t*)targetChars
;
418 setInitialStateToUnicodeKR(UConverter
* converter
, UConverterDataISO2022
*myConverterData
){
419 if(myConverterData
->version
== 1) {
420 UConverter
*cnv
= myConverterData
->currentConverter
;
422 cnv
->toUnicodeStatus
=0; /* offset */
423 cnv
->mode
=0; /* state */
424 cnv
->toULength
=0; /* byteIndex */
429 setInitialStateFromUnicodeKR(UConverter
* converter
,UConverterDataISO2022
*myConverterData
){
430 /* in ISO-2022-KR the designator sequence appears only once
431 * in a file so we append it only once
433 if( converter
->charErrorBufferLength
==0){
435 converter
->charErrorBufferLength
= 4;
436 converter
->charErrorBuffer
[0] = 0x1b;
437 converter
->charErrorBuffer
[1] = 0x24;
438 converter
->charErrorBuffer
[2] = 0x29;
439 converter
->charErrorBuffer
[3] = 0x43;
441 if(myConverterData
->version
== 1) {
442 UConverter
*cnv
= myConverterData
->currentConverter
;
445 cnv
->fromUnicodeStatus
=1; /* prevLength */
450 _ISO2022Open(UConverter
*cnv
, const char *name
, const char *locale
,uint32_t options
, UErrorCode
*errorCode
){
452 char myLocale
[6]={' ',' ',' ',' ',' ',' '};
454 cnv
->extraInfo
= uprv_malloc (sizeof (UConverterDataISO2022
));
455 if(cnv
->extraInfo
!= NULL
) {
456 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) cnv
->extraInfo
;
459 uprv_memset(myConverterData
, 0, sizeof(UConverterDataISO2022
));
460 myConverterData
->currentType
= ASCII1
;
461 cnv
->fromUnicodeStatus
=FALSE
;
463 uprv_strncpy(myLocale
, locale
, sizeof(myLocale
));
465 version
= options
& UCNV_OPTIONS_VERSION_MASK
;
466 myConverterData
->version
= version
;
467 if(myLocale
[0]=='j' && (myLocale
[1]=='a'|| myLocale
[1]=='p') &&
468 (myLocale
[2]=='_' || myLocale
[2]=='\0'))
471 /* open the required converters and cache them */
472 if(jpCharsetMasks
[version
]&CSM(ISO8859_7
)) {
473 myConverterData
->myConverterArray
[ISO8859_7
]= ucnv_loadSharedData("ISO8859_7", NULL
, errorCode
);
475 myConverterData
->myConverterArray
[JISX201
] = ucnv_loadSharedData("JISX0201", NULL
, errorCode
);
476 myConverterData
->myConverterArray
[JISX208
] = ucnv_loadSharedData("jisx-208", NULL
, errorCode
);
477 if(jpCharsetMasks
[version
]&CSM(JISX212
)) {
478 myConverterData
->myConverterArray
[JISX212
] = ucnv_loadSharedData("jisx-212", NULL
, errorCode
);
480 if(jpCharsetMasks
[version
]&CSM(GB2312
)) {
481 myConverterData
->myConverterArray
[GB2312
] = ucnv_loadSharedData("ibm-5478", NULL
, errorCode
); /* gb_2312_80-1 */
483 if(jpCharsetMasks
[version
]&CSM(KSC5601
)) {
484 myConverterData
->myConverterArray
[KSC5601
] = ucnv_loadSharedData("ksc_5601", NULL
, errorCode
);
487 /* set the function pointers to appropriate funtions */
488 cnv
->sharedData
=(UConverterSharedData
*)(&_ISO2022JPData
);
489 uprv_strcpy(myConverterData
->locale
,"ja");
491 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ja,version=");
492 len
= uprv_strlen(myConverterData
->name
);
493 myConverterData
->name
[len
]=(char)(myConverterData
->version
+(int)'0');
494 myConverterData
->name
[len
+1]='\0';
496 else if(myLocale
[0]=='k' && (myLocale
[1]=='o'|| myLocale
[1]=='r') &&
497 (myLocale
[2]=='_' || myLocale
[2]=='\0'))
500 myConverterData
->currentConverter
=
501 ucnv_open("icu-internal-25546",errorCode
);
503 if (U_FAILURE(*errorCode
)) {
508 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ko,version=1");
509 uprv_memcpy(cnv
->subChars
, myConverterData
->currentConverter
->subChars
, 4);
510 cnv
->subCharLen
= myConverterData
->currentConverter
->subCharLen
;
512 myConverterData
->currentConverter
=ucnv_open("ibm-949",errorCode
);
514 if (U_FAILURE(*errorCode
)) {
519 myConverterData
->version
= 0;
520 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ko,version=0");
523 /* initialize the state variables */
524 setInitialStateToUnicodeKR(cnv
, myConverterData
);
525 setInitialStateFromUnicodeKR(cnv
, myConverterData
);
527 /* set the function pointers to appropriate funtions */
528 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022KRData
;
529 uprv_strcpy(myConverterData
->locale
,"ko");
531 else if(((myLocale
[0]=='z' && myLocale
[1]=='h') || (myLocale
[0]=='c'&& myLocale
[1]=='n'))&&
532 (myLocale
[2]=='_' || myLocale
[2]=='\0'))
535 /* open the required converters and cache them */
536 myConverterData
->myConverterArray
[GB2312_1
] = ucnv_loadSharedData("ibm-5478", NULL
, errorCode
);
538 myConverterData
->myConverterArray
[ISO_IR_165
] = ucnv_loadSharedData("iso-ir-165", NULL
, errorCode
);
540 myConverterData
->myConverterArray
[CNS_11643
] = ucnv_loadSharedData("cns-11643-1992", NULL
, errorCode
);
543 /* set the function pointers to appropriate funtions */
544 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022CNData
;
545 uprv_strcpy(myConverterData
->locale
,"cn");
548 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=zh,version=1");
550 myConverterData
->version
= 0;
551 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=zh,version=0");
555 #ifdef U_ENABLE_GENERIC_ISO_2022
556 myConverterData
->isFirstBuffer
= TRUE
;
558 /* append the UTF-8 escape sequence */
559 cnv
->charErrorBufferLength
= 3;
560 cnv
->charErrorBuffer
[0] = 0x1b;
561 cnv
->charErrorBuffer
[1] = 0x25;
562 cnv
->charErrorBuffer
[2] = 0x42;
564 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022Data
;
565 /* initialize the state variables */
566 uprv_strcpy(myConverterData
->name
,"ISO_2022");
568 *errorCode
= U_UNSUPPORTED_ERROR
;
573 cnv
->maxBytesPerUChar
=cnv
->sharedData
->staticData
->maxBytesPerChar
;
575 if(U_FAILURE(*errorCode
)) {
579 *errorCode
= U_MEMORY_ALLOCATION_ERROR
;
585 _ISO2022Close(UConverter
*converter
) {
586 UConverterDataISO2022
* myData
=(UConverterDataISO2022
*) (converter
->extraInfo
);
587 UConverterSharedData
**array
= myData
->myConverterArray
;
590 if (converter
->extraInfo
!= NULL
) {
591 /*close the array of converter pointers and free the memory*/
592 for (i
=0; i
<UCNV_2022_MAX_CONVERTERS
; i
++) {
594 ucnv_unloadSharedDataIfReady(array
[i
]);
598 ucnv_close(myData
->currentConverter
);
600 if(!converter
->isExtraLocal
){
601 uprv_free (converter
->extraInfo
);
602 converter
->extraInfo
= NULL
;
608 _ISO2022Reset(UConverter
*converter
, UConverterResetChoice choice
) {
609 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) (converter
->extraInfo
);
610 if(choice
<=UCNV_RESET_TO_UNICODE
) {
611 uprv_memset(&myConverterData
->toU2022State
, 0, sizeof(ISO2022State
));
612 myConverterData
->key
= 0;
614 if(choice
!=UCNV_RESET_TO_UNICODE
) {
615 uprv_memset(&myConverterData
->fromU2022State
, 0, sizeof(ISO2022State
));
617 #ifdef U_ENABLE_GENERIC_ISO_2022
618 if(myConverterData
->locale
[0] == 0){
619 if(choice
<=UCNV_RESET_TO_UNICODE
) {
620 myConverterData
->isFirstBuffer
= TRUE
;
621 myConverterData
->key
= 0;
622 if (converter
->mode
== UCNV_SO
){
623 ucnv_close (myConverterData
->currentConverter
);
624 myConverterData
->currentConverter
=NULL
;
626 converter
->mode
= UCNV_SI
;
628 if(choice
!=UCNV_RESET_TO_UNICODE
) {
629 /* re-append UTF-8 escape sequence */
630 converter
->charErrorBufferLength
= 3;
631 converter
->charErrorBuffer
[0] = 0x1b;
632 converter
->charErrorBuffer
[1] = 0x28;
633 converter
->charErrorBuffer
[2] = 0x42;
639 /* reset the state variables */
640 if(myConverterData
->locale
[0] == 'k'){
641 if(choice
<=UCNV_RESET_TO_UNICODE
) {
642 setInitialStateToUnicodeKR(converter
, myConverterData
);
644 if(choice
!=UCNV_RESET_TO_UNICODE
) {
645 setInitialStateFromUnicodeKR(converter
, myConverterData
);
652 _ISO2022getName(const UConverter
* cnv
){
654 UConverterDataISO2022
* myData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
661 /*************** to unicode *******************/
662 /****************************************************************************
663 * Recognized escape sequences are
675 static const StateEnum nextStateToUnicodeJP
[MAX_STATES_2022
]= {
676 /* 0 1 2 3 4 5 6 7 8 9 */
677 INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,SS2_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
678 ,ASCII
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,JISX201
,HWKANA_7BIT
,JISX201
,INVALID_STATE
679 ,INVALID_STATE
,INVALID_STATE
,JISX208
,GB2312
,JISX208
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
680 ,ISO8859_1
,ISO8859_7
,JISX208
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,KSC5601
,JISX212
,INVALID_STATE
681 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
682 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
683 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
684 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
687 /*************** to unicode *******************/
688 static const StateEnum nextStateToUnicodeCN
[MAX_STATES_2022
]= {
689 /* 0 1 2 3 4 5 6 7 8 9 */
690 INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,SS2_STATE
,SS3_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
691 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
692 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
693 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
694 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,GB2312_1
,INVALID_STATE
,ISO_IR_165
695 ,CNS_11643_1
,CNS_11643_2
,CNS_11643_3
,CNS_11643_4
,CNS_11643_5
,CNS_11643_6
,CNS_11643_7
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
696 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
697 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
701 static UCNV_TableStates_2022
702 getKey_2022(char c
,int32_t* key
,int32_t* offset
){
705 int32_t hi
= MAX_STATES_2022
;
708 togo
= normalize_esq_chars_2022
[(uint8_t)c
];
710 /* not a valid character anywhere in an escape sequence */
715 togo
= (*key
<< 5) + togo
;
717 while (hi
!= low
) /*binary search*/{
719 register int32_t mid
= (hi
+low
) >> 1; /*Finds median*/
724 if (escSeqStateTable_Key_2022
[mid
] > togo
){
727 else if (escSeqStateTable_Key_2022
[mid
] < togo
){
730 else /*we found it*/{
733 return escSeqStateTable_Value_2022
[mid
];
744 /*runs through a state machine to determine the escape sequence - codepage correspondance
747 changeState_2022(UConverter
* _this
,
749 const char* sourceLimit
,
752 UCNV_TableStates_2022 value
;
753 UConverterDataISO2022
* myData2022
= ((UConverterDataISO2022
*)_this
->extraInfo
);
754 uint32_t key
= myData2022
->key
;
758 value
= VALID_NON_TERMINAL_2022
;
759 while (*source
< sourceLimit
) {
761 _this
->toUBytes
[_this
->toULength
++]=(uint8_t)c
;
762 value
= getKey_2022(c
,(int32_t *) &key
, &offset
);
766 case VALID_NON_TERMINAL_2022
:
767 /* continue with the loop */
770 case VALID_TERMINAL_2022
:
777 case VALID_MAYBE_TERMINAL_2022
:
778 #ifdef U_ENABLE_GENERIC_ISO_2022
779 /* ESC ( B is ambiguous only for ISO_2022 itself */
780 if(var
== ISO_2022
) {
781 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
782 _this
->toULength
= 0;
784 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
786 /* continue with the loop */
787 value
= VALID_NON_TERMINAL_2022
;
792 /* not ISO_2022 itself, finish here */
793 value
= VALID_TERMINAL_2022
;
801 myData2022
->key
= key
;
803 if (value
== VALID_NON_TERMINAL_2022
) {
804 /* indicate that the escape sequence is incomplete: key!=0 */
806 } else if (value
== INVALID_2022
) {
807 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
809 } else /* value == VALID_TERMINAL_2022 */ {
811 #ifdef U_ENABLE_GENERIC_ISO_2022
814 const char *chosenConverterName
= escSeqStateTable_Result_2022
[offset
];
815 if(chosenConverterName
== NULL
) {
817 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
821 _this
->mode
= UCNV_SI
;
822 ucnv_close(myData2022
->currentConverter
);
823 myData2022
->currentConverter
= myUConverter
= ucnv_open(chosenConverterName
, err
);
824 if(U_SUCCESS(*err
)) {
825 myUConverter
->fromCharErrorBehaviour
= UCNV_TO_U_CALLBACK_STOP
;
826 _this
->mode
= UCNV_SO
;
833 StateEnum tempState
=nextStateToUnicodeJP
[offset
];
836 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
839 if(myData2022
->toU2022State
.cs
[2]!=0) {
840 if(myData2022
->toU2022State
.g
<2) {
841 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
843 myData2022
->toU2022State
.g
=2;
845 /* illegal to have SS2 before a matching designator */
846 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
849 /* case SS3_STATE: not used in ISO-2022-JP-x */
852 if((jpCharsetMasks
[myData2022
->version
] & CSM(tempState
)) == 0) {
853 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
855 /* G2 charset for SS2 */
856 myData2022
->toU2022State
.cs
[2]=(int8_t)tempState
;
860 if((jpCharsetMasks
[myData2022
->version
] & CSM(tempState
)) == 0) {
861 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
864 myData2022
->toU2022State
.cs
[0]=(int8_t)tempState
;
872 StateEnum tempState
=nextStateToUnicodeCN
[offset
];
875 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
878 if(myData2022
->toU2022State
.cs
[2]!=0) {
879 if(myData2022
->toU2022State
.g
<2) {
880 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
882 myData2022
->toU2022State
.g
=2;
884 /* illegal to have SS2 before a matching designator */
885 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
889 if(myData2022
->toU2022State
.cs
[3]!=0) {
890 if(myData2022
->toU2022State
.g
<2) {
891 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
893 myData2022
->toU2022State
.g
=3;
895 /* illegal to have SS3 before a matching designator */
896 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
900 if(myData2022
->version
==0) {
901 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
908 myData2022
->toU2022State
.cs
[1]=(int8_t)tempState
;
911 myData2022
->toU2022State
.cs
[2]=(int8_t)tempState
;
914 /* other CNS 11643 planes */
915 if(myData2022
->version
==0) {
916 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
918 myData2022
->toU2022State
.cs
[3]=(int8_t)tempState
;
926 /* nothing to be done, just accept this one escape sequence */
928 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
933 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
937 if(U_SUCCESS(*err
)) {
938 _this
->toULength
= 0;
942 /*Checks the characters of the buffer against valid 2022 escape sequences
943 *if the match we return a pointer to the initial start of the sequence otherwise
944 *we return sourceLimit
946 /*for 2022 looks ahead in the stream
947 *to determine the longest possible convertible
950 static U_INLINE
const char*
951 getEndOfBuffer_2022(const char** source
,
952 const char* sourceLimit
,
955 const char* mySource
= *source
;
957 #ifdef U_ENABLE_GENERIC_ISO_2022
958 if (*source
>= sourceLimit
)
963 if (*mySource
== ESC_2022
){
967 UCNV_TableStates_2022 value
= VALID_NON_TERMINAL_2022
;
969 /* Kludge: I could not
970 * figure out the reason for validating an escape sequence
971 * twice - once here and once in changeState_2022().
972 * is it possible to have an ESC character in a ISO2022
973 * byte stream which is valid in a code page? Is it legal?
976 (mySource
+i
< sourceLimit
)&&(value
== VALID_NON_TERMINAL_2022
);
978 value
= getKey_2022(*(mySource
+i
), &key
, &offset
);
980 if (value
> 0 || *mySource
==ESC_2022
)
983 if ((value
== VALID_NON_TERMINAL_2022
)&&(!flush
) )
986 }while (++mySource
< sourceLimit
);
990 while(mySource
< sourceLimit
&& *mySource
!= ESC_2022
) {
998 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
999 * any future change in _MBCSFromUChar32() function should be reflected here.
1000 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1002 static U_INLINE
int32_t
1003 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData
* sharedData
,
1010 const uint16_t *table
;
1011 uint32_t stage2Entry
;
1016 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1017 * Use internal version of ucnv_open() that verifies that the new structures are available,
1018 * else U_INTERNAL_PROGRAM_ERROR.
1020 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1021 if(c
<0x10000 || (sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
1022 table
=sharedData
->mbcs
.fromUnicodeTable
;
1023 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
1024 /* get the bytes and the length for the output */
1025 if(outputType
==MBCS_OUTPUT_2
){
1026 myValue
=MBCS_VALUE_2_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
1032 } else /* outputType==MBCS_OUTPUT_3 */ {
1033 p
=MBCS_POINTER_3_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
1034 myValue
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
1037 } else if(myValue
<=0xffff) {
1044 * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space.
1045 * Pass in parameter for type of output bytes, for validation and shifting:
1046 * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20?
1047 * (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.)
1048 * - A1-FE: Subtract 80 after range check.
1049 * - SJIS: Shift DBCS result to 21-7E x 21-7E.
1051 /* is this code point assigned, or do we use fallbacks? */
1052 if((stage2Entry
&(1<<(16+(c
&0xf))))!=0) {
1056 } else if(FROM_U_USE_FALLBACK(useFallback
, c
) && myValue
!=0) {
1058 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1059 * There is no way with this data structure for fallback output
1060 * to be a zero byte.
1067 cx
=sharedData
->mbcs
.extIndexes
;
1069 return ucnv_extSimpleMatchFromU(cx
, c
, value
, useFallback
);
1076 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1077 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1078 * @param retval pointer to output byte
1079 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1081 static U_INLINE
int32_t
1082 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData
* sharedData
,
1087 const uint16_t *table
;
1089 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1090 if(c
>=0x10000 && !(sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
1093 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1094 table
=sharedData
->mbcs
.fromUnicodeTable
;
1095 /* get the byte for the output */
1096 value
=MBCS_SINGLE_RESULT_FROM_U(table
, (uint16_t *)sharedData
->mbcs
.fromUnicodeBytes
, c
);
1097 /* is this code point assigned, or do we use fallbacks? */
1098 *retval
=(uint32_t)(value
&0xff);
1100 return 1; /* roundtrip */
1101 } else if(useFallback
? value
>=0x800 : value
>=0xc00) {
1102 return -1; /* fallback taken */
1104 return 0; /* no mapping */
1108 #ifdef U_ENABLE_GENERIC_ISO_2022
1110 /**********************************************************************************
1111 * ISO-2022 Converter
1117 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs
* args
,
1119 const char* mySourceLimit
, *realSourceLimit
;
1120 const char* sourceStart
;
1121 const UChar
* myTargetStart
;
1122 UConverter
* saveThis
;
1123 UConverterDataISO2022
* myData
;
1126 saveThis
= args
->converter
;
1127 myData
=((UConverterDataISO2022
*)(saveThis
->extraInfo
));
1129 realSourceLimit
= args
->sourceLimit
;
1130 while (args
->source
< realSourceLimit
) {
1131 if(myData
->key
== 0) { /* are we in the middle of an escape sequence? */
1132 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1133 mySourceLimit
= getEndOfBuffer_2022(&(args
->source
), realSourceLimit
, args
->flush
);
1135 if(args
->source
< mySourceLimit
) {
1136 if(myData
->currentConverter
==NULL
) {
1137 myData
->currentConverter
= ucnv_open("ASCII",err
);
1138 if(U_FAILURE(*err
)){
1142 myData
->currentConverter
->fromCharErrorBehaviour
= UCNV_TO_U_CALLBACK_STOP
;
1143 saveThis
->mode
= UCNV_SO
;
1146 /* convert to before the ESC or until the end of the buffer */
1147 myData
->isFirstBuffer
=FALSE
;
1148 sourceStart
= args
->source
;
1149 myTargetStart
= args
->target
;
1150 args
->converter
= myData
->currentConverter
;
1151 ucnv_toUnicode(args
->converter
,
1157 (UBool
)(args
->flush
&& mySourceLimit
== realSourceLimit
),
1159 args
->converter
= saveThis
;
1161 if (*err
== U_BUFFER_OVERFLOW_ERROR
) {
1162 /* move the overflow buffer */
1163 length
= saveThis
->UCharErrorBufferLength
= myData
->currentConverter
->UCharErrorBufferLength
;
1164 myData
->currentConverter
->UCharErrorBufferLength
= 0;
1166 uprv_memcpy(saveThis
->UCharErrorBuffer
,
1167 myData
->currentConverter
->UCharErrorBuffer
,
1168 length
*U_SIZEOF_UCHAR
);
1175 * -Error while converting
1176 * -Done with entire buffer
1177 * -Need to write offsets or update the current offset
1178 * (leave that up to the code in ucnv.c)
1180 * or else we just stopped at an ESC byte and continue with changeState_2022()
1182 if (U_FAILURE(*err
) ||
1183 (args
->source
== realSourceLimit
) ||
1184 (args
->offsets
!= NULL
&& (args
->target
!= myTargetStart
|| args
->source
!= sourceStart
) ||
1185 (mySourceLimit
< realSourceLimit
&& myData
->currentConverter
->toULength
> 0))
1187 /* copy partial or error input for truncated detection and error handling */
1188 if(U_FAILURE(*err
)) {
1189 length
= saveThis
->invalidCharLength
= myData
->currentConverter
->invalidCharLength
;
1191 uprv_memcpy(saveThis
->invalidCharBuffer
, myData
->currentConverter
->invalidCharBuffer
, length
);
1194 length
= saveThis
->toULength
= myData
->currentConverter
->toULength
;
1196 uprv_memcpy(saveThis
->toUBytes
, myData
->currentConverter
->toUBytes
, length
);
1197 if(args
->source
< mySourceLimit
) {
1198 *err
= U_TRUNCATED_CHAR_FOUND
; /* truncated input before ESC */
1207 sourceStart
= args
->source
;
1208 changeState_2022(args
->converter
,
1213 if (U_FAILURE(*err
) || (args
->source
!= sourceStart
&& args
->offsets
!= NULL
)) {
1214 /* let the ucnv.c code update its current offset */
1223 * To Unicode Callback helper function
1226 toUnicodeCallback(UConverter
*cnv
,
1227 const uint32_t sourceChar
, const uint32_t targetUniChar
,
1229 if(sourceChar
>0xff){
1230 cnv
->toUBytes
[0] = (uint8_t)(sourceChar
>>8);
1231 cnv
->toUBytes
[1] = (uint8_t)sourceChar
;
1235 cnv
->toUBytes
[0] =(char) sourceChar
;
1239 if(targetUniChar
== (missingCharMarker
-1/*0xfffe*/)){
1240 *err
= U_INVALID_CHAR_FOUND
;
1243 *err
= U_ILLEGAL_CHAR_FOUND
;
1247 /**************************************ISO-2022-JP*************************************************/
1249 /************************************** IMPORTANT **************************************************
1250 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1251 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1252 * The converter iterates over each Unicode codepoint
1253 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1254 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1255 * would do as far as possible.
1257 * If the implementation of these macros or structure of sharedData struct change in the future, make
1258 * sure that ISO-2022 is also changed.
1259 ***************************************************************************************************
1262 /***************************************************************************************************
1263 * Rules for ISO-2022-jp encoding
1264 * (i) Escape sequences must be fully contained within a line they should not
1265 * span new lines or CRs
1266 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1267 * JIS-Roman character escape sequence should follow before the line terminates
1268 * (iii) If the first character on the line is represented by two bytes then a two
1269 * byte character escape sequence should precede it
1270 * (iv) If no escape sequence is encountered then the characters are ASCII
1271 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1272 * and invoked with SS2 (ESC N).
1273 * (vi) If there is any G0 designation in text, there must be a switch to
1274 * ASCII or to JIS X 0201-Roman before a space character (but not
1275 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1276 * characters such as tab or CRLF.
1277 * (vi) Supported encodings:
1278 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1282 * JISX201, JISX208,JISX212 : new .cnv data files created
1283 * KSC5601 : alias to ibm-949 mapping table
1284 * GB2312 : alias to ibm-1386 mapping table
1285 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1286 * ISO-8859-7 : alisas to ibm-9409 mapping table
1289 /* preference order of JP charsets */
1290 static const StateEnum jpCharsetPref
[]={
1303 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1304 * not in order of jpCharsetPref[]!
1306 static const char escSeqChars
[][6] ={
1307 "\x1B\x28\x42", /* <ESC>(B ASCII */
1308 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1309 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1310 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1311 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1312 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1313 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1314 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1315 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1318 static const int32_t escSeqCharsLen
[] ={
1319 3, /* length of <ESC>(B ASCII */
1320 3, /* length of <ESC>.A ISO-8859-1 */
1321 3, /* length of <ESC>.F ISO-8859-7 */
1322 3, /* length of <ESC>(J JISX-201 */
1323 3, /* length of <ESC>$B JISX-208 */
1324 4, /* length of <ESC>$(D JISX-212 */
1325 3, /* length of <ESC>$A GB2312 */
1326 4, /* length of <ESC>$(C KSC5601 */
1327 3 /* length of <ESC>(I HWKANA_7BIT */
1331 * The iteration over various code pages works this way:
1332 * i) Get the currentState from myConverterData->currentState
1333 * ii) Check if the character is mapped to a valid character in the currentState
1334 * Yes -> a) set the initIterState to currentState
1335 * b) remain in this state until an invalid character is found
1336 * No -> a) go to the next code page and find the character
1337 * iii) Before changing the state increment the current state check if the current state
1338 * is equal to the intitIteration state
1339 * Yes -> A character that cannot be represented in any of the supported encodings
1340 * break and return a U_INVALID_CHARACTER error
1341 * No -> Continue and find the character in next code page
1344 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1348 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
) {
1349 UConverter
*cnv
= args
->converter
;
1350 UConverterDataISO2022
*converterData
;
1351 ISO2022State
*pFromU2022State
;
1352 uint8_t *target
= (uint8_t *) args
->target
;
1353 const uint8_t *targetLimit
= (const uint8_t *) args
->targetLimit
;
1354 const UChar
* source
= args
->source
;
1355 const UChar
* sourceLimit
= args
->sourceLimit
;
1356 int32_t* offsets
= args
->offsets
;
1359 int32_t len
, outLen
;
1361 int32_t choiceCount
;
1362 uint32_t targetValue
= 0;
1368 /* set up the state */
1369 converterData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
1370 pFromU2022State
= &converterData
->fromU2022State
;
1374 /* check if the last codepoint of previous buffer was a lead surrogate*/
1375 if((sourceChar
= cnv
->fromUChar32
)!=0 && target
< targetLimit
) {
1379 while(source
< sourceLimit
) {
1380 if(target
< targetLimit
) {
1382 sourceChar
= *(source
++);
1383 /*check if the char is a First surrogate*/
1384 if(UTF_IS_SURROGATE(sourceChar
)) {
1385 if(UTF_IS_SURROGATE_FIRST(sourceChar
)) {
1387 /*look ahead to find the trail surrogate*/
1388 if(source
< sourceLimit
) {
1389 /* test the following code unit */
1390 UChar trail
=(UChar
) *source
;
1391 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1393 sourceChar
=UTF16_GET_PAIR_VALUE(sourceChar
, trail
);
1394 cnv
->fromUChar32
=0x00;
1395 /* convert this supplementary code point */
1396 /* exit this condition tree */
1398 /* this is an unmatched lead code unit (1st surrogate) */
1399 /* callback(illegal) */
1400 *err
=U_ILLEGAL_CHAR_FOUND
;
1401 cnv
->fromUChar32
=sourceChar
;
1406 cnv
->fromUChar32
=sourceChar
;
1410 /* this is an unmatched trail code unit (2nd surrogate) */
1411 /* callback(illegal) */
1412 *err
=U_ILLEGAL_CHAR_FOUND
;
1413 cnv
->fromUChar32
=sourceChar
;
1418 /* do not convert SO/SI/ESC */
1419 if(IS_2022_CONTROL(sourceChar
)) {
1420 /* callback(illegal) */
1421 *err
=U_ILLEGAL_CHAR_FOUND
;
1422 cnv
->fromUChar32
=sourceChar
;
1426 /* do the conversion */
1428 if(choiceCount
== 0) {
1432 * The csm variable keeps track of which charsets are allowed
1433 * and not used yet while building the choices[].
1435 csm
= jpCharsetMasks
[converterData
->version
];
1438 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1439 if(converterData
->version
== 3 || converterData
->version
== 4) {
1440 choices
[choiceCount
++] = (int8_t)HWKANA_7BIT
;
1442 /* Do not try single-byte half-width Katakana for other versions. */
1443 csm
&= ~CSM(HWKANA_7BIT
);
1445 /* try the current G0 charset */
1446 choices
[choiceCount
++] = cs
= pFromU2022State
->cs
[0];
1449 /* try the current G2 charset */
1450 if((cs
= pFromU2022State
->cs
[2]) != 0) {
1451 choices
[choiceCount
++] = cs
;
1455 /* try all the other possible charsets */
1456 for(i
= 0; i
< LENGTHOF(jpCharsetPref
); ++i
) {
1457 cs
= (int8_t)jpCharsetPref
[i
];
1459 choices
[choiceCount
++] = cs
;
1467 * len==0: no mapping found yet
1468 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1469 * len>0: found a roundtrip result, done
1473 * We will turn off useFallback after finding a fallback,
1474 * but we still get fallbacks from PUA code points as usual.
1475 * Therefore, we will also need to check that we don't overwrite
1476 * an early fallback with a later one.
1478 useFallback
= cnv
->useFallback
;
1480 for(i
= 0; i
< choiceCount
&& len
<= 0; ++i
) {
1483 int8_t cs0
= choices
[i
];
1486 if(sourceChar
<= 0x7f) {
1487 targetValue
= (uint32_t)sourceChar
;
1494 if(GR96_START
<= sourceChar
&& sourceChar
<= GR96_END
) {
1495 targetValue
= (uint32_t)sourceChar
- 0x80;
1502 if((uint32_t)(HWKANA_END
-sourceChar
)<=(HWKANA_END
-HWKANA_START
)) {
1503 if(converterData
->version
==3) {
1504 /* JIS7: use G1 (SO) */
1505 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1506 targetValue
= (uint32_t)(sourceChar
- (HWKANA_START
- 0x21));
1508 pFromU2022State
->cs
[1] = cs
= cs0
; /* do not output an escape sequence */
1510 } else if(converterData
->version
==4) {
1511 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1512 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1513 targetValue
= (uint32_t)(sourceChar
- (HWKANA_START
- 0xa1));
1516 cs
= pFromU2022State
->cs
[0];
1517 if(IS_JP_DBCS(cs
)) {
1518 /* switch from a DBCS charset to JISX201 */
1519 cs
= (int8_t)JISX201
;
1521 /* else stay in the current G0 charset */
1524 /* else do not use HWKANA_7BIT with other versions */
1529 len2
= MBCS_SINGLE_FROM_UCHAR32(
1530 converterData
->myConverterArray
[cs0
],
1533 if(len2
!= 0 && !(len2
< 0 && len
!= 0) && value
<= 0x7f) {
1534 targetValue
= value
;
1538 useFallback
= FALSE
;
1542 /* G0 SBCS forced to 7-bit output */
1543 len2
= MBCS_SINGLE_FROM_UCHAR32(
1544 converterData
->myConverterArray
[cs0
],
1547 if(len2
!= 0 && !(len2
< 0 && len
!= 0) && GR96_START
<= value
&& value
<= GR96_END
) {
1548 targetValue
= value
- 0x80;
1552 useFallback
= FALSE
;
1557 len2
= MBCS_FROM_UCHAR32_ISO2022(
1558 converterData
->myConverterArray
[cs0
],
1560 useFallback
, MBCS_OUTPUT_2
);
1561 if(len2
== 2 || (len2
== -2 && len
== 0)) { /* only accept DBCS: abs(len)==2 */
1562 if(cs0
== KSC5601
) {
1564 * Check for valid bytes for the encoding scheme.
1565 * This is necessary because the sub-converter (windows-949)
1566 * has a broader encoding scheme than is valid for 2022.
1568 * Check that the result is a 2-byte value with each byte in the range A1..FE
1569 * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte
1570 * to move it to the ISO 2022 range 21..7E.
1572 if( (uint16_t)(value
- 0xa1a1) <= (0xfefe - 0xa1a1) &&
1573 (uint8_t)(value
- 0xa1) <= (0xfe - 0xa1)
1575 value
-= 0x8080; /* shift down to 21..7e byte range */
1577 break; /* not valid for ISO 2022 */
1580 targetValue
= value
;
1584 useFallback
= FALSE
;
1592 len
= -len
; /* fallback */
1594 outLen
= 0; /* count output bytes */
1596 /* write SI if necessary (only for JIS7) */
1597 if(pFromU2022State
->g
== 1 && g
== 0) {
1598 buffer
[outLen
++] = UCNV_SI
;
1599 pFromU2022State
->g
= 0;
1602 /* write the designation sequence if necessary */
1603 if(cs
!= pFromU2022State
->cs
[g
]) {
1604 int32_t escLen
= escSeqCharsLen
[cs
];
1605 uprv_memcpy(buffer
+ outLen
, escSeqChars
[cs
], escLen
);
1607 pFromU2022State
->cs
[g
] = cs
;
1609 /* invalidate the choices[] */
1613 /* write the shift sequence if necessary */
1614 if(g
!= pFromU2022State
->g
) {
1616 /* case 0 handled before writing escapes */
1618 buffer
[outLen
++] = UCNV_SO
;
1619 pFromU2022State
->g
= 1;
1621 default: /* case 2 */
1622 buffer
[outLen
++] = 0x1b;
1623 buffer
[outLen
++] = 0x4e;
1625 /* no case 3: no SS3 in ISO-2022-JP-x */
1629 /* write the output bytes */
1631 buffer
[outLen
++] = (char)targetValue
;
1632 } else /* len == 2 */ {
1633 buffer
[outLen
++] = (char)(targetValue
>> 8);
1634 buffer
[outLen
++] = (char)targetValue
;
1638 * if we cannot find the character after checking all codepages
1639 * then this is an error
1641 *err
= U_INVALID_CHAR_FOUND
;
1642 cnv
->fromUChar32
=sourceChar
;
1646 if(sourceChar
== CR
|| sourceChar
== LF
) {
1647 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1648 pFromU2022State
->cs
[2] = 0;
1652 /* output outLen>0 bytes in buffer[] */
1654 *target
++ = buffer
[0];
1656 *offsets
++ = (int32_t)(source
- args
->source
- 1); /* -1: known to be ASCII */
1658 } else if(outLen
== 2 && (target
+ 2) <= targetLimit
) {
1659 *target
++ = buffer
[0];
1660 *target
++ = buffer
[1];
1662 int32_t sourceIndex
= (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
));
1663 *offsets
++ = sourceIndex
;
1664 *offsets
++ = sourceIndex
;
1670 &target
, (const char *)targetLimit
,
1671 &offsets
, (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
)),
1673 if(U_FAILURE(*err
)) {
1677 } /* end if(myTargetIndex<myTargetLength) */
1679 *err
=U_BUFFER_OVERFLOW_ERROR
;
1683 }/* end while(mySourceIndex<mySourceLength) */
1686 * the end of the input stream and detection of truncated input
1687 * are handled by the framework, but for ISO-2022-JP conversion
1688 * we need to be in ASCII mode at the very end
1692 * in SO mode or not in ASCII mode
1693 * end of input and no truncated input
1695 if( U_SUCCESS(*err
) &&
1696 (pFromU2022State
->g
!=0 || pFromU2022State
->cs
[0]!=ASCII
) &&
1697 args
->flush
&& source
>=sourceLimit
&& cnv
->fromUChar32
==0
1699 int32_t sourceIndex
;
1703 if(pFromU2022State
->g
!= 0) {
1704 buffer
[outLen
++] = UCNV_SI
;
1705 pFromU2022State
->g
= 0;
1708 if(pFromU2022State
->cs
[0] != ASCII
) {
1709 int32_t escLen
= escSeqCharsLen
[ASCII
];
1710 uprv_memcpy(buffer
+ outLen
, escSeqChars
[ASCII
], escLen
);
1712 pFromU2022State
->cs
[0] = (int8_t)ASCII
;
1715 /* get the source index of the last input character */
1717 * TODO this would be simpler and more reliable if we used a pair
1718 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1719 * so that we could simply use the prevSourceIndex here;
1720 * this code gives an incorrect result for the rare case of an unmatched
1721 * trail surrogate that is alone in the last buffer of the text stream
1723 sourceIndex
=(int32_t)(source
-args
->source
);
1726 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
1727 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
1738 &target
, (const char *)targetLimit
,
1739 &offsets
, sourceIndex
,
1743 /*save the state and return */
1744 args
->source
= source
;
1745 args
->target
= (char*)target
;
1748 /*************** to unicode *******************/
1751 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
1754 const char *mySource
= (char *) args
->source
;
1755 UChar
*myTarget
= args
->target
;
1756 const char *mySourceLimit
= args
->sourceLimit
;
1757 uint32_t targetUniChar
= 0x0000;
1758 uint32_t mySourceChar
= 0x0000;
1759 UConverterDataISO2022
* myData
;
1760 ISO2022State
*pToU2022State
;
1763 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
1764 pToU2022State
= &myData
->toU2022State
;
1766 if(myData
->key
!= 0) {
1767 /* continue with a partial escape sequence */
1769 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
1770 /* continue with a partial double-byte character */
1771 mySourceChar
= args
->converter
->toUBytes
[0];
1772 args
->converter
->toULength
= 0;
1773 cs
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
1777 while(mySource
< mySourceLimit
){
1779 targetUniChar
=missingCharMarker
;
1781 if(myTarget
< args
->targetLimit
){
1783 mySourceChar
= (unsigned char) *mySource
++;
1785 switch(mySourceChar
) {
1787 if(myData
->version
==3) {
1791 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1796 if(myData
->version
==3) {
1797 /* JIS7: switch to G1 half-width Katakana */
1798 pToU2022State
->cs
[1] = (int8_t)HWKANA_7BIT
;
1802 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1809 changeState_2022(args
->converter
,&(mySource
),
1810 mySourceLimit
, ISO_2022_JP
,err
);
1812 /* invalid or illegal escape sequence */
1813 if(U_FAILURE(*err
)){
1814 args
->target
= myTarget
;
1815 args
->source
= mySource
;
1820 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
1825 /* automatically reset to single-byte mode */
1826 if((StateEnum
)pToU2022State
->cs
[0] != ASCII
&& (StateEnum
)pToU2022State
->cs
[0] != JISX201
) {
1827 pToU2022State
->cs
[0] = (int8_t)ASCII
;
1829 pToU2022State
->cs
[2] = 0;
1830 pToU2022State
->g
= 0;
1833 /* convert one or two bytes */
1834 cs
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
1835 if( (uint8_t)(mySourceChar
- 0xa1) <= (0xdf - 0xa1) && myData
->version
==4 &&
1838 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
1839 targetUniChar
= mySourceChar
+ (HWKANA_START
- 0xa1);
1841 /* return from a single-shift state to the previous one */
1842 if(pToU2022State
->g
>= 2) {
1843 pToU2022State
->g
=pToU2022State
->prevG
;
1847 if(mySourceChar
<= 0x7f) {
1848 targetUniChar
= mySourceChar
;
1852 if(mySourceChar
<= 0x7f) {
1853 targetUniChar
= mySourceChar
+ 0x80;
1855 /* return from a single-shift state to the previous one */
1856 pToU2022State
->g
=pToU2022State
->prevG
;
1859 if(mySourceChar
<= 0x7f) {
1860 /* convert mySourceChar+0x80 to use a normal 8-bit table */
1862 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1863 myData
->myConverterArray
[cs
],
1864 mySourceChar
+ 0x80);
1866 /* return from a single-shift state to the previous one */
1867 pToU2022State
->g
=pToU2022State
->prevG
;
1870 if(mySourceChar
<= 0x7f) {
1872 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1873 myData
->myConverterArray
[cs
],
1878 if((uint8_t)(mySourceChar
- 0x21) <= (0x5f - 0x21)) {
1879 /* 7-bit halfwidth Katakana */
1880 targetUniChar
= mySourceChar
+ (HWKANA_START
- 0x21);
1885 if(mySource
< mySourceLimit
) {
1888 tempBuf
[0] = (char) (mySourceChar
);
1889 tempBuf
[1] = trailByte
= *mySource
++;
1890 mySourceChar
= (mySourceChar
<< 8) | (uint8_t)(trailByte
);
1891 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(myData
->myConverterArray
[cs
], tempBuf
, 2, FALSE
);
1893 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
1894 args
->converter
->toULength
= 1;
1897 } /* End of inner switch */
1899 } /* End of outer switch */
1900 if(targetUniChar
< (missingCharMarker
-1/*0xfffe*/)){
1902 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
1904 *(myTarget
++)=(UChar
)targetUniChar
;
1906 else if(targetUniChar
> missingCharMarker
){
1907 /* disassemble the surrogate pair and write to output*/
1908 targetUniChar
-=0x0010000;
1909 *myTarget
= (UChar
)(0xd800+(UChar
)(targetUniChar
>>10));
1911 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
1914 if(myTarget
< args
->targetLimit
){
1915 *myTarget
= (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
1917 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
1921 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]=
1922 (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
1927 /* Call the callback function*/
1928 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
1932 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
1933 *err
=U_BUFFER_OVERFLOW_ERROR
;
1938 args
->target
= myTarget
;
1939 args
->source
= mySource
;
1943 /***************************************************************
1944 * Rules for ISO-2022-KR encoding
1945 * i) The KSC5601 designator sequence should appear only once in a file,
1946 * at the begining of a line before any KSC5601 characters. This usually
1947 * means that it appears by itself on the first line of the file
1948 * ii) There are only 2 shifting sequences SO to shift into double byte mode
1949 * and SI to shift into single byte mode
1952 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
1954 UConverter
* saveConv
= args
->converter
;
1955 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*)saveConv
->extraInfo
;
1956 args
->converter
=myConverterData
->currentConverter
;
1958 myConverterData
->currentConverter
->fromUChar32
= saveConv
->fromUChar32
;
1959 ucnv_MBCSFromUnicodeWithOffsets(args
,err
);
1960 saveConv
->fromUChar32
= myConverterData
->currentConverter
->fromUChar32
;
1962 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
1963 if(myConverterData
->currentConverter
->charErrorBufferLength
> 0) {
1965 saveConv
->charErrorBuffer
,
1966 myConverterData
->currentConverter
->charErrorBuffer
,
1967 myConverterData
->currentConverter
->charErrorBufferLength
);
1969 saveConv
->charErrorBufferLength
= myConverterData
->currentConverter
->charErrorBufferLength
;
1970 myConverterData
->currentConverter
->charErrorBufferLength
= 0;
1972 args
->converter
=saveConv
;
1976 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
1978 const UChar
*source
= args
->source
;
1979 const UChar
*sourceLimit
= args
->sourceLimit
;
1980 unsigned char *target
= (unsigned char *) args
->target
;
1981 unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
1982 int32_t* offsets
= args
->offsets
;
1983 uint32_t targetByteUnit
= 0x0000;
1984 UChar32 sourceChar
= 0x0000;
1985 UBool isTargetByteDBCS
;
1986 UBool oldIsTargetByteDBCS
;
1987 UConverterDataISO2022
*converterData
;
1988 UConverterSharedData
* sharedData
;
1992 converterData
=(UConverterDataISO2022
*)args
->converter
->extraInfo
;
1993 /* if the version is 1 then the user is requesting
1994 * conversion with ibm-25546 pass the arguments to
1995 * MBCS converter and return
1997 if(converterData
->version
==1){
1998 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args
,err
);
2002 /* initialize data */
2003 sharedData
= converterData
->currentConverter
->sharedData
;
2004 useFallback
= args
->converter
->useFallback
;
2005 isTargetByteDBCS
=(UBool
)args
->converter
->fromUnicodeStatus
;
2006 oldIsTargetByteDBCS
= isTargetByteDBCS
;
2008 isTargetByteDBCS
= (UBool
) args
->converter
->fromUnicodeStatus
;
2009 if((sourceChar
= args
->converter
->fromUChar32
)!=0 && target
<targetLimit
) {
2012 while(source
< sourceLimit
){
2014 targetByteUnit
= missingCharMarker
;
2016 if(target
< (unsigned char*) args
->targetLimit
){
2017 sourceChar
= *source
++;
2019 /* do not convert SO/SI/ESC */
2020 if(IS_2022_CONTROL(sourceChar
)) {
2021 /* callback(illegal) */
2022 *err
=U_ILLEGAL_CHAR_FOUND
;
2023 args
->converter
->fromUChar32
=sourceChar
;
2027 length
= MBCS_FROM_UCHAR32_ISO2022(sharedData
,sourceChar
,&targetByteUnit
,useFallback
,MBCS_OUTPUT_2
);
2029 length
= -length
; /* fallback */
2031 /* only DBCS or SBCS characters are expected*/
2032 /* DB characters with high bit set to 1 are expected */
2033 if(length
> 2 || length
==0 ||(((targetByteUnit
& 0x8080) != 0x8080)&& length
==2)){
2034 targetByteUnit
=missingCharMarker
;
2036 if (targetByteUnit
!= missingCharMarker
){
2038 oldIsTargetByteDBCS
= isTargetByteDBCS
;
2039 isTargetByteDBCS
= (UBool
)(targetByteUnit
>0x00FF);
2040 /* append the shift sequence */
2041 if (oldIsTargetByteDBCS
!= isTargetByteDBCS
){
2043 if (isTargetByteDBCS
)
2044 *target
++ = UCNV_SO
;
2046 *target
++ = UCNV_SI
;
2048 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2050 /* write the targetUniChar to target */
2051 if(targetByteUnit
<= 0x00FF){
2052 if( target
< targetLimit
){
2053 *(target
++) = (unsigned char) targetByteUnit
;
2055 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2059 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
);
2060 *err
= U_BUFFER_OVERFLOW_ERROR
;
2063 if(target
< targetLimit
){
2064 *(target
++) =(unsigned char) ((targetByteUnit
>>8) -0x80);
2066 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2068 if(target
< targetLimit
){
2069 *(target
++) =(unsigned char) (targetByteUnit
-0x80);
2071 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2074 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
-0x80);
2075 *err
= U_BUFFER_OVERFLOW_ERROR
;
2078 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) ((targetByteUnit
>>8) -0x80);
2079 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
-0x80);
2080 *err
= U_BUFFER_OVERFLOW_ERROR
;
2086 /* oops.. the code point is unassingned
2087 * set the error and reason
2090 /*check if the char is a First surrogate*/
2091 if(UTF_IS_SURROGATE(sourceChar
)) {
2092 if(UTF_IS_SURROGATE_FIRST(sourceChar
)) {
2094 /*look ahead to find the trail surrogate*/
2095 if(source
< sourceLimit
) {
2096 /* test the following code unit */
2097 UChar trail
=(UChar
) *source
;
2098 if(UTF_IS_SECOND_SURROGATE(trail
)) {
2100 sourceChar
=UTF16_GET_PAIR_VALUE(sourceChar
, trail
);
2101 *err
= U_INVALID_CHAR_FOUND
;
2102 /* convert this surrogate code point */
2103 /* exit this condition tree */
2105 /* this is an unmatched lead code unit (1st surrogate) */
2106 /* callback(illegal) */
2107 *err
=U_ILLEGAL_CHAR_FOUND
;
2111 *err
= U_ZERO_ERROR
;
2114 /* this is an unmatched trail code unit (2nd surrogate) */
2115 /* callback(illegal) */
2116 *err
=U_ILLEGAL_CHAR_FOUND
;
2119 /* callback(unassigned) for a BMP code point */
2120 *err
= U_INVALID_CHAR_FOUND
;
2123 args
->converter
->fromUChar32
=sourceChar
;
2126 } /* end if(myTargetIndex<myTargetLength) */
2128 *err
=U_BUFFER_OVERFLOW_ERROR
;
2132 }/* end while(mySourceIndex<mySourceLength) */
2135 * the end of the input stream and detection of truncated input
2136 * are handled by the framework, but for ISO-2022-KR conversion
2137 * we need to be in ASCII mode at the very end
2142 * end of input and no truncated input
2144 if( U_SUCCESS(*err
) &&
2146 args
->flush
&& source
>=sourceLimit
&& args
->converter
->fromUChar32
==0
2148 int32_t sourceIndex
;
2150 /* we are switching to ASCII */
2151 isTargetByteDBCS
=FALSE
;
2153 /* get the source index of the last input character */
2155 * TODO this would be simpler and more reliable if we used a pair
2156 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2157 * so that we could simply use the prevSourceIndex here;
2158 * this code gives an incorrect result for the rare case of an unmatched
2159 * trail surrogate that is alone in the last buffer of the text stream
2161 sourceIndex
=(int32_t)(source
-args
->source
);
2164 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
2165 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
2176 &target
, (const char *)targetLimit
,
2177 &offsets
, sourceIndex
,
2181 /*save the state and return */
2182 args
->source
= source
;
2183 args
->target
= (char*)target
;
2184 args
->converter
->fromUnicodeStatus
= (uint32_t)isTargetByteDBCS
;
2187 /************************ To Unicode ***************************************/
2190 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs
*args
,
2192 char const* sourceStart
;
2193 UConverterDataISO2022
* myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2195 UConverterToUnicodeArgs subArgs
;
2196 int32_t minArgsSize
;
2198 /* set up the subconverter arguments */
2199 if(args
->size
<sizeof(UConverterToUnicodeArgs
)) {
2200 minArgsSize
= args
->size
;
2202 minArgsSize
= (int32_t)sizeof(UConverterToUnicodeArgs
);
2205 uprv_memcpy(&subArgs
, args
, minArgsSize
);
2206 subArgs
.size
= (uint16_t)minArgsSize
;
2207 subArgs
.converter
= myData
->currentConverter
;
2209 /* remember the original start of the input for offsets */
2210 sourceStart
= args
->source
;
2212 if(myData
->key
!= 0) {
2213 /* continue with a partial escape sequence */
2217 while(U_SUCCESS(*err
) && args
->source
< args
->sourceLimit
) {
2218 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2219 subArgs
.source
= args
->source
;
2220 subArgs
.sourceLimit
= getEndOfBuffer_2022(&(args
->source
), args
->sourceLimit
, args
->flush
);
2221 if(subArgs
.source
!= subArgs
.sourceLimit
) {
2223 * get the current partial byte sequence
2225 * it needs to be moved between the public and the subconverter
2226 * so that the conversion framework, which only sees the public
2227 * converter, can handle truncated and illegal input etc.
2229 if(args
->converter
->toULength
> 0) {
2230 uprv_memcpy(subArgs
.converter
->toUBytes
, args
->converter
->toUBytes
, args
->converter
->toULength
);
2232 subArgs
.converter
->toULength
= args
->converter
->toULength
;
2235 * Convert up to the end of the input, or to before the next escape character.
2236 * Does not handle conversion extensions because the preToU[] state etc.
2239 ucnv_MBCSToUnicodeWithOffsets(&subArgs
, err
);
2241 if(args
->offsets
!= NULL
&& sourceStart
!= args
->source
) {
2242 /* update offsets to base them on the actual start of the input */
2243 int32_t *offsets
= args
->offsets
;
2244 UChar
*target
= args
->target
;
2245 int32_t delta
= (int32_t)(args
->source
- sourceStart
);
2246 while(target
< subArgs
.target
) {
2254 args
->source
= subArgs
.source
;
2255 args
->target
= subArgs
.target
;
2256 args
->offsets
= subArgs
.offsets
;
2258 /* copy input/error/overflow buffers */
2259 if(subArgs
.converter
->toULength
> 0) {
2260 uprv_memcpy(args
->converter
->toUBytes
, subArgs
.converter
->toUBytes
, subArgs
.converter
->toULength
);
2262 args
->converter
->toULength
= subArgs
.converter
->toULength
;
2264 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
2265 if(subArgs
.converter
->UCharErrorBufferLength
> 0) {
2266 uprv_memcpy(args
->converter
->UCharErrorBuffer
, subArgs
.converter
->UCharErrorBuffer
,
2267 subArgs
.converter
->UCharErrorBufferLength
);
2269 args
->converter
->UCharErrorBufferLength
=subArgs
.converter
->UCharErrorBufferLength
;
2270 subArgs
.converter
->UCharErrorBufferLength
= 0;
2274 if (U_FAILURE(*err
) || (args
->source
== args
->sourceLimit
)) {
2279 changeState_2022(args
->converter
,
2288 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
2291 const char *mySource
= ( char *) args
->source
;
2292 UChar
*myTarget
= args
->target
;
2293 const char *mySourceLimit
= args
->sourceLimit
;
2294 UChar32 targetUniChar
= 0x0000;
2295 UChar mySourceChar
= 0x0000;
2296 UConverterDataISO2022
* myData
;
2297 UConverterSharedData
* sharedData
;
2300 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2301 if(myData
->version
==1){
2302 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args
,err
);
2306 /* initialize state */
2307 sharedData
= myData
->currentConverter
->sharedData
;
2308 useFallback
= args
->converter
->useFallback
;
2310 if(myData
->key
!= 0) {
2311 /* continue with a partial escape sequence */
2313 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
2314 /* continue with a partial double-byte character */
2315 mySourceChar
= args
->converter
->toUBytes
[0];
2316 args
->converter
->toULength
= 0;
2320 while(mySource
< mySourceLimit
){
2322 if(myTarget
< args
->targetLimit
){
2324 mySourceChar
= (unsigned char) *mySource
++;
2326 if(mySourceChar
==UCNV_SI
){
2327 myData
->toU2022State
.g
= 0;
2328 /*consume the source */
2330 }else if(mySourceChar
==UCNV_SO
){
2331 myData
->toU2022State
.g
= 1;
2332 /*consume the source */
2334 }else if(mySourceChar
==ESC_2022
){
2337 changeState_2022(args
->converter
,&(mySource
),
2338 mySourceLimit
, ISO_2022_KR
, err
);
2339 if(U_FAILURE(*err
)){
2340 args
->target
= myTarget
;
2341 args
->source
= mySource
;
2347 if(myData
->toU2022State
.g
== 1) {
2348 if(mySource
< mySourceLimit
) {
2351 trailByte
= *mySource
++;
2352 tempBuf
[0] = (char)(mySourceChar
+ 0x80);
2353 tempBuf
[1] = (char)(trailByte
+ 0x80);
2354 mySourceChar
= (mySourceChar
<< 8) | (uint8_t)(trailByte
);
2355 if((mySourceChar
& 0x8080) == 0) {
2356 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(sharedData
, tempBuf
, 2, useFallback
);
2358 /* illegal bytes > 0x7f */
2359 targetUniChar
= missingCharMarker
;
2362 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
2363 args
->converter
->toULength
= 1;
2368 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(sharedData
, mySource
- 1, 1, useFallback
);
2370 if(targetUniChar
< 0xfffe){
2372 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2374 *(myTarget
++)=(UChar
)targetUniChar
;
2377 /* Call the callback function*/
2378 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
2383 *err
=U_BUFFER_OVERFLOW_ERROR
;
2387 args
->target
= myTarget
;
2388 args
->source
= mySource
;
2391 /*************************** END ISO2022-KR *********************************/
2393 /*************************** ISO-2022-CN *********************************
2395 * Rules for ISO-2022-CN Encoding:
2396 * i) The designator sequence must appear once on a line before any instance
2397 * of character set it designates.
2398 * ii) If two lines contain characters from the same character set, both lines
2399 * must include the designator sequence.
2400 * iii) Once the designator sequence is known, a shifting sequence has to be found
2401 * to invoke the shifting
2402 * iv) All lines start in ASCII and end in ASCII.
2403 * v) Four shifting sequences are employed for this purpose:
2405 * Sequcence ASCII Eq Charsets
2406 * ---------- ------- ---------
2408 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2409 * SS2 <ESC>N CNS-11643-1992 Plane 2
2410 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2413 * SOdesignator : ESC "$" ")" finalchar_for_SO
2414 * SS2designator : ESC "$" "*" finalchar_for_SS2
2415 * SS3designator : ESC "$" "+" finalchar_for_SS3
2417 * ESC $ ) A Indicates the bytes following SO are Chinese
2418 * characters as defined in GB 2312-80, until
2419 * another SOdesignation appears
2422 * ESC $ ) E Indicates the bytes following SO are as defined
2423 * in ISO-IR-165 (for details, see section 2.1),
2424 * until another SOdesignation appears
2426 * ESC $ ) G Indicates the bytes following SO are as defined
2427 * in CNS 11643-plane-1, until another
2428 * SOdesignation appears
2430 * ESC $ * H Indicates the two bytes immediately following
2431 * SS2 is a Chinese character as defined in CNS
2432 * 11643-plane-2, until another SS2designation
2434 * (Meaning <ESC>N must preceed every 2 byte
2437 * ESC $ + I Indicates the immediate two bytes following SS3
2438 * is a Chinese character as defined in CNS
2439 * 11643-plane-3, until another SS3designation
2441 * (Meaning <ESC>O must preceed every 2 byte
2444 * ESC $ + J Indicates the immediate two bytes following SS3
2445 * is a Chinese character as defined in CNS
2446 * 11643-plane-4, until another SS3designation
2448 * (In English: <ESC>O must preceed every 2 byte
2451 * ESC $ + K Indicates the immediate two bytes following SS3
2452 * is a Chinese character as defined in CNS
2453 * 11643-plane-5, until another SS3designation
2456 * ESC $ + L Indicates the immediate two bytes following SS3
2457 * is a Chinese character as defined in CNS
2458 * 11643-plane-6, until another SS3designation
2461 * ESC $ + M Indicates the immediate two bytes following SS3
2462 * is a Chinese character as defined in CNS
2463 * 11643-plane-7, until another SS3designation
2466 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2467 * has its own designation information before any Chinese characters
2472 /* The following are defined this way to make the strings truely readonly */
2473 static const char GB_2312_80_STR
[] = "\x1B\x24\x29\x41";
2474 static const char ISO_IR_165_STR
[] = "\x1B\x24\x29\x45";
2475 static const char CNS_11643_1992_Plane_1_STR
[] = "\x1B\x24\x29\x47";
2476 static const char CNS_11643_1992_Plane_2_STR
[] = "\x1B\x24\x2A\x48";
2477 static const char CNS_11643_1992_Plane_3_STR
[] = "\x1B\x24\x2B\x49";
2478 static const char CNS_11643_1992_Plane_4_STR
[] = "\x1B\x24\x2B\x4A";
2479 static const char CNS_11643_1992_Plane_5_STR
[] = "\x1B\x24\x2B\x4B";
2480 static const char CNS_11643_1992_Plane_6_STR
[] = "\x1B\x24\x2B\x4C";
2481 static const char CNS_11643_1992_Plane_7_STR
[] = "\x1B\x24\x2B\x4D";
2483 /********************** ISO2022-CN Data **************************/
2484 static const char* const escSeqCharsCN
[10] ={
2485 SHIFT_IN_STR
, /* ASCII */
2488 CNS_11643_1992_Plane_1_STR
,
2489 CNS_11643_1992_Plane_2_STR
,
2490 CNS_11643_1992_Plane_3_STR
,
2491 CNS_11643_1992_Plane_4_STR
,
2492 CNS_11643_1992_Plane_5_STR
,
2493 CNS_11643_1992_Plane_6_STR
,
2494 CNS_11643_1992_Plane_7_STR
2498 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
2499 UConverter
*cnv
= args
->converter
;
2500 UConverterDataISO2022
*converterData
;
2501 ISO2022State
*pFromU2022State
;
2502 uint8_t *target
= (uint8_t *) args
->target
;
2503 const uint8_t *targetLimit
= (const uint8_t *) args
->targetLimit
;
2504 const UChar
* source
= args
->source
;
2505 const UChar
* sourceLimit
= args
->sourceLimit
;
2506 int32_t* offsets
= args
->offsets
;
2511 int32_t choiceCount
;
2512 uint32_t targetValue
= 0;
2515 /* set up the state */
2516 converterData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
2517 pFromU2022State
= &converterData
->fromU2022State
;
2521 /* check if the last codepoint of previous buffer was a lead surrogate*/
2522 if((sourceChar
= cnv
->fromUChar32
)!=0 && target
< targetLimit
) {
2526 while( source
< sourceLimit
){
2527 if(target
< targetLimit
){
2529 sourceChar
= *(source
++);
2530 /*check if the char is a First surrogate*/
2531 if(UTF_IS_SURROGATE(sourceChar
)) {
2532 if(UTF_IS_SURROGATE_FIRST(sourceChar
)) {
2534 /*look ahead to find the trail surrogate*/
2535 if(source
< sourceLimit
) {
2536 /* test the following code unit */
2537 UChar trail
=(UChar
) *source
;
2538 if(UTF_IS_SECOND_SURROGATE(trail
)) {
2540 sourceChar
=UTF16_GET_PAIR_VALUE(sourceChar
, trail
);
2541 cnv
->fromUChar32
=0x00;
2542 /* convert this supplementary code point */
2543 /* exit this condition tree */
2545 /* this is an unmatched lead code unit (1st surrogate) */
2546 /* callback(illegal) */
2547 *err
=U_ILLEGAL_CHAR_FOUND
;
2548 cnv
->fromUChar32
=sourceChar
;
2553 cnv
->fromUChar32
=sourceChar
;
2557 /* this is an unmatched trail code unit (2nd surrogate) */
2558 /* callback(illegal) */
2559 *err
=U_ILLEGAL_CHAR_FOUND
;
2560 cnv
->fromUChar32
=sourceChar
;
2565 /* do the conversion */
2566 if(sourceChar
<= 0x007f ){
2567 /* do not convert SO/SI/ESC */
2568 if(IS_2022_CONTROL(sourceChar
)) {
2569 /* callback(illegal) */
2570 *err
=U_ILLEGAL_CHAR_FOUND
;
2571 cnv
->fromUChar32
=sourceChar
;
2576 if(pFromU2022State
->g
== 0) {
2577 buffer
[0] = (char)sourceChar
;
2580 buffer
[0] = UCNV_SI
;
2581 buffer
[1] = (char)sourceChar
;
2583 pFromU2022State
->g
= 0;
2586 if(sourceChar
== CR
|| sourceChar
== LF
) {
2587 /* reset the state at the end of a line */
2588 uprv_memset(pFromU2022State
, 0, sizeof(ISO2022State
));
2593 /* convert U+0080..U+10ffff */
2597 if(choiceCount
== 0) {
2598 /* try the current SO/G1 converter first */
2599 choices
[0] = pFromU2022State
->cs
[1];
2601 /* default to GB2312_1 if none is designated yet */
2602 if(choices
[0] == 0) {
2603 choices
[0] = GB2312_1
;
2606 if(converterData
->version
== 0) {
2609 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2610 if(choices
[0] == GB2312_1
) {
2611 choices
[1] = (int8_t)CNS_11643_1
;
2613 choices
[1] = (int8_t)GB2312_1
;
2618 /* ISO-2022-CN-EXT */
2620 /* try one of the other converters */
2621 switch(choices
[0]) {
2623 choices
[1] = (int8_t)CNS_11643_1
;
2624 choices
[2] = (int8_t)ISO_IR_165
;
2627 choices
[1] = (int8_t)GB2312_1
;
2628 choices
[2] = (int8_t)CNS_11643_1
;
2630 default: /* CNS_11643_x */
2631 choices
[1] = (int8_t)GB2312_1
;
2632 choices
[2] = (int8_t)ISO_IR_165
;
2642 * len==0: no mapping found yet
2643 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
2644 * len>0: found a roundtrip result, done
2648 * We will turn off useFallback after finding a fallback,
2649 * but we still get fallbacks from PUA code points as usual.
2650 * Therefore, we will also need to check that we don't overwrite
2651 * an early fallback with a later one.
2653 useFallback
= cnv
->useFallback
;
2655 for(i
= 0; i
< choiceCount
&& len
<= 0; ++i
) {
2656 int8_t cs0
= choices
[i
];
2660 if(cs0
> CNS_11643_0
) {
2661 len2
= MBCS_FROM_UCHAR32_ISO2022(
2662 converterData
->myConverterArray
[CNS_11643
],
2667 if(len2
== 3 || (len2
== -3 && len
== 0)) {
2668 targetValue
= value
;
2669 cs
= (int8_t)(CNS_11643_0
+ (value
>> 16) - 0x80);
2674 useFallback
= FALSE
;
2676 if(cs
== CNS_11643_1
) {
2678 } else if(cs
== CNS_11643_2
) {
2680 } else /* plane 3..7 */ if(converterData
->version
== 1) {
2683 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
2688 /* GB2312_1 or ISO-IR-165 */
2689 len2
= MBCS_FROM_UCHAR32_ISO2022(
2690 converterData
->myConverterArray
[cs0
],
2695 if(len2
== 2 || (len2
== -2 && len
== 0)) {
2696 targetValue
= value
;
2700 useFallback
= FALSE
;
2707 len
= 0; /* count output bytes; it must have been abs(len) == 2 */
2709 /* write the designation sequence if necessary */
2710 if(cs
!= pFromU2022State
->cs
[g
]) {
2711 if(cs
< CNS_11643
) {
2712 uprv_memcpy(buffer
, escSeqCharsCN
[cs
], 4);
2714 uprv_memcpy(buffer
, escSeqCharsCN
[CNS_11643
+ (cs
- CNS_11643_1
)], 4);
2717 pFromU2022State
->cs
[g
] = cs
;
2719 /* changing the SO/G1 charset invalidates the choices[] */
2724 /* write the shift sequence if necessary */
2725 if(g
!= pFromU2022State
->g
) {
2728 buffer
[len
++] = UCNV_SO
;
2730 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
2731 pFromU2022State
->g
= 1;
2734 buffer
[len
++] = 0x1b;
2735 buffer
[len
++] = 0x4e;
2737 default: /* case 3 */
2738 buffer
[len
++] = 0x1b;
2739 buffer
[len
++] = 0x4f;
2744 /* write the two output bytes */
2745 buffer
[len
++] = (char)(targetValue
>> 8);
2746 buffer
[len
++] = (char)targetValue
;
2748 /* if we cannot find the character after checking all codepages
2749 * then this is an error
2751 *err
= U_INVALID_CHAR_FOUND
;
2752 cnv
->fromUChar32
=sourceChar
;
2757 /* output len>0 bytes in buffer[] */
2759 *target
++ = buffer
[0];
2761 *offsets
++ = (int32_t)(source
- args
->source
- 1); /* -1: known to be ASCII */
2763 } else if(len
== 2 && (target
+ 2) <= targetLimit
) {
2764 *target
++ = buffer
[0];
2765 *target
++ = buffer
[1];
2767 int32_t sourceIndex
= (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
));
2768 *offsets
++ = sourceIndex
;
2769 *offsets
++ = sourceIndex
;
2775 &target
, (const char *)targetLimit
,
2776 &offsets
, (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
)),
2778 if(U_FAILURE(*err
)) {
2782 } /* end if(myTargetIndex<myTargetLength) */
2784 *err
=U_BUFFER_OVERFLOW_ERROR
;
2788 }/* end while(mySourceIndex<mySourceLength) */
2791 * the end of the input stream and detection of truncated input
2792 * are handled by the framework, but for ISO-2022-CN conversion
2793 * we need to be in ASCII mode at the very end
2798 * end of input and no truncated input
2800 if( U_SUCCESS(*err
) &&
2801 pFromU2022State
->g
!=0 &&
2802 args
->flush
&& source
>=sourceLimit
&& cnv
->fromUChar32
==0
2804 int32_t sourceIndex
;
2806 /* we are switching to ASCII */
2807 pFromU2022State
->g
=0;
2809 /* get the source index of the last input character */
2811 * TODO this would be simpler and more reliable if we used a pair
2812 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2813 * so that we could simply use the prevSourceIndex here;
2814 * this code gives an incorrect result for the rare case of an unmatched
2815 * trail surrogate that is alone in the last buffer of the text stream
2817 sourceIndex
=(int32_t)(source
-args
->source
);
2820 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
2821 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
2832 &target
, (const char *)targetLimit
,
2833 &offsets
, sourceIndex
,
2837 /*save the state and return */
2838 args
->source
= source
;
2839 args
->target
= (char*)target
;
2844 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
2847 const char *mySource
= (char *) args
->source
;
2848 UChar
*myTarget
= args
->target
;
2849 const char *mySourceLimit
= args
->sourceLimit
;
2850 uint32_t targetUniChar
= 0x0000;
2851 uint32_t mySourceChar
= 0x0000;
2852 UConverterDataISO2022
* myData
;
2853 ISO2022State
*pToU2022State
;
2855 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2856 pToU2022State
= &myData
->toU2022State
;
2858 if(myData
->key
!= 0) {
2859 /* continue with a partial escape sequence */
2861 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
2862 /* continue with a partial double-byte character */
2863 mySourceChar
= args
->converter
->toUBytes
[0];
2864 args
->converter
->toULength
= 0;
2868 while(mySource
< mySourceLimit
){
2870 targetUniChar
=missingCharMarker
;
2872 if(myTarget
< args
->targetLimit
){
2874 mySourceChar
= (unsigned char) *mySource
++;
2876 switch(mySourceChar
){
2882 if(pToU2022State
->cs
[1] != 0) {
2886 /* illegal to have SO before a matching designator */
2893 changeState_2022(args
->converter
,&(mySource
),
2894 mySourceLimit
, ISO_2022_CN
,err
);
2896 /* invalid or illegal escape sequence */
2897 if(U_FAILURE(*err
)){
2898 args
->target
= myTarget
;
2899 args
->source
= mySource
;
2904 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
2909 uprv_memset(pToU2022State
, 0, sizeof(ISO2022State
));
2912 /* convert one or two bytes */
2913 if(pToU2022State
->g
!= 0) {
2914 if(mySource
< mySourceLimit
) {
2915 UConverterSharedData
*cnv
;
2916 StateEnum tempState
;
2920 trailByte
= *mySource
++;
2921 tempState
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
2922 if(tempState
> CNS_11643_0
) {
2923 cnv
= myData
->myConverterArray
[CNS_11643
];
2924 tempBuf
[0] = (char) (0x80+(tempState
-CNS_11643_0
));
2925 tempBuf
[1] = (char) (mySourceChar
);
2926 tempBuf
[2] = trailByte
;
2930 cnv
= myData
->myConverterArray
[tempState
];
2931 tempBuf
[0] = (char) (mySourceChar
);
2932 tempBuf
[1] = trailByte
;
2935 mySourceChar
= (mySourceChar
<< 8) | (uint8_t)(trailByte
);
2936 if(pToU2022State
->g
>=2) {
2937 /* return from a single-shift state to the previous one */
2938 pToU2022State
->g
=pToU2022State
->prevG
;
2940 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(cnv
, tempBuf
, tempBufLen
, FALSE
);
2942 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
2943 args
->converter
->toULength
= 1;
2948 if(mySourceChar
<= 0x7f) {
2949 targetUniChar
= (UChar
) mySourceChar
;
2954 if(targetUniChar
< (missingCharMarker
-1/*0xfffe*/)){
2956 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2958 *(myTarget
++)=(UChar
)targetUniChar
;
2960 else if(targetUniChar
> missingCharMarker
){
2961 /* disassemble the surrogate pair and write to output*/
2962 targetUniChar
-=0x0010000;
2963 *myTarget
= (UChar
)(0xd800+(UChar
)(targetUniChar
>>10));
2965 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2968 if(myTarget
< args
->targetLimit
){
2969 *myTarget
= (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
2971 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2975 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]=
2976 (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
2981 /* Call the callback function*/
2982 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
2987 *err
=U_BUFFER_OVERFLOW_ERROR
;
2992 args
->target
= myTarget
;
2993 args
->source
= mySource
;
2997 _ISO_2022_WriteSub(UConverterFromUnicodeArgs
*args
, int32_t offsetIndex
, UErrorCode
*err
) {
2998 UConverter
*cnv
= args
->converter
;
2999 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) cnv
->extraInfo
;
3000 ISO2022State
*pFromU2022State
=&myConverterData
->fromU2022State
;
3005 subchar
=(char *)cnv
->subChars
;
3006 length
=cnv
->subCharLen
; /* assume length==1 for most variants */
3009 switch(myConverterData
->locale
[0]){
3014 if(pFromU2022State
->g
== 1) {
3015 /* JIS7: switch from G1 to G0 */
3016 pFromU2022State
->g
= 0;
3020 cs
= pFromU2022State
->cs
[0];
3021 if(cs
!= ASCII
&& cs
!= JISX201
) {
3022 /* not in ASCII or JIS X 0201: switch to ASCII */
3023 pFromU2022State
->cs
[0] = (int8_t)ASCII
;
3033 if(pFromU2022State
->g
!= 0) {
3034 /* not in ASCII mode: switch to ASCII */
3035 pFromU2022State
->g
= 0;
3041 if(myConverterData
->version
== 0) {
3043 if((UBool
)args
->converter
->fromUnicodeStatus
) {
3044 /* in DBCS mode: switch to SBCS */
3045 args
->converter
->fromUnicodeStatus
= 0;
3049 } else /* length == 2*/ {
3050 if(!(UBool
)args
->converter
->fromUnicodeStatus
) {
3051 /* in SBCS mode: switch to DBCS */
3052 args
->converter
->fromUnicodeStatus
= 1;
3060 /* save the subconverter's substitution string */
3061 uint8_t *currentSubChars
= myConverterData
->currentConverter
->subChars
;
3062 int8_t currentSubCharLen
= myConverterData
->currentConverter
->subCharLen
;
3064 /* set our substitution string into the subconverter */
3065 myConverterData
->currentConverter
->subChars
= (uint8_t *)subchar
;
3066 myConverterData
->currentConverter
->subCharLen
= (int8_t)length
;
3068 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3069 args
->converter
= myConverterData
->currentConverter
;
3070 myConverterData
->currentConverter
->fromUChar32
= cnv
->fromUChar32
;
3071 ucnv_cbFromUWriteSub(args
, 0, err
);
3072 cnv
->fromUChar32
= myConverterData
->currentConverter
->fromUChar32
;
3073 args
->converter
= cnv
;
3075 /* restore the subconverter's substitution string */
3076 myConverterData
->currentConverter
->subChars
= currentSubChars
;
3077 myConverterData
->currentConverter
->subCharLen
= currentSubCharLen
;
3079 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
3080 if(myConverterData
->currentConverter
->charErrorBufferLength
> 0) {
3082 cnv
->charErrorBuffer
,
3083 myConverterData
->currentConverter
->charErrorBuffer
,
3084 myConverterData
->currentConverter
->charErrorBufferLength
);
3086 cnv
->charErrorBufferLength
= myConverterData
->currentConverter
->charErrorBufferLength
;
3087 myConverterData
->currentConverter
->charErrorBufferLength
= 0;
3095 ucnv_cbFromUWriteBytes(args
,
3096 buffer
, (int32_t)(p
- buffer
),
3101 * Structure for cloning an ISO 2022 converter into a single memory block.
3102 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3103 * and then ucnv_safeClone() of the sub-converter may additionally align
3104 * currentConverter inside the cloneStruct, for which we need the deadSpace
3105 * after currentConverter.
3106 * This is because UAlignedMemory may be larger than the actually
3107 * necessary alignment size for the platform.
3108 * The other cloneStruct fields will not be moved around,
3109 * and are aligned properly with cloneStruct's alignment.
3114 UConverter currentConverter
;
3115 UAlignedMemory deadSpace
;
3116 UConverterDataISO2022 mydata
;
3121 _ISO_2022_SafeClone(
3122 const UConverter
*cnv
,
3124 int32_t *pBufferSize
,
3127 struct cloneStruct
* localClone
;
3128 UConverterDataISO2022
*cnvData
;
3131 if (*pBufferSize
== 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3132 *pBufferSize
= (int32_t)sizeof(struct cloneStruct
);
3136 cnvData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
3137 localClone
= (struct cloneStruct
*)stackBuffer
;
3139 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3141 uprv_memcpy(&localClone
->mydata
, cnvData
, sizeof(UConverterDataISO2022
));
3142 localClone
->cnv
.extraInfo
= &localClone
->mydata
; /* set pointer to extra data */
3143 localClone
->cnv
.isExtraLocal
= TRUE
;
3145 /* share the subconverters */
3147 if(cnvData
->currentConverter
!= NULL
) {
3148 size
= (int32_t)(sizeof(UConverter
) + sizeof(UAlignedMemory
)); /* include size of padding */
3149 localClone
->mydata
.currentConverter
=
3150 ucnv_safeClone(cnvData
->currentConverter
,
3151 &localClone
->currentConverter
,
3153 if(U_FAILURE(*status
)) {
3158 for(i
=0; i
<UCNV_2022_MAX_CONVERTERS
; ++i
) {
3159 if(cnvData
->myConverterArray
[i
] != NULL
) {
3160 ucnv_incrementRefCount(cnvData
->myConverterArray
[i
]);
3164 return &localClone
->cnv
;
3168 _ISO_2022_GetUnicodeSet(const UConverter
*cnv
,
3169 const USetAdder
*sa
,
3170 UConverterUnicodeSet which
,
3171 UErrorCode
*pErrorCode
)
3174 UConverterDataISO2022
* cnvData
;
3176 if (U_FAILURE(*pErrorCode
)) {
3179 #ifdef U_ENABLE_GENERIC_ISO_2022
3180 if (cnv
->sharedData
== &_ISO2022Data
) {
3181 /* We use UTF-8 in this case */
3182 sa
->addRange(sa
->set
, 0, 0xd7FF);
3183 sa
->addRange(sa
->set
, 0xE000, 0x10FFFF);
3188 cnvData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
3190 /* open a set and initialize it with code points that are algorithmically round-tripped */
3191 switch(cnvData
->locale
[0]){
3193 if(jpCharsetMasks
[cnvData
->version
]&CSM(ISO8859_1
)) {
3194 /* include Latin-1 for some variants of JP */
3195 sa
->addRange(sa
->set
, 0, 0xff);
3197 /* include ASCII for JP */
3198 sa
->addRange(sa
->set
, 0, 0x7f);
3200 if(jpCharsetMasks
[cnvData
->version
]&CSM(HWKANA_7BIT
)) {
3201 /* include half-width Katakana for JP */
3202 sa
->addRange(sa
->set
, HWKANA_START
, HWKANA_END
);
3207 /* include ASCII for CN */
3208 sa
->addRange(sa
->set
, 0, 0x7f);
3211 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3212 cnvData
->currentConverter
->sharedData
->impl
->getUnicodeSet(
3213 cnvData
->currentConverter
, sa
, which
, pErrorCode
);
3214 /* the loop over myConverterArray[] will simply not find another converter */
3221 * Version-specific for CN:
3222 * CN version 0 does not map CNS planes 3..7 although
3223 * they are all available in the CNS conversion table;
3224 * CN version 1 does map them all.
3225 * The two versions create different Unicode sets.
3227 for (i
=0; i
<UCNV_2022_MAX_CONVERTERS
; i
++) {
3228 if(cnvData
->myConverterArray
[i
]!=NULL
) {
3229 if( (cnvData
->locale
[0]=='c' || cnvData
->locale
[0]=='z') &&
3230 cnvData
->version
==0 && i
==CNS_11643
3232 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3233 ucnv_MBCSGetUnicodeSetForBytes(
3234 cnvData
->myConverterArray
[i
],
3235 sa
, UCNV_ROUNDTRIP_SET
,
3239 ucnv_MBCSGetUnicodeSetForUnicode(cnvData
->myConverterArray
[i
], sa
, which
, pErrorCode
);
3245 * ISO 2022 converters must not convert SO/SI/ESC despite what
3246 * sub-converters do by themselves.
3247 * Remove these characters from the set.
3249 sa
->remove(sa
->set
, 0x0e);
3250 sa
->remove(sa
->set
, 0x0f);
3251 sa
->remove(sa
->set
, 0x1b);
3254 static const UConverterImpl _ISO2022Impl
={
3264 #ifdef U_ENABLE_GENERIC_ISO_2022
3265 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC
,
3266 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC
,
3267 ucnv_fromUnicode_UTF8
,
3268 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC
,
3280 _ISO_2022_SafeClone
,
3281 _ISO_2022_GetUnicodeSet
3283 static const UConverterStaticData _ISO2022StaticData
={
3284 sizeof(UConverterStaticData
),
3290 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3297 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3299 const UConverterSharedData _ISO2022Data
={
3300 sizeof(UConverterSharedData
),
3304 &_ISO2022StaticData
,
3310 /*************JP****************/
3311 static const UConverterImpl _ISO2022JPImpl
={
3321 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3322 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3323 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3324 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3330 _ISO_2022_SafeClone
,
3331 _ISO_2022_GetUnicodeSet
3333 static const UConverterStaticData _ISO2022JPStaticData
={
3334 sizeof(UConverterStaticData
),
3340 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3347 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3349 static const UConverterSharedData _ISO2022JPData
={
3350 sizeof(UConverterSharedData
),
3354 &_ISO2022JPStaticData
,
3360 /************* KR ***************/
3361 static const UConverterImpl _ISO2022KRImpl
={
3371 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3372 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3373 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3374 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3380 _ISO_2022_SafeClone
,
3381 _ISO_2022_GetUnicodeSet
3383 static const UConverterStaticData _ISO2022KRStaticData
={
3384 sizeof(UConverterStaticData
),
3390 3, /* max 3 bytes per UChar: SO+DBCS */
3397 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3399 static const UConverterSharedData _ISO2022KRData
={
3400 sizeof(UConverterSharedData
),
3404 &_ISO2022KRStaticData
,
3410 /*************** CN ***************/
3411 static const UConverterImpl _ISO2022CNImpl
={
3422 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3423 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3424 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3425 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3431 _ISO_2022_SafeClone
,
3432 _ISO_2022_GetUnicodeSet
3434 static const UConverterStaticData _ISO2022CNStaticData
={
3435 sizeof(UConverterStaticData
),
3441 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3448 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3450 static const UConverterSharedData _ISO2022CNData
={
3451 sizeof(UConverterSharedData
),
3455 &_ISO2022CNStaticData
,
3463 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */