- support of [Strings.LanguageID]-sections for inf-files added in setupapi
[reactos.git] / reactos / lib / 3rdparty / icu4ros / icu / source / common / ucnv2022.c
1 /*
2 **********************************************************************
3 * Copyright (C) 2000-2007, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
13 *
14 * Change history:
15 *
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
19 * function
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
22 * ucnvebdc.c
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
27 */
28
29 #include "unicode/utypes.h"
30
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
37 #include "ucnv_imp.h"
38 #include "ucnv_bld.h"
39 #include "ucnv_cnv.h"
40 #include "ucnvmbcs.h"
41 #include "cstring.h"
42 #include "cmemory.h"
43
44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
45
46 #ifdef U_ENABLE_GENERIC_ISO_2022
47 /*
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
50 *
51 * Reasons:
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
72 * for 7-bit ISO-2022.
73 *
74 * Markus Scherer 2003-dec-03
75 */
76 #endif
77
78 static const char SHIFT_IN_STR[] = "\x0F";
79 static const char SHIFT_OUT_STR[] = "\x0E";
80
81 #define CR 0x0D
82 #define LF 0x0A
83 #define H_TAB 0x09
84 #define V_TAB 0x0B
85 #define SPACE 0x20
86
87 enum {
88 HWKANA_START=0xff61,
89 HWKANA_END=0xff9f
90 };
91
92 /*
93 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
94 * as bytes 21..7E. (Subtract 0x80.)
95 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
96 * as bytes 20..7F. (Subtract 0x80.)
97 * Do not encode C1 control codes with native bytes 80..9F
98 * as bytes 00..1F (C0 control codes).
99 */
100 enum {
101 GR94_START=0xa1,
102 GR94_END=0xfe,
103 GR96_START=0xa0,
104 GR96_END=0xff
105 };
106
107 /*
108 * ISO 2022 control codes must not be converted from Unicode
109 * because they would mess up the byte stream.
110 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
111 * corresponding to SO, SI, and ESC.
112 */
113 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
114
115 /* for ISO-2022-JP and -CN implementations */
116 typedef enum {
117 /* shared values */
118 INVALID_STATE=-1,
119 ASCII = 0,
120
121 SS2_STATE=0x10,
122 SS3_STATE,
123
124 /* JP */
125 ISO8859_1 = 1 ,
126 ISO8859_7 = 2 ,
127 JISX201 = 3,
128 JISX208 = 4,
129 JISX212 = 5,
130 GB2312 =6,
131 KSC5601 =7,
132 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
133
134 /* CN */
135 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
136 GB2312_1=1,
137 ISO_IR_165=2,
138 CNS_11643=3,
139
140 /*
141 * these are used in StateEnum and ISO2022State variables,
142 * but CNS_11643 must be used to index into myConverterArray[]
143 */
144 CNS_11643_0=0x20,
145 CNS_11643_1,
146 CNS_11643_2,
147 CNS_11643_3,
148 CNS_11643_4,
149 CNS_11643_5,
150 CNS_11643_6,
151 CNS_11643_7
152 } StateEnum;
153
154 /* is the StateEnum charset value for a DBCS charset? */
155 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
156
157 #define CSM(cs) ((uint16_t)1<<(cs))
158
159 /*
160 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
161 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
162 *
163 * Note: The converter uses some leniency:
164 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
165 * all versions, not just JIS7 and JIS8.
166 * - ICU does not distinguish between different versions of JIS X 0208.
167 */
168 static const uint16_t jpCharsetMasks[5]={
169 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
170 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
171 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
174 };
175
176 typedef enum {
177 ASCII1=0,
178 LATIN1,
179 SBCS,
180 DBCS,
181 MBCS,
182 HWKANA
183 }Cnv2022Type;
184
185 typedef struct ISO2022State {
186 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
187 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
188 int8_t prevG; /* g before single shift (SS2 or SS3) */
189 } ISO2022State;
190
191 #define UCNV_OPTIONS_VERSION_MASK 0xf
192 #define UCNV_2022_MAX_CONVERTERS 10
193
194 typedef struct{
195 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
196 UConverter *currentConverter;
197 Cnv2022Type currentType;
198 ISO2022State toU2022State, fromU2022State;
199 uint32_t key;
200 uint32_t version;
201 #ifdef U_ENABLE_GENERIC_ISO_2022
202 UBool isFirstBuffer;
203 #endif
204 char name[30];
205 char locale[3];
206 }UConverterDataISO2022;
207
208 /* Protos */
209 /* ISO-2022 ----------------------------------------------------------------- */
210
211 /*Forward declaration */
212 U_CFUNC void
213 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
214 UErrorCode * err);
215 U_CFUNC void
216 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
217 UErrorCode * err);
218
219 #define ESC_2022 0x1B /*ESC*/
220
221 typedef enum
222 {
223 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
224 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
225 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
226 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
227 } UCNV_TableStates_2022;
228
229 /*
230 * The way these state transition arrays work is:
231 * ex : ESC$B is the sequence for JISX208
232 * a) First Iteration: char is ESC
233 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
234 * int x = normalize_esq_chars_2022[27] which is equal to 1
235 * ii) Search for this value in escSeqStateTable_Key_2022[]
236 * value of x is stored at escSeqStateTable_Key_2022[0]
237 * iii) Save this index as offset
238 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
239 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
240 * b) Switch on this state and continue to next char
241 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
242 * which is normalize_esq_chars_2022[36] == 4
243 * ii) x is currently 1(from above)
244 * x<<=5 -- x is now 32
245 * x+=normalize_esq_chars_2022[36]
246 * now x is 36
247 * iii) Search for this value in escSeqStateTable_Key_2022[]
248 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
249 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
250 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
251 * c) Switch on this state and continue to next char
252 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
253 * ii) x is currently 36 (from above)
254 * x<<=5 -- x is now 1152
255 * x+=normalize_esq_chars_2022[66]
256 * now x is 1161
257 * iii) Search for this value in escSeqStateTable_Key_2022[]
258 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
259 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
260 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
261 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
262 */
263
264
265 /*Below are the 3 arrays depicting a state transition table*/
266 static const int8_t normalize_esq_chars_2022[256] = {
267 /* 0 1 2 3 4 5 6 7 8 9 */
268
269 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
270 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
271 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
272 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
273 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
274 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
275 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
276 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
277 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
278 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
279 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
280 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
281 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
282 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,0 ,0 ,0 ,0 ,0 ,0
295 };
296
297 #ifdef U_ENABLE_GENERIC_ISO_2022
298 /*
299 * When the generic ISO-2022 converter is completely removed, not just disabled
300 * per #ifdef, then the following state table and the associated tables that are
301 * dimensioned with MAX_STATES_2022 should be trimmed.
302 *
303 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
304 * the associated escape sequences starting with ESC ( B should be removed.
305 * This includes the ones with key values 1097 and all of the ones above 1000000.
306 *
307 * For the latter, the tables can simply be truncated.
308 * For the former, since the tables must be kept parallel, it is probably best
309 * to simply duplicate an adjacent table cell, parallel in all tables.
310 *
311 * It may make sense to restructure the tables, especially by using small search
312 * tables for the variants instead of indexing them parallel to the table here.
313 */
314 #endif
315
316 #define MAX_STATES_2022 74
317 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
318 /* 0 1 2 3 4 5 6 7 8 9 */
319
320 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
321 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
322 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
323 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
324 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
325 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
326 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
327 ,35947631 ,35947635 ,35947636 ,35947638
328 };
329
330 #ifdef U_ENABLE_GENERIC_ISO_2022
331
332 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
333 /* 0 1 2 3 4 5 6 7 8 9 */
334
335 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
336 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
337 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
338 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
339 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
340 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
341 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
342 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
343 };
344
345 #endif
346
347 static const UCNV_TableStates_2022 escSeqStateTable_Value_2022[MAX_STATES_2022] = {
348 /* 0 1 2 3 4 5 6 7 8 9 */
349 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
350 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
351 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
352 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
353 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
354 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
355 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
357 };
358
359
360 /* Type def for refactoring changeState_2022 code*/
361 typedef enum{
362 #ifdef U_ENABLE_GENERIC_ISO_2022
363 ISO_2022=0,
364 #endif
365 ISO_2022_JP=1,
366 ISO_2022_KR=2,
367 ISO_2022_CN=3
368 } Variant2022;
369
370 /*********** ISO 2022 Converter Protos ***********/
371 static void
372 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode);
373
374 static void
375 _ISO2022Close(UConverter *converter);
376
377 static void
378 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
379
380 static const char*
381 _ISO2022getName(const UConverter* cnv);
382
383 static void
384 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
385
386 static UConverter *
387 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
388
389 #ifdef U_ENABLE_GENERIC_ISO_2022
390 static void
391 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
392 #endif
393
394 /*const UConverterSharedData _ISO2022Data;*/
395 static const UConverterSharedData _ISO2022JPData;
396 static const UConverterSharedData _ISO2022KRData;
397 static const UConverterSharedData _ISO2022CNData;
398
399 /*************** Converter implementations ******************/
400
401 /* The purpose of this function is to get around gcc compiler warnings. */
402 static U_INLINE void
403 fromUWriteUInt8(UConverter *cnv,
404 const char *bytes, int32_t length,
405 uint8_t **target, const char *targetLimit,
406 int32_t **offsets,
407 int32_t sourceIndex,
408 UErrorCode *pErrorCode)
409 {
410 char *targetChars = (char *)*target;
411 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
412 offsets, sourceIndex, pErrorCode);
413 *target = (uint8_t*)targetChars;
414
415 }
416
417 static U_INLINE void
418 setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
419 if(myConverterData->version == 1) {
420 UConverter *cnv = myConverterData->currentConverter;
421
422 cnv->toUnicodeStatus=0; /* offset */
423 cnv->mode=0; /* state */
424 cnv->toULength=0; /* byteIndex */
425 }
426 }
427
428 static U_INLINE void
429 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
430 /* in ISO-2022-KR the designator sequence appears only once
431 * in a file so we append it only once
432 */
433 if( converter->charErrorBufferLength==0){
434
435 converter->charErrorBufferLength = 4;
436 converter->charErrorBuffer[0] = 0x1b;
437 converter->charErrorBuffer[1] = 0x24;
438 converter->charErrorBuffer[2] = 0x29;
439 converter->charErrorBuffer[3] = 0x43;
440 }
441 if(myConverterData->version == 1) {
442 UConverter *cnv = myConverterData->currentConverter;
443
444 cnv->fromUChar32=0;
445 cnv->fromUnicodeStatus=1; /* prevLength */
446 }
447 }
448
449 static void
450 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode){
451
452 char myLocale[6]={' ',' ',' ',' ',' ',' '};
453
454 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
455 if(cnv->extraInfo != NULL) {
456 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
457 uint32_t version;
458
459 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
460 myConverterData->currentType = ASCII1;
461 cnv->fromUnicodeStatus =FALSE;
462 if(locale){
463 uprv_strncpy(myLocale, locale, sizeof(myLocale));
464 }
465 version = options & UCNV_OPTIONS_VERSION_MASK;
466 myConverterData->version = version;
467 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
468 (myLocale[2]=='_' || myLocale[2]=='\0'))
469 {
470 size_t len=0;
471 /* open the required converters and cache them */
472 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
473 myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
474 }
475 myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode);
476 myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode);
477 if(jpCharsetMasks[version]&CSM(JISX212)) {
478 myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode);
479 }
480 if(jpCharsetMasks[version]&CSM(GB2312)) {
481 myConverterData->myConverterArray[GB2312] = ucnv_loadSharedData("ibm-5478", NULL, errorCode); /* gb_2312_80-1 */
482 }
483 if(jpCharsetMasks[version]&CSM(KSC5601)) {
484 myConverterData->myConverterArray[KSC5601] = ucnv_loadSharedData("ksc_5601", NULL, errorCode);
485 }
486
487 /* set the function pointers to appropriate funtions */
488 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
489 uprv_strcpy(myConverterData->locale,"ja");
490
491 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
492 len = uprv_strlen(myConverterData->name);
493 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
494 myConverterData->name[len+1]='\0';
495 }
496 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
497 (myLocale[2]=='_' || myLocale[2]=='\0'))
498 {
499 if (version==1){
500 myConverterData->currentConverter=
501 ucnv_open("icu-internal-25546",errorCode);
502
503 if (U_FAILURE(*errorCode)) {
504 _ISO2022Close(cnv);
505 return;
506 }
507
508 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
509 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
510 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
511 }else{
512 myConverterData->currentConverter=ucnv_open("ibm-949",errorCode);
513
514 if (U_FAILURE(*errorCode)) {
515 _ISO2022Close(cnv);
516 return;
517 }
518
519 myConverterData->version = 0;
520 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
521 }
522
523 /* initialize the state variables */
524 setInitialStateToUnicodeKR(cnv, myConverterData);
525 setInitialStateFromUnicodeKR(cnv, myConverterData);
526
527 /* set the function pointers to appropriate funtions */
528 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
529 uprv_strcpy(myConverterData->locale,"ko");
530 }
531 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
532 (myLocale[2]=='_' || myLocale[2]=='\0'))
533 {
534
535 /* open the required converters and cache them */
536 myConverterData->myConverterArray[GB2312_1] = ucnv_loadSharedData("ibm-5478", NULL, errorCode);
537 if(version==1) {
538 myConverterData->myConverterArray[ISO_IR_165] = ucnv_loadSharedData("iso-ir-165", NULL, errorCode);
539 }
540 myConverterData->myConverterArray[CNS_11643] = ucnv_loadSharedData("cns-11643-1992", NULL, errorCode);
541
542
543 /* set the function pointers to appropriate funtions */
544 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
545 uprv_strcpy(myConverterData->locale,"cn");
546
547 if (version==1){
548 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
549 }else{
550 myConverterData->version = 0;
551 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
552 }
553 }
554 else{
555 #ifdef U_ENABLE_GENERIC_ISO_2022
556 myConverterData->isFirstBuffer = TRUE;
557
558 /* append the UTF-8 escape sequence */
559 cnv->charErrorBufferLength = 3;
560 cnv->charErrorBuffer[0] = 0x1b;
561 cnv->charErrorBuffer[1] = 0x25;
562 cnv->charErrorBuffer[2] = 0x42;
563
564 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
565 /* initialize the state variables */
566 uprv_strcpy(myConverterData->name,"ISO_2022");
567 #else
568 *errorCode = U_UNSUPPORTED_ERROR;
569 return;
570 #endif
571 }
572
573 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
574
575 if(U_FAILURE(*errorCode)) {
576 _ISO2022Close(cnv);
577 }
578 } else {
579 *errorCode = U_MEMORY_ALLOCATION_ERROR;
580 }
581 }
582
583
584 static void
585 _ISO2022Close(UConverter *converter) {
586 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
587 UConverterSharedData **array = myData->myConverterArray;
588 int32_t i;
589
590 if (converter->extraInfo != NULL) {
591 /*close the array of converter pointers and free the memory*/
592 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
593 if(array[i]!=NULL) {
594 ucnv_unloadSharedDataIfReady(array[i]);
595 }
596 }
597
598 ucnv_close(myData->currentConverter);
599
600 if(!converter->isExtraLocal){
601 uprv_free (converter->extraInfo);
602 converter->extraInfo = NULL;
603 }
604 }
605 }
606
607 static void
608 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
609 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
610 if(choice<=UCNV_RESET_TO_UNICODE) {
611 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
612 myConverterData->key = 0;
613 }
614 if(choice!=UCNV_RESET_TO_UNICODE) {
615 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
616 }
617 #ifdef U_ENABLE_GENERIC_ISO_2022
618 if(myConverterData->locale[0] == 0){
619 if(choice<=UCNV_RESET_TO_UNICODE) {
620 myConverterData->isFirstBuffer = TRUE;
621 myConverterData->key = 0;
622 if (converter->mode == UCNV_SO){
623 ucnv_close (myConverterData->currentConverter);
624 myConverterData->currentConverter=NULL;
625 }
626 converter->mode = UCNV_SI;
627 }
628 if(choice!=UCNV_RESET_TO_UNICODE) {
629 /* re-append UTF-8 escape sequence */
630 converter->charErrorBufferLength = 3;
631 converter->charErrorBuffer[0] = 0x1b;
632 converter->charErrorBuffer[1] = 0x28;
633 converter->charErrorBuffer[2] = 0x42;
634 }
635 }
636 else
637 #endif
638 {
639 /* reset the state variables */
640 if(myConverterData->locale[0] == 'k'){
641 if(choice<=UCNV_RESET_TO_UNICODE) {
642 setInitialStateToUnicodeKR(converter, myConverterData);
643 }
644 if(choice!=UCNV_RESET_TO_UNICODE) {
645 setInitialStateFromUnicodeKR(converter, myConverterData);
646 }
647 }
648 }
649 }
650
651 static const char*
652 _ISO2022getName(const UConverter* cnv){
653 if(cnv->extraInfo){
654 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
655 return myData->name;
656 }
657 return NULL;
658 }
659
660
661 /*************** to unicode *******************/
662 /****************************************************************************
663 * Recognized escape sequences are
664 * <ESC>(B ASCII
665 * <ESC>.A ISO-8859-1
666 * <ESC>.F ISO-8859-7
667 * <ESC>(J JISX-201
668 * <ESC>(I JISX-201
669 * <ESC>$B JISX-208
670 * <ESC>$@ JISX-208
671 * <ESC>$(D JISX-212
672 * <ESC>$A GB2312
673 * <ESC>$(C KSC5601
674 */
675 static const StateEnum nextStateToUnicodeJP[MAX_STATES_2022]= {
676 /* 0 1 2 3 4 5 6 7 8 9 */
677 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
678 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
679 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
680 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
681 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
682 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
683 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
684 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
685 };
686
687 /*************** to unicode *******************/
688 static const StateEnum nextStateToUnicodeCN[MAX_STATES_2022]= {
689 /* 0 1 2 3 4 5 6 7 8 9 */
690 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
691 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
692 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
693 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
694 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
695 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
696 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
697 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
698 };
699
700
701 static UCNV_TableStates_2022
702 getKey_2022(char c,int32_t* key,int32_t* offset){
703 int32_t togo;
704 int32_t low = 0;
705 int32_t hi = MAX_STATES_2022;
706 int32_t oldmid=0;
707
708 togo = normalize_esq_chars_2022[(uint8_t)c];
709 if(togo == 0) {
710 /* not a valid character anywhere in an escape sequence */
711 *key = 0;
712 *offset = 0;
713 return INVALID_2022;
714 }
715 togo = (*key << 5) + togo;
716
717 while (hi != low) /*binary search*/{
718
719 register int32_t mid = (hi+low) >> 1; /*Finds median*/
720
721 if (mid == oldmid)
722 break;
723
724 if (escSeqStateTable_Key_2022[mid] > togo){
725 hi = mid;
726 }
727 else if (escSeqStateTable_Key_2022[mid] < togo){
728 low = mid;
729 }
730 else /*we found it*/{
731 *key = togo;
732 *offset = mid;
733 return escSeqStateTable_Value_2022[mid];
734 }
735 oldmid = mid;
736
737 }
738
739 *key = 0;
740 *offset = 0;
741 return INVALID_2022;
742 }
743
744 /*runs through a state machine to determine the escape sequence - codepage correspondance
745 */
746 static void
747 changeState_2022(UConverter* _this,
748 const char** source,
749 const char* sourceLimit,
750 Variant2022 var,
751 UErrorCode* err){
752 UCNV_TableStates_2022 value;
753 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
754 uint32_t key = myData2022->key;
755 int32_t offset = 0;
756 char c;
757
758 value = VALID_NON_TERMINAL_2022;
759 while (*source < sourceLimit) {
760 c = *(*source)++;
761 _this->toUBytes[_this->toULength++]=(uint8_t)c;
762 value = getKey_2022(c,(int32_t *) &key, &offset);
763
764 switch (value){
765
766 case VALID_NON_TERMINAL_2022 :
767 /* continue with the loop */
768 break;
769
770 case VALID_TERMINAL_2022:
771 key = 0;
772 goto DONE;
773
774 case INVALID_2022:
775 goto DONE;
776
777 case VALID_MAYBE_TERMINAL_2022:
778 #ifdef U_ENABLE_GENERIC_ISO_2022
779 /* ESC ( B is ambiguous only for ISO_2022 itself */
780 if(var == ISO_2022) {
781 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
782 _this->toULength = 0;
783
784 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
785
786 /* continue with the loop */
787 value = VALID_NON_TERMINAL_2022;
788 break;
789 } else
790 #endif
791 {
792 /* not ISO_2022 itself, finish here */
793 value = VALID_TERMINAL_2022;
794 key = 0;
795 goto DONE;
796 }
797 }
798 }
799
800 DONE:
801 myData2022->key = key;
802
803 if (value == VALID_NON_TERMINAL_2022) {
804 /* indicate that the escape sequence is incomplete: key!=0 */
805 return;
806 } else if (value == INVALID_2022 ) {
807 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
808 return;
809 } else /* value == VALID_TERMINAL_2022 */ {
810 switch(var){
811 #ifdef U_ENABLE_GENERIC_ISO_2022
812 case ISO_2022:
813 {
814 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
815 if(chosenConverterName == NULL) {
816 /* SS2 or SS3 */
817 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
818 return;
819 }
820
821 _this->mode = UCNV_SI;
822 ucnv_close(myData2022->currentConverter);
823 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
824 if(U_SUCCESS(*err)) {
825 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
826 _this->mode = UCNV_SO;
827 }
828 break;
829 }
830 #endif
831 case ISO_2022_JP:
832 {
833 StateEnum tempState=nextStateToUnicodeJP[offset];
834 switch(tempState) {
835 case INVALID_STATE:
836 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
837 break;
838 case SS2_STATE:
839 if(myData2022->toU2022State.cs[2]!=0) {
840 if(myData2022->toU2022State.g<2) {
841 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
842 }
843 myData2022->toU2022State.g=2;
844 } else {
845 /* illegal to have SS2 before a matching designator */
846 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
847 }
848 break;
849 /* case SS3_STATE: not used in ISO-2022-JP-x */
850 case ISO8859_1:
851 case ISO8859_7:
852 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
853 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
854 } else {
855 /* G2 charset for SS2 */
856 myData2022->toU2022State.cs[2]=(int8_t)tempState;
857 }
858 break;
859 default:
860 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
861 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
862 } else {
863 /* G0 charset */
864 myData2022->toU2022State.cs[0]=(int8_t)tempState;
865 }
866 break;
867 }
868 }
869 break;
870 case ISO_2022_CN:
871 {
872 StateEnum tempState=nextStateToUnicodeCN[offset];
873 switch(tempState) {
874 case INVALID_STATE:
875 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
876 break;
877 case SS2_STATE:
878 if(myData2022->toU2022State.cs[2]!=0) {
879 if(myData2022->toU2022State.g<2) {
880 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
881 }
882 myData2022->toU2022State.g=2;
883 } else {
884 /* illegal to have SS2 before a matching designator */
885 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
886 }
887 break;
888 case SS3_STATE:
889 if(myData2022->toU2022State.cs[3]!=0) {
890 if(myData2022->toU2022State.g<2) {
891 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
892 }
893 myData2022->toU2022State.g=3;
894 } else {
895 /* illegal to have SS3 before a matching designator */
896 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
897 }
898 break;
899 case ISO_IR_165:
900 if(myData2022->version==0) {
901 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
902 break;
903 }
904 /*fall through*/
905 case GB2312_1:
906 /*fall through*/
907 case CNS_11643_1:
908 myData2022->toU2022State.cs[1]=(int8_t)tempState;
909 break;
910 case CNS_11643_2:
911 myData2022->toU2022State.cs[2]=(int8_t)tempState;
912 break;
913 default:
914 /* other CNS 11643 planes */
915 if(myData2022->version==0) {
916 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
917 } else {
918 myData2022->toU2022State.cs[3]=(int8_t)tempState;
919 }
920 break;
921 }
922 }
923 break;
924 case ISO_2022_KR:
925 if(offset==0x30){
926 /* nothing to be done, just accept this one escape sequence */
927 } else {
928 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
929 }
930 break;
931
932 default:
933 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
934 break;
935 }
936 }
937 if(U_SUCCESS(*err)) {
938 _this->toULength = 0;
939 }
940 }
941
942 /*Checks the characters of the buffer against valid 2022 escape sequences
943 *if the match we return a pointer to the initial start of the sequence otherwise
944 *we return sourceLimit
945 */
946 /*for 2022 looks ahead in the stream
947 *to determine the longest possible convertible
948 *data stream
949 */
950 static U_INLINE const char*
951 getEndOfBuffer_2022(const char** source,
952 const char* sourceLimit,
953 UBool flush){
954
955 const char* mySource = *source;
956
957 #ifdef U_ENABLE_GENERIC_ISO_2022
958 if (*source >= sourceLimit)
959 return sourceLimit;
960
961 do{
962
963 if (*mySource == ESC_2022){
964 int8_t i;
965 int32_t key = 0;
966 int32_t offset;
967 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
968
969 /* Kludge: I could not
970 * figure out the reason for validating an escape sequence
971 * twice - once here and once in changeState_2022().
972 * is it possible to have an ESC character in a ISO2022
973 * byte stream which is valid in a code page? Is it legal?
974 */
975 for (i=0;
976 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
977 i++) {
978 value = getKey_2022(*(mySource+i), &key, &offset);
979 }
980 if (value > 0 || *mySource==ESC_2022)
981 return mySource;
982
983 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
984 return sourceLimit;
985 }
986 }while (++mySource < sourceLimit);
987
988 return sourceLimit;
989 #else
990 while(mySource < sourceLimit && *mySource != ESC_2022) {
991 ++mySource;
992 }
993 return mySource;
994 #endif
995 }
996
997
998 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
999 * any future change in _MBCSFromUChar32() function should be reflected here.
1000 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1001 */
1002 static U_INLINE int32_t
1003 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1004 UChar32 c,
1005 uint32_t* value,
1006 UBool useFallback,
1007 int outputType)
1008 {
1009 const int32_t *cx;
1010 const uint16_t *table;
1011 uint32_t stage2Entry;
1012 uint32_t myValue;
1013 int32_t length;
1014 const uint8_t *p;
1015 /*
1016 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1017 * Use internal version of ucnv_open() that verifies that the new structures are available,
1018 * else U_INTERNAL_PROGRAM_ERROR.
1019 */
1020 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1021 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1022 table=sharedData->mbcs.fromUnicodeTable;
1023 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1024 /* get the bytes and the length for the output */
1025 if(outputType==MBCS_OUTPUT_2){
1026 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1027 if(myValue<=0xff) {
1028 length=1;
1029 } else {
1030 length=2;
1031 }
1032 } else /* outputType==MBCS_OUTPUT_3 */ {
1033 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1034 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1035 if(myValue<=0xff) {
1036 length=1;
1037 } else if(myValue<=0xffff) {
1038 length=2;
1039 } else {
1040 length=3;
1041 }
1042 }
1043 /*
1044 * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space.
1045 * Pass in parameter for type of output bytes, for validation and shifting:
1046 * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20?
1047 * (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.)
1048 * - A1-FE: Subtract 80 after range check.
1049 * - SJIS: Shift DBCS result to 21-7E x 21-7E.
1050 */
1051 /* is this code point assigned, or do we use fallbacks? */
1052 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1053 /* assigned */
1054 *value=myValue;
1055 return length;
1056 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1057 /*
1058 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1059 * There is no way with this data structure for fallback output
1060 * to be a zero byte.
1061 */
1062 *value=myValue;
1063 return -length;
1064 }
1065 }
1066
1067 cx=sharedData->mbcs.extIndexes;
1068 if(cx!=NULL) {
1069 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1070 }
1071
1072 /* unassigned */
1073 return 0;
1074 }
1075
1076 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1077 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1078 * @param retval pointer to output byte
1079 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1080 */
1081 static U_INLINE int32_t
1082 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1083 UChar32 c,
1084 uint32_t* retval,
1085 UBool useFallback)
1086 {
1087 const uint16_t *table;
1088 int32_t value;
1089 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1090 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1091 return 0;
1092 }
1093 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1094 table=sharedData->mbcs.fromUnicodeTable;
1095 /* get the byte for the output */
1096 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1097 /* is this code point assigned, or do we use fallbacks? */
1098 *retval=(uint32_t)(value&0xff);
1099 if(value>=0xf00) {
1100 return 1; /* roundtrip */
1101 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1102 return -1; /* fallback taken */
1103 } else {
1104 return 0; /* no mapping */
1105 }
1106 }
1107
1108 #ifdef U_ENABLE_GENERIC_ISO_2022
1109
1110 /**********************************************************************************
1111 * ISO-2022 Converter
1112 *
1113 *
1114 */
1115
1116 static void
1117 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1118 UErrorCode* err){
1119 const char* mySourceLimit, *realSourceLimit;
1120 const char* sourceStart;
1121 const UChar* myTargetStart;
1122 UConverter* saveThis;
1123 UConverterDataISO2022* myData;
1124 int8_t length;
1125
1126 saveThis = args->converter;
1127 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1128
1129 realSourceLimit = args->sourceLimit;
1130 while (args->source < realSourceLimit) {
1131 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1132 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1133 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1134
1135 if(args->source < mySourceLimit) {
1136 if(myData->currentConverter==NULL) {
1137 myData->currentConverter = ucnv_open("ASCII",err);
1138 if(U_FAILURE(*err)){
1139 return;
1140 }
1141
1142 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1143 saveThis->mode = UCNV_SO;
1144 }
1145
1146 /* convert to before the ESC or until the end of the buffer */
1147 myData->isFirstBuffer=FALSE;
1148 sourceStart = args->source;
1149 myTargetStart = args->target;
1150 args->converter = myData->currentConverter;
1151 ucnv_toUnicode(args->converter,
1152 &args->target,
1153 args->targetLimit,
1154 &args->source,
1155 mySourceLimit,
1156 args->offsets,
1157 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1158 err);
1159 args->converter = saveThis;
1160
1161 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1162 /* move the overflow buffer */
1163 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1164 myData->currentConverter->UCharErrorBufferLength = 0;
1165 if(length > 0) {
1166 uprv_memcpy(saveThis->UCharErrorBuffer,
1167 myData->currentConverter->UCharErrorBuffer,
1168 length*U_SIZEOF_UCHAR);
1169 }
1170 return;
1171 }
1172
1173 /*
1174 * At least one of:
1175 * -Error while converting
1176 * -Done with entire buffer
1177 * -Need to write offsets or update the current offset
1178 * (leave that up to the code in ucnv.c)
1179 *
1180 * or else we just stopped at an ESC byte and continue with changeState_2022()
1181 */
1182 if (U_FAILURE(*err) ||
1183 (args->source == realSourceLimit) ||
1184 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1185 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1186 ) {
1187 /* copy partial or error input for truncated detection and error handling */
1188 if(U_FAILURE(*err)) {
1189 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1190 if(length > 0) {
1191 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1192 }
1193 } else {
1194 length = saveThis->toULength = myData->currentConverter->toULength;
1195 if(length > 0) {
1196 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1197 if(args->source < mySourceLimit) {
1198 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1199 }
1200 }
1201 }
1202 return;
1203 }
1204 }
1205 }
1206
1207 sourceStart = args->source;
1208 changeState_2022(args->converter,
1209 &(args->source),
1210 realSourceLimit,
1211 ISO_2022,
1212 err);
1213 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1214 /* let the ucnv.c code update its current offset */
1215 return;
1216 }
1217 }
1218 }
1219
1220 #endif
1221
1222 /*
1223 * To Unicode Callback helper function
1224 */
1225 static void
1226 toUnicodeCallback(UConverter *cnv,
1227 const uint32_t sourceChar, const uint32_t targetUniChar,
1228 UErrorCode* err){
1229 if(sourceChar>0xff){
1230 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1231 cnv->toUBytes[1] = (uint8_t)sourceChar;
1232 cnv->toULength = 2;
1233 }
1234 else{
1235 cnv->toUBytes[0] =(char) sourceChar;
1236 cnv->toULength = 2;
1237 }
1238
1239 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1240 *err = U_INVALID_CHAR_FOUND;
1241 }
1242 else{
1243 *err = U_ILLEGAL_CHAR_FOUND;
1244 }
1245 }
1246
1247 /**************************************ISO-2022-JP*************************************************/
1248
1249 /************************************** IMPORTANT **************************************************
1250 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1251 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1252 * The converter iterates over each Unicode codepoint
1253 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1254 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1255 * would do as far as possible.
1256 *
1257 * If the implementation of these macros or structure of sharedData struct change in the future, make
1258 * sure that ISO-2022 is also changed.
1259 ***************************************************************************************************
1260 */
1261
1262 /***************************************************************************************************
1263 * Rules for ISO-2022-jp encoding
1264 * (i) Escape sequences must be fully contained within a line they should not
1265 * span new lines or CRs
1266 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1267 * JIS-Roman character escape sequence should follow before the line terminates
1268 * (iii) If the first character on the line is represented by two bytes then a two
1269 * byte character escape sequence should precede it
1270 * (iv) If no escape sequence is encountered then the characters are ASCII
1271 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1272 * and invoked with SS2 (ESC N).
1273 * (vi) If there is any G0 designation in text, there must be a switch to
1274 * ASCII or to JIS X 0201-Roman before a space character (but not
1275 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1276 * characters such as tab or CRLF.
1277 * (vi) Supported encodings:
1278 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1279 *
1280 * source : RFC-1554
1281 *
1282 * JISX201, JISX208,JISX212 : new .cnv data files created
1283 * KSC5601 : alias to ibm-949 mapping table
1284 * GB2312 : alias to ibm-1386 mapping table
1285 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1286 * ISO-8859-7 : alisas to ibm-9409 mapping table
1287 */
1288
1289 /* preference order of JP charsets */
1290 static const StateEnum jpCharsetPref[]={
1291 ASCII,
1292 JISX201,
1293 ISO8859_1,
1294 ISO8859_7,
1295 JISX208,
1296 JISX212,
1297 GB2312,
1298 KSC5601,
1299 HWKANA_7BIT
1300 };
1301
1302 /*
1303 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1304 * not in order of jpCharsetPref[]!
1305 */
1306 static const char escSeqChars[][6] ={
1307 "\x1B\x28\x42", /* <ESC>(B ASCII */
1308 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1309 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1310 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1311 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1312 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1313 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1314 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1315 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1316
1317 };
1318 static const int32_t escSeqCharsLen[] ={
1319 3, /* length of <ESC>(B ASCII */
1320 3, /* length of <ESC>.A ISO-8859-1 */
1321 3, /* length of <ESC>.F ISO-8859-7 */
1322 3, /* length of <ESC>(J JISX-201 */
1323 3, /* length of <ESC>$B JISX-208 */
1324 4, /* length of <ESC>$(D JISX-212 */
1325 3, /* length of <ESC>$A GB2312 */
1326 4, /* length of <ESC>$(C KSC5601 */
1327 3 /* length of <ESC>(I HWKANA_7BIT */
1328 };
1329
1330 /*
1331 * The iteration over various code pages works this way:
1332 * i) Get the currentState from myConverterData->currentState
1333 * ii) Check if the character is mapped to a valid character in the currentState
1334 * Yes -> a) set the initIterState to currentState
1335 * b) remain in this state until an invalid character is found
1336 * No -> a) go to the next code page and find the character
1337 * iii) Before changing the state increment the current state check if the current state
1338 * is equal to the intitIteration state
1339 * Yes -> A character that cannot be represented in any of the supported encodings
1340 * break and return a U_INVALID_CHARACTER error
1341 * No -> Continue and find the character in next code page
1342 *
1343 *
1344 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1345 */
1346
1347 static void
1348 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1349 UConverter *cnv = args->converter;
1350 UConverterDataISO2022 *converterData;
1351 ISO2022State *pFromU2022State;
1352 uint8_t *target = (uint8_t *) args->target;
1353 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1354 const UChar* source = args->source;
1355 const UChar* sourceLimit = args->sourceLimit;
1356 int32_t* offsets = args->offsets;
1357 UChar32 sourceChar;
1358 char buffer[8];
1359 int32_t len, outLen;
1360 int8_t choices[10];
1361 int32_t choiceCount;
1362 uint32_t targetValue = 0;
1363 UBool useFallback;
1364
1365 int32_t i;
1366 int8_t cs, g;
1367
1368 /* set up the state */
1369 converterData = (UConverterDataISO2022*)cnv->extraInfo;
1370 pFromU2022State = &converterData->fromU2022State;
1371
1372 choiceCount = 0;
1373
1374 /* check if the last codepoint of previous buffer was a lead surrogate*/
1375 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1376 goto getTrail;
1377 }
1378
1379 while(source < sourceLimit) {
1380 if(target < targetLimit) {
1381
1382 sourceChar = *(source++);
1383 /*check if the char is a First surrogate*/
1384 if(UTF_IS_SURROGATE(sourceChar)) {
1385 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1386 getTrail:
1387 /*look ahead to find the trail surrogate*/
1388 if(source < sourceLimit) {
1389 /* test the following code unit */
1390 UChar trail=(UChar) *source;
1391 if(UTF_IS_SECOND_SURROGATE(trail)) {
1392 source++;
1393 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1394 cnv->fromUChar32=0x00;
1395 /* convert this supplementary code point */
1396 /* exit this condition tree */
1397 } else {
1398 /* this is an unmatched lead code unit (1st surrogate) */
1399 /* callback(illegal) */
1400 *err=U_ILLEGAL_CHAR_FOUND;
1401 cnv->fromUChar32=sourceChar;
1402 break;
1403 }
1404 } else {
1405 /* no more input */
1406 cnv->fromUChar32=sourceChar;
1407 break;
1408 }
1409 } else {
1410 /* this is an unmatched trail code unit (2nd surrogate) */
1411 /* callback(illegal) */
1412 *err=U_ILLEGAL_CHAR_FOUND;
1413 cnv->fromUChar32=sourceChar;
1414 break;
1415 }
1416 }
1417
1418 /* do not convert SO/SI/ESC */
1419 if(IS_2022_CONTROL(sourceChar)) {
1420 /* callback(illegal) */
1421 *err=U_ILLEGAL_CHAR_FOUND;
1422 cnv->fromUChar32=sourceChar;
1423 break;
1424 }
1425
1426 /* do the conversion */
1427
1428 if(choiceCount == 0) {
1429 uint16_t csm;
1430
1431 /*
1432 * The csm variable keeps track of which charsets are allowed
1433 * and not used yet while building the choices[].
1434 */
1435 csm = jpCharsetMasks[converterData->version];
1436 choiceCount = 0;
1437
1438 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1439 if(converterData->version == 3 || converterData->version == 4) {
1440 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1441 }
1442 /* Do not try single-byte half-width Katakana for other versions. */
1443 csm &= ~CSM(HWKANA_7BIT);
1444
1445 /* try the current G0 charset */
1446 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1447 csm &= ~CSM(cs);
1448
1449 /* try the current G2 charset */
1450 if((cs = pFromU2022State->cs[2]) != 0) {
1451 choices[choiceCount++] = cs;
1452 csm &= ~CSM(cs);
1453 }
1454
1455 /* try all the other possible charsets */
1456 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1457 cs = (int8_t)jpCharsetPref[i];
1458 if(CSM(cs) & csm) {
1459 choices[choiceCount++] = cs;
1460 csm &= ~CSM(cs);
1461 }
1462 }
1463 }
1464
1465 cs = g = 0;
1466 /*
1467 * len==0: no mapping found yet
1468 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1469 * len>0: found a roundtrip result, done
1470 */
1471 len = 0;
1472 /*
1473 * We will turn off useFallback after finding a fallback,
1474 * but we still get fallbacks from PUA code points as usual.
1475 * Therefore, we will also need to check that we don't overwrite
1476 * an early fallback with a later one.
1477 */
1478 useFallback = cnv->useFallback;
1479
1480 for(i = 0; i < choiceCount && len <= 0; ++i) {
1481 uint32_t value;
1482 int32_t len2;
1483 int8_t cs0 = choices[i];
1484 switch(cs0) {
1485 case ASCII:
1486 if(sourceChar <= 0x7f) {
1487 targetValue = (uint32_t)sourceChar;
1488 len = 1;
1489 cs = cs0;
1490 g = 0;
1491 }
1492 break;
1493 case ISO8859_1:
1494 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1495 targetValue = (uint32_t)sourceChar - 0x80;
1496 len = 1;
1497 cs = cs0;
1498 g = 2;
1499 }
1500 break;
1501 case HWKANA_7BIT:
1502 if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) {
1503 if(converterData->version==3) {
1504 /* JIS7: use G1 (SO) */
1505 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1506 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1507 len = 1;
1508 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1509 g = 1;
1510 } else if(converterData->version==4) {
1511 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1512 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1513 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1514 len = 1;
1515
1516 cs = pFromU2022State->cs[0];
1517 if(IS_JP_DBCS(cs)) {
1518 /* switch from a DBCS charset to JISX201 */
1519 cs = (int8_t)JISX201;
1520 }
1521 /* else stay in the current G0 charset */
1522 g = 0;
1523 }
1524 /* else do not use HWKANA_7BIT with other versions */
1525 }
1526 break;
1527 case JISX201:
1528 /* G0 SBCS */
1529 len2 = MBCS_SINGLE_FROM_UCHAR32(
1530 converterData->myConverterArray[cs0],
1531 sourceChar, &value,
1532 useFallback);
1533 if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) {
1534 targetValue = value;
1535 len = len2;
1536 cs = cs0;
1537 g = 0;
1538 useFallback = FALSE;
1539 }
1540 break;
1541 case ISO8859_7:
1542 /* G0 SBCS forced to 7-bit output */
1543 len2 = MBCS_SINGLE_FROM_UCHAR32(
1544 converterData->myConverterArray[cs0],
1545 sourceChar, &value,
1546 useFallback);
1547 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1548 targetValue = value - 0x80;
1549 len = len2;
1550 cs = cs0;
1551 g = 2;
1552 useFallback = FALSE;
1553 }
1554 break;
1555 default:
1556 /* G0 DBCS */
1557 len2 = MBCS_FROM_UCHAR32_ISO2022(
1558 converterData->myConverterArray[cs0],
1559 sourceChar, &value,
1560 useFallback, MBCS_OUTPUT_2);
1561 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1562 if(cs0 == KSC5601) {
1563 /*
1564 * Check for valid bytes for the encoding scheme.
1565 * This is necessary because the sub-converter (windows-949)
1566 * has a broader encoding scheme than is valid for 2022.
1567 *
1568 * Check that the result is a 2-byte value with each byte in the range A1..FE
1569 * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte
1570 * to move it to the ISO 2022 range 21..7E.
1571 */
1572 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1573 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1574 ) {
1575 value -= 0x8080; /* shift down to 21..7e byte range */
1576 } else {
1577 break; /* not valid for ISO 2022 */
1578 }
1579 }
1580 targetValue = value;
1581 len = len2;
1582 cs = cs0;
1583 g = 0;
1584 useFallback = FALSE;
1585 }
1586 break;
1587 }
1588 }
1589
1590 if(len != 0) {
1591 if(len < 0) {
1592 len = -len; /* fallback */
1593 }
1594 outLen = 0; /* count output bytes */
1595
1596 /* write SI if necessary (only for JIS7) */
1597 if(pFromU2022State->g == 1 && g == 0) {
1598 buffer[outLen++] = UCNV_SI;
1599 pFromU2022State->g = 0;
1600 }
1601
1602 /* write the designation sequence if necessary */
1603 if(cs != pFromU2022State->cs[g]) {
1604 int32_t escLen = escSeqCharsLen[cs];
1605 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1606 outLen += escLen;
1607 pFromU2022State->cs[g] = cs;
1608
1609 /* invalidate the choices[] */
1610 choiceCount = 0;
1611 }
1612
1613 /* write the shift sequence if necessary */
1614 if(g != pFromU2022State->g) {
1615 switch(g) {
1616 /* case 0 handled before writing escapes */
1617 case 1:
1618 buffer[outLen++] = UCNV_SO;
1619 pFromU2022State->g = 1;
1620 break;
1621 default: /* case 2 */
1622 buffer[outLen++] = 0x1b;
1623 buffer[outLen++] = 0x4e;
1624 break;
1625 /* no case 3: no SS3 in ISO-2022-JP-x */
1626 }
1627 }
1628
1629 /* write the output bytes */
1630 if(len == 1) {
1631 buffer[outLen++] = (char)targetValue;
1632 } else /* len == 2 */ {
1633 buffer[outLen++] = (char)(targetValue >> 8);
1634 buffer[outLen++] = (char)targetValue;
1635 }
1636 } else {
1637 /*
1638 * if we cannot find the character after checking all codepages
1639 * then this is an error
1640 */
1641 *err = U_INVALID_CHAR_FOUND;
1642 cnv->fromUChar32=sourceChar;
1643 break;
1644 }
1645
1646 if(sourceChar == CR || sourceChar == LF) {
1647 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1648 pFromU2022State->cs[2] = 0;
1649 choiceCount = 0;
1650 }
1651
1652 /* output outLen>0 bytes in buffer[] */
1653 if(outLen == 1) {
1654 *target++ = buffer[0];
1655 if(offsets) {
1656 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1657 }
1658 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1659 *target++ = buffer[0];
1660 *target++ = buffer[1];
1661 if(offsets) {
1662 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1663 *offsets++ = sourceIndex;
1664 *offsets++ = sourceIndex;
1665 }
1666 } else {
1667 fromUWriteUInt8(
1668 cnv,
1669 buffer, outLen,
1670 &target, (const char *)targetLimit,
1671 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1672 err);
1673 if(U_FAILURE(*err)) {
1674 break;
1675 }
1676 }
1677 } /* end if(myTargetIndex<myTargetLength) */
1678 else{
1679 *err =U_BUFFER_OVERFLOW_ERROR;
1680 break;
1681 }
1682
1683 }/* end while(mySourceIndex<mySourceLength) */
1684
1685 /*
1686 * the end of the input stream and detection of truncated input
1687 * are handled by the framework, but for ISO-2022-JP conversion
1688 * we need to be in ASCII mode at the very end
1689 *
1690 * conditions:
1691 * successful
1692 * in SO mode or not in ASCII mode
1693 * end of input and no truncated input
1694 */
1695 if( U_SUCCESS(*err) &&
1696 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1697 args->flush && source>=sourceLimit && cnv->fromUChar32==0
1698 ) {
1699 int32_t sourceIndex;
1700
1701 outLen = 0;
1702
1703 if(pFromU2022State->g != 0) {
1704 buffer[outLen++] = UCNV_SI;
1705 pFromU2022State->g = 0;
1706 }
1707
1708 if(pFromU2022State->cs[0] != ASCII) {
1709 int32_t escLen = escSeqCharsLen[ASCII];
1710 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1711 outLen += escLen;
1712 pFromU2022State->cs[0] = (int8_t)ASCII;
1713 }
1714
1715 /* get the source index of the last input character */
1716 /*
1717 * TODO this would be simpler and more reliable if we used a pair
1718 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1719 * so that we could simply use the prevSourceIndex here;
1720 * this code gives an incorrect result for the rare case of an unmatched
1721 * trail surrogate that is alone in the last buffer of the text stream
1722 */
1723 sourceIndex=(int32_t)(source-args->source);
1724 if(sourceIndex>0) {
1725 --sourceIndex;
1726 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
1727 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
1728 ) {
1729 --sourceIndex;
1730 }
1731 } else {
1732 sourceIndex=-1;
1733 }
1734
1735 fromUWriteUInt8(
1736 cnv,
1737 buffer, outLen,
1738 &target, (const char *)targetLimit,
1739 &offsets, sourceIndex,
1740 err);
1741 }
1742
1743 /*save the state and return */
1744 args->source = source;
1745 args->target = (char*)target;
1746 }
1747
1748 /*************** to unicode *******************/
1749
1750 static void
1751 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
1752 UErrorCode* err){
1753 char tempBuf[3];
1754 const char *mySource = (char *) args->source;
1755 UChar *myTarget = args->target;
1756 const char *mySourceLimit = args->sourceLimit;
1757 uint32_t targetUniChar = 0x0000;
1758 uint32_t mySourceChar = 0x0000;
1759 UConverterDataISO2022* myData;
1760 ISO2022State *pToU2022State;
1761 StateEnum cs;
1762
1763 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
1764 pToU2022State = &myData->toU2022State;
1765
1766 if(myData->key != 0) {
1767 /* continue with a partial escape sequence */
1768 goto escape;
1769 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
1770 /* continue with a partial double-byte character */
1771 mySourceChar = args->converter->toUBytes[0];
1772 args->converter->toULength = 0;
1773 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1774 goto getTrailByte;
1775 }
1776
1777 while(mySource < mySourceLimit){
1778
1779 targetUniChar =missingCharMarker;
1780
1781 if(myTarget < args->targetLimit){
1782
1783 mySourceChar= (unsigned char) *mySource++;
1784
1785 switch(mySourceChar) {
1786 case UCNV_SI:
1787 if(myData->version==3) {
1788 pToU2022State->g=0;
1789 continue;
1790 } else {
1791 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1792 break;
1793 }
1794
1795 case UCNV_SO:
1796 if(myData->version==3) {
1797 /* JIS7: switch to G1 half-width Katakana */
1798 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
1799 pToU2022State->g=1;
1800 continue;
1801 } else {
1802 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1803 break;
1804 }
1805
1806 case ESC_2022:
1807 mySource--;
1808 escape:
1809 changeState_2022(args->converter,&(mySource),
1810 mySourceLimit, ISO_2022_JP,err);
1811
1812 /* invalid or illegal escape sequence */
1813 if(U_FAILURE(*err)){
1814 args->target = myTarget;
1815 args->source = mySource;
1816 return;
1817 }
1818 continue;
1819
1820 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
1821
1822 case CR:
1823 /*falls through*/
1824 case LF:
1825 /* automatically reset to single-byte mode */
1826 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
1827 pToU2022State->cs[0] = (int8_t)ASCII;
1828 }
1829 pToU2022State->cs[2] = 0;
1830 pToU2022State->g = 0;
1831 /* falls through */
1832 default:
1833 /* convert one or two bytes */
1834 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1835 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
1836 !IS_JP_DBCS(cs)
1837 ) {
1838 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
1839 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
1840
1841 /* return from a single-shift state to the previous one */
1842 if(pToU2022State->g >= 2) {
1843 pToU2022State->g=pToU2022State->prevG;
1844 }
1845 } else switch(cs) {
1846 case ASCII:
1847 if(mySourceChar <= 0x7f) {
1848 targetUniChar = mySourceChar;
1849 }
1850 break;
1851 case ISO8859_1:
1852 if(mySourceChar <= 0x7f) {
1853 targetUniChar = mySourceChar + 0x80;
1854 }
1855 /* return from a single-shift state to the previous one */
1856 pToU2022State->g=pToU2022State->prevG;
1857 break;
1858 case ISO8859_7:
1859 if(mySourceChar <= 0x7f) {
1860 /* convert mySourceChar+0x80 to use a normal 8-bit table */
1861 targetUniChar =
1862 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1863 myData->myConverterArray[cs],
1864 mySourceChar + 0x80);
1865 }
1866 /* return from a single-shift state to the previous one */
1867 pToU2022State->g=pToU2022State->prevG;
1868 break;
1869 case JISX201:
1870 if(mySourceChar <= 0x7f) {
1871 targetUniChar =
1872 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1873 myData->myConverterArray[cs],
1874 mySourceChar);
1875 }
1876 break;
1877 case HWKANA_7BIT:
1878 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
1879 /* 7-bit halfwidth Katakana */
1880 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
1881 }
1882 break;
1883 default:
1884 /* G0 DBCS */
1885 if(mySource < mySourceLimit) {
1886 char trailByte;
1887 getTrailByte:
1888 tempBuf[0] = (char) (mySourceChar);
1889 tempBuf[1] = trailByte = *mySource++;
1890 mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
1891 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
1892 } else {
1893 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
1894 args->converter->toULength = 1;
1895 goto endloop;
1896 }
1897 } /* End of inner switch */
1898 break;
1899 } /* End of outer switch */
1900 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
1901 if(args->offsets){
1902 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
1903 }
1904 *(myTarget++)=(UChar)targetUniChar;
1905 }
1906 else if(targetUniChar > missingCharMarker){
1907 /* disassemble the surrogate pair and write to output*/
1908 targetUniChar-=0x0010000;
1909 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
1910 if(args->offsets){
1911 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
1912 }
1913 ++myTarget;
1914 if(myTarget< args->targetLimit){
1915 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
1916 if(args->offsets){
1917 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
1918 }
1919 ++myTarget;
1920 }else{
1921 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
1922 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
1923 }
1924
1925 }
1926 else{
1927 /* Call the callback function*/
1928 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
1929 break;
1930 }
1931 }
1932 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
1933 *err =U_BUFFER_OVERFLOW_ERROR;
1934 break;
1935 }
1936 }
1937 endloop:
1938 args->target = myTarget;
1939 args->source = mySource;
1940 }
1941
1942
1943 /***************************************************************
1944 * Rules for ISO-2022-KR encoding
1945 * i) The KSC5601 designator sequence should appear only once in a file,
1946 * at the begining of a line before any KSC5601 characters. This usually
1947 * means that it appears by itself on the first line of the file
1948 * ii) There are only 2 shifting sequences SO to shift into double byte mode
1949 * and SI to shift into single byte mode
1950 */
1951 static void
1952 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
1953
1954 UConverter* saveConv = args->converter;
1955 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
1956 args->converter=myConverterData->currentConverter;
1957
1958 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
1959 ucnv_MBCSFromUnicodeWithOffsets(args,err);
1960 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
1961
1962 if(*err == U_BUFFER_OVERFLOW_ERROR) {
1963 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
1964 uprv_memcpy(
1965 saveConv->charErrorBuffer,
1966 myConverterData->currentConverter->charErrorBuffer,
1967 myConverterData->currentConverter->charErrorBufferLength);
1968 }
1969 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
1970 myConverterData->currentConverter->charErrorBufferLength = 0;
1971 }
1972 args->converter=saveConv;
1973 }
1974
1975 static void
1976 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
1977
1978 const UChar *source = args->source;
1979 const UChar *sourceLimit = args->sourceLimit;
1980 unsigned char *target = (unsigned char *) args->target;
1981 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
1982 int32_t* offsets = args->offsets;
1983 uint32_t targetByteUnit = 0x0000;
1984 UChar32 sourceChar = 0x0000;
1985 UBool isTargetByteDBCS;
1986 UBool oldIsTargetByteDBCS;
1987 UConverterDataISO2022 *converterData;
1988 UConverterSharedData* sharedData;
1989 UBool useFallback;
1990 int32_t length =0;
1991
1992 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
1993 /* if the version is 1 then the user is requesting
1994 * conversion with ibm-25546 pass the arguments to
1995 * MBCS converter and return
1996 */
1997 if(converterData->version==1){
1998 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
1999 return;
2000 }
2001
2002 /* initialize data */
2003 sharedData = converterData->currentConverter->sharedData;
2004 useFallback = args->converter->useFallback;
2005 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2006 oldIsTargetByteDBCS = isTargetByteDBCS;
2007
2008 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
2009 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2010 goto getTrail;
2011 }
2012 while(source < sourceLimit){
2013
2014 targetByteUnit = missingCharMarker;
2015
2016 if(target < (unsigned char*) args->targetLimit){
2017 sourceChar = *source++;
2018
2019 /* do not convert SO/SI/ESC */
2020 if(IS_2022_CONTROL(sourceChar)) {
2021 /* callback(illegal) */
2022 *err=U_ILLEGAL_CHAR_FOUND;
2023 args->converter->fromUChar32=sourceChar;
2024 break;
2025 }
2026
2027 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2028 if(length < 0) {
2029 length = -length; /* fallback */
2030 }
2031 /* only DBCS or SBCS characters are expected*/
2032 /* DB characters with high bit set to 1 are expected */
2033 if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){
2034 targetByteUnit=missingCharMarker;
2035 }
2036 if (targetByteUnit != missingCharMarker){
2037
2038 oldIsTargetByteDBCS = isTargetByteDBCS;
2039 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2040 /* append the shift sequence */
2041 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2042
2043 if (isTargetByteDBCS)
2044 *target++ = UCNV_SO;
2045 else
2046 *target++ = UCNV_SI;
2047 if(offsets)
2048 *(offsets++) = (int32_t)(source - args->source-1);
2049 }
2050 /* write the targetUniChar to target */
2051 if(targetByteUnit <= 0x00FF){
2052 if( target < targetLimit){
2053 *(target++) = (unsigned char) targetByteUnit;
2054 if(offsets){
2055 *(offsets++) = (int32_t)(source - args->source-1);
2056 }
2057
2058 }else{
2059 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2060 *err = U_BUFFER_OVERFLOW_ERROR;
2061 }
2062 }else{
2063 if(target < targetLimit){
2064 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2065 if(offsets){
2066 *(offsets++) = (int32_t)(source - args->source-1);
2067 }
2068 if(target < targetLimit){
2069 *(target++) =(unsigned char) (targetByteUnit -0x80);
2070 if(offsets){
2071 *(offsets++) = (int32_t)(source - args->source-1);
2072 }
2073 }else{
2074 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2075 *err = U_BUFFER_OVERFLOW_ERROR;
2076 }
2077 }else{
2078 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2079 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2080 *err = U_BUFFER_OVERFLOW_ERROR;
2081 }
2082 }
2083
2084 }
2085 else{
2086 /* oops.. the code point is unassingned
2087 * set the error and reason
2088 */
2089
2090 /*check if the char is a First surrogate*/
2091 if(UTF_IS_SURROGATE(sourceChar)) {
2092 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2093 getTrail:
2094 /*look ahead to find the trail surrogate*/
2095 if(source < sourceLimit) {
2096 /* test the following code unit */
2097 UChar trail=(UChar) *source;
2098 if(UTF_IS_SECOND_SURROGATE(trail)) {
2099 source++;
2100 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2101 *err = U_INVALID_CHAR_FOUND;
2102 /* convert this surrogate code point */
2103 /* exit this condition tree */
2104 } else {
2105 /* this is an unmatched lead code unit (1st surrogate) */
2106 /* callback(illegal) */
2107 *err=U_ILLEGAL_CHAR_FOUND;
2108 }
2109 } else {
2110 /* no more input */
2111 *err = U_ZERO_ERROR;
2112 }
2113 } else {
2114 /* this is an unmatched trail code unit (2nd surrogate) */
2115 /* callback(illegal) */
2116 *err=U_ILLEGAL_CHAR_FOUND;
2117 }
2118 } else {
2119 /* callback(unassigned) for a BMP code point */
2120 *err = U_INVALID_CHAR_FOUND;
2121 }
2122
2123 args->converter->fromUChar32=sourceChar;
2124 break;
2125 }
2126 } /* end if(myTargetIndex<myTargetLength) */
2127 else{
2128 *err =U_BUFFER_OVERFLOW_ERROR;
2129 break;
2130 }
2131
2132 }/* end while(mySourceIndex<mySourceLength) */
2133
2134 /*
2135 * the end of the input stream and detection of truncated input
2136 * are handled by the framework, but for ISO-2022-KR conversion
2137 * we need to be in ASCII mode at the very end
2138 *
2139 * conditions:
2140 * successful
2141 * not in ASCII mode
2142 * end of input and no truncated input
2143 */
2144 if( U_SUCCESS(*err) &&
2145 isTargetByteDBCS &&
2146 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2147 ) {
2148 int32_t sourceIndex;
2149
2150 /* we are switching to ASCII */
2151 isTargetByteDBCS=FALSE;
2152
2153 /* get the source index of the last input character */
2154 /*
2155 * TODO this would be simpler and more reliable if we used a pair
2156 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2157 * so that we could simply use the prevSourceIndex here;
2158 * this code gives an incorrect result for the rare case of an unmatched
2159 * trail surrogate that is alone in the last buffer of the text stream
2160 */
2161 sourceIndex=(int32_t)(source-args->source);
2162 if(sourceIndex>0) {
2163 --sourceIndex;
2164 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2165 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2166 ) {
2167 --sourceIndex;
2168 }
2169 } else {
2170 sourceIndex=-1;
2171 }
2172
2173 fromUWriteUInt8(
2174 args->converter,
2175 SHIFT_IN_STR, 1,
2176 &target, (const char *)targetLimit,
2177 &offsets, sourceIndex,
2178 err);
2179 }
2180
2181 /*save the state and return */
2182 args->source = source;
2183 args->target = (char*)target;
2184 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2185 }
2186
2187 /************************ To Unicode ***************************************/
2188
2189 static void
2190 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2191 UErrorCode* err){
2192 char const* sourceStart;
2193 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2194
2195 UConverterToUnicodeArgs subArgs;
2196 int32_t minArgsSize;
2197
2198 /* set up the subconverter arguments */
2199 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2200 minArgsSize = args->size;
2201 } else {
2202 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2203 }
2204
2205 uprv_memcpy(&subArgs, args, minArgsSize);
2206 subArgs.size = (uint16_t)minArgsSize;
2207 subArgs.converter = myData->currentConverter;
2208
2209 /* remember the original start of the input for offsets */
2210 sourceStart = args->source;
2211
2212 if(myData->key != 0) {
2213 /* continue with a partial escape sequence */
2214 goto escape;
2215 }
2216
2217 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2218 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2219 subArgs.source = args->source;
2220 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2221 if(subArgs.source != subArgs.sourceLimit) {
2222 /*
2223 * get the current partial byte sequence
2224 *
2225 * it needs to be moved between the public and the subconverter
2226 * so that the conversion framework, which only sees the public
2227 * converter, can handle truncated and illegal input etc.
2228 */
2229 if(args->converter->toULength > 0) {
2230 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2231 }
2232 subArgs.converter->toULength = args->converter->toULength;
2233
2234 /*
2235 * Convert up to the end of the input, or to before the next escape character.
2236 * Does not handle conversion extensions because the preToU[] state etc.
2237 * is not copied.
2238 */
2239 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2240
2241 if(args->offsets != NULL && sourceStart != args->source) {
2242 /* update offsets to base them on the actual start of the input */
2243 int32_t *offsets = args->offsets;
2244 UChar *target = args->target;
2245 int32_t delta = (int32_t)(args->source - sourceStart);
2246 while(target < subArgs.target) {
2247 if(*offsets >= 0) {
2248 *offsets += delta;
2249 }
2250 ++offsets;
2251 ++target;
2252 }
2253 }
2254 args->source = subArgs.source;
2255 args->target = subArgs.target;
2256 args->offsets = subArgs.offsets;
2257
2258 /* copy input/error/overflow buffers */
2259 if(subArgs.converter->toULength > 0) {
2260 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2261 }
2262 args->converter->toULength = subArgs.converter->toULength;
2263
2264 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2265 if(subArgs.converter->UCharErrorBufferLength > 0) {
2266 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2267 subArgs.converter->UCharErrorBufferLength);
2268 }
2269 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2270 subArgs.converter->UCharErrorBufferLength = 0;
2271 }
2272 }
2273
2274 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2275 return;
2276 }
2277
2278 escape:
2279 changeState_2022(args->converter,
2280 &(args->source),
2281 args->sourceLimit,
2282 ISO_2022_KR,
2283 err);
2284 }
2285 }
2286
2287 static void
2288 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2289 UErrorCode* err){
2290 char tempBuf[2];
2291 const char *mySource = ( char *) args->source;
2292 UChar *myTarget = args->target;
2293 const char *mySourceLimit = args->sourceLimit;
2294 UChar32 targetUniChar = 0x0000;
2295 UChar mySourceChar = 0x0000;
2296 UConverterDataISO2022* myData;
2297 UConverterSharedData* sharedData ;
2298 UBool useFallback;
2299
2300 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2301 if(myData->version==1){
2302 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2303 return;
2304 }
2305
2306 /* initialize state */
2307 sharedData = myData->currentConverter->sharedData;
2308 useFallback = args->converter->useFallback;
2309
2310 if(myData->key != 0) {
2311 /* continue with a partial escape sequence */
2312 goto escape;
2313 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2314 /* continue with a partial double-byte character */
2315 mySourceChar = args->converter->toUBytes[0];
2316 args->converter->toULength = 0;
2317 goto getTrailByte;
2318 }
2319
2320 while(mySource< mySourceLimit){
2321
2322 if(myTarget < args->targetLimit){
2323
2324 mySourceChar= (unsigned char) *mySource++;
2325
2326 if(mySourceChar==UCNV_SI){
2327 myData->toU2022State.g = 0;
2328 /*consume the source */
2329 continue;
2330 }else if(mySourceChar==UCNV_SO){
2331 myData->toU2022State.g = 1;
2332 /*consume the source */
2333 continue;
2334 }else if(mySourceChar==ESC_2022){
2335 mySource--;
2336 escape:
2337 changeState_2022(args->converter,&(mySource),
2338 mySourceLimit, ISO_2022_KR, err);
2339 if(U_FAILURE(*err)){
2340 args->target = myTarget;
2341 args->source = mySource;
2342 return;
2343 }
2344 continue;
2345 }
2346
2347 if(myData->toU2022State.g == 1) {
2348 if(mySource < mySourceLimit) {
2349 char trailByte;
2350 getTrailByte:
2351 trailByte = *mySource++;
2352 tempBuf[0] = (char)(mySourceChar + 0x80);
2353 tempBuf[1] = (char)(trailByte + 0x80);
2354 mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
2355 if((mySourceChar & 0x8080) == 0) {
2356 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2357 } else {
2358 /* illegal bytes > 0x7f */
2359 targetUniChar = missingCharMarker;
2360 }
2361 } else {
2362 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2363 args->converter->toULength = 1;
2364 break;
2365 }
2366 }
2367 else{
2368 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2369 }
2370 if(targetUniChar < 0xfffe){
2371 if(args->offsets) {
2372 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2373 }
2374 *(myTarget++)=(UChar)targetUniChar;
2375 }
2376 else {
2377 /* Call the callback function*/
2378 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2379 break;
2380 }
2381 }
2382 else{
2383 *err =U_BUFFER_OVERFLOW_ERROR;
2384 break;
2385 }
2386 }
2387 args->target = myTarget;
2388 args->source = mySource;
2389 }
2390
2391 /*************************** END ISO2022-KR *********************************/
2392
2393 /*************************** ISO-2022-CN *********************************
2394 *
2395 * Rules for ISO-2022-CN Encoding:
2396 * i) The designator sequence must appear once on a line before any instance
2397 * of character set it designates.
2398 * ii) If two lines contain characters from the same character set, both lines
2399 * must include the designator sequence.
2400 * iii) Once the designator sequence is known, a shifting sequence has to be found
2401 * to invoke the shifting
2402 * iv) All lines start in ASCII and end in ASCII.
2403 * v) Four shifting sequences are employed for this purpose:
2404 *
2405 * Sequcence ASCII Eq Charsets
2406 * ---------- ------- ---------
2407 * SI <SI> US-ASCII
2408 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2409 * SS2 <ESC>N CNS-11643-1992 Plane 2
2410 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2411 *
2412 * vi)
2413 * SOdesignator : ESC "$" ")" finalchar_for_SO
2414 * SS2designator : ESC "$" "*" finalchar_for_SS2
2415 * SS3designator : ESC "$" "+" finalchar_for_SS3
2416 *
2417 * ESC $ ) A Indicates the bytes following SO are Chinese
2418 * characters as defined in GB 2312-80, until
2419 * another SOdesignation appears
2420 *
2421 *
2422 * ESC $ ) E Indicates the bytes following SO are as defined
2423 * in ISO-IR-165 (for details, see section 2.1),
2424 * until another SOdesignation appears
2425 *
2426 * ESC $ ) G Indicates the bytes following SO are as defined
2427 * in CNS 11643-plane-1, until another
2428 * SOdesignation appears
2429 *
2430 * ESC $ * H Indicates the two bytes immediately following
2431 * SS2 is a Chinese character as defined in CNS
2432 * 11643-plane-2, until another SS2designation
2433 * appears
2434 * (Meaning <ESC>N must preceed every 2 byte
2435 * sequence.)
2436 *
2437 * ESC $ + I Indicates the immediate two bytes following SS3
2438 * is a Chinese character as defined in CNS
2439 * 11643-plane-3, until another SS3designation
2440 * appears
2441 * (Meaning <ESC>O must preceed every 2 byte
2442 * sequence.)
2443 *
2444 * ESC $ + J Indicates the immediate two bytes following SS3
2445 * is a Chinese character as defined in CNS
2446 * 11643-plane-4, until another SS3designation
2447 * appears
2448 * (In English: <ESC>O must preceed every 2 byte
2449 * sequence.)
2450 *
2451 * ESC $ + K Indicates the immediate two bytes following SS3
2452 * is a Chinese character as defined in CNS
2453 * 11643-plane-5, until another SS3designation
2454 * appears
2455 *
2456 * ESC $ + L Indicates the immediate two bytes following SS3
2457 * is a Chinese character as defined in CNS
2458 * 11643-plane-6, until another SS3designation
2459 * appears
2460 *
2461 * ESC $ + M Indicates the immediate two bytes following SS3
2462 * is a Chinese character as defined in CNS
2463 * 11643-plane-7, until another SS3designation
2464 * appears
2465 *
2466 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2467 * has its own designation information before any Chinese characters
2468 * appear
2469 *
2470 */
2471
2472 /* The following are defined this way to make the strings truely readonly */
2473 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2474 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2475 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2476 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2477 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2478 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2479 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2480 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2481 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2482
2483 /********************** ISO2022-CN Data **************************/
2484 static const char* const escSeqCharsCN[10] ={
2485 SHIFT_IN_STR, /* ASCII */
2486 GB_2312_80_STR,
2487 ISO_IR_165_STR,
2488 CNS_11643_1992_Plane_1_STR,
2489 CNS_11643_1992_Plane_2_STR,
2490 CNS_11643_1992_Plane_3_STR,
2491 CNS_11643_1992_Plane_4_STR,
2492 CNS_11643_1992_Plane_5_STR,
2493 CNS_11643_1992_Plane_6_STR,
2494 CNS_11643_1992_Plane_7_STR
2495 };
2496
2497 static void
2498 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2499 UConverter *cnv = args->converter;
2500 UConverterDataISO2022 *converterData;
2501 ISO2022State *pFromU2022State;
2502 uint8_t *target = (uint8_t *) args->target;
2503 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2504 const UChar* source = args->source;
2505 const UChar* sourceLimit = args->sourceLimit;
2506 int32_t* offsets = args->offsets;
2507 UChar32 sourceChar;
2508 char buffer[8];
2509 int32_t len;
2510 int8_t choices[3];
2511 int32_t choiceCount;
2512 uint32_t targetValue = 0;
2513 UBool useFallback;
2514
2515 /* set up the state */
2516 converterData = (UConverterDataISO2022*)cnv->extraInfo;
2517 pFromU2022State = &converterData->fromU2022State;
2518
2519 choiceCount = 0;
2520
2521 /* check if the last codepoint of previous buffer was a lead surrogate*/
2522 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2523 goto getTrail;
2524 }
2525
2526 while( source < sourceLimit){
2527 if(target < targetLimit){
2528
2529 sourceChar = *(source++);
2530 /*check if the char is a First surrogate*/
2531 if(UTF_IS_SURROGATE(sourceChar)) {
2532 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2533 getTrail:
2534 /*look ahead to find the trail surrogate*/
2535 if(source < sourceLimit) {
2536 /* test the following code unit */
2537 UChar trail=(UChar) *source;
2538 if(UTF_IS_SECOND_SURROGATE(trail)) {
2539 source++;
2540 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2541 cnv->fromUChar32=0x00;
2542 /* convert this supplementary code point */
2543 /* exit this condition tree */
2544 } else {
2545 /* this is an unmatched lead code unit (1st surrogate) */
2546 /* callback(illegal) */
2547 *err=U_ILLEGAL_CHAR_FOUND;
2548 cnv->fromUChar32=sourceChar;
2549 break;
2550 }
2551 } else {
2552 /* no more input */
2553 cnv->fromUChar32=sourceChar;
2554 break;
2555 }
2556 } else {
2557 /* this is an unmatched trail code unit (2nd surrogate) */
2558 /* callback(illegal) */
2559 *err=U_ILLEGAL_CHAR_FOUND;
2560 cnv->fromUChar32=sourceChar;
2561 break;
2562 }
2563 }
2564
2565 /* do the conversion */
2566 if(sourceChar <= 0x007f ){
2567 /* do not convert SO/SI/ESC */
2568 if(IS_2022_CONTROL(sourceChar)) {
2569 /* callback(illegal) */
2570 *err=U_ILLEGAL_CHAR_FOUND;
2571 cnv->fromUChar32=sourceChar;
2572 break;
2573 }
2574
2575 /* US-ASCII */
2576 if(pFromU2022State->g == 0) {
2577 buffer[0] = (char)sourceChar;
2578 len = 1;
2579 } else {
2580 buffer[0] = UCNV_SI;
2581 buffer[1] = (char)sourceChar;
2582 len = 2;
2583 pFromU2022State->g = 0;
2584 choiceCount = 0;
2585 }
2586 if(sourceChar == CR || sourceChar == LF) {
2587 /* reset the state at the end of a line */
2588 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2589 choiceCount = 0;
2590 }
2591 }
2592 else{
2593 /* convert U+0080..U+10ffff */
2594 int32_t i;
2595 int8_t cs, g;
2596
2597 if(choiceCount == 0) {
2598 /* try the current SO/G1 converter first */
2599 choices[0] = pFromU2022State->cs[1];
2600
2601 /* default to GB2312_1 if none is designated yet */
2602 if(choices[0] == 0) {
2603 choices[0] = GB2312_1;
2604 }
2605
2606 if(converterData->version == 0) {
2607 /* ISO-2022-CN */
2608
2609 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2610 if(choices[0] == GB2312_1) {
2611 choices[1] = (int8_t)CNS_11643_1;
2612 } else {
2613 choices[1] = (int8_t)GB2312_1;
2614 }
2615
2616 choiceCount = 2;
2617 } else {
2618 /* ISO-2022-CN-EXT */
2619
2620 /* try one of the other converters */
2621 switch(choices[0]) {
2622 case GB2312_1:
2623 choices[1] = (int8_t)CNS_11643_1;
2624 choices[2] = (int8_t)ISO_IR_165;
2625 break;
2626 case ISO_IR_165:
2627 choices[1] = (int8_t)GB2312_1;
2628 choices[2] = (int8_t)CNS_11643_1;
2629 break;
2630 default: /* CNS_11643_x */
2631 choices[1] = (int8_t)GB2312_1;
2632 choices[2] = (int8_t)ISO_IR_165;
2633 break;
2634 }
2635
2636 choiceCount = 3;
2637 }
2638 }
2639
2640 cs = g = 0;
2641 /*
2642 * len==0: no mapping found yet
2643 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
2644 * len>0: found a roundtrip result, done
2645 */
2646 len = 0;
2647 /*
2648 * We will turn off useFallback after finding a fallback,
2649 * but we still get fallbacks from PUA code points as usual.
2650 * Therefore, we will also need to check that we don't overwrite
2651 * an early fallback with a later one.
2652 */
2653 useFallback = cnv->useFallback;
2654
2655 for(i = 0; i < choiceCount && len <= 0; ++i) {
2656 int8_t cs0 = choices[i];
2657 if(cs0 > 0) {
2658 uint32_t value;
2659 int32_t len2;
2660 if(cs0 > CNS_11643_0) {
2661 len2 = MBCS_FROM_UCHAR32_ISO2022(
2662 converterData->myConverterArray[CNS_11643],
2663 sourceChar,
2664 &value,
2665 useFallback,
2666 MBCS_OUTPUT_3);
2667 if(len2 == 3 || (len2 == -3 && len == 0)) {
2668 targetValue = value;
2669 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
2670 if(len2 >= 0) {
2671 len = 2;
2672 } else {
2673 len = -2;
2674 useFallback = FALSE;
2675 }
2676 if(cs == CNS_11643_1) {
2677 g = 1;
2678 } else if(cs == CNS_11643_2) {
2679 g = 2;
2680 } else /* plane 3..7 */ if(converterData->version == 1) {
2681 g = 3;
2682 } else {
2683 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
2684 len = 0;
2685 }
2686 }
2687 } else {
2688 /* GB2312_1 or ISO-IR-165 */
2689 len2 = MBCS_FROM_UCHAR32_ISO2022(
2690 converterData->myConverterArray[cs0],
2691 sourceChar,
2692 &value,
2693 useFallback,
2694 MBCS_OUTPUT_2);
2695 if(len2 == 2 || (len2 == -2 && len == 0)) {
2696 targetValue = value;
2697 len = len2;
2698 cs = cs0;
2699 g = 1;
2700 useFallback = FALSE;
2701 }
2702 }
2703 }
2704 }
2705
2706 if(len != 0) {
2707 len = 0; /* count output bytes; it must have been abs(len) == 2 */
2708
2709 /* write the designation sequence if necessary */
2710 if(cs != pFromU2022State->cs[g]) {
2711 if(cs < CNS_11643) {
2712 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
2713 } else {
2714 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
2715 }
2716 len = 4;
2717 pFromU2022State->cs[g] = cs;
2718 if(g == 1) {
2719 /* changing the SO/G1 charset invalidates the choices[] */
2720 choiceCount = 0;
2721 }
2722 }
2723
2724 /* write the shift sequence if necessary */
2725 if(g != pFromU2022State->g) {
2726 switch(g) {
2727 case 1:
2728 buffer[len++] = UCNV_SO;
2729
2730 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
2731 pFromU2022State->g = 1;
2732 break;
2733 case 2:
2734 buffer[len++] = 0x1b;
2735 buffer[len++] = 0x4e;
2736 break;
2737 default: /* case 3 */
2738 buffer[len++] = 0x1b;
2739 buffer[len++] = 0x4f;
2740 break;
2741 }
2742 }
2743
2744 /* write the two output bytes */
2745 buffer[len++] = (char)(targetValue >> 8);
2746 buffer[len++] = (char)targetValue;
2747 } else {
2748 /* if we cannot find the character after checking all codepages
2749 * then this is an error
2750 */
2751 *err = U_INVALID_CHAR_FOUND;
2752 cnv->fromUChar32=sourceChar;
2753 break;
2754 }
2755 }
2756
2757 /* output len>0 bytes in buffer[] */
2758 if(len == 1) {
2759 *target++ = buffer[0];
2760 if(offsets) {
2761 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
2762 }
2763 } else if(len == 2 && (target + 2) <= targetLimit) {
2764 *target++ = buffer[0];
2765 *target++ = buffer[1];
2766 if(offsets) {
2767 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
2768 *offsets++ = sourceIndex;
2769 *offsets++ = sourceIndex;
2770 }
2771 } else {
2772 fromUWriteUInt8(
2773 cnv,
2774 buffer, len,
2775 &target, (const char *)targetLimit,
2776 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
2777 err);
2778 if(U_FAILURE(*err)) {
2779 break;
2780 }
2781 }
2782 } /* end if(myTargetIndex<myTargetLength) */
2783 else{
2784 *err =U_BUFFER_OVERFLOW_ERROR;
2785 break;
2786 }
2787
2788 }/* end while(mySourceIndex<mySourceLength) */
2789
2790 /*
2791 * the end of the input stream and detection of truncated input
2792 * are handled by the framework, but for ISO-2022-CN conversion
2793 * we need to be in ASCII mode at the very end
2794 *
2795 * conditions:
2796 * successful
2797 * not in ASCII mode
2798 * end of input and no truncated input
2799 */
2800 if( U_SUCCESS(*err) &&
2801 pFromU2022State->g!=0 &&
2802 args->flush && source>=sourceLimit && cnv->fromUChar32==0
2803 ) {
2804 int32_t sourceIndex;
2805
2806 /* we are switching to ASCII */
2807 pFromU2022State->g=0;
2808
2809 /* get the source index of the last input character */
2810 /*
2811 * TODO this would be simpler and more reliable if we used a pair
2812 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2813 * so that we could simply use the prevSourceIndex here;
2814 * this code gives an incorrect result for the rare case of an unmatched
2815 * trail surrogate that is alone in the last buffer of the text stream
2816 */
2817 sourceIndex=(int32_t)(source-args->source);
2818 if(sourceIndex>0) {
2819 --sourceIndex;
2820 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2821 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2822 ) {
2823 --sourceIndex;
2824 }
2825 } else {
2826 sourceIndex=-1;
2827 }
2828
2829 fromUWriteUInt8(
2830 cnv,
2831 SHIFT_IN_STR, 1,
2832 &target, (const char *)targetLimit,
2833 &offsets, sourceIndex,
2834 err);
2835 }
2836
2837 /*save the state and return */
2838 args->source = source;
2839 args->target = (char*)target;
2840 }
2841
2842
2843 static void
2844 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2845 UErrorCode* err){
2846 char tempBuf[3];
2847 const char *mySource = (char *) args->source;
2848 UChar *myTarget = args->target;
2849 const char *mySourceLimit = args->sourceLimit;
2850 uint32_t targetUniChar = 0x0000;
2851 uint32_t mySourceChar = 0x0000;
2852 UConverterDataISO2022* myData;
2853 ISO2022State *pToU2022State;
2854
2855 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2856 pToU2022State = &myData->toU2022State;
2857
2858 if(myData->key != 0) {
2859 /* continue with a partial escape sequence */
2860 goto escape;
2861 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2862 /* continue with a partial double-byte character */
2863 mySourceChar = args->converter->toUBytes[0];
2864 args->converter->toULength = 0;
2865 goto getTrailByte;
2866 }
2867
2868 while(mySource < mySourceLimit){
2869
2870 targetUniChar =missingCharMarker;
2871
2872 if(myTarget < args->targetLimit){
2873
2874 mySourceChar= (unsigned char) *mySource++;
2875
2876 switch(mySourceChar){
2877 case UCNV_SI:
2878 pToU2022State->g=0;
2879 continue;
2880
2881 case UCNV_SO:
2882 if(pToU2022State->cs[1] != 0) {
2883 pToU2022State->g=1;
2884 continue;
2885 } else {
2886 /* illegal to have SO before a matching designator */
2887 break;
2888 }
2889
2890 case ESC_2022:
2891 mySource--;
2892 escape:
2893 changeState_2022(args->converter,&(mySource),
2894 mySourceLimit, ISO_2022_CN,err);
2895
2896 /* invalid or illegal escape sequence */
2897 if(U_FAILURE(*err)){
2898 args->target = myTarget;
2899 args->source = mySource;
2900 return;
2901 }
2902 continue;
2903
2904 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
2905
2906 case CR:
2907 /*falls through*/
2908 case LF:
2909 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
2910 /* falls through */
2911 default:
2912 /* convert one or two bytes */
2913 if(pToU2022State->g != 0) {
2914 if(mySource < mySourceLimit) {
2915 UConverterSharedData *cnv;
2916 StateEnum tempState;
2917 int32_t tempBufLen;
2918 char trailByte;
2919 getTrailByte:
2920 trailByte = *mySource++;
2921 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
2922 if(tempState > CNS_11643_0) {
2923 cnv = myData->myConverterArray[CNS_11643];
2924 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
2925 tempBuf[1] = (char) (mySourceChar);
2926 tempBuf[2] = trailByte;
2927 tempBufLen = 3;
2928
2929 }else{
2930 cnv = myData->myConverterArray[tempState];
2931 tempBuf[0] = (char) (mySourceChar);
2932 tempBuf[1] = trailByte;
2933 tempBufLen = 2;
2934 }
2935 mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
2936 if(pToU2022State->g>=2) {
2937 /* return from a single-shift state to the previous one */
2938 pToU2022State->g=pToU2022State->prevG;
2939 }
2940 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
2941 } else {
2942 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2943 args->converter->toULength = 1;
2944 goto endloop;
2945 }
2946 }
2947 else{
2948 if(mySourceChar <= 0x7f) {
2949 targetUniChar = (UChar) mySourceChar;
2950 }
2951 }
2952 break;
2953 }
2954 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2955 if(args->offsets){
2956 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2957 }
2958 *(myTarget++)=(UChar)targetUniChar;
2959 }
2960 else if(targetUniChar > missingCharMarker){
2961 /* disassemble the surrogate pair and write to output*/
2962 targetUniChar-=0x0010000;
2963 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2964 if(args->offsets){
2965 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2966 }
2967 ++myTarget;
2968 if(myTarget< args->targetLimit){
2969 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2970 if(args->offsets){
2971 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2972 }
2973 ++myTarget;
2974 }else{
2975 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2976 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2977 }
2978
2979 }
2980 else{
2981 /* Call the callback function*/
2982 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2983 break;
2984 }
2985 }
2986 else{
2987 *err =U_BUFFER_OVERFLOW_ERROR;
2988 break;
2989 }
2990 }
2991 endloop:
2992 args->target = myTarget;
2993 args->source = mySource;
2994 }
2995
2996 static void
2997 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
2998 UConverter *cnv = args->converter;
2999 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3000 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3001 char *p, *subchar;
3002 char buffer[8];
3003 int32_t length;
3004
3005 subchar=(char *)cnv->subChars;
3006 length=cnv->subCharLen; /* assume length==1 for most variants */
3007
3008 p = buffer;
3009 switch(myConverterData->locale[0]){
3010 case 'j':
3011 {
3012 int8_t cs;
3013
3014 if(pFromU2022State->g == 1) {
3015 /* JIS7: switch from G1 to G0 */
3016 pFromU2022State->g = 0;
3017 *p++ = UCNV_SI;
3018 }
3019
3020 cs = pFromU2022State->cs[0];
3021 if(cs != ASCII && cs != JISX201) {
3022 /* not in ASCII or JIS X 0201: switch to ASCII */
3023 pFromU2022State->cs[0] = (int8_t)ASCII;
3024 *p++ = '\x1b';
3025 *p++ = '\x28';
3026 *p++ = '\x42';
3027 }
3028
3029 *p++ = subchar[0];
3030 break;
3031 }
3032 case 'c':
3033 if(pFromU2022State->g != 0) {
3034 /* not in ASCII mode: switch to ASCII */
3035 pFromU2022State->g = 0;
3036 *p++ = UCNV_SI;
3037 }
3038 *p++ = subchar[0];
3039 break;
3040 case 'k':
3041 if(myConverterData->version == 0) {
3042 if(length == 1) {
3043 if((UBool)args->converter->fromUnicodeStatus) {
3044 /* in DBCS mode: switch to SBCS */
3045 args->converter->fromUnicodeStatus = 0;
3046 *p++ = UCNV_SI;
3047 }
3048 *p++ = subchar[0];
3049 } else /* length == 2*/ {
3050 if(!(UBool)args->converter->fromUnicodeStatus) {
3051 /* in SBCS mode: switch to DBCS */
3052 args->converter->fromUnicodeStatus = 1;
3053 *p++ = UCNV_SO;
3054 }
3055 *p++ = subchar[0];
3056 *p++ = subchar[1];
3057 }
3058 break;
3059 } else {
3060 /* save the subconverter's substitution string */
3061 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3062 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3063
3064 /* set our substitution string into the subconverter */
3065 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3066 myConverterData->currentConverter->subCharLen = (int8_t)length;
3067
3068 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3069 args->converter = myConverterData->currentConverter;
3070 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3071 ucnv_cbFromUWriteSub(args, 0, err);
3072 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3073 args->converter = cnv;
3074
3075 /* restore the subconverter's substitution string */
3076 myConverterData->currentConverter->subChars = currentSubChars;
3077 myConverterData->currentConverter->subCharLen = currentSubCharLen;
3078
3079 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3080 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3081 uprv_memcpy(
3082 cnv->charErrorBuffer,
3083 myConverterData->currentConverter->charErrorBuffer,
3084 myConverterData->currentConverter->charErrorBufferLength);
3085 }
3086 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3087 myConverterData->currentConverter->charErrorBufferLength = 0;
3088 }
3089 return;
3090 }
3091 default:
3092 /* not expected */
3093 break;
3094 }
3095 ucnv_cbFromUWriteBytes(args,
3096 buffer, (int32_t)(p - buffer),
3097 offsetIndex, err);
3098 }
3099
3100 /*
3101 * Structure for cloning an ISO 2022 converter into a single memory block.
3102 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3103 * and then ucnv_safeClone() of the sub-converter may additionally align
3104 * currentConverter inside the cloneStruct, for which we need the deadSpace
3105 * after currentConverter.
3106 * This is because UAlignedMemory may be larger than the actually
3107 * necessary alignment size for the platform.
3108 * The other cloneStruct fields will not be moved around,
3109 * and are aligned properly with cloneStruct's alignment.
3110 */
3111 struct cloneStruct
3112 {
3113 UConverter cnv;
3114 UConverter currentConverter;
3115 UAlignedMemory deadSpace;
3116 UConverterDataISO2022 mydata;
3117 };
3118
3119
3120 static UConverter *
3121 _ISO_2022_SafeClone(
3122 const UConverter *cnv,
3123 void *stackBuffer,
3124 int32_t *pBufferSize,
3125 UErrorCode *status)
3126 {
3127 struct cloneStruct * localClone;
3128 UConverterDataISO2022 *cnvData;
3129 int32_t i, size;
3130
3131 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3132 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3133 return NULL;
3134 }
3135
3136 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3137 localClone = (struct cloneStruct *)stackBuffer;
3138
3139 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3140
3141 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3142 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3143 localClone->cnv.isExtraLocal = TRUE;
3144
3145 /* share the subconverters */
3146
3147 if(cnvData->currentConverter != NULL) {
3148 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3149 localClone->mydata.currentConverter =
3150 ucnv_safeClone(cnvData->currentConverter,
3151 &localClone->currentConverter,
3152 &size, status);
3153 if(U_FAILURE(*status)) {
3154 return NULL;
3155 }
3156 }
3157
3158 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3159 if(cnvData->myConverterArray[i] != NULL) {
3160 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3161 }
3162 }
3163
3164 return &localClone->cnv;
3165 }
3166
3167 static void
3168 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3169 const USetAdder *sa,
3170 UConverterUnicodeSet which,
3171 UErrorCode *pErrorCode)
3172 {
3173 int32_t i;
3174 UConverterDataISO2022* cnvData;
3175
3176 if (U_FAILURE(*pErrorCode)) {
3177 return;
3178 }
3179 #ifdef U_ENABLE_GENERIC_ISO_2022
3180 if (cnv->sharedData == &_ISO2022Data) {
3181 /* We use UTF-8 in this case */
3182 sa->addRange(sa->set, 0, 0xd7FF);
3183 sa->addRange(sa->set, 0xE000, 0x10FFFF);
3184 return;
3185 }
3186 #endif
3187
3188 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3189
3190 /* open a set and initialize it with code points that are algorithmically round-tripped */
3191 switch(cnvData->locale[0]){
3192 case 'j':
3193 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3194 /* include Latin-1 for some variants of JP */
3195 sa->addRange(sa->set, 0, 0xff);
3196 } else {
3197 /* include ASCII for JP */
3198 sa->addRange(sa->set, 0, 0x7f);
3199 }
3200 if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
3201 /* include half-width Katakana for JP */
3202 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3203 }
3204 break;
3205 case 'c':
3206 case 'z':
3207 /* include ASCII for CN */
3208 sa->addRange(sa->set, 0, 0x7f);
3209 break;
3210 case 'k':
3211 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3212 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3213 cnvData->currentConverter, sa, which, pErrorCode);
3214 /* the loop over myConverterArray[] will simply not find another converter */
3215 break;
3216 default:
3217 break;
3218 }
3219
3220 /*
3221 * Version-specific for CN:
3222 * CN version 0 does not map CNS planes 3..7 although
3223 * they are all available in the CNS conversion table;
3224 * CN version 1 does map them all.
3225 * The two versions create different Unicode sets.
3226 */
3227 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3228 if(cnvData->myConverterArray[i]!=NULL) {
3229 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3230 cnvData->version==0 && i==CNS_11643
3231 ) {
3232 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3233 ucnv_MBCSGetUnicodeSetForBytes(
3234 cnvData->myConverterArray[i],
3235 sa, UCNV_ROUNDTRIP_SET,
3236 0, 0x81, 0x82,
3237 pErrorCode);
3238 } else {
3239 ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode);
3240 }
3241 }
3242 }
3243
3244 /*
3245 * ISO 2022 converters must not convert SO/SI/ESC despite what
3246 * sub-converters do by themselves.
3247 * Remove these characters from the set.
3248 */
3249 sa->remove(sa->set, 0x0e);
3250 sa->remove(sa->set, 0x0f);
3251 sa->remove(sa->set, 0x1b);
3252 }
3253
3254 static const UConverterImpl _ISO2022Impl={
3255 UCNV_ISO_2022,
3256
3257 NULL,
3258 NULL,
3259
3260 _ISO2022Open,
3261 _ISO2022Close,
3262 _ISO2022Reset,
3263
3264 #ifdef U_ENABLE_GENERIC_ISO_2022
3265 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3266 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3267 ucnv_fromUnicode_UTF8,
3268 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3269 #else
3270 NULL,
3271 NULL,
3272 NULL,
3273 NULL,
3274 #endif
3275 NULL,
3276
3277 NULL,
3278 _ISO2022getName,
3279 _ISO_2022_WriteSub,
3280 _ISO_2022_SafeClone,
3281 _ISO_2022_GetUnicodeSet
3282 };
3283 static const UConverterStaticData _ISO2022StaticData={
3284 sizeof(UConverterStaticData),
3285 "ISO_2022",
3286 2022,
3287 UCNV_IBM,
3288 UCNV_ISO_2022,
3289 1,
3290 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3291 { 0x1a, 0, 0, 0 },
3292 1,
3293 FALSE,
3294 FALSE,
3295 0,
3296 0,
3297 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3298 };
3299 const UConverterSharedData _ISO2022Data={
3300 sizeof(UConverterSharedData),
3301 ~((uint32_t) 0),
3302 NULL,
3303 NULL,
3304 &_ISO2022StaticData,
3305 FALSE,
3306 &_ISO2022Impl,
3307 0
3308 };
3309
3310 /*************JP****************/
3311 static const UConverterImpl _ISO2022JPImpl={
3312 UCNV_ISO_2022,
3313
3314 NULL,
3315 NULL,
3316
3317 _ISO2022Open,
3318 _ISO2022Close,
3319 _ISO2022Reset,
3320
3321 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3322 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3323 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3324 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3325 NULL,
3326
3327 NULL,
3328 _ISO2022getName,
3329 _ISO_2022_WriteSub,
3330 _ISO_2022_SafeClone,
3331 _ISO_2022_GetUnicodeSet
3332 };
3333 static const UConverterStaticData _ISO2022JPStaticData={
3334 sizeof(UConverterStaticData),
3335 "ISO_2022_JP",
3336 0,
3337 UCNV_IBM,
3338 UCNV_ISO_2022,
3339 1,
3340 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3341 { 0x1a, 0, 0, 0 },
3342 1,
3343 FALSE,
3344 FALSE,
3345 0,
3346 0,
3347 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3348 };
3349 static const UConverterSharedData _ISO2022JPData={
3350 sizeof(UConverterSharedData),
3351 ~((uint32_t) 0),
3352 NULL,
3353 NULL,
3354 &_ISO2022JPStaticData,
3355 FALSE,
3356 &_ISO2022JPImpl,
3357 0
3358 };
3359
3360 /************* KR ***************/
3361 static const UConverterImpl _ISO2022KRImpl={
3362 UCNV_ISO_2022,
3363
3364 NULL,
3365 NULL,
3366
3367 _ISO2022Open,
3368 _ISO2022Close,
3369 _ISO2022Reset,
3370
3371 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3372 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3373 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3374 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3375 NULL,
3376
3377 NULL,
3378 _ISO2022getName,
3379 _ISO_2022_WriteSub,
3380 _ISO_2022_SafeClone,
3381 _ISO_2022_GetUnicodeSet
3382 };
3383 static const UConverterStaticData _ISO2022KRStaticData={
3384 sizeof(UConverterStaticData),
3385 "ISO_2022_KR",
3386 0,
3387 UCNV_IBM,
3388 UCNV_ISO_2022,
3389 1,
3390 3, /* max 3 bytes per UChar: SO+DBCS */
3391 { 0x1a, 0, 0, 0 },
3392 1,
3393 FALSE,
3394 FALSE,
3395 0,
3396 0,
3397 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3398 };
3399 static const UConverterSharedData _ISO2022KRData={
3400 sizeof(UConverterSharedData),
3401 ~((uint32_t) 0),
3402 NULL,
3403 NULL,
3404 &_ISO2022KRStaticData,
3405 FALSE,
3406 &_ISO2022KRImpl,
3407 0
3408 };
3409
3410 /*************** CN ***************/
3411 static const UConverterImpl _ISO2022CNImpl={
3412
3413 UCNV_ISO_2022,
3414
3415 NULL,
3416 NULL,
3417
3418 _ISO2022Open,
3419 _ISO2022Close,
3420 _ISO2022Reset,
3421
3422 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3423 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3424 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3425 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3426 NULL,
3427
3428 NULL,
3429 _ISO2022getName,
3430 _ISO_2022_WriteSub,
3431 _ISO_2022_SafeClone,
3432 _ISO_2022_GetUnicodeSet
3433 };
3434 static const UConverterStaticData _ISO2022CNStaticData={
3435 sizeof(UConverterStaticData),
3436 "ISO_2022_CN",
3437 0,
3438 UCNV_IBM,
3439 UCNV_ISO_2022,
3440 1,
3441 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3442 { 0x1a, 0, 0, 0 },
3443 1,
3444 FALSE,
3445 FALSE,
3446 0,
3447 0,
3448 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3449 };
3450 static const UConverterSharedData _ISO2022CNData={
3451 sizeof(UConverterSharedData),
3452 ~((uint32_t) 0),
3453 NULL,
3454 NULL,
3455 &_ISO2022CNStaticData,
3456 FALSE,
3457 &_ISO2022CNImpl,
3458 0
3459 };
3460
3461
3462
3463 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */