2 **********************************************************************
3 * Copyright (C) 2000-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 * tab size: 8 (not used)
11 * created on: 2000oct16
12 * created by: Ram Viswanadha
13 * 10/31/2000 Ram Implemented offsets logic function
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
22 #include "unicode/ucnv.h"
23 #include "unicode/ucnv_cb.h"
24 #include "unicode/uset.h"
28 #define UCNV_TILDE 0x7E /* ~ */
29 #define UCNV_OPEN_BRACE 0x7B /* { */
30 #define UCNV_CLOSE_BRACE 0x7D /* } */
31 #define SB_ESCAPE "\x7E\x7D"
32 #define DB_ESCAPE "\x7E\x7B"
33 #define TILDE_ESCAPE "\x7E\x7E"
37 #define CONCAT_ESCAPE_MACRO( args, targetIndex,targetLength,strToAppend, err, len,sourceIndex){ \
39 if(targetIndex < targetLength){ \
40 args->target[targetIndex] = (unsigned char) *strToAppend; \
41 if(args->offsets!=NULL){ \
42 *(offsets++) = sourceIndex-1; \
47 args->converter->charErrorBuffer[(int)args->converter->charErrorBufferLength++] = (unsigned char) *strToAppend; \
48 *err =U_BUFFER_OVERFLOW_ERROR; \
56 UConverter
* gbConverter
;
59 UBool isEscapeAppended
;
61 UBool isTargetUCharDBCS
;
67 _HZOpen(UConverter
*cnv
, const char *name
,const char *locale
,uint32_t options
, UErrorCode
*errorCode
){
68 cnv
->toUnicodeStatus
= 0;
69 cnv
->fromUnicodeStatus
= 0;
71 cnv
->fromUChar32
=0x0000;
72 cnv
->extraInfo
= uprv_malloc(sizeof(UConverterDataHZ
));
73 if(cnv
->extraInfo
!= NULL
){
74 uprv_memset(cnv
->extraInfo
, 0, sizeof(UConverterDataHZ
));
75 ((UConverterDataHZ
*)cnv
->extraInfo
)->gbConverter
= ucnv_open("ibm-1386",errorCode
);
78 *errorCode
= U_MEMORY_ALLOCATION_ERROR
;
84 _HZClose(UConverter
*cnv
){
85 if(cnv
->extraInfo
!= NULL
) {
86 ucnv_close (((UConverterDataHZ
*) (cnv
->extraInfo
))->gbConverter
);
87 if(!cnv
->isExtraLocal
) {
88 uprv_free(cnv
->extraInfo
);
90 cnv
->extraInfo
= NULL
;
95 _HZReset(UConverter
*cnv
, UConverterResetChoice choice
){
96 if(choice
<=UCNV_RESET_TO_UNICODE
) {
97 cnv
->toUnicodeStatus
= 0;
99 if(cnv
->extraInfo
!= NULL
){
100 ((UConverterDataHZ
*)cnv
->extraInfo
)->isStateDBCS
= FALSE
;
103 if(choice
!=UCNV_RESET_TO_UNICODE
) {
104 cnv
->fromUnicodeStatus
= 0;
105 cnv
->fromUChar32
=0x0000;
106 if(cnv
->extraInfo
!= NULL
){
107 ((UConverterDataHZ
*)cnv
->extraInfo
)->isEscapeAppended
= FALSE
;
108 ((UConverterDataHZ
*)cnv
->extraInfo
)->targetIndex
= 0;
109 ((UConverterDataHZ
*)cnv
->extraInfo
)->sourceIndex
= 0;
110 ((UConverterDataHZ
*)cnv
->extraInfo
)->isTargetUCharDBCS
= FALSE
;
115 /**************************************HZ Encoding*************************************************
116 * Rules for HZ encoding
118 * In ASCII mode, a byte is interpreted as an ASCII character, unless a
119 * '~' is encountered. The character '~' is an escape character. By
120 * convention, it must be immediately followed ONLY by '~', '{' or '\n'
121 * (<LF>), with the following special meaning.
123 * 1. The escape sequence '~~' is interpreted as a '~'.
124 * 2. The escape-to-GB sequence '~{' switches the mode from ASCII to GB.
125 * 3. The escape sequence '~\n' is a line-continuation marker to be
126 * consumed with no output produced.
127 * In GB mode, characters are interpreted two bytes at a time as (pure)
128 * GB codes until the escape-from-GB code '~}' is read. This code
129 * switches the mode from GB back to ASCII. (Note that the escape-
130 * from-GB code '~}' ($7E7D) is outside the defined GB range.)
137 UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
140 const char *mySource
= ( char *) args
->source
;
141 UChar
*myTarget
= args
->target
;
142 const char *mySourceLimit
= args
->sourceLimit
;
143 UChar32 targetUniChar
= 0x0000;
144 UChar mySourceChar
= 0x0000;
145 UConverterDataHZ
* myData
=(UConverterDataHZ
*)(args
->converter
->extraInfo
);
148 if ((args
->converter
== NULL
) || (args
->targetLimit
< args
->target
) || (mySourceLimit
< args
->source
)){
149 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
153 while(mySource
< mySourceLimit
){
155 if(myTarget
< args
->targetLimit
){
157 mySourceChar
= (unsigned char) *mySource
++;
159 switch(mySourceChar
){
161 if(args
->converter
->mode
==UCNV_TILDE
){
162 args
->converter
->mode
=0;
165 *(myTarget
++)=(UChar
)mySourceChar
;
169 if(args
->converter
->mode
==UCNV_TILDE
){
170 *(myTarget
++)=(UChar
)mySourceChar
;
171 args
->converter
->mode
=0;
175 else if(args
->converter
->toUnicodeStatus
!=0){
176 args
->converter
->mode
=0;
180 args
->converter
->mode
= UCNV_TILDE
;
185 case UCNV_OPEN_BRACE
:
186 if(args
->converter
->mode
== UCNV_TILDE
){
187 args
->converter
->mode
=0;
188 myData
->isStateDBCS
= TRUE
;
196 case UCNV_CLOSE_BRACE
:
197 if(args
->converter
->mode
== UCNV_TILDE
){
198 args
->converter
->mode
=0;
199 myData
->isStateDBCS
= FALSE
;
207 /* if the first byte is equal to TILDE and the trail byte
208 * is not a valid byte then it is an error condition
210 if(args
->converter
->mode
== UCNV_TILDE
){
211 args
->converter
->mode
=0;
212 mySourceChar
= (UChar
)(((UCNV_TILDE
+0x80) << 8) | ((mySourceChar
& 0x00ff)+0x80));
220 if(myData
->isStateDBCS
){
221 if(args
->converter
->toUnicodeStatus
== 0x00){
222 args
->converter
->toUnicodeStatus
= (UChar
) mySourceChar
;
226 tempBuf
[0] = (char) (args
->converter
->toUnicodeStatus
+0x80) ;
227 tempBuf
[1] = (char) (mySourceChar
+0x80);
228 mySourceChar
= (UChar
)(((args
->converter
->toUnicodeStatus
+0x80) << 8) | ((mySourceChar
& 0x00ff)+0x80));
229 args
->converter
->toUnicodeStatus
=0x00;
230 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(myData
->gbConverter
->sharedData
,
231 tempBuf
, 2, args
->converter
->useFallback
);
235 if(args
->converter
->fromUnicodeStatus
== 0x00){
236 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(myData
->gbConverter
->sharedData
,
237 mySource
- 1, 1, args
->converter
->useFallback
);
244 if(targetUniChar
< 0xfffe){
246 args
->offsets
[myTarget
- args
->target
]=(int32_t)(mySource
- args
->source
- 1-(myData
->isStateDBCS
));
249 *(myTarget
++)=(UChar
)targetUniChar
;
251 else if(targetUniChar
>=0xfffe){
253 if(targetUniChar
== 0xfffe){
254 *err
= U_INVALID_CHAR_FOUND
;
257 *err
= U_ILLEGAL_CHAR_FOUND
;
259 if(myData
->isStateDBCS
){
260 /* this should never occur since isStateDBCS is set to true
261 * only after tempBuf[0] and tempBuf[1]
262 * are set to the input .. just to please BEAM
264 if(tempBuf
[0]==0 || tempBuf
[1]==0){
265 *err
= U_INTERNAL_PROGRAM_ERROR
;
267 args
->converter
->toUBytes
[0] = (uint8_t)(tempBuf
[0]-0x80);
268 args
->converter
->toUBytes
[1] = (uint8_t)(tempBuf
[1]-0x80);
269 args
->converter
->toULength
=2;
273 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
274 args
->converter
->toULength
=1;
280 *err
=U_BUFFER_OVERFLOW_ERROR
;
285 args
->target
= myTarget
;
286 args
->source
= mySource
;
291 UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs
* args
,
293 const UChar
*mySource
= args
->source
;
294 char *myTarget
= args
->target
;
295 int32_t* offsets
= args
->offsets
;
296 int32_t mySourceIndex
= 0;
297 int32_t myTargetIndex
= 0;
298 int32_t targetLength
= (int32_t)(args
->targetLimit
- myTarget
);
299 int32_t mySourceLength
= (int32_t)(args
->sourceLimit
- args
->source
);
301 uint32_t targetUniChar
= 0x0000;
302 UChar32 mySourceChar
= 0x0000;
303 UConverterDataHZ
*myConverterData
=(UConverterDataHZ
*)args
->converter
->extraInfo
;
304 UBool isTargetUCharDBCS
= (UBool
) myConverterData
->isTargetUCharDBCS
;
305 UBool oldIsTargetUCharDBCS
= isTargetUCharDBCS
;
307 const char* escSeq
=NULL
;
309 if ((args
->converter
== NULL
) || (args
->targetLimit
< myTarget
) || (args
->sourceLimit
< args
->source
)){
310 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
313 if(args
->converter
->fromUChar32
!=0 && myTargetIndex
< targetLength
) {
316 /*writing the char to the output stream */
317 while (mySourceIndex
< mySourceLength
){
318 targetUniChar
= missingCharMarker
;
319 if (myTargetIndex
< targetLength
){
321 mySourceChar
= (UChar
) mySource
[mySourceIndex
++];
324 oldIsTargetUCharDBCS
= isTargetUCharDBCS
;
325 if(mySourceChar
==UCNV_TILDE
){
326 /*concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);*/
328 escSeq
= TILDE_ESCAPE
;
329 CONCAT_ESCAPE_MACRO(args
, myTargetIndex
, targetLength
, escSeq
,err
,len
,mySourceIndex
);
333 length
= ucnv_MBCSFromUChar32(myConverterData
->gbConverter
->sharedData
,
334 mySourceChar
,&targetUniChar
,args
->converter
->useFallback
);
337 /* only DBCS or SBCS characters are expected*/
338 /* DB haracters with high bit set to 1 are expected */
339 if(length
> 2 || length
==0 ||(((targetUniChar
& 0x8080) != 0x8080)&& length
==2)){
340 targetUniChar
= missingCharMarker
;
342 if (targetUniChar
!= missingCharMarker
){
343 myConverterData
->isTargetUCharDBCS
= isTargetUCharDBCS
= (UBool
)(targetUniChar
>0x00FF);
344 if(oldIsTargetUCharDBCS
!= isTargetUCharDBCS
|| !myConverterData
->isEscapeAppended
){
345 /*Shifting from a double byte to single byte mode*/
346 if(!isTargetUCharDBCS
){
349 CONCAT_ESCAPE_MACRO(args
, myTargetIndex
, targetLength
, escSeq
,err
,len
,mySourceIndex
);
350 myConverterData
->isEscapeAppended
= TRUE
;
352 else{ /* Shifting from a single byte to double byte mode*/
355 CONCAT_ESCAPE_MACRO(args
, myTargetIndex
, targetLength
, escSeq
,err
,len
,mySourceIndex
);
356 myConverterData
->isEscapeAppended
= TRUE
;
361 if(isTargetUCharDBCS
){
362 if( myTargetIndex
<targetLength
){
363 myTarget
[myTargetIndex
++] =(char) ((targetUniChar
>> 8) -0x80);
365 *(offsets
++) = mySourceIndex
-1;
367 if(myTargetIndex
< targetLength
){
368 myTarget
[myTargetIndex
++] =(char) ((targetUniChar
& 0x00FF) -0x80);
370 *(offsets
++) = mySourceIndex
-1;
373 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (char) ((targetUniChar
& 0x00FF) -0x80);
374 *err
= U_BUFFER_OVERFLOW_ERROR
;
377 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] =(char) ((targetUniChar
>> 8) -0x80);
378 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (char) ((targetUniChar
& 0x00FF) -0x80);
379 *err
= U_BUFFER_OVERFLOW_ERROR
;
383 if( myTargetIndex
<targetLength
){
384 myTarget
[myTargetIndex
++] = (char) (targetUniChar
);
386 *(offsets
++) = mySourceIndex
-1;
390 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (char) targetUniChar
;
391 *err
= U_BUFFER_OVERFLOW_ERROR
;
397 /* oops.. the code point is unassigned */
398 /*Handle surrogates */
399 /*check if the char is a First surrogate*/
400 if(UTF_IS_SURROGATE(mySourceChar
)) {
401 if(UTF_IS_SURROGATE_FIRST(mySourceChar
)) {
402 args
->converter
->fromUChar32
=mySourceChar
;
404 /*look ahead to find the trail surrogate*/
405 if(mySourceIndex
< mySourceLength
) {
406 /* test the following code unit */
407 UChar trail
=(UChar
) args
->source
[mySourceIndex
];
408 if(UTF_IS_SECOND_SURROGATE(trail
)) {
410 mySourceChar
=UTF16_GET_PAIR_VALUE(args
->converter
->fromUChar32
, trail
);
411 args
->converter
->fromUChar32
=0x00;
412 /* there are no surrogates in GB2312*/
413 *err
= U_INVALID_CHAR_FOUND
;
414 /* exit this condition tree */
416 /* this is an unmatched lead code unit (1st surrogate) */
417 /* callback(illegal) */
418 *err
=U_ILLEGAL_CHAR_FOUND
;
425 /* this is an unmatched trail code unit (2nd surrogate) */
426 /* callback(illegal) */
427 *err
=U_ILLEGAL_CHAR_FOUND
;
430 /* callback(unassigned) for a BMP code point */
431 *err
= U_INVALID_CHAR_FOUND
;
434 args
->converter
->fromUChar32
=mySourceChar
;
439 *err
= U_BUFFER_OVERFLOW_ERROR
;
442 targetUniChar
=missingCharMarker
;
445 args
->target
+= myTargetIndex
;
446 args
->source
+= mySourceIndex
;
447 myConverterData
->isTargetUCharDBCS
= isTargetUCharDBCS
;
451 _HZ_WriteSub(UConverterFromUnicodeArgs
*args
, int32_t offsetIndex
, UErrorCode
*err
) {
452 UConverter
*cnv
= args
->converter
;
453 UConverterDataHZ
*convData
=(UConverterDataHZ
*) cnv
->extraInfo
;
458 if( convData
->isTargetUCharDBCS
){
460 *p
++= UCNV_CLOSE_BRACE
;
461 convData
->isTargetUCharDBCS
=FALSE
;
463 *p
++= (char)cnv
->subChars
[0];
465 ucnv_cbFromUWriteBytes(args
,
466 buffer
, (int32_t)(p
- buffer
),
471 * Structure for cloning an HZ converter into a single memory block.
472 * ucnv_safeClone() of the HZ converter will align the entire cloneHZStruct,
473 * and then ucnv_safeClone() of the sub-converter may additionally align
474 * subCnv inside the cloneHZStruct, for which we need the deadSpace after
475 * subCnv. This is because UAlignedMemory may be larger than the actually
476 * necessary alignment size for the platform.
477 * The other cloneHZStruct fields will not be moved around,
478 * and are aligned properly with cloneHZStruct's alignment.
484 UAlignedMemory deadSpace
;
485 UConverterDataHZ mydata
;
490 _HZ_SafeClone(const UConverter
*cnv
,
492 int32_t *pBufferSize
,
495 struct cloneHZStruct
* localClone
;
496 int32_t size
, bufferSizeNeeded
= sizeof(struct cloneHZStruct
);
498 if (U_FAILURE(*status
)){
502 if (*pBufferSize
== 0){ /* 'preflighting' request - set needed size into *pBufferSize */
503 *pBufferSize
= bufferSizeNeeded
;
507 localClone
= (struct cloneHZStruct
*)stackBuffer
;
508 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
510 uprv_memcpy(&localClone
->mydata
, cnv
->extraInfo
, sizeof(UConverterDataHZ
));
511 localClone
->cnv
.extraInfo
= &localClone
->mydata
;
512 localClone
->cnv
.isExtraLocal
= TRUE
;
514 /* deep-clone the sub-converter */
515 size
= (int32_t)(sizeof(UConverter
) + sizeof(UAlignedMemory
)); /* include size of padding */
516 ((UConverterDataHZ
*)localClone
->cnv
.extraInfo
)->gbConverter
=
517 ucnv_safeClone(((UConverterDataHZ
*)cnv
->extraInfo
)->gbConverter
, &localClone
->subCnv
, &size
, status
);
519 return &localClone
->cnv
;
523 _HZ_GetUnicodeSet(const UConverter
*cnv
,
525 UConverterUnicodeSet which
,
526 UErrorCode
*pErrorCode
) {
527 /* the tilde '~' is hardcoded in the converter */
528 sa
->add(sa
->set
, 0x7e);
530 /* add all of the code points that the sub-converter handles */
531 ((UConverterDataHZ
*)cnv
->extraInfo
)->
532 gbConverter
->sharedData
->impl
->
533 getUnicodeSet(((UConverterDataHZ
*)cnv
->extraInfo
)->gbConverter
,
534 sa
, which
, pErrorCode
);
537 static const UConverterImpl _HZImpl
={
548 UConverter_toUnicode_HZ_OFFSETS_LOGIC
,
549 UConverter_toUnicode_HZ_OFFSETS_LOGIC
,
550 UConverter_fromUnicode_HZ_OFFSETS_LOGIC
,
551 UConverter_fromUnicode_HZ_OFFSETS_LOGIC
,
561 static const UConverterStaticData _HZStaticData
={
562 sizeof(UConverterStaticData
),
575 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
580 const UConverterSharedData _HZData
={
581 sizeof(UConverterSharedData
),
591 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */