- support of [Strings.LanguageID]-sections for inf-files added in setupapi
[reactos.git] / reactos / nls / 3rdparty / icu / source / common / utext.cpp
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2005-2007, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: utext.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2005apr12
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18 #include "unicode/ustring.h"
19 #include "unicode/unistr.h"
20 #include "unicode/chariter.h"
21 #include "unicode/utext.h"
22 #include "ustr_imp.h"
23 #include "cmemory.h"
24 #include "cstring.h"
25 #include "uassert.h"
26
27 U_NAMESPACE_USE
28
29 #define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex))
30
31
32 static UBool
33 utext_access(UText *ut, int64_t index, UBool forward) {
34 return ut->pFuncs->access(ut, index, forward);
35 }
36
37
38
39 U_CAPI UBool U_EXPORT2
40 utext_moveIndex32(UText *ut, int32_t delta) {
41 UChar32 c;
42 if (delta > 0) {
43 do {
44 if(ut->chunkOffset>=ut->chunkLength && !utext_access(ut, ut->chunkNativeLimit, TRUE)) {
45 return FALSE;
46 }
47 c = ut->chunkContents[ut->chunkOffset];
48 if (U16_IS_SURROGATE(c)) {
49 c = utext_next32(ut);
50 if (c == U_SENTINEL) {
51 return FALSE;
52 }
53 } else {
54 ut->chunkOffset++;
55 }
56 } while(--delta>0);
57
58 } else if (delta<0) {
59 do {
60 if(ut->chunkOffset<=0 && !utext_access(ut, ut->chunkNativeStart, FALSE)) {
61 return FALSE;
62 }
63 c = ut->chunkContents[ut->chunkOffset-1];
64 if (U16_IS_SURROGATE(c)) {
65 c = utext_previous32(ut);
66 if (c == U_SENTINEL) {
67 return FALSE;
68 }
69 } else {
70 ut->chunkOffset--;
71 }
72 } while(++delta<0);
73 }
74
75 return TRUE;
76 }
77
78
79 U_CAPI int64_t U_EXPORT2
80 utext_nativeLength(UText *ut) {
81 return ut->pFuncs->nativeLength(ut);
82 }
83
84
85 U_CAPI UBool U_EXPORT2
86 utext_isLengthExpensive(const UText *ut) {
87 UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != 0;
88 return r;
89 }
90
91
92 U_CAPI int64_t U_EXPORT2
93 utext_getNativeIndex(const UText *ut) {
94 if(ut->chunkOffset <= ut->nativeIndexingLimit) {
95 return ut->chunkNativeStart+ut->chunkOffset;
96 } else {
97 return ut->pFuncs->mapOffsetToNative(ut);
98 }
99 }
100
101
102 U_CAPI void U_EXPORT2
103 utext_setNativeIndex(UText *ut, int64_t index) {
104 if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
105 // The desired position is outside of the current chunk.
106 // Access the new position. Assume a forward iteration from here,
107 // which will also be optimimum for a single random access.
108 // Reverse iterations may suffer slightly.
109 ut->pFuncs->access(ut, index, TRUE);
110 } else if((int32_t)(index - ut->chunkNativeStart) <= ut->nativeIndexingLimit) {
111 // utf-16 indexing.
112 ut->chunkOffset=(int32_t)(index-ut->chunkNativeStart);
113 } else {
114 ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
115 }
116 // The convention is that the index must always be on a code point boundary.
117 // Adjust the index position if it is in the middle of a surrogate pair.
118 if (ut->chunkOffset<ut->chunkLength) {
119 UChar c= ut->chunkContents[ut->chunkOffset];
120 if (UTF16_IS_TRAIL(c)) {
121 if (ut->chunkOffset==0) {
122 ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE);
123 }
124 if (ut->chunkOffset>0) {
125 UChar lead = ut->chunkContents[ut->chunkOffset-1];
126 if (UTF16_IS_LEAD(lead)) {
127 ut->chunkOffset--;
128 }
129 }
130 }
131 }
132 }
133
134
135
136 U_CAPI int64_t U_EXPORT2
137 utext_getPreviousNativeIndex(UText *ut) {
138 //
139 // Fast-path the common case.
140 // Common means current position is not at the beginning of a chunk
141 // and the preceding character is not supplementary.
142 //
143 int32_t i = ut->chunkOffset - 1;
144 int64_t result;
145 if (i >= 0) {
146 UChar c = ut->chunkContents[i];
147 if (U16_IS_TRAIL(c) == FALSE) {
148 if (i <= ut->nativeIndexingLimit) {
149 result = ut->chunkNativeStart + i;
150 } else {
151 ut->chunkOffset = i;
152 result = ut->pFuncs->mapOffsetToNative(ut);
153 ut->chunkOffset++;
154 }
155 return result;
156 }
157 }
158
159 // If at the start of text, simply return 0.
160 if (ut->chunkOffset==0 && ut->chunkNativeStart==0) {
161 return 0;
162 }
163
164 // Harder, less common cases. We are at a chunk boundary, or on a surrogate.
165 // Keep it simple, use other functions to handle the edges.
166 //
167 utext_previous32(ut);
168 result = UTEXT_GETNATIVEINDEX(ut);
169 utext_next32(ut);
170 return result;
171 }
172
173
174 //
175 // utext_current32. Get the UChar32 at the current position.
176 // UText iteration position is always on a code point boundary,
177 // never on the trail half of a surrogate pair.
178 //
179 U_CAPI UChar32 U_EXPORT2
180 utext_current32(UText *ut) {
181 UChar32 c;
182 if (ut->chunkOffset==ut->chunkLength) {
183 // Current position is just off the end of the chunk.
184 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
185 // Off the end of the text.
186 return U_SENTINEL;
187 }
188 }
189
190 c = ut->chunkContents[ut->chunkOffset];
191 if (U16_IS_LEAD(c) == FALSE) {
192 // Normal, non-supplementary case.
193 return c;
194 }
195
196 //
197 // Possible supplementary char.
198 //
199 UChar32 trail = 0;
200 UChar32 supplementaryC = c;
201 if ((ut->chunkOffset+1) < ut->chunkLength) {
202 // The trail surrogate is in the same chunk.
203 trail = ut->chunkContents[ut->chunkOffset+1];
204 } else {
205 // The trail surrogate is in a different chunk.
206 // Because we must maintain the iteration position, we need to switch forward
207 // into the new chunk, get the trail surrogate, then revert the chunk back to the
208 // original one.
209 // An edge case to be careful of: the entire text may end with an unpaired
210 // leading surrogate. The attempt to access the trail will fail, but
211 // the original position before the unpaired lead still needs to be restored.
212 int64_t nativePosition = ut->chunkNativeLimit;
213 int32_t originalOffset = ut->chunkOffset;
214 if (ut->pFuncs->access(ut, nativePosition, TRUE)) {
215 trail = ut->chunkContents[ut->chunkOffset];
216 }
217 UBool r = ut->pFuncs->access(ut, nativePosition, FALSE); // reverse iteration flag loads preceding chunk
218 U_ASSERT(r==TRUE);
219 ut->chunkOffset = originalOffset;
220 if(!r) {
221 return U_SENTINEL;
222 }
223 }
224
225 if (U16_IS_TRAIL(trail)) {
226 supplementaryC = U16_GET_SUPPLEMENTARY(c, trail);
227 }
228 return supplementaryC;
229
230 }
231
232
233 U_CAPI UChar32 U_EXPORT2
234 utext_char32At(UText *ut, int64_t nativeIndex) {
235 UChar32 c = U_SENTINEL;
236
237 // Fast path the common case.
238 if (nativeIndex>=ut->chunkNativeStart && nativeIndex < ut->chunkNativeStart + ut->nativeIndexingLimit) {
239 ut->chunkOffset = (int32_t)(nativeIndex - ut->chunkNativeStart);
240 c = ut->chunkContents[ut->chunkOffset];
241 if (U16_IS_SURROGATE(c) == FALSE) {
242 return c;
243 }
244 }
245
246
247 utext_setNativeIndex(ut, nativeIndex);
248 if (nativeIndex>=ut->chunkNativeStart && ut->chunkOffset<ut->chunkLength) {
249 c = ut->chunkContents[ut->chunkOffset];
250 if (U16_IS_SURROGATE(c)) {
251 // For surrogates, let current32() deal with the complications
252 // of supplementaries that may span chunk boundaries.
253 c = utext_current32(ut);
254 }
255 }
256 return c;
257 }
258
259
260 U_CAPI UChar32 U_EXPORT2
261 utext_next32(UText *ut) {
262 UChar32 c;
263
264 if (ut->chunkOffset >= ut->chunkLength) {
265 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
266 return U_SENTINEL;
267 }
268 }
269
270 c = ut->chunkContents[ut->chunkOffset++];
271 if (U16_IS_LEAD(c) == FALSE) {
272 // Normal case, not supplementary.
273 // (A trail surrogate seen here is just returned as is, as a surrogate value.
274 // It cannot be part of a pair.)
275 return c;
276 }
277
278 if (ut->chunkOffset >= ut->chunkLength) {
279 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
280 // c is an unpaired lead surrogate at the end of the text.
281 // return it as it is.
282 return c;
283 }
284 }
285 UChar32 trail = ut->chunkContents[ut->chunkOffset];
286 if (U16_IS_TRAIL(trail) == FALSE) {
287 // c was an unpaired lead surrogate, not at the end of the text.
288 // return it as it is (unpaired). Iteration position is on the
289 // following character, possibly in the next chunk, where the
290 // trail surrogate would have been if it had existed.
291 return c;
292 }
293
294 UChar32 supplementary = U16_GET_SUPPLEMENTARY(c, trail);
295 ut->chunkOffset++; // move iteration position over the trail surrogate.
296 return supplementary;
297 }
298
299
300 U_CAPI UChar32 U_EXPORT2
301 utext_previous32(UText *ut) {
302 UChar32 c;
303
304 if (ut->chunkOffset <= 0) {
305 if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
306 return U_SENTINEL;
307 }
308 }
309 ut->chunkOffset--;
310 c = ut->chunkContents[ut->chunkOffset];
311 if (U16_IS_TRAIL(c) == FALSE) {
312 // Normal case, not supplementary.
313 // (A lead surrogate seen here is just returned as is, as a surrogate value.
314 // It cannot be part of a pair.)
315 return c;
316 }
317
318 if (ut->chunkOffset <= 0) {
319 if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
320 // c is an unpaired trail surrogate at the start of the text.
321 // return it as it is.
322 return c;
323 }
324 }
325
326 UChar32 lead = ut->chunkContents[ut->chunkOffset-1];
327 if (U16_IS_LEAD(lead) == FALSE) {
328 // c was an unpaired trail surrogate, not at the end of the text.
329 // return it as it is (unpaired). Iteration position is at c
330 return c;
331 }
332
333 UChar32 supplementary = U16_GET_SUPPLEMENTARY(lead, c);
334 ut->chunkOffset--; // move iteration position over the lead surrogate.
335 return supplementary;
336 }
337
338
339
340 U_CAPI UChar32 U_EXPORT2
341 utext_next32From(UText *ut, int64_t index) {
342 UChar32 c = U_SENTINEL;
343
344 if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
345 // Desired position is outside of the current chunk.
346 if(!ut->pFuncs->access(ut, index, TRUE)) {
347 // no chunk available here
348 return U_SENTINEL;
349 }
350 } else if (index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) {
351 // Desired position is in chunk, with direct 1:1 native to UTF16 indexing
352 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
353 } else {
354 // Desired position is in chunk, with non-UTF16 indexing.
355 ut->chunkOffset = ut->pFuncs->mapNativeIndexToUTF16(ut, index);
356 }
357
358 c = ut->chunkContents[ut->chunkOffset++];
359 if (U16_IS_SURROGATE(c)) {
360 // Surrogates. Many edge cases. Use other functions that already
361 // deal with the problems.
362 utext_setNativeIndex(ut, index);
363 c = utext_next32(ut);
364 }
365 return c;
366 }
367
368
369 U_CAPI UChar32 U_EXPORT2
370 utext_previous32From(UText *ut, int64_t index) {
371 //
372 // Return the character preceding the specified index.
373 // Leave the iteration position at the start of the character that was returned.
374 //
375 UChar32 cPrev; // The character preceding cCurr, which is what we will return.
376
377 // Address the chunk containg the position preceding the incoming index
378 // A tricky edge case:
379 // We try to test the requested native index against the chunkNativeStart to determine
380 // whether the character preceding the one at the index is in the current chunk.
381 // BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the
382 // requested index is on something other than the first position of the first char.
383 //
384 if(index<=ut->chunkNativeStart || index>ut->chunkNativeLimit) {
385 // Requested native index is outside of the current chunk.
386 if(!ut->pFuncs->access(ut, index, FALSE)) {
387 // no chunk available here
388 return U_SENTINEL;
389 }
390 } else if(index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) {
391 // Direct UTF-16 indexing.
392 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
393 } else {
394 ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
395 if (ut->chunkOffset==0 && !ut->pFuncs->access(ut, index, FALSE)) {
396 // no chunk available here
397 return U_SENTINEL;
398 }
399 }
400
401 //
402 // Simple case with no surrogates.
403 //
404 ut->chunkOffset--;
405 cPrev = ut->chunkContents[ut->chunkOffset];
406
407 if (U16_IS_SURROGATE(cPrev)) {
408 // Possible supplementary. Many edge cases.
409 // Let other functions do the heavy lifting.
410 utext_setNativeIndex(ut, index);
411 cPrev = utext_previous32(ut);
412 }
413 return cPrev;
414 }
415
416
417 U_CAPI int32_t U_EXPORT2
418 utext_extract(UText *ut,
419 int64_t start, int64_t limit,
420 UChar *dest, int32_t destCapacity,
421 UErrorCode *status) {
422 return ut->pFuncs->extract(ut, start, limit, dest, destCapacity, status);
423 }
424
425
426
427 U_CAPI UBool U_EXPORT2
428 utext_equals(const UText *a, const UText *b) {
429 if (a==NULL || b==NULL ||
430 a->magic != UTEXT_MAGIC ||
431 b->magic != UTEXT_MAGIC) {
432 // Null or invalid arguments don't compare equal to anything.
433 return FALSE;
434 }
435
436 if (a->pFuncs != b->pFuncs) {
437 // Different types of text providers.
438 return FALSE;
439 }
440
441 if (a->context != b->context) {
442 // Different sources (different strings)
443 return FALSE;
444 }
445 if (utext_getNativeIndex(a) != utext_getNativeIndex(b)) {
446 // Different current position in the string.
447 return FALSE;
448 }
449
450 return TRUE;
451 }
452
453 U_CAPI UBool U_EXPORT2
454 utext_isWritable(const UText *ut)
455 {
456 UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != 0;
457 return b;
458 }
459
460
461 U_CAPI void U_EXPORT2
462 utext_freeze(UText *ut) {
463 // Zero out the WRITABLE flag.
464 ut->providerProperties &= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE));
465 }
466
467
468 U_CAPI UBool U_EXPORT2
469 utext_hasMetaData(const UText *ut)
470 {
471 UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != 0;
472 return b;
473 }
474
475
476
477 U_CAPI int32_t U_EXPORT2
478 utext_replace(UText *ut,
479 int64_t nativeStart, int64_t nativeLimit,
480 const UChar *replacementText, int32_t replacementLength,
481 UErrorCode *status)
482 {
483 if (U_FAILURE(*status)) {
484 return 0;
485 }
486 if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
487 *status = U_NO_WRITE_PERMISSION;
488 return 0;
489 }
490 int32_t i = ut->pFuncs->replace(ut, nativeStart, nativeLimit, replacementText, replacementLength, status);
491 return i;
492 }
493
494 U_CAPI void U_EXPORT2
495 utext_copy(UText *ut,
496 int64_t nativeStart, int64_t nativeLimit,
497 int64_t destIndex,
498 UBool move,
499 UErrorCode *status)
500 {
501 if (U_FAILURE(*status)) {
502 return;
503 }
504 if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
505 *status = U_NO_WRITE_PERMISSION;
506 return;
507 }
508 ut->pFuncs->copy(ut, nativeStart, nativeLimit, destIndex, move, status);
509 }
510
511
512
513 U_CAPI UText * U_EXPORT2
514 utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status) {
515 UText *result;
516 result = src->pFuncs->clone(dest, src, deep, status);
517 if (readOnly) {
518 utext_freeze(result);
519 }
520 return result;
521 }
522
523
524
525 //------------------------------------------------------------------------------
526 //
527 // UText common functions implementation
528 //
529 //------------------------------------------------------------------------------
530
531 //
532 // UText.flags bit definitions
533 //
534 enum {
535 UTEXT_HEAP_ALLOCATED = 1, // 1 if ICU has allocated this UText struct on the heap.
536 // 0 if caller provided storage for the UText.
537
538 UTEXT_EXTRA_HEAP_ALLOCATED = 2, // 1 if ICU has allocated extra storage as a separate
539 // heap block.
540 // 0 if there is no separate allocation. Either no extra
541 // storage was requested, or it is appended to the end
542 // of the main UText storage.
543
544 UTEXT_OPEN = 4 // 1 if this UText is currently open
545 // 0 if this UText is not open.
546 };
547
548
549 //
550 // Extended form of a UText. The purpose is to aid in computing the total size required
551 // when a provider asks for a UText to be allocated with extra storage.
552
553 struct ExtendedUText {
554 UText ut;
555 UAlignedMemory extension;
556 };
557
558 static const UText emptyText = UTEXT_INITIALIZER;
559
560 U_CAPI UText * U_EXPORT2
561 utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
562 if (U_FAILURE(*status)) {
563 return ut;
564 }
565
566 if (ut == NULL) {
567 // We need to heap-allocate storage for the new UText
568 int32_t spaceRequired = sizeof(UText);
569 if (extraSpace > 0) {
570 spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(UAlignedMemory);
571 }
572 ut = (UText *)uprv_malloc(spaceRequired);
573 if (ut == NULL) {
574 *status = U_MEMORY_ALLOCATION_ERROR;
575 } else {
576 *ut = emptyText;
577 ut->flags |= UTEXT_HEAP_ALLOCATED;
578 if (spaceRequired>0) {
579 ut->extraSize = extraSpace;
580 ut->pExtra = &((ExtendedUText *)ut)->extension;
581 }
582 }
583 } else {
584 // We have been supplied with an already existing UText.
585 // Verify that it really appears to be a UText.
586 if (ut->magic != UTEXT_MAGIC) {
587 *status = U_ILLEGAL_ARGUMENT_ERROR;
588 return ut;
589 }
590 // If the ut is already open and there's a provider supplied close
591 // function, call it.
592 if ((ut->flags & UTEXT_OPEN) && ut->pFuncs->close != NULL) {
593 ut->pFuncs->close(ut);
594 }
595 ut->flags &= ~UTEXT_OPEN;
596
597 // If extra space was requested by our caller, check whether
598 // sufficient already exists, and allocate new if needed.
599 if (extraSpace > ut->extraSize) {
600 // Need more space. If there is existing separately allocated space,
601 // delete it first, then allocate new space.
602 if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
603 uprv_free(ut->pExtra);
604 ut->extraSize = 0;
605 }
606 ut->pExtra = uprv_malloc(extraSpace);
607 if (ut->pExtra == NULL) {
608 *status = U_MEMORY_ALLOCATION_ERROR;
609 } else {
610 ut->extraSize = extraSpace;
611 ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED;
612 }
613 }
614 }
615 if (U_SUCCESS(*status)) {
616 ut->flags |= UTEXT_OPEN;
617
618 // Initialize all remaining fields of the UText.
619 //
620 ut->context = NULL;
621 ut->chunkContents = NULL;
622 ut->p = NULL;
623 ut->q = NULL;
624 ut->r = NULL;
625 ut->a = 0;
626 ut->b = 0;
627 ut->c = 0;
628 ut->chunkOffset = 0;
629 ut->chunkLength = 0;
630 ut->chunkNativeStart = 0;
631 ut->chunkNativeLimit = 0;
632 ut->nativeIndexingLimit = 0;
633 ut->providerProperties = 0;
634 ut->privA = 0;
635 ut->privB = 0;
636 ut->privC = 0;
637 ut->privP = NULL;
638 if (ut->pExtra!=NULL && ut->extraSize>0)
639 uprv_memset(ut->pExtra, 0, ut->extraSize);
640
641 }
642 return ut;
643 }
644
645
646 U_CAPI UText * U_EXPORT2
647 utext_close(UText *ut) {
648 if (ut==NULL ||
649 ut->magic != UTEXT_MAGIC ||
650 (ut->flags & UTEXT_OPEN) == 0)
651 {
652 // The supplied ut is not an open UText.
653 // Do nothing.
654 return ut;
655 }
656
657 // If the provider gave us a close function, call it now.
658 // This will clean up anything allocated specifically by the provider.
659 if (ut->pFuncs->close != NULL) {
660 ut->pFuncs->close(ut);
661 }
662 ut->flags &= ~UTEXT_OPEN;
663
664 // If we (the framework) allocated the UText or subsidiary storage,
665 // delete it.
666 if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
667 uprv_free(ut->pExtra);
668 ut->pExtra = NULL;
669 ut->flags &= ~UTEXT_EXTRA_HEAP_ALLOCATED;
670 ut->extraSize = 0;
671 }
672
673 // Zero out function table of the closed UText. This is a defensive move,
674 // inteded to cause applications that inadvertantly use a closed
675 // utext to crash with null pointer errors.
676 ut->pFuncs = NULL;
677
678 if (ut->flags & UTEXT_HEAP_ALLOCATED) {
679 // This UText was allocated by UText setup. We need to free it.
680 // Clear magic, so we can detect if the user messes up and immediately
681 // tries to reopen another UText using the deleted storage.
682 ut->magic = 0;
683 uprv_free(ut);
684 ut = NULL;
685 }
686 return ut;
687 }
688
689
690
691
692 //
693 // invalidateChunk Reset a chunk to have no contents, so that the next call
694 // to access will cause new data to load.
695 // This is needed when copy/move/replace operate directly on the
696 // backing text, potentially putting it out of sync with the
697 // contents in the chunk.
698 //
699 static void
700 invalidateChunk(UText *ut) {
701 ut->chunkLength = 0;
702 ut->chunkNativeLimit = 0;
703 ut->chunkNativeStart = 0;
704 ut->chunkOffset = 0;
705 ut->nativeIndexingLimit = 0;
706 }
707
708 //
709 // pinIndex Do range pinning on a native index parameter.
710 // 64 bit pinning is done in place.
711 // 32 bit truncated result is returned as a convenience for
712 // use in providers that don't need 64 bits.
713 static int32_t
714 pinIndex(int64_t &index, int64_t limit) {
715 if (index<0) {
716 index = 0;
717 } else if (index > limit) {
718 index = limit;
719 }
720 return (int32_t)index;
721 }
722
723
724 U_CDECL_BEGIN
725
726 //
727 // Pointer relocation function,
728 // a utility used by shallow clone.
729 // Adjust a pointer that refers to something within one UText (the source)
730 // to refer to the same relative offset within a another UText (the target)
731 //
732 static void adjustPointer(UText *dest, const void **destPtr, const UText *src) {
733 // convert all pointers to (char *) so that byte address arithmetic will work.
734 char *dptr = (char *)*destPtr;
735 char *dUText = (char *)dest;
736 char *sUText = (char *)src;
737
738 if (dptr >= (char *)src->pExtra && dptr < ((char*)src->pExtra)+src->extraSize) {
739 // target ptr was to something within the src UText's pExtra storage.
740 // relocate it into the target UText's pExtra region.
741 *destPtr = ((char *)dest->pExtra) + (dptr - (char *)src->pExtra);
742 } else if (dptr>=sUText && dptr < sUText+src->sizeOfStruct) {
743 // target ptr was pointing to somewhere within the source UText itself.
744 // Move it to the same offset within the target UText.
745 *destPtr = dUText + (dptr-sUText);
746 }
747 }
748
749
750 //
751 // Clone. This is a generic copy-the-utext-by-value clone function that can be
752 // used as-is with some utext types, and as a helper by other clones.
753 //
754 static UText * U_CALLCONV
755 shallowTextClone(UText * dest, const UText * src, UErrorCode * status) {
756 if (U_FAILURE(*status)) {
757 return NULL;
758 }
759 int32_t srcExtraSize = src->extraSize;
760
761 //
762 // Use the generic text_setup to allocate storage if required.
763 //
764 dest = utext_setup(dest, srcExtraSize, status);
765 if (U_FAILURE(*status)) {
766 return dest;
767 }
768
769 //
770 // flags (how the UText was allocated) and the pointer to the
771 // extra storage must retain the values in the cloned utext that
772 // were set up by utext_setup. Save them separately before
773 // copying the whole struct.
774 //
775 void *destExtra = dest->pExtra;
776 int32_t flags = dest->flags;
777
778
779 //
780 // Copy the whole UText struct by value.
781 // Any "Extra" storage is copied also.
782 //
783 int sizeToCopy = src->sizeOfStruct;
784 if (sizeToCopy > dest->sizeOfStruct) {
785 sizeToCopy = dest->sizeOfStruct;
786 }
787 uprv_memcpy(dest, src, sizeToCopy);
788 dest->pExtra = destExtra;
789 dest->flags = flags;
790 if (srcExtraSize > 0) {
791 uprv_memcpy(dest->pExtra, src->pExtra, srcExtraSize);
792 }
793
794 //
795 // Relocate any pointers in the target that refer to the UText itself
796 // to point to the cloned copy rather than the original source.
797 //
798 adjustPointer(dest, &dest->context, src);
799 adjustPointer(dest, &dest->p, src);
800 adjustPointer(dest, &dest->q, src);
801 adjustPointer(dest, &dest->r, src);
802 adjustPointer(dest, (const void **)&dest->chunkContents, src);
803
804 return dest;
805 }
806
807
808 U_CDECL_END
809
810
811
812 //------------------------------------------------------------------------------
813 //
814 // UText implementation for UTF-8 char * strings (read-only)
815 // Limitation: string length must be <= 0x7fffffff in length.
816 // (length must for in an int32_t variable)
817 //
818 // Use of UText data members:
819 // context pointer to UTF-8 string
820 // utext.b is the input string length (bytes).
821 // utext.c Length scanned so far in string
822 // (for optimizing finding length of zero terminated strings.)
823 // utext.p pointer to the current buffer
824 // utext.q pointer to the other buffer.
825 //
826 //------------------------------------------------------------------------------
827
828 // Chunk size.
829 // Must be less than 85, because of byte mapping from UChar indexes to native indexes.
830 // Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes
831 // to two UChars.)
832 //
833 enum { UTF8_TEXT_CHUNK_SIZE=32 };
834
835 //
836 // UTF8Buf Two of these structs will be set up in the UText's extra allocated space.
837 // Each contains the UChar chunk buffer, the to and from native maps, and
838 // header info.
839 //
840 // because backwards iteration fills the buffers starting at the end and
841 // working towards the front, the filled part of the buffers may not begin
842 // at the start of the available storage for the buffers.
843 //
844 // Buffer size is one bigger than the specified UTF8_TEXT_CHUNK_SIZE to allow for
845 // the last character added being a supplementary, and thus requiring a surrogate
846 // pair. Doing this is simpler than checking for the edge case.
847 //
848
849 struct UTF8Buf {
850 int32_t bufNativeStart; // Native index of first char in UChar buf
851 int32_t bufNativeLimit; // Native index following last char in buf.
852 int32_t bufStartIdx; // First filled position in buf.
853 int32_t bufLimitIdx; // Limit of filled range in buf.
854 int32_t bufNILimit; // Limit of native indexing part of buf
855 int32_t toUCharsMapStart; // Native index corresponding to
856 // mapToUChars[0].
857 // Set to bufNativeStart when filling forwards.
858 // Set to computed value when filling backwards.
859
860 UChar buf[UTF8_TEXT_CHUNK_SIZE+4]; // The UChar buffer. Requires one extra position beyond the
861 // the chunk size, to allow for surrogate at the end.
862 // Length must be identical to mapToNative array, below,
863 // because of the way indexing works when the array is
864 // filled backwards during a reverse iteration. Thus,
865 // the additional extra size.
866 uint8_t mapToNative[UTF8_TEXT_CHUNK_SIZE+4]; // map UChar index in buf to
867 // native offset from bufNativeStart.
868 // Requires two extra slots,
869 // one for a supplementary starting in the last normal position,
870 // and one for an entry for the buffer limit position.
871 uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
872 // correspoding offset in filled part of buf.
873 int32_t align;
874 };
875
876 U_CDECL_BEGIN
877
878 //
879 // utf8TextLength
880 //
881 // Get the length of the string. If we don't already know it,
882 // we'll need to scan for the trailing nul.
883 //
884 static int64_t U_CALLCONV
885 utf8TextLength(UText *ut) {
886 if (ut->b < 0) {
887 // Zero terminated string, and we haven't scanned to the end yet.
888 // Scan it now.
889 const char *r = (const char *)ut->context + ut->c;
890 while (*r != 0) {
891 r++;
892 }
893 if ((r - (const char *)ut->context) < 0x7fffffff) {
894 ut->b = (int32_t)(r - (const char *)ut->context);
895 } else {
896 // Actual string was bigger (more than 2 gig) than we
897 // can handle. Clip it to 2 GB.
898 ut->b = 0x7fffffff;
899 }
900 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
901 }
902 return ut->b;
903 }
904
905
906
907
908
909
910 static UBool U_CALLCONV
911 utf8TextAccess(UText *ut, int64_t index, UBool forward) {
912 //
913 // Apologies to those who are allergic to goto statements.
914 // Consider each goto to a labelled block to be the equivalent of
915 // call the named block as if it were a function();
916 // return;
917 //
918 const uint8_t *s8=(const uint8_t *)ut->context;
919 UTF8Buf *u8b = NULL;
920 int32_t length = ut->b; // Length of original utf-8
921 int32_t ix= (int32_t)index; // Requested index, trimmed to 32 bits.
922 int32_t mapIndex = 0;
923 if (index<0) {
924 ix=0;
925 } else if (index > 0x7fffffff) {
926 // Strings with 64 bit lengths not supported by this UTF-8 provider.
927 ix = 0x7fffffff;
928 }
929
930 // Pin requested index to the string length.
931 if (ix>length) {
932 if (length>=0) {
933 ix=length;
934 } else if (ix>ut->c) {
935 // Zero terminated string, and requested index is beyond
936 // the region that has already been scanned.
937 // Scan up to either the end of the string or to the
938 // requested position, whichever comes first.
939 while (ut->c<ix && s8[ut->c]!=0) {
940 ut->c++;
941 }
942 // TODO: support for null terminated string length > 32 bits.
943 if (s8[ut->c] == 0) {
944 // We just found the actual length of the string.
945 // Trim the requested index back to that.
946 ix = ut->c;
947 ut->b = ut->c;
948 length = ut->c;
949 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
950 }
951 }
952 }
953
954 //
955 // Dispatch to the appropriate action for a forward iteration request.
956 //
957 if (forward) {
958 if (ix==ut->chunkNativeLimit) {
959 // Check for normal sequential iteration cases first.
960 if (ix==length) {
961 // Just reached end of string
962 // Don't swap buffers, but do set the
963 // current buffer position.
964 ut->chunkOffset = ut->chunkLength;
965 return FALSE;
966 } else {
967 // End of current buffer.
968 // check whether other buffer already has what we need.
969 UTF8Buf *altB = (UTF8Buf *)ut->q;
970 if (ix>=altB->bufNativeStart && ix<altB->bufNativeLimit) {
971 goto swapBuffers;
972 }
973 }
974 }
975
976 // A random access. Desired index could be in either or niether buf.
977 // For optimizing the order of testing, first check for the index
978 // being in the other buffer. This will be the case for uses that
979 // move back and forth over a fairly limited range
980 {
981 u8b = (UTF8Buf *)ut->q; // the alternate buffer
982 if (ix>=u8b->bufNativeStart && ix<u8b->bufNativeLimit) {
983 // Requested index is in the other buffer.
984 goto swapBuffers;
985 }
986 if (ix == length) {
987 // Requested index is end-of-string.
988 // (this is the case of randomly seeking to the end.
989 // The case of iterating off the end is handled earlier.)
990 if (ix == ut->chunkNativeLimit) {
991 // Current buffer extends up to the end of the string.
992 // Leave it as the current buffer.
993 ut->chunkOffset = ut->chunkLength;
994 return FALSE;
995 }
996 if (ix == u8b->bufNativeLimit) {
997 // Alternate buffer extends to the end of string.
998 // Swap it in as the current buffer.
999 goto swapBuffersAndFail;
1000 }
1001
1002 // Neither existing buffer extends to the end of the string.
1003 goto makeStubBuffer;
1004 }
1005
1006 if (ix<ut->chunkNativeStart || ix>=ut->chunkNativeLimit) {
1007 // Requested index is in neither buffer.
1008 goto fillForward;
1009 }
1010
1011 // Requested index is in this buffer.
1012 u8b = (UTF8Buf *)ut->p; // the current buffer
1013 mapIndex = ix - u8b->toUCharsMapStart;
1014 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1015 return TRUE;
1016
1017 }
1018 }
1019
1020
1021 //
1022 // Dispatch to the appropriate action for a
1023 // Backwards Diretion iteration request.
1024 //
1025 if (ix==ut->chunkNativeStart) {
1026 // Check for normal sequential iteration cases first.
1027 if (ix==0) {
1028 // Just reached the start of string
1029 // Don't swap buffers, but do set the
1030 // current buffer position.
1031 ut->chunkOffset = 0;
1032 return FALSE;
1033 } else {
1034 // Start of current buffer.
1035 // check whether other buffer already has what we need.
1036 UTF8Buf *altB = (UTF8Buf *)ut->q;
1037 if (ix>altB->bufNativeStart && ix<=altB->bufNativeLimit) {
1038 goto swapBuffers;
1039 }
1040 }
1041 }
1042
1043 // A random access. Desired index could be in either or niether buf.
1044 // For optimizing the order of testing,
1045 // Most likely case: in the other buffer.
1046 // Second most likely: in neither buffer.
1047 // Unlikely, but must work: in the current buffer.
1048 u8b = (UTF8Buf *)ut->q; // the alternate buffer
1049 if (ix>u8b->bufNativeStart && ix<=u8b->bufNativeLimit) {
1050 // Requested index is in the other buffer.
1051 goto swapBuffers;
1052 }
1053 // Requested index is start-of-string.
1054 // (this is the case of randomly seeking to the start.
1055 // The case of iterating off the start is handled earlier.)
1056 if (ix==0) {
1057 if (u8b->bufNativeStart==0) {
1058 // Alternate buffer contains the data for the start string.
1059 // Make it be the current buffer.
1060 goto swapBuffersAndFail;
1061 } else {
1062 // Request for data before the start of string,
1063 // neither buffer is usable.
1064 // set up a zero-length buffer.
1065 goto makeStubBuffer;
1066 }
1067 }
1068
1069 if (ix<=ut->chunkNativeStart || ix>ut->chunkNativeLimit) {
1070 // Requested index is in neither buffer.
1071 goto fillReverse;
1072 }
1073
1074 // Requested index is in this buffer.
1075 // Set the utf16 buffer index.
1076 u8b = (UTF8Buf *)ut->p;
1077 mapIndex = ix - u8b->toUCharsMapStart;
1078 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1079 if (ut->chunkOffset==0) {
1080 // This occurs when the first character in the text is
1081 // a multi-byte UTF-8 char, and the requested index is to
1082 // one of the trailing bytes. Because there is no preceding ,
1083 // character, this access fails. We can't pick up on the
1084 // situation sooner because the requested index is not zero.
1085 return FALSE;
1086 } else {
1087 return TRUE;
1088 }
1089
1090
1091
1092 swapBuffers:
1093 // The alternate buffer (ut->q) has the string data that was requested.
1094 // Swap the primary and alternate buffers, and set the
1095 // chunk index into the new primary buffer.
1096 {
1097 u8b = (UTF8Buf *)ut->q;
1098 ut->q = ut->p;
1099 ut->p = u8b;
1100 ut->chunkContents = &u8b->buf[u8b->bufStartIdx];
1101 ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
1102 ut->chunkNativeStart = u8b->bufNativeStart;
1103 ut->chunkNativeLimit = u8b->bufNativeLimit;
1104 ut->nativeIndexingLimit = u8b->bufNILimit;
1105
1106 // Index into the (now current) chunk
1107 // Use the map to set the chunk index. It's more trouble than it's worth
1108 // to check whether native indexing can be used.
1109 U_ASSERT(ix>=u8b->bufNativeStart);
1110 U_ASSERT(ix<=u8b->bufNativeLimit);
1111 mapIndex = ix - u8b->toUCharsMapStart;
1112 U_ASSERT(mapIndex>=0);
1113 U_ASSERT(mapIndex<(int32_t)sizeof(u8b->mapToUChars));
1114 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1115
1116 return TRUE;
1117 }
1118
1119
1120 swapBuffersAndFail:
1121 // We got a request for either the start or end of the string,
1122 // with iteration continuing in the out-of-bounds direction.
1123 // The alternate buffer already contains the data up to the
1124 // start/end.
1125 // Swap the buffers, then return failure, indicating that we couldn't
1126 // make things correct for continuing the iteration in the requested
1127 // direction. The position & buffer are correct should the
1128 // user decide to iterate in the opposite direction.
1129 u8b = (UTF8Buf *)ut->q;
1130 ut->q = ut->p;
1131 ut->p = u8b;
1132 ut->chunkContents = &u8b->buf[u8b->bufStartIdx];
1133 ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
1134 ut->chunkNativeStart = u8b->bufNativeStart;
1135 ut->chunkNativeLimit = u8b->bufNativeLimit;
1136 ut->nativeIndexingLimit = u8b->bufNILimit;
1137
1138 // Index into the (now current) chunk
1139 // For this function (swapBuffersAndFail), the requested index
1140 // will always be at either the start or end of the chunk.
1141 if (ix==u8b->bufNativeLimit) {
1142 ut->chunkOffset = ut->chunkLength;
1143 } else {
1144 ut->chunkOffset = 0;
1145 U_ASSERT(ix == u8b->bufNativeStart);
1146 }
1147 return FALSE;
1148
1149 makeStubBuffer:
1150 // The user has done a seek/access past the start or end
1151 // of the string. Rather than loading data that is likely
1152 // to never be used, just set up a zero-length buffer at
1153 // the position.
1154 u8b = (UTF8Buf *)ut->q;
1155 u8b->bufNativeStart = ix;
1156 u8b->bufNativeLimit = ix;
1157 u8b->bufStartIdx = 0;
1158 u8b->bufLimitIdx = 0;
1159 u8b->bufNILimit = 0;
1160 u8b->toUCharsMapStart = ix;
1161 u8b->mapToNative[0] = 0;
1162 u8b->mapToUChars[0] = 0;
1163 goto swapBuffersAndFail;
1164
1165
1166
1167 fillForward:
1168 {
1169 // Move the incoming index to a code point boundary.
1170 U8_SET_CP_START(s8, 0, ix);
1171
1172 // Swap the UText buffers.
1173 // We want to fill what was previously the alternate buffer,
1174 // and make what was the current buffer be the new alternate.
1175 UTF8Buf *u8b = (UTF8Buf *)ut->q;
1176 ut->q = ut->p;
1177 ut->p = u8b;
1178
1179 int32_t strLen = ut->b;
1180 UBool nulTerminated = FALSE;
1181 if (strLen < 0) {
1182 strLen = 0x7fffffff;
1183 nulTerminated = TRUE;
1184 }
1185
1186 UChar *buf = u8b->buf;
1187 uint8_t *mapToNative = u8b->mapToNative;
1188 uint8_t *mapToUChars = u8b->mapToUChars;
1189 int32_t destIx = 0;
1190 int32_t srcIx = ix;
1191 UBool seenNonAscii = FALSE;
1192 UChar32 c;
1193
1194 // Fill the chunk buffer and mapping arrays.
1195 while (destIx<UTF8_TEXT_CHUNK_SIZE) {
1196 c = s8[srcIx];
1197 if (c>0 && c<0x80) {
1198 // Special case ASCII range for speed.
1199 // zero is excluded to simplify bounds checking.
1200 buf[destIx] = c;
1201 mapToNative[destIx] = srcIx - ix;
1202 mapToUChars[srcIx-ix] = destIx;
1203 srcIx++;
1204 destIx++;
1205 } else {
1206 // General case, handle everything.
1207 if (seenNonAscii == FALSE) {
1208 seenNonAscii = TRUE;
1209 u8b->bufNILimit = destIx;
1210 }
1211
1212 int32_t cIx = srcIx;
1213 int32_t dIx = destIx;
1214 int32_t dIxSaved = destIx;
1215 U8_NEXT(s8, srcIx, strLen, c);
1216 if (c==0 && nulTerminated) {
1217 srcIx--;
1218 break;
1219 }
1220 if (c<0) {
1221 // Illegal UTF-8. Replace with sub character.
1222 c = 0x0fffd;
1223 }
1224
1225 U16_APPEND_UNSAFE(buf, destIx, c);
1226 do {
1227 mapToNative[dIx++] = cIx - ix;
1228 } while (dIx < destIx);
1229
1230 do {
1231 mapToUChars[cIx++ - ix] = dIxSaved;
1232 } while (cIx < srcIx);
1233 }
1234 if (srcIx>=strLen) {
1235 break;
1236 }
1237
1238 }
1239
1240 // store Native <--> Chunk Map entries for the end of the buffer.
1241 // There is no actual character here, but the index position is valid.
1242 mapToNative[destIx] = srcIx - ix;
1243 mapToUChars[srcIx - ix] = destIx;
1244
1245 // fill in Buffer descriptor
1246 u8b->bufNativeStart = ix;
1247 u8b->bufNativeLimit = srcIx;
1248 u8b->bufStartIdx = 0;
1249 u8b->bufLimitIdx = destIx;
1250 if (seenNonAscii == FALSE) {
1251 u8b->bufNILimit = destIx;
1252 }
1253 u8b->toUCharsMapStart = u8b->bufNativeStart;
1254
1255 // Set UText chunk to refer to this buffer.
1256 ut->chunkContents = buf;
1257 ut->chunkOffset = 0;
1258 ut->chunkLength = u8b->bufLimitIdx;
1259 ut->chunkNativeStart = u8b->bufNativeStart;
1260 ut->chunkNativeLimit = u8b->bufNativeLimit;
1261 ut->nativeIndexingLimit = u8b->bufNILimit;
1262
1263 // For zero terminated strings, keep track of the maximum point
1264 // scanned so far.
1265 if (nulTerminated && srcIx>ut->c) {
1266 ut->c = srcIx;
1267 if (c==0) {
1268 // We scanned to the end.
1269 // Remember the actual length.
1270 ut->b = srcIx;
1271 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
1272 }
1273 }
1274 return TRUE;
1275 }
1276
1277
1278 fillReverse:
1279 {
1280 // Move the incoming index to a code point boundary.
1281 // Can only do this if the incoming index is somewhere in the interior of the string.
1282 // If index is at the end, there is no character there to look at.
1283 if (ix != ut->b) {
1284 U8_SET_CP_START(s8, 0, ix);
1285 }
1286
1287 // Swap the UText buffers.
1288 // We want to fill what was previously the alternate buffer,
1289 // and make what was the current buffer be the new alternate.
1290 UTF8Buf *u8b = (UTF8Buf *)ut->q;
1291 ut->q = ut->p;
1292 ut->p = u8b;
1293
1294 UChar *buf = u8b->buf;
1295 uint8_t *mapToNative = u8b->mapToNative;
1296 uint8_t *mapToUChars = u8b->mapToUChars;
1297 int32_t toUCharsMapStart = ix - (UTF8_TEXT_CHUNK_SIZE*3 + 1);
1298 int32_t destIx = UTF8_TEXT_CHUNK_SIZE+2; // Start in the overflow region
1299 // at end of buffer to leave room
1300 // for a surrogate pair at the
1301 // buffer start.
1302 int32_t srcIx = ix;
1303 int32_t bufNILimit = destIx;
1304 UChar32 c;
1305
1306 // Map to/from Native Indexes, fill in for the position at the end of
1307 // the buffer.
1308 //
1309 mapToNative[destIx] = srcIx - toUCharsMapStart;
1310 mapToUChars[srcIx - toUCharsMapStart] = destIx;
1311
1312 // Fill the chunk buffer
1313 // Work backwards, filling from the end of the buffer towards the front.
1314 //
1315 while (destIx>2 && (srcIx - toUCharsMapStart > 5) && (srcIx > 0)) {
1316 srcIx--;
1317 destIx--;
1318
1319 // Get last byte of the UTF-8 character
1320 c = s8[srcIx];
1321 if (c<0x80) {
1322 // Special case ASCII range for speed.
1323 buf[destIx] = c;
1324 mapToUChars[srcIx - toUCharsMapStart] = destIx;
1325 mapToNative[destIx] = srcIx - toUCharsMapStart;
1326 } else {
1327 // General case, handle everything non-ASCII.
1328
1329 int32_t sIx = srcIx; // ix of last byte of multi-byte u8 char
1330
1331 // Get the full character from the UTF8 string.
1332 // use code derived from tbe macros in utf.8
1333 // Leaves srcIx pointing at the first byte of the UTF-8 char.
1334 //
1335 if (c<=0xbf) {
1336 c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -1);
1337 // leaves srcIx at first byte of the multi-byte char.
1338 } else {
1339 c=0x0fffd;
1340 }
1341
1342 // Store the character in UTF-16 buffer.
1343 if (c<0x10000) {
1344 buf[destIx] = c;
1345 mapToNative[destIx] = srcIx - toUCharsMapStart;
1346 } else {
1347 buf[destIx] = U16_TRAIL(c);
1348 mapToNative[destIx] = srcIx - toUCharsMapStart;
1349 buf[--destIx] = U16_LEAD(c);
1350 mapToNative[destIx] = srcIx - toUCharsMapStart;
1351 }
1352
1353 // Fill in the map from native indexes to UChars buf index.
1354 do {
1355 mapToUChars[sIx-- - toUCharsMapStart] = destIx;
1356 } while (sIx >= srcIx);
1357
1358 // Set native indexing limit to be the current position.
1359 // We are processing a non-ascii, non-native-indexing char now;
1360 // the limit will be here if the rest of the chars to be
1361 // added to this buffer are ascii.
1362 bufNILimit = destIx;
1363 }
1364 }
1365 u8b->bufNativeStart = srcIx;
1366 u8b->bufNativeLimit = ix;
1367 u8b->bufStartIdx = destIx;
1368 u8b->bufLimitIdx = UTF8_TEXT_CHUNK_SIZE+2;
1369 u8b->bufNILimit = bufNILimit - u8b->bufStartIdx;
1370 u8b->toUCharsMapStart = toUCharsMapStart;
1371
1372 ut->chunkContents = &buf[u8b->bufStartIdx];
1373 ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
1374 ut->chunkOffset = ut->chunkLength;
1375 ut->chunkNativeStart = u8b->bufNativeStart;
1376 ut->chunkNativeLimit = u8b->bufNativeLimit;
1377 ut->nativeIndexingLimit = u8b->bufNILimit;
1378 return TRUE;
1379 }
1380
1381 }
1382
1383
1384
1385 //
1386 // This is a slightly modified copy of u_strFromUTF8,
1387 // Inserts a Replacement Char rather than failing on invalid UTF-8
1388 // Removes unnecessary features.
1389 //
1390 static UChar*
1391 utext_strFromUTF8(UChar *dest,
1392 int32_t destCapacity,
1393 int32_t *pDestLength,
1394 const char* src,
1395 int32_t srcLength, // required. NUL terminated not supported.
1396 UErrorCode *pErrorCode
1397 )
1398 {
1399
1400 UChar *pDest = dest;
1401 UChar *pDestLimit = dest+destCapacity;
1402 UChar32 ch=0;
1403 int32_t index = 0;
1404 int32_t reqLength = 0;
1405 uint8_t* pSrc = (uint8_t*) src;
1406
1407
1408 while((index < srcLength)&&(pDest<pDestLimit)){
1409 ch = pSrc[index++];
1410 if(ch <=0x7f){
1411 *pDest++=(UChar)ch;
1412 }else{
1413 ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
1414 if(ch<0){
1415 ch = 0xfffd;
1416 }
1417 if(ch<=0xFFFF){
1418 *(pDest++)=(UChar)ch;
1419 }else{
1420 *(pDest++)=UTF16_LEAD(ch);
1421 if(pDest<pDestLimit){
1422 *(pDest++)=UTF16_TRAIL(ch);
1423 }else{
1424 reqLength++;
1425 break;
1426 }
1427 }
1428 }
1429 }
1430 /* donot fill the dest buffer just count the UChars needed */
1431 while(index < srcLength){
1432 ch = pSrc[index++];
1433 if(ch <= 0x7f){
1434 reqLength++;
1435 }else{
1436 ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
1437 if(ch<0){
1438 ch = 0xfffd;
1439 }
1440 reqLength+=UTF_CHAR_LENGTH(ch);
1441 }
1442 }
1443
1444 reqLength+=(int32_t)(pDest - dest);
1445
1446 if(pDestLength){
1447 *pDestLength = reqLength;
1448 }
1449
1450 /* Terminate the buffer */
1451 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
1452
1453 return dest;
1454 }
1455
1456
1457
1458 static int32_t U_CALLCONV
1459 utf8TextExtract(UText *ut,
1460 int64_t start, int64_t limit,
1461 UChar *dest, int32_t destCapacity,
1462 UErrorCode *pErrorCode) {
1463 if(U_FAILURE(*pErrorCode)) {
1464 return 0;
1465 }
1466 if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
1467 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1468 return 0;
1469 }
1470 int32_t length = ut->b;
1471 int32_t start32 = pinIndex(start, length);
1472 int32_t limit32 = pinIndex(limit, length);
1473
1474 if(start32>limit32) {
1475 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1476 return 0;
1477 }
1478
1479
1480 // adjust the incoming indexes to land on code point boundaries if needed.
1481 // adjust by no more than three, because that is the largest number of trail bytes
1482 // in a well formed UTF8 character.
1483 const uint8_t *buf = (const uint8_t *)ut->context;
1484 int i;
1485 if (start32 < ut->chunkNativeLimit) {
1486 for (i=0; i<3; i++) {
1487 if (U8_IS_LEAD(buf[start32]) || start32==0) {
1488 break;
1489 }
1490 start32--;
1491 }
1492 }
1493
1494 if (limit32 < ut->chunkNativeLimit) {
1495 for (i=0; i<3; i++) {
1496 if (U8_IS_LEAD(buf[limit32]) || limit32==0) {
1497 break;
1498 }
1499 limit32--;
1500 }
1501 }
1502
1503 // Do the actual extract.
1504 int32_t destLength=0;
1505 utext_strFromUTF8(dest, destCapacity, &destLength,
1506 (const char *)ut->context+start32, limit32-start32,
1507 pErrorCode);
1508 return destLength;
1509 }
1510
1511 //
1512 // utf8TextMapOffsetToNative
1513 //
1514 // Map a chunk (UTF-16) offset to a native index.
1515 static int64_t U_CALLCONV
1516 utf8TextMapOffsetToNative(const UText *ut) {
1517 //
1518 UTF8Buf *u8b = (UTF8Buf *)ut->p;
1519 U_ASSERT(ut->chunkOffset>ut->nativeIndexingLimit && ut->chunkOffset<=ut->chunkLength);
1520 int32_t nativeOffset = u8b->mapToNative[ut->chunkOffset + u8b->bufStartIdx] + u8b->toUCharsMapStart;
1521 U_ASSERT(nativeOffset >= ut->chunkNativeStart && nativeOffset <= ut->chunkNativeLimit);
1522 return nativeOffset;
1523 }
1524
1525 //
1526 // Map a native index to the corrsponding chunk offset
1527 //
1528 static int32_t U_CALLCONV
1529 utf8TextMapIndexToUTF16(const UText *ut, int64_t index64) {
1530 U_ASSERT(index64 <= 0x7fffffff);
1531 int32_t index = (int32_t)index64;
1532 UTF8Buf *u8b = (UTF8Buf *)ut->p;
1533 U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit);
1534 U_ASSERT(index<=ut->chunkNativeLimit);
1535 int32_t mapIndex = index - u8b->toUCharsMapStart;
1536 int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1537 U_ASSERT(offset>=0 && offset<=ut->chunkLength);
1538 return offset;
1539 }
1540
1541 static UText * U_CALLCONV
1542 utf8TextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status)
1543 {
1544 // First do a generic shallow clone. Does everything needed for the UText struct itself.
1545 dest = shallowTextClone(dest, src, status);
1546
1547 // For deep clones, make a copy of the string.
1548 // The copied storage is owned by the newly created clone.
1549 //
1550 // TODO: There is an isssue with using utext_nativeLength().
1551 // That function is non-const in cases where the input was NUL terminated
1552 // and the length has not yet been determined.
1553 // This function (clone()) is const.
1554 // There potentially a thread safety issue lurking here.
1555 //
1556 if (deep && U_SUCCESS(*status)) {
1557 int32_t len = (int32_t)utext_nativeLength((UText *)src);
1558 char *copyStr = (char *)uprv_malloc(len+1);
1559 if (copyStr == NULL) {
1560 *status = U_MEMORY_ALLOCATION_ERROR;
1561 } else {
1562 uprv_memcpy(copyStr, src->context, len+1);
1563 dest->context = copyStr;
1564 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
1565 }
1566 }
1567 return dest;
1568 }
1569
1570
1571 static void U_CALLCONV
1572 utf8TextClose(UText *ut) {
1573 // Most of the work of close is done by the generic UText framework close.
1574 // All that needs to be done here is to delete the UTF8 string if the UText
1575 // owns it. This occurs if the UText was created by cloning.
1576 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
1577 char *s = (char *)ut->context;
1578 uprv_free(s);
1579 ut->context = NULL;
1580 }
1581 }
1582
1583 U_CDECL_END
1584
1585
1586 static struct UTextFuncs utf8Funcs =
1587 {
1588 sizeof(UTextFuncs),
1589 0, 0, 0, // Reserved alignment padding
1590 utf8TextClone,
1591 utf8TextLength,
1592 utf8TextAccess,
1593 utf8TextExtract,
1594 NULL, /* replace*/
1595 NULL, /* copy */
1596 utf8TextMapOffsetToNative,
1597 utf8TextMapIndexToUTF16,
1598 utf8TextClose,
1599 NULL, // spare 1
1600 NULL, // spare 2
1601 NULL // spare 3
1602 };
1603
1604
1605 U_CAPI UText * U_EXPORT2
1606 utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status) {
1607 if(U_FAILURE(*status)) {
1608 return NULL;
1609 }
1610 if(s==NULL || length<-1 || length>INT32_MAX) {
1611 *status=U_ILLEGAL_ARGUMENT_ERROR;
1612 return NULL;
1613 }
1614
1615 ut = utext_setup(ut, sizeof(UTF8Buf) * 2, status);
1616 if (U_FAILURE(*status)) {
1617 return ut;
1618 }
1619
1620 ut->pFuncs = &utf8Funcs;
1621 ut->context = s;
1622 ut->b = (int32_t)length;
1623 ut->c = (int32_t)length;
1624 if (ut->c < 0) {
1625 ut->c = 0;
1626 ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
1627 }
1628 ut->p = ut->pExtra;
1629 ut->q = (char *)ut->pExtra + sizeof(UTF8Buf);
1630 return ut;
1631
1632 }
1633
1634
1635
1636
1637
1638
1639
1640
1641 //------------------------------------------------------------------------------
1642 //
1643 // UText implementation wrapper for Replaceable (read/write)
1644 //
1645 // Use of UText data members:
1646 // context pointer to Replaceable.
1647 // p pointer to Replaceable if it is owned by the UText.
1648 //
1649 //------------------------------------------------------------------------------
1650
1651
1652
1653 // minimum chunk size for this implementation: 3
1654 // to allow for possible trimming for code point boundaries
1655 enum { REP_TEXT_CHUNK_SIZE=10 };
1656
1657 struct ReplExtra {
1658 /*
1659 * Chunk UChars.
1660 * +1 to simplify filling with surrogate pair at the end.
1661 */
1662 UChar s[REP_TEXT_CHUNK_SIZE+1];
1663 };
1664
1665
1666 U_CDECL_BEGIN
1667
1668 static UText * U_CALLCONV
1669 repTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
1670 // First do a generic shallow clone. Does everything needed for the UText struct itself.
1671 dest = shallowTextClone(dest, src, status);
1672
1673 // For deep clones, make a copy of the Replaceable.
1674 // The copied Replaceable storage is owned by the newly created UText clone.
1675 // A non-NULL pointer in UText.p is the signal to the close() function to delete
1676 // it.
1677 //
1678 if (deep && U_SUCCESS(*status)) {
1679 const Replaceable *replSrc = (const Replaceable *)src->context;
1680 dest->context = replSrc->clone();
1681 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
1682
1683 // with deep clone, the copy is writable, even when the source is not.
1684 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
1685 }
1686 return dest;
1687 }
1688
1689
1690 static void U_CALLCONV
1691 repTextClose(UText *ut) {
1692 // Most of the work of close is done by the generic UText framework close.
1693 // All that needs to be done here is delete the Replaceable if the UText
1694 // owns it. This occurs if the UText was created by cloning.
1695 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
1696 Replaceable *rep = (Replaceable *)ut->context;
1697 delete rep;
1698 ut->context = NULL;
1699 }
1700 }
1701
1702
1703 static int64_t U_CALLCONV
1704 repTextLength(UText *ut) {
1705 const Replaceable *replSrc = (const Replaceable *)ut->context;
1706 int32_t len = replSrc->length();
1707 return len;
1708 }
1709
1710
1711 static UBool U_CALLCONV
1712 repTextAccess(UText *ut, int64_t index, UBool forward) {
1713 const Replaceable *rep=(const Replaceable *)ut->context;
1714 int32_t length=rep->length(); // Full length of the input text (bigger than a chunk)
1715
1716 // clip the requested index to the limits of the text.
1717 int32_t index32 = pinIndex(index, length);
1718 U_ASSERT(index<=INT32_MAX);
1719
1720
1721 /*
1722 * Compute start/limit boundaries around index, for a segment of text
1723 * to be extracted.
1724 * To allow for the possibility that our user gave an index to the trailing
1725 * half of a surrogate pair, we must request one extra preceding UChar when
1726 * going in the forward direction. This will ensure that the buffer has the
1727 * entire code point at the specified index.
1728 */
1729 if(forward) {
1730
1731 if (index32>=ut->chunkNativeStart && index32<ut->chunkNativeLimit) {
1732 // Buffer already contains the requested position.
1733 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
1734 return TRUE;
1735 }
1736 if (index32>=length && ut->chunkNativeLimit==length) {
1737 // Request for end of string, and buffer already extends up to it.
1738 // Can't get the data, but don't change the buffer.
1739 ut->chunkOffset = length - (int32_t)ut->chunkNativeStart;
1740 return FALSE;
1741 }
1742
1743 ut->chunkNativeLimit = index + REP_TEXT_CHUNK_SIZE - 1;
1744 // Going forward, so we want to have the buffer with stuff at and beyond
1745 // the requested index. The -1 gets us one code point before the
1746 // requested index also, to handle the case of the index being on
1747 // a trail surrogate of a surrogate pair.
1748 if(ut->chunkNativeLimit > length) {
1749 ut->chunkNativeLimit = length;
1750 }
1751 // unless buffer ran off end, start is index-1.
1752 ut->chunkNativeStart = ut->chunkNativeLimit - REP_TEXT_CHUNK_SIZE;
1753 if(ut->chunkNativeStart < 0) {
1754 ut->chunkNativeStart = 0;
1755 }
1756 } else {
1757 // Reverse iteration. Fill buffer with data preceding the requested index.
1758 if (index32>ut->chunkNativeStart && index32<=ut->chunkNativeLimit) {
1759 // Requested position already in buffer.
1760 ut->chunkOffset = index32 - (int32_t)ut->chunkNativeStart;
1761 return TRUE;
1762 }
1763 if (index32==0 && ut->chunkNativeStart==0) {
1764 // Request for start, buffer already begins at start.
1765 // No data, but keep the buffer as is.
1766 ut->chunkOffset = 0;
1767 return FALSE;
1768 }
1769
1770 // Figure out the bounds of the chunk to extract for reverse iteration.
1771 // Need to worry about chunk not splitting surrogate pairs, and while still
1772 // containing the data we need.
1773 // Fix by requesting a chunk that includes an extra UChar at the end.
1774 // If this turns out to be a lead surrogate, we can lop it off and still have
1775 // the data we wanted.
1776 ut->chunkNativeStart = index32 + 1 - REP_TEXT_CHUNK_SIZE;
1777 if (ut->chunkNativeStart < 0) {
1778 ut->chunkNativeStart = 0;
1779 }
1780
1781 ut->chunkNativeLimit = index32 + 1;
1782 if (ut->chunkNativeLimit > length) {
1783 ut->chunkNativeLimit = length;
1784 }
1785 }
1786
1787 // Extract the new chunk of text from the Replaceable source.
1788 ReplExtra *ex = (ReplExtra *)ut->pExtra;
1789 // UnicodeString with its buffer a writable alias to the chunk buffer
1790 UnicodeString buffer(ex->s, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE /*buffer capacity*/);
1791 rep->extractBetween((int32_t)ut->chunkNativeStart, (int32_t)ut->chunkNativeLimit, buffer);
1792
1793 ut->chunkContents = ex->s;
1794 ut->chunkLength = (int32_t)(ut->chunkNativeLimit - ut->chunkNativeStart);
1795 ut->chunkOffset = (int32_t)(index32 - ut->chunkNativeStart);
1796
1797 // Surrogate pairs from the input text must not span chunk boundaries.
1798 // If end of chunk could be the start of a surrogate, trim it off.
1799 if (ut->chunkNativeLimit < length &&
1800 U16_IS_LEAD(ex->s[ut->chunkLength-1])) {
1801 ut->chunkLength--;
1802 ut->chunkNativeLimit--;
1803 if (ut->chunkOffset > ut->chunkLength) {
1804 ut->chunkOffset = ut->chunkLength;
1805 }
1806 }
1807
1808 // if the first UChar in the chunk could be the trailing half of a surrogate pair,
1809 // trim it off.
1810 if(ut->chunkNativeStart>0 && U16_IS_TRAIL(ex->s[0])) {
1811 ++(ut->chunkContents);
1812 ++(ut->chunkNativeStart);
1813 --(ut->chunkLength);
1814 --(ut->chunkOffset);
1815 }
1816
1817 // adjust the index/chunkOffset to a code point boundary
1818 U16_SET_CP_START(ut->chunkContents, 0, ut->chunkOffset);
1819
1820 // Use fast indexing for get/setNativeIndex()
1821 ut->nativeIndexingLimit = ut->chunkLength;
1822
1823 return TRUE;
1824 }
1825
1826
1827
1828 static int32_t U_CALLCONV
1829 repTextExtract(UText *ut,
1830 int64_t start, int64_t limit,
1831 UChar *dest, int32_t destCapacity,
1832 UErrorCode *status) {
1833 const Replaceable *rep=(const Replaceable *)ut->context;
1834 int32_t length=rep->length();
1835
1836 if(U_FAILURE(*status)) {
1837 return 0;
1838 }
1839 if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
1840 *status=U_ILLEGAL_ARGUMENT_ERROR;
1841 }
1842 if(start>limit) {
1843 *status=U_INDEX_OUTOFBOUNDS_ERROR;
1844 return 0;
1845 }
1846
1847 int32_t start32 = pinIndex(start, length);
1848 int32_t limit32 = pinIndex(limit, length);
1849
1850 // adjust start, limit if they point to trail half of surrogates
1851 if (start32<length && U16_IS_TRAIL(rep->charAt(start32)) &&
1852 U_IS_SUPPLEMENTARY(rep->char32At(start32))){
1853 start32--;
1854 }
1855 if (limit32<length && U16_IS_TRAIL(rep->charAt(limit32)) &&
1856 U_IS_SUPPLEMENTARY(rep->char32At(limit32))){
1857 limit32--;
1858 }
1859
1860 length=limit32-start32;
1861 if(length>destCapacity) {
1862 limit32 = start32 + destCapacity;
1863 }
1864 UnicodeString buffer(dest, 0, destCapacity); // writable alias
1865 rep->extractBetween(start32, limit32, buffer);
1866 return u_terminateUChars(dest, destCapacity, length, status);
1867 }
1868
1869 static int32_t U_CALLCONV
1870 repTextReplace(UText *ut,
1871 int64_t start, int64_t limit,
1872 const UChar *src, int32_t length,
1873 UErrorCode *status) {
1874 Replaceable *rep=(Replaceable *)ut->context;
1875 int32_t oldLength;
1876
1877 if(U_FAILURE(*status)) {
1878 return 0;
1879 }
1880 if(src==NULL && length!=0) {
1881 *status=U_ILLEGAL_ARGUMENT_ERROR;
1882 return 0;
1883 }
1884 oldLength=rep->length(); // will subtract from new length
1885 if(start>limit ) {
1886 *status=U_INDEX_OUTOFBOUNDS_ERROR;
1887 return 0;
1888 }
1889
1890 int32_t start32 = pinIndex(start, oldLength);
1891 int32_t limit32 = pinIndex(limit, oldLength);
1892
1893 // Snap start & limit to code point boundaries.
1894 if (start32<oldLength && U16_IS_TRAIL(rep->charAt(start32)) &&
1895 start32>0 && U16_IS_LEAD(rep->charAt(start32-1)))
1896 {
1897 start32--;
1898 }
1899 if (limit32<oldLength && U16_IS_LEAD(rep->charAt(limit32-1)) &&
1900 U16_IS_TRAIL(rep->charAt(limit32)))
1901 {
1902 limit32++;
1903 }
1904
1905 // Do the actual replace operation using methods of the Replaceable class
1906 UnicodeString replStr((UBool)(length<0), src, length); // read-only alias
1907 rep->handleReplaceBetween(start32, limit32, replStr);
1908 int32_t newLength = rep->length();
1909 int32_t lengthDelta = newLength - oldLength;
1910
1911 // Is the UText chunk buffer OK?
1912 if (ut->chunkNativeLimit > start32) {
1913 // this replace operation may have impacted the current chunk.
1914 // invalidate it, which will force a reload on the next access.
1915 invalidateChunk(ut);
1916 }
1917
1918 // set the iteration position to the end of the newly inserted replacement text.
1919 int32_t newIndexPos = limit32 + lengthDelta;
1920 repTextAccess(ut, newIndexPos, TRUE);
1921
1922 return lengthDelta;
1923 }
1924
1925
1926 static void U_CALLCONV
1927 repTextCopy(UText *ut,
1928 int64_t start, int64_t limit,
1929 int64_t destIndex,
1930 UBool move,
1931 UErrorCode *status)
1932 {
1933 Replaceable *rep=(Replaceable *)ut->context;
1934 int32_t length=rep->length();
1935
1936 if(U_FAILURE(*status)) {
1937 return;
1938 }
1939 if (start>limit || (start<destIndex && destIndex<limit))
1940 {
1941 *status=U_INDEX_OUTOFBOUNDS_ERROR;
1942 return;
1943 }
1944
1945 int32_t start32 = pinIndex(start, length);
1946 int32_t limit32 = pinIndex(limit, length);
1947 int32_t destIndex32 = pinIndex(destIndex, length);
1948
1949 // TODO: snap input parameters to code point boundaries.
1950
1951 if(move) {
1952 // move: copy to destIndex, then replace original with nothing
1953 int32_t segLength=limit32-start32;
1954 rep->copy(start32, limit32, destIndex32);
1955 if(destIndex32<start32) {
1956 start32+=segLength;
1957 limit32+=segLength;
1958 }
1959 rep->handleReplaceBetween(start32, limit32, UnicodeString());
1960 } else {
1961 // copy
1962 rep->copy(start32, limit32, destIndex32);
1963 }
1964
1965 // If the change to the text touched the region in the chunk buffer,
1966 // invalidate the buffer.
1967 int32_t firstAffectedIndex = destIndex32;
1968 if (move && start32<firstAffectedIndex) {
1969 firstAffectedIndex = start32;
1970 }
1971 if (firstAffectedIndex < ut->chunkNativeLimit) {
1972 // changes may have affected range covered by the chunk
1973 invalidateChunk(ut);
1974 }
1975
1976 // Put iteration position at the newly inserted (moved) block,
1977 int32_t nativeIterIndex = destIndex32 + limit32 - start32;
1978 if (move && destIndex32>start32) {
1979 // moved a block of text towards the end of the string.
1980 nativeIterIndex = destIndex32;
1981 }
1982
1983 // Set position, reload chunk if needed.
1984 repTextAccess(ut, nativeIterIndex, TRUE);
1985 }
1986
1987 static struct UTextFuncs repFuncs =
1988 {
1989 sizeof(UTextFuncs),
1990 0, 0, 0, // Reserved alignment padding
1991 repTextClone,
1992 repTextLength,
1993 repTextAccess,
1994 repTextExtract,
1995 repTextReplace,
1996 repTextCopy,
1997 NULL, // MapOffsetToNative,
1998 NULL, // MapIndexToUTF16,
1999 repTextClose,
2000 NULL, // spare 1
2001 NULL, // spare 2
2002 NULL // spare 3
2003 };
2004
2005
2006 U_CAPI UText * U_EXPORT2
2007 utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status)
2008 {
2009 if(U_FAILURE(*status)) {
2010 return NULL;
2011 }
2012 if(rep==NULL) {
2013 *status=U_ILLEGAL_ARGUMENT_ERROR;
2014 return NULL;
2015 }
2016 ut = utext_setup(ut, sizeof(ReplExtra), status);
2017
2018 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2019 if(rep->hasMetaData()) {
2020 ut->providerProperties |=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA);
2021 }
2022
2023 ut->pFuncs = &repFuncs;
2024 ut->context = rep;
2025 return ut;
2026 }
2027
2028 U_CDECL_END
2029
2030
2031
2032
2033
2034
2035
2036
2037 //------------------------------------------------------------------------------
2038 //
2039 // UText implementation for UnicodeString (read/write) and
2040 // for const UnicodeString (read only)
2041 // (same implementation, only the flags are different)
2042 //
2043 // Use of UText data members:
2044 // context pointer to UnicodeString
2045 // p pointer to UnicodeString IF this UText owns the string
2046 // and it must be deleted on close(). NULL otherwise.
2047 //
2048 //------------------------------------------------------------------------------
2049
2050 U_CDECL_BEGIN
2051
2052
2053 static UText * U_CALLCONV
2054 unistrTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
2055 // First do a generic shallow clone. Does everything needed for the UText struct itself.
2056 dest = shallowTextClone(dest, src, status);
2057
2058 // For deep clones, make a copy of the UnicodeSring.
2059 // The copied UnicodeString storage is owned by the newly created UText clone.
2060 // A non-NULL pointer in UText.p is the signal to the close() function to delete
2061 // the UText.
2062 //
2063 if (deep && U_SUCCESS(*status)) {
2064 const UnicodeString *srcString = (const UnicodeString *)src->context;
2065 dest->context = new UnicodeString(*srcString);
2066 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
2067
2068 // with deep clone, the copy is writable, even when the source is not.
2069 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2070 }
2071 return dest;
2072 }
2073
2074 static void U_CALLCONV
2075 unistrTextClose(UText *ut) {
2076 // Most of the work of close is done by the generic UText framework close.
2077 // All that needs to be done here is delete the UnicodeString if the UText
2078 // owns it. This occurs if the UText was created by cloning.
2079 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
2080 UnicodeString *str = (UnicodeString *)ut->context;
2081 delete str;
2082 ut->context = NULL;
2083 }
2084 }
2085
2086
2087 static int64_t U_CALLCONV
2088 unistrTextLength(UText *t) {
2089 return ((const UnicodeString *)t->context)->length();
2090 }
2091
2092
2093 static UBool U_CALLCONV
2094 unistrTextAccess(UText *ut, int64_t index, UBool forward) {
2095 int32_t length = ut->chunkLength;
2096 ut->chunkOffset = pinIndex(index, length);
2097
2098 // Check whether request is at the start or end
2099 UBool retVal = (forward && index<length) || (!forward && index>0);
2100 return retVal;
2101 }
2102
2103
2104
2105 static int32_t U_CALLCONV
2106 unistrTextExtract(UText *t,
2107 int64_t start, int64_t limit,
2108 UChar *dest, int32_t destCapacity,
2109 UErrorCode *pErrorCode) {
2110 const UnicodeString *us=(const UnicodeString *)t->context;
2111 int32_t length=us->length();
2112
2113 if(U_FAILURE(*pErrorCode)) {
2114 return 0;
2115 }
2116 if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
2117 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2118 }
2119 if(start<0 || start>limit) {
2120 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2121 return 0;
2122 }
2123
2124 int32_t start32 = start<length ? us->getChar32Start((int32_t)start) : length;
2125 int32_t limit32 = limit<length ? us->getChar32Start((int32_t)limit) : length;
2126
2127 length=limit32-start32;
2128 if (destCapacity>0 && dest!=NULL) {
2129 int32_t trimmedLength = length;
2130 if(trimmedLength>destCapacity) {
2131 trimmedLength=destCapacity;
2132 }
2133 us->extract(start32, trimmedLength, dest);
2134 }
2135 u_terminateUChars(dest, destCapacity, length, pErrorCode);
2136 return length;
2137 }
2138
2139 static int32_t U_CALLCONV
2140 unistrTextReplace(UText *ut,
2141 int64_t start, int64_t limit,
2142 const UChar *src, int32_t length,
2143 UErrorCode *pErrorCode) {
2144 UnicodeString *us=(UnicodeString *)ut->context;
2145 int32_t oldLength;
2146
2147 if(U_FAILURE(*pErrorCode)) {
2148 return 0;
2149 }
2150 if(src==NULL && length!=0) {
2151 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2152 }
2153 if(start>limit) {
2154 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2155 return 0;
2156 }
2157 oldLength=us->length();
2158 int32_t start32 = pinIndex(start, oldLength);
2159 int32_t limit32 = pinIndex(limit, oldLength);
2160 if (start32 < oldLength) {
2161 start32 = us->getChar32Start(start32);
2162 }
2163 if (limit32 < oldLength) {
2164 limit32 = us->getChar32Start(limit32);
2165 }
2166
2167 // replace
2168 us->replace(start32, limit32-start32, src, length);
2169 int32_t newLength = us->length();
2170
2171 // Update the chunk description.
2172 ut->chunkContents = us->getBuffer();
2173 ut->chunkLength = newLength;
2174 ut->chunkNativeLimit = newLength;
2175 ut->nativeIndexingLimit = newLength;
2176
2177 // Set iteration position to the point just following the newly inserted text.
2178 int32_t lengthDelta = newLength - oldLength;
2179 ut->chunkOffset = limit32 + lengthDelta;
2180
2181 return lengthDelta;
2182 }
2183
2184 static void U_CALLCONV
2185 unistrTextCopy(UText *ut,
2186 int64_t start, int64_t limit,
2187 int64_t destIndex,
2188 UBool move,
2189 UErrorCode *pErrorCode) {
2190 UnicodeString *us=(UnicodeString *)ut->context;
2191 int32_t length=us->length();
2192
2193 if(U_FAILURE(*pErrorCode)) {
2194 return;
2195 }
2196 int32_t start32 = pinIndex(start, length);
2197 int32_t limit32 = pinIndex(limit, length);
2198 int32_t destIndex32 = pinIndex(destIndex, length);
2199
2200 if( start32>limit32 || (start32<destIndex32 && destIndex32<limit32)) {
2201 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2202 return;
2203 }
2204
2205 if(move) {
2206 // move: copy to destIndex, then replace original with nothing
2207 int32_t segLength=limit32-start32;
2208 us->copy(start32, limit32, destIndex32);
2209 if(destIndex32<start32) {
2210 start32+=segLength;
2211 }
2212 us->replace(start32, segLength, NULL, 0);
2213 } else {
2214 // copy
2215 us->copy(start32, limit32, destIndex32);
2216 }
2217
2218 // update chunk description, set iteration position.
2219 ut->chunkContents = us->getBuffer();
2220 if (move==FALSE) {
2221 // copy operation, string length grows
2222 ut->chunkLength += limit32-start32;
2223 ut->chunkNativeLimit = ut->chunkLength;
2224 ut->nativeIndexingLimit = ut->chunkLength;
2225 }
2226
2227 // Iteration position to end of the newly inserted text.
2228 ut->chunkOffset = destIndex32+limit32-start32;
2229 if (move && destIndex32>start32) {
2230 ut->chunkOffset = destIndex32;
2231 }
2232
2233 }
2234
2235 static struct UTextFuncs unistrFuncs =
2236 {
2237 sizeof(UTextFuncs),
2238 0, 0, 0, // Reserved alignment padding
2239 unistrTextClone,
2240 unistrTextLength,
2241 unistrTextAccess,
2242 unistrTextExtract,
2243 unistrTextReplace,
2244 unistrTextCopy,
2245 NULL, // MapOffsetToNative,
2246 NULL, // MapIndexToUTF16,
2247 unistrTextClose,
2248 NULL, // spare 1
2249 NULL, // spare 2
2250 NULL // spare 3
2251 };
2252
2253
2254
2255 U_CDECL_END
2256
2257
2258 U_CAPI UText * U_EXPORT2
2259 utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
2260 // TODO: use openConstUnicodeString, then add in the differences.
2261 //
2262 ut = utext_setup(ut, 0, status);
2263 if (U_SUCCESS(*status)) {
2264 ut->pFuncs = &unistrFuncs;
2265 ut->context = s;
2266 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS)|
2267 I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2268
2269 ut->chunkContents = s->getBuffer();
2270 ut->chunkLength = s->length();
2271 ut->chunkNativeStart = 0;
2272 ut->chunkNativeLimit = ut->chunkLength;
2273 ut->nativeIndexingLimit = ut->chunkLength;
2274 }
2275 return ut;
2276 }
2277
2278
2279
2280 U_CAPI UText * U_EXPORT2
2281 utext_openConstUnicodeString(UText *ut, const UnicodeString *s, UErrorCode *status) {
2282 ut = utext_setup(ut, 0, status);
2283 // note: use the standard (writable) function table for UnicodeString.
2284 // The flag settings disable writing, so having the functions in
2285 // the table is harmless.
2286 if (U_SUCCESS(*status)) {
2287 ut->pFuncs = &unistrFuncs;
2288 ut->context = s;
2289 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
2290 ut->chunkContents = s->getBuffer();
2291 ut->chunkLength = s->length();
2292 ut->chunkNativeStart = 0;
2293 ut->chunkNativeLimit = ut->chunkLength;
2294 ut->nativeIndexingLimit = ut->chunkLength;
2295 }
2296 return ut;
2297 }
2298
2299 //------------------------------------------------------------------------------
2300 //
2301 // UText implementation for const UChar * strings
2302 //
2303 // Use of UText data members:
2304 // context pointer to UnicodeString
2305 // a length. -1 if not yet known.
2306 //
2307 // TODO: support 64 bit lengths.
2308 //
2309 //------------------------------------------------------------------------------
2310
2311 U_CDECL_BEGIN
2312
2313
2314 static UText * U_CALLCONV
2315 ucstrTextClone(UText *dest, const UText * src, UBool deep, UErrorCode * status) {
2316 // First do a generic shallow clone.
2317 dest = shallowTextClone(dest, src, status);
2318
2319 // For deep clones, make a copy of the string.
2320 // The copied storage is owned by the newly created clone.
2321 // A non-NULL pointer in UText.p is the signal to the close() function to delete
2322 // it.
2323 //
2324 if (deep && U_SUCCESS(*status)) {
2325 U_ASSERT(utext_nativeLength(dest) < INT32_MAX);
2326 int32_t len = (int32_t)utext_nativeLength(dest);
2327
2328 // The cloned string IS going to be NUL terminated, whether or not the original was.
2329 const UChar *srcStr = (const UChar *)src->context;
2330 UChar *copyStr = (UChar *)uprv_malloc((len+1) * sizeof(UChar));
2331 if (copyStr == NULL) {
2332 *status = U_MEMORY_ALLOCATION_ERROR;
2333 } else {
2334 int64_t i;
2335 for (i=0; i<len; i++) {
2336 copyStr[i] = srcStr[i];
2337 }
2338 copyStr[len] = 0;
2339 dest->context = copyStr;
2340 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
2341 }
2342 }
2343 return dest;
2344 }
2345
2346
2347 static void U_CALLCONV
2348 ucstrTextClose(UText *ut) {
2349 // Most of the work of close is done by the generic UText framework close.
2350 // All that needs to be done here is delete the string if the UText
2351 // owns it. This occurs if the UText was created by cloning.
2352 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
2353 UChar *s = (UChar *)ut->context;
2354 uprv_free(s);
2355 ut->context = NULL;
2356 }
2357 }
2358
2359
2360
2361 static int64_t U_CALLCONV
2362 ucstrTextLength(UText *ut) {
2363 if (ut->a < 0) {
2364 // null terminated, we don't yet know the length. Scan for it.
2365 // Access is not convenient for doing this
2366 // because the current interation postion can't be changed.
2367 const UChar *str = (const UChar *)ut->context;
2368 for (;;) {
2369 if (str[ut->chunkNativeLimit] == 0) {
2370 break;
2371 }
2372 ut->chunkNativeLimit++;
2373 }
2374 ut->a = ut->chunkNativeLimit;
2375 ut->chunkLength = (int32_t)ut->chunkNativeLimit;
2376 ut->nativeIndexingLimit = ut->chunkLength;
2377 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2378 }
2379 return ut->a;
2380 }
2381
2382
2383 static UBool U_CALLCONV
2384 ucstrTextAccess(UText *ut, int64_t index, UBool forward) {
2385 const UChar *str = (const UChar *)ut->context;
2386
2387 // pin the requested index to the bounds of the string,
2388 // and set current iteration position.
2389 if (index<0) {
2390 index = 0;
2391 } else if (index < ut->chunkNativeLimit) {
2392 // The request data is within the chunk as it is known so far.
2393 // Put index on a code point boundary.
2394 U16_SET_CP_START(str, 0, index);
2395 } else if (ut->a >= 0) {
2396 // We know the length of this string, and the user is requesting something
2397 // at or beyond the length. Pin the requested index to the length.
2398 index = ut->a;
2399 } else {
2400 // Null terminated string, length not yet known, and the requested index
2401 // is beyond where we have scanned so far.
2402 // Scan to 32 UChars beyond the requested index. The strategy here is
2403 // to avoid fully scanning a long string when the caller only wants to
2404 // see a few characters at its beginning.
2405 int32_t scanLimit = (int32_t)index + 32;
2406 if ((index + 32)>INT32_MAX || (index + 32)<0 ) { // note: int64 expression
2407 scanLimit = INT32_MAX;
2408 }
2409
2410 int32_t chunkLimit = (int32_t)ut->chunkNativeLimit;
2411 for (; chunkLimit<scanLimit; chunkLimit++) {
2412 if (str[chunkLimit] == 0) {
2413 // We found the end of the string. Remember it, pin the requested index to it,
2414 // and bail out of here.
2415 ut->a = chunkLimit;
2416 ut->chunkLength = chunkLimit;
2417 ut->nativeIndexingLimit = chunkLimit;
2418 if (index >= chunkLimit) {
2419 index = chunkLimit;
2420 } else {
2421 U16_SET_CP_START(str, 0, index);
2422 }
2423
2424 ut->chunkNativeLimit = chunkLimit;
2425 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2426 goto breakout;
2427 }
2428 }
2429 // We scanned through the next batch of UChars without finding the end.
2430 U16_SET_CP_START(str, 0, index);
2431 if (chunkLimit == INT32_MAX) {
2432 // Scanned to the limit of a 32 bit length.
2433 // Forceably trim the overlength string back so length fits in int32
2434 // TODO: add support for 64 bit strings.
2435 ut->a = chunkLimit;
2436 ut->chunkLength = chunkLimit;
2437 ut->nativeIndexingLimit = chunkLimit;
2438 if (index > chunkLimit) {
2439 index = chunkLimit;
2440 }
2441 ut->chunkNativeLimit = chunkLimit;
2442 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2443 } else {
2444 // The endpoint of a chunk must not be left in the middle of a surrogate pair.
2445 // If the current end is on a lead surrogate, back the end up by one.
2446 // It doesn't matter if the end char happens to be an unpaired surrogate,
2447 // and it's simpler not to worry about it.
2448 if (U16_IS_LEAD(str[chunkLimit-1])) {
2449 --chunkLimit;
2450 }
2451 ut->chunkNativeLimit = chunkLimit;
2452 }
2453
2454 }
2455 breakout:
2456 U_ASSERT(index<=INT32_MAX);
2457 ut->chunkOffset = (int32_t)index;
2458
2459 // Check whether request is at the start or end
2460 UBool retVal = (forward && index<ut->chunkNativeLimit) || (!forward && index>0);
2461 return retVal;
2462 }
2463
2464
2465
2466 static int32_t U_CALLCONV
2467 ucstrTextExtract(UText *ut,
2468 int64_t start, int64_t limit,
2469 UChar *dest, int32_t destCapacity,
2470 UErrorCode *pErrorCode)
2471 {
2472 if(U_FAILURE(*pErrorCode)) {
2473 return 0;
2474 }
2475 if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) {
2476 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2477 return 0;
2478 }
2479
2480 const UChar *s=(const UChar *)ut->context;
2481 int32_t si, di;
2482
2483 int32_t start32;
2484 int32_t limit32;
2485
2486 // Access the start. Does two things we need:
2487 // Pins 'start' to the length of the string, if it came in out-of-bounds.
2488 // Snaps 'start' to the beginning of a code point.
2489 ucstrTextAccess(ut, start, TRUE);
2490 U_ASSERT(start <= INT32_MAX);
2491 start32 = (int32_t)start;
2492
2493 int32_t strLength=(int32_t)ut->a;
2494 if (strLength >= 0) {
2495 limit32 = pinIndex(limit, strLength);
2496 } else {
2497 limit32 = pinIndex(limit, INT32_MAX);
2498 }
2499
2500 di = 0;
2501 for (si=start32; si<limit32; si++) {
2502 if (strLength<0 && s[si]==0) {
2503 // Just hit the end of a null-terminated string.
2504 ut->a = si; // set string length for this UText
2505 ut->chunkNativeLimit = si;
2506 ut->chunkLength = si;
2507 ut->nativeIndexingLimit = si;
2508 strLength = si;
2509 break;
2510 }
2511 if (di<destCapacity) {
2512 // only store if there is space.
2513 dest[di] = s[si];
2514 } else {
2515 if (strLength>=0) {
2516 // We have filled the destination buffer, and the string length is known.
2517 // Cut the loop short. There is no need to scan string termination.
2518 di = strLength;
2519 si = limit32;
2520 break;
2521 }
2522 }
2523 di++;
2524 }
2525
2526 // If the limit index points to a lead surrogate of a pair,
2527 // add the corresponding trail surrogate to the destination.
2528 if (si>0 && U16_IS_LEAD(s[si-1]) &&
2529 ((si<strLength || strLength<0) && U16_IS_TRAIL(s[si])))
2530 {
2531 if (di<destCapacity) {
2532 // store only if there is space in the output buffer.
2533 dest[di++] = s[si++];
2534 }
2535 }
2536
2537 // Put iteration position at the point just following the extracted text
2538 ut->chunkOffset = si;
2539
2540 // Add a terminating NUL if space in the buffer permits,
2541 // and set the error status as required.
2542 u_terminateUChars(dest, destCapacity, di, pErrorCode);
2543 return di;
2544 }
2545
2546 static struct UTextFuncs ucstrFuncs =
2547 {
2548 sizeof(UTextFuncs),
2549 0, 0, 0, // Reserved alignment padding
2550 ucstrTextClone,
2551 ucstrTextLength,
2552 ucstrTextAccess,
2553 ucstrTextExtract,
2554 NULL, // Replace
2555 NULL, // Copy
2556 NULL, // MapOffsetToNative,
2557 NULL, // MapIndexToUTF16,
2558 ucstrTextClose,
2559 NULL, // spare 1
2560 NULL, // spare 2
2561 NULL, // spare 3
2562 };
2563
2564 U_CDECL_END
2565
2566
2567 U_CAPI UText * U_EXPORT2
2568 utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status) {
2569 if (U_FAILURE(*status)) {
2570 return NULL;
2571 }
2572 if (length < -1 || length>INT32_MAX) {
2573 *status = U_ILLEGAL_ARGUMENT_ERROR;
2574 return NULL;
2575 }
2576 ut = utext_setup(ut, 0, status);
2577 if (U_SUCCESS(*status)) {
2578 ut->pFuncs = &ucstrFuncs;
2579 ut->context = s;
2580 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
2581 if (length==-1) {
2582 ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2583 }
2584 ut->a = length;
2585 ut->chunkContents = s;
2586 ut->chunkNativeStart = 0;
2587 ut->chunkNativeLimit = length>=0? length : 0;
2588 ut->chunkLength = (int32_t)ut->chunkNativeLimit;
2589 ut->chunkOffset = 0;
2590 ut->nativeIndexingLimit = ut->chunkLength;
2591 }
2592 return ut;
2593 }
2594
2595
2596 //------------------------------------------------------------------------------
2597 //
2598 // UText implementation for text from ICU CharacterIterators
2599 //
2600 // Use of UText data members:
2601 // context pointer to the CharacterIterator
2602 // a length of the full text.
2603 // p pointer to buffer 1
2604 // b start index of local buffer 1 contents
2605 // q pointer to buffer 2
2606 // c start index of local buffer 2 contents
2607 // r pointer to the character iterator if the UText owns it.
2608 // Null otherwise.
2609 //
2610 //------------------------------------------------------------------------------
2611 #define CIBufSize 16
2612
2613 U_CDECL_BEGIN
2614 static void U_CALLCONV
2615 charIterTextClose(UText *ut) {
2616 // Most of the work of close is done by the generic UText framework close.
2617 // All that needs to be done here is delete the CharacterIterator if the UText
2618 // owns it. This occurs if the UText was created by cloning.
2619 CharacterIterator *ci = (CharacterIterator *)ut->r;
2620 delete ci;
2621 ut->r = NULL;
2622 }
2623
2624 static int64_t U_CALLCONV
2625 charIterTextLength(UText *ut) {
2626 return (int32_t)ut->a;
2627 }
2628
2629 static UBool U_CALLCONV
2630 charIterTextAccess(UText *ut, int64_t index, UBool forward) {
2631 CharacterIterator *ci = (CharacterIterator *)ut->context;
2632
2633 int32_t clippedIndex = (int32_t)index;
2634 if (clippedIndex<0) {
2635 clippedIndex=0;
2636 } else if (clippedIndex>=ut->a) {
2637 clippedIndex=(int32_t)ut->a;
2638 }
2639 int32_t neededIndex = clippedIndex;
2640 if (!forward && neededIndex>0) {
2641 // reverse iteration, want the position just before what was asked for.
2642 neededIndex--;
2643 } else if (forward && neededIndex==ut->a && neededIndex>0) {
2644 // Forward iteration, don't ask for something past the end of the text.
2645 neededIndex--;
2646 }
2647
2648 // Find the native index of the start of the buffer containing what we want.
2649 neededIndex -= neededIndex % CIBufSize;
2650
2651 UChar *buf = NULL;
2652 UBool needChunkSetup = TRUE;
2653 int i;
2654 if (ut->chunkNativeStart == neededIndex) {
2655 // The buffer we want is already the current chunk.
2656 needChunkSetup = FALSE;
2657 } else if (ut->b == neededIndex) {
2658 // The first buffer (buffer p) has what we need.
2659 buf = (UChar *)ut->p;
2660 } else if (ut->c == neededIndex) {
2661 // The second buffer (buffer q) has what we need.
2662 buf = (UChar *)ut->q;
2663 } else {
2664 // Neither buffer already has what we need.
2665 // Load new data from the character iterator.
2666 // Use the buf that is not the current buffer.
2667 buf = (UChar *)ut->p;
2668 if (ut->p == ut->chunkContents) {
2669 buf = (UChar *)ut->q;
2670 }
2671 ci->setIndex(neededIndex);
2672 for (i=0; i<CIBufSize; i++) {
2673 buf[i] = ci->nextPostInc();
2674 if (i+neededIndex > ut->a) {
2675 break;
2676 }
2677 }
2678 }
2679
2680 // We have a buffer with the data we need.
2681 // Set it up as the current chunk, if it wasn't already.
2682 if (needChunkSetup) {
2683 ut->chunkContents = buf;
2684 ut->chunkLength = CIBufSize;
2685 ut->chunkNativeStart = neededIndex;
2686 ut->chunkNativeLimit = neededIndex + CIBufSize;
2687 if (ut->chunkNativeLimit > ut->a) {
2688 ut->chunkNativeLimit = ut->a;
2689 ut->chunkLength = (int32_t)(ut->chunkNativeLimit)-(int32_t)(ut->chunkNativeStart);
2690 }
2691 ut->nativeIndexingLimit = ut->chunkLength;
2692 U_ASSERT(ut->chunkOffset>=0 && ut->chunkOffset<=CIBufSize);
2693 }
2694 ut->chunkOffset = clippedIndex - (int32_t)ut->chunkNativeStart;
2695 UBool success = (forward? ut->chunkOffset<ut->chunkLength : ut->chunkOffset>0);
2696 return success;
2697 }
2698
2699 static UText * U_CALLCONV
2700 charIterTextClone(UText *dest, const UText *src, UBool deep, UErrorCode * status) {
2701 if (U_FAILURE(*status)) {
2702 return NULL;
2703 }
2704
2705 if (deep) {
2706 // There is no CharacterIterator API for cloning the underlying text storage.
2707 *status = U_UNSUPPORTED_ERROR;
2708 return NULL;
2709 } else {
2710 CharacterIterator *srcCI =(CharacterIterator *)src->context;
2711 srcCI = srcCI->clone();
2712 dest = utext_openCharacterIterator(dest, srcCI, status);
2713 // cast off const on getNativeIndex.
2714 // For CharacterIterator based UTexts, this is safe, the operation is const.
2715 int64_t ix = utext_getNativeIndex((UText *)src);
2716 utext_setNativeIndex(dest, ix);
2717 dest->r = srcCI; // flags that this UText owns the CharacterIterator
2718 }
2719 return dest;
2720 }
2721
2722 static int32_t U_CALLCONV
2723 charIterTextExtract(UText *ut,
2724 int64_t start, int64_t limit,
2725 UChar *dest, int32_t destCapacity,
2726 UErrorCode *status)
2727 {
2728 if(U_FAILURE(*status)) {
2729 return 0;
2730 }
2731 if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) {
2732 *status=U_ILLEGAL_ARGUMENT_ERROR;
2733 return 0;
2734 }
2735 int32_t length = (int32_t)ut->a;
2736 int32_t start32 = pinIndex(start, length);
2737 int32_t limit32 = pinIndex(limit, length);
2738 int32_t desti = 0;
2739 int32_t srci;
2740
2741 CharacterIterator *ci = (CharacterIterator *)ut->context;
2742 ci->setIndex32(start32); // Moves ix to lead of surrogate pair, if needed.
2743 srci = ci->getIndex();
2744 while (srci<limit32) {
2745 UChar32 c = ci->next32PostInc();
2746 int32_t len = U16_LENGTH(c);
2747 if (desti+len <= destCapacity) {
2748 U16_APPEND_UNSAFE(dest, desti, c);
2749 } else {
2750 desti += len;
2751 *status = U_BUFFER_OVERFLOW_ERROR;
2752 }
2753 srci += len;
2754 }
2755
2756 u_terminateUChars(dest, destCapacity, desti, status);
2757 return desti;
2758 }
2759
2760 static struct UTextFuncs charIterFuncs =
2761 {
2762 sizeof(UTextFuncs),
2763 0, 0, 0, // Reserved alignment padding
2764 charIterTextClone,
2765 charIterTextLength,
2766 charIterTextAccess,
2767 charIterTextExtract,
2768 NULL, // Replace
2769 NULL, // Copy
2770 NULL, // MapOffsetToNative,
2771 NULL, // MapIndexToUTF16,
2772 charIterTextClose,
2773 NULL, // spare 1
2774 NULL, // spare 2
2775 NULL // spare 3
2776 };
2777 U_CDECL_END
2778
2779
2780 U_CAPI UText * U_EXPORT2
2781 utext_openCharacterIterator(UText *ut, CharacterIterator *ci, UErrorCode *status) {
2782 if (U_FAILURE(*status)) {
2783 return NULL;
2784 }
2785
2786 if (ci->startIndex() > 0) {
2787 // No support for CharacterIterators that do not start indexing from zero.
2788 *status = U_UNSUPPORTED_ERROR;
2789 return NULL;
2790 }
2791
2792 // Extra space in UText for 2 buffers of CIBufSize UChars each.
2793 int32_t extraSpace = 2 * CIBufSize * sizeof(UChar);
2794 ut = utext_setup(ut, extraSpace, status);
2795 if (U_SUCCESS(*status)) {
2796 ut->pFuncs = &charIterFuncs;
2797 ut->context = ci;
2798 ut->providerProperties = 0;
2799 ut->a = ci->endIndex(); // Length of text
2800 ut->p = ut->pExtra; // First buffer
2801 ut->b = -1; // Native index of first buffer contents
2802 ut->q = (UChar*)ut->pExtra+CIBufSize; // Second buffer
2803 ut->c = -1; // Native index of second buffer contents
2804
2805 // Initialize current chunk contents to be empty.
2806 // First access will fault something in.
2807 // Note: The initial nativeStart and chunkOffset must sum to zero
2808 // so that getNativeIndex() will correctly compute to zero
2809 // if no call to Access() has ever been made. They can't be both
2810 // zero without Access() thinking that the chunk is valid.
2811 ut->chunkContents = (UChar *)ut->p;
2812 ut->chunkNativeStart = -1;
2813 ut->chunkOffset = 1;
2814 ut->chunkNativeLimit = 0;
2815 ut->chunkLength = 0;
2816 ut->nativeIndexingLimit = ut->chunkOffset; // enables native indexing
2817 }
2818 return ut;
2819 }
2820
2821
2822