2 * The CMap data structure here is constructed on the fly by
3 * adding simple range-to-range mappings. Then the data structure
4 * is optimized to contain both range-to-range and range-to-table
7 * Any one-to-many mappings are inserted as one-to-table
8 * lookups in the beginning, and are not affected by the optimization
11 * There is a special function to add a 256-length range-to-table mapping.
12 * The ranges do not have to be added in order.
14 * This code can be a lot simpler if we don't care about wasting memory,
15 * or can trust the parser to give us optimal mappings.
21 typedef struct pdf_range_s pdf_range
;
23 enum { MAXCODESPACE
= 10 };
24 enum { SINGLE
, RANGE
, TABLE
, MULTI
};
30 int flag
; /* what kind of lookup is this */
31 int offset
; /* either range-delta or table-index */
35 cmprange(const void *va
, const void *vb
)
37 return ((const pdf_range
*)va
)->low
- ((const pdf_range
*)vb
)->low
;
55 } cspace
[MAXCODESPACE
];
65 * Allocate, destroy and simple parameters.
69 pdf_newcmap(pdf_cmap
**cmapp
)
73 cmap
= *cmapp
= fz_malloc(sizeof(pdf_cmap
));
78 strcpy(cmap
->cmapname
, "");
80 strcpy(cmap
->usecmapname
, "");
99 pdf_keepcmap(pdf_cmap
*cmap
)
106 pdf_dropcmap(pdf_cmap
*cmap
)
108 if (--cmap
->refs
== 0)
111 pdf_dropcmap(cmap
->usecmap
);
112 fz_free(cmap
->ranges
);
113 fz_free(cmap
->table
);
119 pdf_getusecmap(pdf_cmap
*cmap
)
121 return cmap
->usecmap
;
125 pdf_setusecmap(pdf_cmap
*cmap
, pdf_cmap
*usecmap
)
130 pdf_dropcmap(cmap
->usecmap
);
131 cmap
->usecmap
= pdf_keepcmap(usecmap
);
133 if (cmap
->ncspace
== 0)
135 cmap
->ncspace
= usecmap
->ncspace
;
136 for (i
= 0; i
< usecmap
->ncspace
; i
++)
137 cmap
->cspace
[i
] = usecmap
->cspace
[i
];
142 pdf_getwmode(pdf_cmap
*cmap
)
148 pdf_setwmode(pdf_cmap
*cmap
, int wmode
)
154 pdf_debugcmap(pdf_cmap
*cmap
)
158 printf("cmap $%p /%s {\n", cmap
, cmap
->cmapname
);
160 if (cmap
->usecmapname
[0])
161 printf(" usecmap /%s\n", cmap
->usecmapname
);
163 printf(" usecmap $%p\n", cmap
->usecmap
);
165 printf(" wmode %d\n", cmap
->wmode
);
167 printf(" codespaces {\n");
168 for (i
= 0; i
< cmap
->ncspace
; i
++)
171 for (k
= 0; k
< cmap
->cspace
[i
].n
; k
++)
172 printf("%02x", cmap
->cspace
[i
].lo
[k
]);
174 for (k
= 0; k
< cmap
->cspace
[i
].n
; k
++)
175 printf("%02x", cmap
->cspace
[i
].hi
[k
]);
180 printf(" ranges (%d,%d) {\n", cmap
->rlen
, cmap
->tlen
);
181 for (i
= 0; i
< cmap
->rlen
; i
++)
183 pdf_range
*r
= &cmap
->ranges
[i
];
184 printf(" <%04x> <%04x> ", r
->low
, r
->high
);
185 if (r
->flag
== TABLE
)
188 for (k
= 0; k
< r
->high
- r
->low
+ 1; k
++)
189 printf("%d ", cmap
->table
[r
->offset
+ k
]);
192 else if (r
->flag
== MULTI
)
195 n
= cmap
->table
[r
->offset
];
196 for (k
= 0; k
< n
; k
++)
197 printf("%04x ", cmap
->table
[r
->offset
+ 1 + k
]);
201 printf("%d\n", r
->offset
);
207 * Add a codespacerange section.
208 * These ranges are used by pdf_decodecmap to decode
209 * multi-byte encoded strings.
212 pdf_addcodespace(pdf_cmap
*cmap
, unsigned lo
, unsigned hi
, int n
)
216 if (cmap
->ncspace
+ 1 == MAXCODESPACE
)
217 return fz_throw("rangelimit: too many code space ranges");
219 cmap
->cspace
[cmap
->ncspace
].n
= n
;
221 for (i
= 0; i
< n
; i
++)
223 int o
= (n
- i
- 1) * 8;
224 cmap
->cspace
[cmap
->ncspace
].lo
[i
] = (lo
>> o
) & 0xFF;
225 cmap
->cspace
[cmap
->ncspace
].hi
[i
] = (hi
>> o
) & 0xFF;
234 * Add an integer to the table.
237 addtable(pdf_cmap
*cmap
, int value
)
239 if (cmap
->tlen
+ 1 > cmap
->tcap
)
241 int newcap
= cmap
->tcap
== 0 ? 256 : cmap
->tcap
* 2;
242 int *newtable
= fz_realloc(cmap
->table
, newcap
* sizeof(int));
246 cmap
->table
= newtable
;
249 cmap
->table
[cmap
->tlen
++] = value
;
258 addrange(pdf_cmap
*cmap
, int low
, int high
, int flag
, int offset
)
260 if (cmap
->rlen
+ 1 > cmap
->rcap
)
262 pdf_range
*newranges
;
263 int newcap
= cmap
->rcap
== 0 ? 256 : cmap
->rcap
* 2;
264 newranges
= fz_realloc(cmap
->ranges
, newcap
* sizeof(pdf_range
));
268 cmap
->ranges
= newranges
;
271 cmap
->ranges
[cmap
->rlen
].low
= low
;
272 cmap
->ranges
[cmap
->rlen
].high
= high
;
273 cmap
->ranges
[cmap
->rlen
].flag
= flag
;
274 cmap
->ranges
[cmap
->rlen
].offset
= offset
;
281 * Add a range-to-table mapping.
284 pdf_maprangetotable(pdf_cmap
*cmap
, int low
, int *table
, int len
)
294 for (i
= 0; i
< len
; i
++)
296 error
= addtable(cmap
, table
[i
]);
301 return addrange(cmap
, low
, high
, TABLE
, offset
);
305 * Add a range of contiguous one-to-one mappings (ie 1..5 maps to 21..25)
308 pdf_maprangetorange(pdf_cmap
*cmap
, int low
, int high
, int offset
)
310 return addrange(cmap
, low
, high
, high
- low
== 0 ? SINGLE
: RANGE
, offset
);
314 * Add a single one-to-many mapping.
317 pdf_maponetomany(pdf_cmap
*cmap
, int low
, int *values
, int len
)
324 return addrange(cmap
, low
, low
, SINGLE
, values
[0]);
328 error
= addtable(cmap
, len
);
332 for (i
= 0; i
< len
; i
++)
334 addtable(cmap
, values
[i
]);
339 return addrange(cmap
, low
, low
, MULTI
, offset
);
343 * Sort the input ranges.
344 * Merge contiguous input ranges to range-to-range if the output is contiguos.
345 * Merge contiguous input ranges to range-to-table if the output is random.
348 pdf_sortcmap(pdf_cmap
*cmap
)
351 pdf_range
*newranges
;
353 pdf_range
*a
; /* last written range on output */
354 pdf_range
*b
; /* current range examined on input */
356 qsort(cmap
->ranges
, cmap
->rlen
, sizeof(pdf_range
), cmprange
);
359 b
= cmap
->ranges
+ 1;
361 while (b
< cmap
->ranges
+ cmap
->rlen
)
363 /* ignore one-to-many mappings */
364 if (b
->flag
== MULTI
)
369 /* input contiguous */
370 else if (a
->high
+ 1 == b
->low
)
372 /* output contiguous */
373 if (a
->high
- a
->low
+ a
->offset
+ 1 == b
->offset
)
375 /* SR -> R and SS -> R and RR -> R and RS -> R */
376 if (a
->flag
== SINGLE
|| a
->flag
== RANGE
)
383 else if (a
->flag
== TABLE
&& b
->flag
== SINGLE
)
386 error
= addtable(cmap
, b
->offset
);
392 else if (a
->flag
== TABLE
&& b
->flag
== RANGE
)
404 /* output separated */
408 if (a
->flag
== SINGLE
&& b
->flag
== SINGLE
)
413 error
= addtable(cmap
, a
->offset
);
417 error
= addtable(cmap
, b
->offset
);
421 a
->offset
= cmap
->tlen
- 2;
425 else if (a
->flag
== TABLE
&& b
->flag
== SINGLE
)
428 error
= addtable(cmap
, b
->offset
);
441 /* input separated: XX -> XX */
450 cmap
->rlen
= a
- cmap
->ranges
+ 1;
452 assert(cmap
->rlen
> 0);
454 newranges
= fz_realloc(cmap
->ranges
, cmap
->rlen
* sizeof(pdf_range
));
457 cmap
->rcap
= cmap
->rlen
;
458 cmap
->ranges
= newranges
;
462 newtable
= fz_realloc(cmap
->table
, cmap
->tlen
* sizeof(int));
465 cmap
->tcap
= cmap
->tlen
;
466 cmap
->table
= newtable
;
473 * Lookup the mapping of a codepoint.
476 pdf_lookupcmap(pdf_cmap
*cmap
, int cpt
)
479 int r
= cmap
->rlen
- 1;
485 if (cpt
< cmap
->ranges
[m
].low
)
487 else if (cpt
> cmap
->ranges
[m
].high
)
491 int i
= cpt
- cmap
->ranges
[m
].low
+ cmap
->ranges
[m
].offset
;
492 if (cmap
->ranges
[m
].flag
== TABLE
)
493 return cmap
->table
[i
];
494 if (cmap
->ranges
[m
].flag
== MULTI
)
501 return pdf_lookupcmap(cmap
->usecmap
, cpt
);
507 * Use the codespace ranges to extract a codepoint from a
508 * multi-byte encoded string.
511 pdf_decodecmap(pdf_cmap
*cmap
, unsigned char *buf
, int *cpt
)
515 for (k
= 0; k
< cmap
->ncspace
; k
++)
517 unsigned char *lo
= cmap
->cspace
[k
].lo
;
518 unsigned char *hi
= cmap
->cspace
[k
].hi
;
519 int n
= cmap
->cspace
[k
].n
;
522 for (i
= 0; i
< n
; i
++)
524 if (lo
[i
] <= buf
[i
] && buf
[i
] <= hi
[i
])
525 c
= (c
<< 8) | buf
[i
];
546 TUSECMAP
= PDF_NTOKENS
,
547 TBEGINCODESPACERANGE
,
559 static int tokenfromkeyword(char *key
)
561 if (!strcmp(key
, "usecmap")) return TUSECMAP
;
562 if (!strcmp(key
, "begincodespacerange")) return TBEGINCODESPACERANGE
;
563 if (!strcmp(key
, "endcodespacerange")) return TENDCODESPACERANGE
;
564 if (!strcmp(key
, "beginbfchar")) return TBEGINBFCHAR
;
565 if (!strcmp(key
, "endbfchar")) return TENDBFCHAR
;
566 if (!strcmp(key
, "beginbfrange")) return TBEGINBFRANGE
;
567 if (!strcmp(key
, "endbfrange")) return TENDBFRANGE
;
568 if (!strcmp(key
, "begincidchar")) return TBEGINCIDCHAR
;
569 if (!strcmp(key
, "endcidchar")) return TENDCIDCHAR
;
570 if (!strcmp(key
, "begincidrange")) return TBEGINCIDRANGE
;
571 if (!strcmp(key
, "endcidrange")) return TENDCIDRANGE
;
575 static int codefromstring(unsigned char *buf
, int len
)
579 a
= (a
<< 8) | *buf
++;
583 static int mylex(fz_stream
*file
, char *buf
, int n
, int *sl
)
585 int token
= pdf_lex(file
, buf
, n
, sl
);
586 if (token
== PDF_TKEYWORD
)
587 token
= tokenfromkeyword(buf
);
591 static fz_error
*parsecmapname(pdf_cmap
*cmap
, fz_stream
*file
)
597 token
= mylex(file
, buf
, sizeof buf
, &len
);
598 if (token
== PDF_TNAME
) {
599 strlcpy(cmap
->cmapname
, buf
, sizeof(cmap
->cmapname
));
603 return fz_throw("syntaxerror in CMap after /CMapName");
606 static fz_error
*parsewmode(pdf_cmap
*cmap
, fz_stream
*file
)
612 token
= mylex(file
, buf
, sizeof buf
, &len
);
613 if (token
== PDF_TINT
) {
614 pdf_setwmode(cmap
, atoi(buf
));
618 return fz_throw("syntaxerror in CMap after /WMode");
621 static fz_error
*parsecodespacerange(pdf_cmap
*cmap
, fz_stream
*file
)
631 token
= mylex(file
, buf
, sizeof buf
, &len
);
633 if (token
== TENDCODESPACERANGE
)
636 else if (token
== PDF_TSTRING
)
638 lo
= codefromstring(buf
, len
);
639 token
= mylex(file
, buf
, sizeof buf
, &len
);
640 if (token
== PDF_TSTRING
)
642 hi
= codefromstring(buf
, len
);
643 error
= pdf_addcodespace(cmap
, lo
, hi
, len
);
653 return fz_throw("syntaxerror in CMap codespacerange section");
656 static fz_error
*parsecidrange(pdf_cmap
*cmap
, fz_stream
*file
)
666 token
= mylex(file
, buf
, sizeof buf
, &len
);
668 if (token
== TENDCIDRANGE
)
671 else if (token
!= PDF_TSTRING
)
674 lo
= codefromstring(buf
, len
);
676 token
= mylex(file
, buf
, sizeof buf
, &len
);
677 if (token
!= PDF_TSTRING
)
680 hi
= codefromstring(buf
, len
);
682 token
= mylex(file
, buf
, sizeof buf
, &len
);
683 if (token
!= PDF_TINT
)
688 error
= pdf_maprangetorange(cmap
, lo
, hi
, dst
);
694 return fz_throw("syntaxerror in CMap cidrange section");
697 static fz_error
*parsecidchar(pdf_cmap
*cmap
, fz_stream
*file
)
707 token
= mylex(file
, buf
, sizeof buf
, &len
);
709 if (token
== TENDCIDCHAR
)
712 else if (token
!= PDF_TSTRING
)
715 src
= codefromstring(buf
, len
);
717 token
= mylex(file
, buf
, sizeof buf
, &len
);
718 if (token
!= PDF_TINT
)
723 error
= pdf_maprangetorange(cmap
, src
, src
, dst
);
729 return fz_throw("syntaxerror in CMap cidchar section");
732 static fz_error
*parsebfrangearray(pdf_cmap
*cmap
, fz_stream
*file
, int lo
, int hi
)
743 token
= mylex(file
, buf
, sizeof buf
, &len
);
744 /* Note: does not handle [ /Name /Name ... ] */
746 if (token
== PDF_TCARRAY
)
749 else if (token
!= PDF_TSTRING
)
750 return fz_throw("syntaxerror in CMap bfrange array section");
754 for (i
= 0; i
< len
/ 2; i
++)
755 dst
[i
] = codefromstring(buf
+ i
* 2, 2);
757 error
= pdf_maponetomany(cmap
, lo
, dst
, len
/ 2);
766 static fz_error
*parsebfrange(pdf_cmap
*cmap
, fz_stream
*file
)
776 token
= mylex(file
, buf
, sizeof buf
, &len
);
778 if (token
== TENDBFRANGE
)
781 else if (token
!= PDF_TSTRING
)
784 lo
= codefromstring(buf
, len
);
786 token
= mylex(file
, buf
, sizeof buf
, &len
);
787 if (token
!= PDF_TSTRING
)
790 hi
= codefromstring(buf
, len
);
792 token
= mylex(file
, buf
, sizeof buf
, &len
);
794 if (token
== PDF_TSTRING
)
798 dst
= codefromstring(buf
, len
);
799 error
= pdf_maprangetorange(cmap
, lo
, hi
, dst
);
810 for (i
= 0; i
< len
/ 2; i
++)
811 dststr
[i
] = codefromstring(buf
+ i
* 2, 2);
816 error
= pdf_maponetomany(cmap
, lo
, dststr
, i
);
825 else if (token
== PDF_TOARRAY
)
827 error
= parsebfrangearray(cmap
, file
, lo
, hi
);
839 return fz_throw("syntaxerror in CMap bfrange section");
842 static fz_error
*parsebfchar(pdf_cmap
*cmap
, fz_stream
*file
)
854 token
= mylex(file
, buf
, sizeof buf
, &len
);
856 if (token
== TENDBFCHAR
)
859 else if (token
!= PDF_TSTRING
)
862 src
= codefromstring(buf
, len
);
864 token
= mylex(file
, buf
, sizeof buf
, &len
);
865 /* Note: does not handle /dstName */
866 if (token
!= PDF_TSTRING
)
871 for (i
= 0; i
< len
/ 2; i
++)
872 dst
[i
] = codefromstring(buf
+ i
* 2, 2);
874 error
= pdf_maponetomany(cmap
, src
, dst
, i
);
881 return fz_throw("syntaxerror in CMap bfchar section");
885 pdf_parsecmap(pdf_cmap
**cmapp
, fz_stream
*file
)
894 error
= pdf_newcmap(&cmap
);
898 strcpy(key
, ".notdef");
902 token
= mylex(file
, buf
, sizeof buf
, &len
);
904 if (token
== PDF_TEOF
)
907 else if (token
== PDF_TERROR
)
909 error
= fz_throw("syntaxerror in CMap");
913 else if (token
== PDF_TNAME
)
915 if (!strcmp(buf
, "CMapName"))
917 error
= parsecmapname(cmap
, file
);
921 else if (!strcmp(buf
, "WMode"))
923 error
= parsewmode(cmap
, file
);
928 strlcpy(key
, buf
, sizeof key
);
931 else if (token
== TUSECMAP
)
933 strlcpy(cmap
->usecmapname
, key
, sizeof(cmap
->usecmapname
));
936 else if (token
== TBEGINCODESPACERANGE
)
938 error
= parsecodespacerange(cmap
, file
);
943 else if (token
== TBEGINBFCHAR
)
945 error
= parsebfchar(cmap
, file
);
950 else if (token
== TBEGINCIDCHAR
)
952 error
= parsecidchar(cmap
, file
);
957 else if (token
== TBEGINBFRANGE
)
959 error
= parsebfrange(cmap
, file
);
964 else if (token
== TBEGINCIDRANGE
)
966 error
= parsecidrange(cmap
, file
);
971 /* ignore everything else */
974 error
= pdf_sortcmap(cmap
);
987 * Load CMap stream in PDF file
990 pdf_loadembeddedcmap(pdf_cmap
**cmapp
, pdf_xref
*xref
, fz_obj
*stmref
)
992 fz_obj
*stmobj
= stmref
;
993 fz_error
*error
= nil
;
995 pdf_cmap
*cmap
= nil
;
1000 if ((*cmapp
= pdf_finditem(xref
->store
, PDF_KCMAP
, stmref
)))
1002 pdf_keepcmap(*cmapp
);
1006 pdf_logfont("load embedded cmap %d %d {\n", fz_tonum(stmref
), fz_togen(stmref
));
1008 error
= pdf_resolve(&stmobj
, xref
);
1012 error
= pdf_openstream(&file
, xref
, fz_tonum(stmref
), fz_togen(stmref
));
1016 error
= pdf_parsecmap(&cmap
, file
);
1020 fz_dropstream(file
);
1022 wmode
= fz_dictgets(stmobj
, "WMode");
1023 if (fz_isint(wmode
))
1025 pdf_logfont("wmode %d\n", wmode
);
1026 pdf_setwmode(cmap
, fz_toint(wmode
));
1029 obj
= fz_dictgets(stmobj
, "UseCMap");
1032 pdf_logfont("usecmap /%s\n", fz_toname(obj
));
1033 error
= pdf_loadsystemcmap(&usecmap
, fz_toname(obj
));
1036 pdf_setusecmap(cmap
, usecmap
);
1037 pdf_dropcmap(usecmap
);
1039 else if (fz_isindirect(obj
))
1041 pdf_logfont("usecmap %d %d R\n", fz_tonum(obj
), fz_togen(obj
));
1042 error
= pdf_loadembeddedcmap(&usecmap
, xref
, obj
);
1045 pdf_setusecmap(cmap
, usecmap
);
1046 pdf_dropcmap(usecmap
);
1051 error
= pdf_storeitem(xref
->store
, PDF_KCMAP
, stmref
, cmap
);
1068 * Load predefined CMap from system
1071 pdf_loadsystemcmap(pdf_cmap
**cmapp
, char *name
)
1073 fz_error
*error
= nil
;
1084 pdf_logfont("load system cmap %s {\n", name
);
1086 cmapdir
= getenv("CMAPDIR");
1088 return fz_throw("ioerror: CMAPDIR environment not set");
1090 strlcpy(path
, cmapdir
, sizeof path
);
1091 strlcat(path
, "/", sizeof path
);
1092 strlcat(path
, name
, sizeof path
);
1094 error
= fz_openrfile(&file
, path
);
1098 error
= pdf_parsecmap(&cmap
, file
);
1102 fz_dropstream(file
);
1104 usecmapname
= cmap
->usecmapname
;
1107 pdf_logfont("usecmap %s\n", usecmapname
);
1108 error
= pdf_loadsystemcmap(&usecmap
, usecmapname
);
1111 pdf_setusecmap(cmap
, usecmap
);
1112 pdf_dropcmap(usecmap
);
1124 fz_dropstream(file
);
1129 * Create an Identity-* CMap (for both 1 and 2-byte encodings)
1132 pdf_newidentitycmap(pdf_cmap
**cmapp
, int wmode
, int bytes
)
1137 error
= pdf_newcmap(&cmap
);
1141 sprintf(cmap
->cmapname
, "Identity-%c", wmode
? 'V' : 'H');
1143 error
= pdf_addcodespace(cmap
, 0x0000, 0xffff, bytes
);
1149 error
= pdf_maprangetorange(cmap
, 0x0000, 0xffff, 0);
1155 error
= pdf_sortcmap(cmap
);
1161 pdf_setwmode(cmap
, wmode
);