rosapps/smartpdf/fitz/mupdf/pdf_open.c

   1 #include <fitz.h>
   2 #include <mupdf.h>
   3
   4 static inline int iswhite(int ch)
   5 {
   6         return  ch == '\000' || ch == '\011' || ch == '\012' ||
   7                         ch == '\014' || ch == '\015' || ch == '\040';
   8 }
   9
  10 /*
  11  * magic version tag and startxref
  12  */
  13
  14 static fz_error *
  15 loadversion(pdf_xref *xref)
  16 {
  17         char buf[20];
  18         int n;
  19
  20         n = fz_seek(xref->file, 0, 0);
  21         if (n < 0)
  22                 return fz_ioerror(xref->file);
  23
  24         fz_readline(xref->file, buf, sizeof buf);
  25         if (memcmp(buf, "%PDF-", 5) != 0)
  26                 return fz_throw("syntaxerror: corrupt version marker");
  27
  28         xref->version = atof(buf + 5);
  29
  30         pdf_logxref("version %g\n", xref->version);
  31
  32         return nil;
  33 }
  34
  35 static fz_error *
  36 readstartxref(pdf_xref *xref)
  37 {
  38         char buf[1024];
  39         int t, n;
  40         int i;
  41
  42         t = fz_seek(xref->file, 0, 2);
  43         if (t == -1)
  44                 return fz_ioerror(xref->file);
  45
  46         t = fz_seek(xref->file, MAX(0, t - ((int)sizeof buf)), 0);
  47         if (t == -1)
  48                 return fz_ioerror(xref->file);
  49
  50         n = fz_read(xref->file, buf, sizeof buf);
  51         if (n == -1)
  52                 return fz_ioerror(xref->file);
  53
  54         for (i = n - 9; i >= 0; i--)
  55         {
  56                 if (memcmp(buf + i, "startxref", 9) == 0)
  57                 {
  58                         i += 9;
  59                         while (iswhite(buf[i]) && i < n)
  60                                 i ++;
  61                         xref->startxref = atoi(buf + i);
  62                         return nil;
  63                 }
  64         }
  65
  66         return fz_throw("syntaxerror: could not find startxref");
  67 }
  68
  69 #define WHITE_SPACE_CHARS " \n\t\r"
  70
  71 static const char *str_find_char(const char *txt, char c)
  72 {
  73     while (*txt != c) {
  74         if (0 == *txt)
  75             return NULL;
  76         ++txt;
  77     }
  78     return txt;
  79 }
  80
  81 static int str_contains(const char *str, char c)
  82 {
  83     const char *pos = str_find_char(str, c);
  84     if (!pos)
  85         return 0;
  86     return 1;
  87 }
  88
  89 static void str_strip_right(char *txt, const char *to_strip)
  90 {
  91     char * new_end;
  92     char   c;
  93     if (!txt || !to_strip)
  94         return;
  95     if (0 == *txt)
  96         return;
  97     /* point at the last character in the string */
  98     new_end = txt + strlen(txt) - 1;
  99     for (;;) {
 100         c = *new_end;
 101         if (!str_contains(to_strip, c))
 102             break;
 103         if (txt == new_end)
 104             break;
 105         --new_end;
 106     }
 107     if (str_contains(to_strip, *new_end))
 108         new_end[0] = 0;
 109     else
 110         new_end[1] = 0;
 111 }
 112
 113 static void str_strip_ws_right(char *txt)
 114 {
 115     str_strip_right(txt, WHITE_SPACE_CHARS);
 116 }
 117
 118
 119 /*
 120  * trailer dictionary
 121  */
 122
 123 static fz_error *
 124 readoldtrailer(pdf_xref *xref, char *buf, int cap)
 125 {
 126         int ofs, len;
 127         char *s;
 128         int n;
 129         int t;
 130         int c;
 131
 132         pdf_logxref("load old xref format trailer\n");
 133
 134         fz_readline(xref->file, buf, cap);
 135     str_strip_ws_right(buf);
 136         if (strcmp(buf, "xref") != 0)
 137                 return fz_throw("ioerror: missing xref");
 138
 139         while (1)
 140         {
 141                 c = fz_peekbyte(xref->file);
 142                 if (!(c >= '0' && c <= '9'))
 143                         break;
 144
 145                 n = fz_readline(xref->file, buf, cap);
 146                 if (n < 0)
 147                         return fz_ioerror(xref->file);
 148
 149                 s = buf;
 150                 ofs = atoi(strsep(&s, " "));
 151                 len = atoi(strsep(&s, " "));
 152
 153                 /* broken pdfs where the section is not on a separate line */
 154                 if (s && *s != '\0')
 155                         fz_seek(xref->file, -(n + buf - s + 2), 1);
 156
 157                 t = fz_tell(xref->file);
 158                 if (t < 0)
 159                         return fz_ioerror(xref->file);
 160
 161                 n = fz_seek(xref->file, t + 20 * len, 0);
 162                 if (n < 0)
 163                         return fz_ioerror(xref->file);
 164         }
 165
 166         t = pdf_lex(xref->file, buf, cap, &n);
 167         if (t != PDF_TTRAILER)
 168                 return fz_throw("syntaxerror: expected trailer");
 169
 170         t = pdf_lex(xref->file, buf, cap, &n);
 171         if (t != PDF_TODICT)
 172                 return fz_throw("syntaxerror: expected trailer dictionary");
 173
 174         return pdf_parsedict(&xref->trailer, xref->file, buf, cap);
 175 }
 176
 177 static fz_error *
 178 readnewtrailer(pdf_xref *xref, char *buf, int cap)
 179 {
 180         pdf_logxref("load new xref format trailer\n");
 181         return pdf_parseindobj(&xref->trailer, xref->file, buf, cap, nil, nil, nil);
 182 }
 183
 184 static fz_error *
 185 readtrailer(pdf_xref *xref, char *buf, int cap)
 186 {
 187         int n;
 188         int c;
 189
 190         n = fz_seek(xref->file, xref->startxref, 0);
 191         if (n < 0)
 192                 return fz_ioerror(xref->file);
 193
 194         c = fz_peekbyte(xref->file);
 195         if (c == 'x')
 196                 return readoldtrailer(xref, buf, cap);
 197         else if (c >= '0' && c <= '9')
 198                 return readnewtrailer(xref, buf, cap);
 199
 200         return fz_throw("syntaxerror: could not find xref");
 201 }
 202
 203 /*
 204  * xref tables
 205  */
 206
 207 static fz_error *
 208 readoldxref(fz_obj **trailerp, pdf_xref *xref, char *buf, int cap)
 209 {
 210         int ofs, len;
 211         char *s;
 212         int n;
 213         int t;
 214         int i;
 215         int c;
 216
 217         pdf_logxref("load old xref format\n");
 218
 219         fz_readline(xref->file, buf, cap);
 220     str_strip_ws_right(buf);
 221         if (strcmp(buf, "xref") != 0)
 222                 return fz_throw("syntaxerror: expected xref");
 223
 224         while (1)
 225         {
 226                 c = fz_peekbyte(xref->file);
 227                 if (!(c >= '0' && c <= '9'))
 228                         break;
 229
 230                 n = fz_readline(xref->file, buf, cap);
 231                 if (n < 0)
 232                         return fz_ioerror(xref->file);
 233
 234                 s = buf;
 235                 ofs = atoi(strsep(&s, " "));
 236                 len = atoi(strsep(&s, " "));
 237
 238                 /* broken pdfs where the section is not on a separate line */
 239                 if (s && *s != '\0')
 240                 {
 241                         fz_warn("syntaxerror: broken xref section");
 242                         fz_seek(xref->file, -(n + buf - s + 2), 1);
 243                 }
 244
 245                 for (i = 0; i < len; i++)
 246                 {
 247                         n = fz_read(xref->file, buf, 20);
 248                         if (n < 0)
 249                                 return fz_ioerror(xref->file);
 250                         if (n != 20)
 251                                 return fz_throw("syntaxerror: truncated xref table");
 252                         if (!xref->table[ofs + i].type)
 253                         {
 254                                 s = buf;
 255                                 xref->table[ofs + i].ofs = atoi(s);
 256                                 xref->table[ofs + i].gen = atoi(s + 11);
 257                                 xref->table[ofs + i].type = s[17];
 258                         }
 259                 }
 260         }
 261
 262         t = pdf_lex(xref->file, buf, cap, &n);
 263         if (t != PDF_TTRAILER)
 264                 return fz_throw("syntaxerror: expected trailer");
 265         t = pdf_lex(xref->file, buf, cap, &n);
 266         if (t != PDF_TODICT)
 267                 return fz_throw("syntaxerror: expected trailer dictionary");
 268
 269         return pdf_parsedict(trailerp, xref->file, buf, cap);
 270 }
 271
 272 static fz_error *
 273 readnewxref(fz_obj **trailerp, pdf_xref *xref, char *buf, int cap)
 274 {
 275         fz_error *error;
 276         fz_stream *stm;
 277         fz_obj *trailer;
 278         fz_obj *obj;
 279         int oid, gen, stmofs;
 280         int size, w0, w1, w2, i0, i1;
 281         int i, n;
 282
 283         pdf_logxref("load new xref format\n");
 284
 285         error = pdf_parseindobj(&trailer, xref->file, buf, cap, &oid, &gen, &stmofs);
 286         if (error)
 287                 return error;
 288
 289         if (oid < 0 || oid >= xref->len) {
 290                 error = fz_throw("rangecheck: object id out of range");
 291                 goto cleanup;
 292         }
 293
 294         xref->table[oid].type = 'n';
 295         xref->table[oid].gen = gen;
 296         xref->table[oid].obj = fz_keepobj(trailer);
 297         xref->table[oid].stmofs = stmofs;
 298
 299         obj = fz_dictgets(trailer, "Size");
 300         if (!obj) {
 301                 error = fz_throw("syntaxerror: xref stream missing Size entry");
 302                 goto cleanup;
 303         }
 304         size = fz_toint(obj);
 305
 306         obj = fz_dictgets(trailer, "W");
 307         if (!obj) {
 308                 error = fz_throw("syntaxerror: xref stream missing W entry");
 309                 goto cleanup;
 310         }
 311         w0 = fz_toint(fz_arrayget(obj, 0));
 312         w1 = fz_toint(fz_arrayget(obj, 1));
 313         w2 = fz_toint(fz_arrayget(obj, 2));
 314
 315         obj = fz_dictgets(trailer, "Index");
 316         if (obj) {
 317                 i0 = fz_toint(fz_arrayget(obj, 0));
 318                 i1 = fz_toint(fz_arrayget(obj, 1));
 319         }
 320         else {
 321                 i0 = 0;
 322                 i1 = size;
 323         }
 324
 325         if (i0 < 0 || i1 > xref->len) {
 326                 error = fz_throw("syntaxerror: xref stream has too many entries");
 327                 goto cleanup;
 328         }
 329
 330         error = pdf_openstream(&stm, xref, oid, gen);
 331         if (error)
 332                 goto cleanup;
 333
 334         for (i = i0; i < i0 + i1; i++)
 335         {
 336                 int a = 0;
 337                 int b = 0;
 338                 int c = 0;
 339
 340                 if (fz_peekbyte(stm) == EOF)
 341                 {
 342                         error = fz_throw("syntaxerror: truncated xref stream");
 343                         fz_dropstream(stm);
 344                         goto cleanup;
 345                 }
 346
 347                 for (n = 0; n < w0; n++)
 348                         a = (a << 8) + fz_readbyte(stm);
 349                 for (n = 0; n < w1; n++)
 350                         b = (b << 8) + fz_readbyte(stm);
 351                 for (n = 0; n < w2; n++)
 352                         c = (c << 8) + fz_readbyte(stm);
 353
 354                 if (!xref->table[i].type)
 355                 {
 356                         int t = w0 ? a : 1;
 357                         xref->table[i].type = t == 0 ? 'f' : t == 1 ? 'n' : t == 2 ? 'o' : 0;
 358                         xref->table[i].ofs = w2 ? b : 0;
 359                         xref->table[i].gen = w1 ? c : 0;
 360                 }
 361         }
 362
 363         fz_dropstream(stm);
 364
 365         *trailerp = trailer;
 366
 367         return nil;
 368
 369 cleanup:
 370         fz_dropobj(trailer);
 371         return error;
 372 }
 373
 374 static fz_error *
 375 readxref(fz_obj **trailerp, pdf_xref *xref, int ofs, char *buf, int cap)
 376 {
 377         int n;
 378         int c;
 379
 380         n = fz_seek(xref->file, ofs, 0);
 381         if (n < 0)
 382                 return fz_ioerror(xref->file);
 383
 384         c = fz_peekbyte(xref->file);
 385         if (c == 'x')
 386                 return readoldxref(trailerp, xref, buf, cap);
 387         else if (c >= '0' && c <= '9')
 388                 return readnewxref(trailerp, xref, buf, cap);
 389
 390         return fz_throw("syntaxerror: expected xref");
 391 }
 392
 393 static fz_error *
 394 readxrefsections(pdf_xref *xref, int ofs, char *buf, int cap)
 395 {
 396         fz_error *error;
 397         fz_obj *trailer;
 398         fz_obj *prev;
 399         fz_obj *xrefstm;
 400
 401         error = readxref(&trailer, xref, ofs, buf, cap);
 402         if (error)
 403                 return error;
 404
 405         /* FIXME: do we overwrite free entries properly? */
 406         xrefstm = fz_dictgets(trailer, "XrefStm");
 407         if (xrefstm)
 408         {
 409                 pdf_logxref("load xrefstm\n");
 410                 error = readxrefsections(xref, fz_toint(xrefstm), buf, cap);
 411                 if (error)
 412                         goto cleanup;
 413         }
 414
 415         prev = fz_dictgets(trailer, "Prev");
 416         if (prev)
 417         {
 418                 pdf_logxref("load prev\n");
 419                 error = readxrefsections(xref, fz_toint(prev), buf, cap);
 420                 if (error)
 421                         goto cleanup;
 422         }
 423
 424         fz_dropobj(trailer);
 425         return nil;
 426
 427 cleanup:
 428         fz_dropobj(trailer);
 429         return error;
 430 }
 431
 432 /*
 433  * compressed object streams
 434  */
 435
 436 fz_error *
 437 pdf_loadobjstm(pdf_xref *xref, int oid, int gen, char *buf, int cap)
 438 {
 439         fz_error *error;
 440         fz_stream *stm;
 441         fz_obj *objstm;
 442         int *oidbuf;
 443         int *ofsbuf;
 444
 445         fz_obj *obj;
 446         int first;
 447         int count;
 448         int i, n, t;
 449
 450         pdf_logxref("loadobjstm %d %d\n", oid, gen);
 451
 452         error = pdf_loadobject(&objstm, xref, oid, gen);
 453         if (error)
 454                 return error;
 455
 456         count = fz_toint(fz_dictgets(objstm, "N"));
 457         first = fz_toint(fz_dictgets(objstm, "First"));
 458
 459         pdf_logxref("  count %d\n", count);
 460
 461         oidbuf = fz_malloc(count * sizeof(int));
 462         if (!oidbuf) { error = fz_outofmem; goto cleanupobj; }
 463
 464         ofsbuf = fz_malloc(count * sizeof(int));
 465         if (!ofsbuf) { error = fz_outofmem; goto cleanupoid; }
 466
 467         error = pdf_openstream(&stm, xref, oid, gen);
 468         if (error)
 469                 goto cleanupofs;
 470
 471         for (i = 0; i < count; i++)
 472         {
 473                 t = pdf_lex(stm, buf, cap, &n);
 474                 if (t != PDF_TINT)
 475                 {
 476                         error = fz_throw("syntaxerror: corrupt object stream");
 477                         goto cleanupstm;
 478                 }
 479                 oidbuf[i] = atoi(buf);
 480
 481                 t = pdf_lex(stm, buf, cap, &n);
 482                 if (t != PDF_TINT)
 483                 {
 484                         error = fz_throw("syntaxerror: corrupt object stream");
 485                         goto cleanupstm;
 486                 }
 487                 ofsbuf[i] = atoi(buf);
 488         }
 489
 490         n = fz_seek(stm, first, 0);
 491         if (n < 0)
 492         {
 493                 error = fz_ioerror(stm);
 494                 goto cleanupstm;
 495         }
 496
 497         for (i = 0; i < count; i++)
 498         {
 499                 /* FIXME: seek to first + ofsbuf[i] */
 500
 501                 error = pdf_parsestmobj(&obj, stm, buf, cap);
 502                 if (error)
 503                         goto cleanupstm;
 504
 505                 if (oidbuf[i] < 1 || oidbuf[i] >= xref->len)
 506                 {
 507                         error = fz_throw("rangecheck: object number out of range");
 508                         goto cleanupstm;
 509                 }
 510
 511                 if (xref->table[oidbuf[i]].obj)
 512                         fz_dropobj(xref->table[oidbuf[i]].obj);
 513                 xref->table[oidbuf[i]].obj = obj;
 514         }
 515
 516         fz_dropstream(stm);
 517         fz_free(ofsbuf);
 518         fz_free(oidbuf);
 519         fz_dropobj(objstm);
 520         return nil;
 521
 522 cleanupstm:
 523         fz_dropstream(stm);
 524 cleanupofs:
 525         fz_free(ofsbuf);
 526 cleanupoid:
 527         fz_free(oidbuf);
 528 cleanupobj:
 529         fz_dropobj(objstm);
 530         return error;
 531 }
 532
 533 /*
 534  * open and load xref tables from pdf
 535  */
 536
 537 fz_error *
 538 pdf_loadxref(pdf_xref *xref, char *filename)
 539 {
 540         fz_error *error;
 541         fz_obj *size;
 542         int i;
 543
 544         char buf[65536];        /* yeowch! */
 545
 546         pdf_logxref("loadxref '%s' %p\n", filename, xref);
 547
 548         error = fz_openrfile(&xref->file, filename);
 549         if (error)
 550                 return error;
 551
 552         error = loadversion(xref);
 553         if (error)
 554                 return error;
 555
 556         error = readstartxref(xref);
 557         if (error)
 558                 return error;
 559
 560         error = readtrailer(xref, buf, sizeof buf);
 561         if (error)
 562                 return error;
 563
 564         size = fz_dictgets(xref->trailer, "Size");
 565         if (!size)
 566                 return fz_throw("syntaxerror: trailer missing Size entry");
 567
 568         pdf_logxref("  size %d\n", fz_toint(size));
 569
 570         assert(xref->table == nil);
 571
 572         xref->cap = fz_toint(size);
 573         xref->len = fz_toint(size);
 574         xref->table = fz_malloc(xref->cap * sizeof(pdf_xrefentry));
 575         if (!xref->table)
 576                 return fz_outofmem;
 577
 578         for (i = 0; i < xref->len; i++)
 579         {
 580                 xref->table[i].ofs = 0;
 581                 xref->table[i].gen = 0;
 582                 xref->table[i].type = 0;
 583                 xref->table[i].mark = 0;
 584                 xref->table[i].stmbuf = nil;
 585                 xref->table[i].stmofs = 0;
 586                 xref->table[i].obj = nil;
 587         }
 588
 589         error = readxrefsections(xref, xref->startxref, buf, sizeof buf);
 590         if (error)
 591                 return error;
 592
 593         return nil;
 594 }
 595