1 //========================================================================
6 // Copyright 1999-2000 G. Ovtcharov
7 //========================================================================
10 #include <poppler-config.h>
17 #include "parseargs.h"
18 #include "goo/GooString.h"
28 #include "HtmlOutputDev.h"
29 #include "PSOutputDev.h"
30 #include "GlobalParams.h"
32 #include "UGooString.h"
33 #include "goo/gfile.h"
36 # define GHOSTSCRIPT "gs"
39 static int firstPage
= 1;
40 static int lastPage
= 0;
41 static GBool rawOrder
= gTrue
;
42 GBool printCommands
= gTrue
;
43 static GBool printHelp
= gFalse
;
44 GBool printHtml
= gFalse
;
45 GBool complexMode
=gFalse
;
47 //char extension[5]=".png";
49 GBool noframes
=gFalse
;
52 GBool errQuiet
=gFalse
;
55 GBool showHidden
= gFalse
;
56 GBool noMerge
= gFalse
;
57 static char ownerPassword
[33] = "";
58 static char userPassword
[33] = "";
59 static char gsDevice
[33] = "png16m";
60 static GBool printVersion
= gFalse
;
62 static GooString
* getInfoString(Dict
*infoDict
, char *key
);
63 static GooString
* getInfoDate(Dict
*infoDict
, char *key
);
65 static char textEncName
[128] = "";
67 static ArgDesc argDesc
[] = {
68 {"-f", argInt
, &firstPage
, 0,
69 "first page to convert"},
70 {"-l", argInt
, &lastPage
, 0,
71 "last page to convert"},
72 /*{"-raw", argFlag, &rawOrder, 0,
73 "keep strings in content stream order"},*/
74 {"-q", argFlag
, &errQuiet
, 0,
75 "don't print any messages or errors"},
76 {"-h", argFlag
, &printHelp
, 0,
77 "print usage information"},
78 {"-help", argFlag
, &printHelp
, 0,
79 "print usage information"},
80 {"-p", argFlag
, &printHtml
, 0,
81 "exchange .pdf links by .html"},
82 {"-c", argFlag
, &complexMode
, 0,
83 "generate complex document"},
84 {"-i", argFlag
, &ignore
, 0,
86 {"-noframes", argFlag
, &noframes
, 0,
87 "generate no frames"},
88 {"-stdout" ,argFlag
, &stout
, 0,
89 "use standard output"},
90 {"-zoom", argFP
, &scale
, 0,
91 "zoom the pdf document (default 1.5)"},
92 {"-xml", argFlag
, &xml
, 0,
93 "output for XML post-processing"},
94 {"-hidden", argFlag
, &showHidden
, 0,
95 "output hidden text"},
96 {"-nomerge", argFlag
, &noMerge
, 0,
97 "do not merge paragraphs"},
98 {"-enc", argString
, textEncName
, sizeof(textEncName
),
99 "output text encoding name"},
100 {"-dev", argString
, gsDevice
, sizeof(gsDevice
),
101 "output device name for Ghostscript (png16m, jpeg etc)"},
102 {"-v", argFlag
, &printVersion
, 0,
103 "print copyright and version info"},
104 {"-opw", argString
, ownerPassword
, sizeof(ownerPassword
),
105 "owner password (for encrypted files)"},
106 {"-upw", argString
, userPassword
, sizeof(userPassword
),
107 "user password (for encrypted files)"},
108 {"-nodrm", argFlag
, &noDrm
, 0,
109 "override document DRM settings"},
113 int main(int argc
, char *argv
[]) {
115 GooString
*fileName
= NULL
;
116 GooString
*docTitle
= NULL
;
117 GooString
*author
= NULL
, *keywords
= NULL
, *subject
= NULL
, *date
= NULL
;
118 GooString
*htmlFileName
= NULL
;
119 GooString
*psFileName
= NULL
;
120 HtmlOutputDev
*htmlOut
= NULL
;
121 PSOutputDev
*psOut
= NULL
;
124 char extension
[16] = "png";
125 GooString
*ownerPW
, *userPW
;
127 char * extsList
[] = {"png", "jpeg", "bmp", "pcx", "tiff", "pbm", NULL
};
130 ok
= parseArgs(argDesc
, &argc
, argv
);
131 if (!ok
|| argc
< 2 || argc
> 3 || printHelp
|| printVersion
) {
132 fprintf(stderr
, "pdftohtml version %s http://pdftohtml.sourceforge.net/, based on Xpdf version %s\n", "0.36", xpdfVersion
);
133 fprintf(stderr
, "%s\n", "Copyright 1999-2003 Gueorgui Ovtcharov and Rainer Dorsch");
134 fprintf(stderr
, "%s\n\n", xpdfCopyright
);
136 printUsage("pdftohtml", "<PDF-file> [<html-file> <xml-file>]", argDesc
);
145 globalParams
= new GlobalParams("");
148 globalParams
->setErrQuiet(errQuiet
);
149 printCommands
= gFalse
; // I'm not 100% what is the differecne between them
152 if (textEncName
[0]) {
153 globalParams
->setTextEncoding(textEncName
);
154 if( !globalParams
->getTextEncoding() ) {
160 if (ownerPassword
[0]) {
161 ownerPW
= new GooString(ownerPassword
);
165 if (userPassword
[0]) {
166 userPW
= new GooString(userPassword
);
171 fileName
= new GooString(argv
[1]);
173 doc
= new PDFDoc(fileName
, ownerPW
, userPW
);
184 // check for copy permission
185 if (!doc
->okToCopy()) {
187 error(-1, "Copying of text from this document is not allowed.");
190 fprintf(stderr
, "Document has copy-protection bit set.\n");
193 // construct text file name
195 GooString
* tmp
= new GooString(argv
[2]);
196 p
=tmp
->getCString()+tmp
->getLength()-5;
198 if (!strcmp(p
, ".html") || !strcmp(p
, ".HTML"))
199 htmlFileName
= new GooString(tmp
->getCString(),
200 tmp
->getLength() - 5);
201 else htmlFileName
=new GooString(tmp
);
203 if (!strcmp(p
, ".xml") || !strcmp(p
, ".XML"))
204 htmlFileName
= new GooString(tmp
->getCString(),
205 tmp
->getLength() - 5);
206 else htmlFileName
=new GooString(tmp
);
210 p
= fileName
->getCString() + fileName
->getLength() - 4;
211 if (!strcmp(p
, ".pdf") || !strcmp(p
, ".PDF"))
212 htmlFileName
= new GooString(fileName
->getCString(),
213 fileName
->getLength() - 4);
215 htmlFileName
= fileName
->copy();
216 // htmlFileName->append(".html");
219 if (scale
>3.0) scale
=3.0;
220 if (scale
<0.5) scale
=0.5;
242 if (lastPage
< 1 || lastPage
> doc
->getNumPages())
243 lastPage
= doc
->getNumPages();
245 doc
->getDocInfo(&info
);
247 docTitle
= getInfoString(info
.getDict(), "Title");
248 author
= getInfoString(info
.getDict(), "Author");
249 keywords
= getInfoString(info
.getDict(), "Keywords");
250 subject
= getInfoString(info
.getDict(), "Subject");
251 date
= getInfoDate(info
.getDict(), "ModDate");
253 date
= getInfoDate(info
.getDict(), "CreationDate");
256 if( !docTitle
) docTitle
= new GooString(htmlFileName
);
258 /* determine extensions of output backgroun images */
260 for(i
= 0; extsList
[i
]; i
++)
262 if( strstr(gsDevice
, extsList
[i
]) != (char *) NULL
)
264 strncpy(extension
, extsList
[i
], sizeof(extension
));
269 rawOrder
= complexMode
; // todo: figure out what exactly rawOrder do :)
272 htmlOut
= new HtmlOutputDev(htmlFileName
->getCString(),
273 docTitle
->getCString(),
274 author
? author
->getCString() : NULL
,
275 keywords
? keywords
->getCString() : NULL
,
276 subject
? subject
->getCString() : NULL
,
277 date
? date
->getCString() : NULL
,
281 doc
->getCatalog()->getOutline()->isDict());
302 doc
->displayPages(htmlOut
, firstPage
, lastPage
, 72, 72, 0,
303 gTrue
, gFalse
, gFalse
);
306 htmlOut
->dumpDocOutline(doc
->getCatalog());
310 if( complexMode
&& !xml
&& !ignore
) {
311 int h
=xoutRound(htmlOut
->getPageHeight()/scale
);
312 int w
=xoutRound(htmlOut
->getPageWidth()/scale
);
313 //int h=xoutRound(doc->getPageHeight(1)/scale);
314 //int w=xoutRound(doc->getPageWidth(1)/scale);
316 psFileName
= new GooString(htmlFileName
->getCString());
317 psFileName
->append(".ps");
320 // globalParams->setPSNoText(gTrue);
321 psOut
= new PSOutputDev(psFileName
->getCString(), doc
->getXRef(),
322 doc
->getCatalog(), firstPage
, lastPage
, psModePS
, w
, h
);
323 doc
->displayPages(psOut
, firstPage
, lastPage
, 72, 72, 0,
324 gTrue
, gFalse
, gFalse
);
327 /*sprintf(buf, "%s -sDEVICE=png16m -dBATCH -dNOPROMPT -dNOPAUSE -r72 -sOutputFile=%s%%03d.png -g%dx%d -q %s", GHOSTSCRIPT, htmlFileName->getCString(), w, h,
328 psFileName->getCString());*/
330 GooString
*gsCmd
= new GooString(GHOSTSCRIPT
);
331 GooString
*tw
, *th
, *sc
;
332 gsCmd
->append(" -sDEVICE=");
333 gsCmd
->append(gsDevice
);
334 gsCmd
->append(" -dBATCH -dNOPROMPT -dNOPAUSE -r");
335 sc
= GooString::fromInt(static_cast<int>(72*scale
));
337 gsCmd
->append(" -sOutputFile=");
339 gsCmd
->append(htmlFileName
);
340 gsCmd
->append("%03d.");
341 gsCmd
->append(extension
);
342 gsCmd
->append("\" -g");
343 tw
= GooString::fromInt(static_cast<int>(scale
*w
));
346 th
= GooString::fromInt(static_cast<int>(scale
*h
));
348 gsCmd
->append(" -q \"");
349 gsCmd
->append(psFileName
);
351 // printf("running: %s\n", gsCmd->getCString());
352 if( !executeCommand(gsCmd
->getCString()) && !errQuiet
) {
353 error(-1, "Failed to launch Ghostscript!\n");
355 unlink(psFileName
->getCString());
368 if(globalParams
) delete globalParams
;
370 if(htmlFileName
) delete htmlFileName
;
373 // check for memory leaks
374 Object::memCheck(stderr
);
380 static GooString
* getInfoString(Dict
*infoDict
, char *key
) {
382 GooString
*s1
= NULL
;
384 if (infoDict
->lookup(key
, &obj
)->isString()) {
385 s1
= new GooString(obj
.getString());
391 static GooString
* getInfoDate(Dict
*infoDict
, char *key
) {
394 int year
, mon
, day
, hour
, min
, sec
;
396 GooString
*result
= NULL
;
399 if (infoDict
->lookup(key
, &obj
)->isString()) {
400 s
= obj
.getString()->getCString();
401 if (s
[0] == 'D' && s
[1] == ':') {
404 if (sscanf(s
, "%4d%2d%2d%2d%2d%2d",
405 &year
, &mon
, &day
, &hour
, &min
, &sec
) == 6) {
406 tmStruct
.tm_year
= year
- 1900;
407 tmStruct
.tm_mon
= mon
- 1;
408 tmStruct
.tm_mday
= day
;
409 tmStruct
.tm_hour
= hour
;
410 tmStruct
.tm_min
= min
;
411 tmStruct
.tm_sec
= sec
;
412 tmStruct
.tm_wday
= -1;
413 tmStruct
.tm_yday
= -1;
414 tmStruct
.tm_isdst
= -1;
415 mktime(&tmStruct
); // compute the tm_wday and tm_yday fields
416 if (strftime(buf
, sizeof(buf
), "%Y-%m-%dT%H:%M:%S+00:00", &tmStruct
)) {
417 result
= new GooString(buf
);
419 result
= new GooString(s
);
422 result
= new GooString(s
);