1 //========================================================================
5 // Copyright 1997-2003 Glyph & Cog, LLC
7 //========================================================================
9 #ifndef TEXTOUTPUTDEV_H
10 #define TEXTOUTPUTDEV_H
12 #ifdef USE_GCC_PRAGMAS
16 #include "poppler-config.h"
18 #include "goo/gtypes.h"
21 #include "OutputDev.h"
38 class TextSelectionVisitor
;
40 //------------------------------------------------------------------------
42 typedef void (*TextOutputFunc
)(void *stream
, char *text
, int len
);
44 //------------------------------------------------------------------------
46 //------------------------------------------------------------------------
51 TextFontInfo(GfxState
*state
);
54 GBool
matches(GfxState
*state
);
63 friend class TextWord
;
64 friend class TextPage
;
65 friend class TextSelectionPainter
;
68 //------------------------------------------------------------------------
70 //------------------------------------------------------------------------
76 TextWord(GfxState
*state
, int rotA
, double x0
, double y0
,
77 int charPosA
, TextFontInfo
*fontA
, double fontSize
);
82 // Add a character to the word.
83 void addChar(GfxState
*state
, double x
, double y
,
84 double dx
, double dy
, CharCode c
, Unicode u
);
86 // Merge <word> onto the end of <this>.
87 void merge(TextWord
*word
);
89 // Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>),
90 // based on a primary-axis comparison, e.g., x ordering if rot=0.
91 int primaryCmp(TextWord
*word
);
93 // Return the distance along the primary axis between <this> and
95 double primaryDelta(TextWord
*word
);
97 static int cmpYX(const void *p1
, const void *p2
);
99 void visitSelection(TextSelectionVisitor
*visitor
,
100 PDFRectangle
*selection
);
102 #if TEXTOUT_WORD_LIST
103 int getLength() { return len
; }
104 const Unicode
*getChar(int idx
) { return &text
[idx
]; }
105 GooString
*getText();
106 GooString
*getFontName() { return font
->fontName
; }
107 void getColor(double *r
, double *g
, double *b
)
108 { *r
= colorR
; *g
= colorG
; *b
= colorB
; }
109 void getBBox(double *xMinA
, double *yMinA
, double *xMaxA
, double *yMaxA
)
110 { *xMinA
= xMin
; *yMinA
= yMin
; *xMaxA
= xMax
; *yMaxA
= yMax
; }
111 double getFontSize() { return fontSize
; }
112 int getRotation() { return rot
; }
113 int getCharPos() { return charPos
; }
114 int getCharLen() { return charLen
; }
116 double getEdge(int i
) { return edge
[i
]; }
117 double getBaseline () { return base
; }
118 GBool
hasSpaceAfter () { return spaceAfter
; }
119 TextWord
* nextWord () { return next
; };
122 int rot
; // rotation, multiple of 90 degrees
124 double xMin
, xMax
; // bounding box x coordinates
125 double yMin
, yMax
; // bounding box y coordinates
126 double base
; // baseline x or y coordinate
127 Unicode
*text
; // the text
128 CharCode
*charcode
; // glyph indices
129 double *edge
; // "near" edge x or y coord of each char
130 // (plus one extra entry for the last char)
131 int len
; // length of text and edge arrays
132 int size
; // size of text and edge arrays
133 int charPos
; // character position (within content stream)
134 int charLen
; // number of content stream characters in
136 TextFontInfo
*font
; // font information
137 double fontSize
; // font size
138 GBool spaceAfter
; // set if there is a space between this
139 // word and the next word on the line
140 TextWord
*next
; // next word in line
142 #if TEXTOUT_WORD_LIST
143 double colorR
, // word color
148 friend class TextPool
;
149 friend class TextLine
;
150 friend class TextBlock
;
151 friend class TextFlow
;
152 friend class TextWordList
;
153 friend class TextPage
;
155 friend class TextSelectionPainter
;
156 friend class TextSelectionDumper
;
159 //------------------------------------------------------------------------
161 //------------------------------------------------------------------------
169 TextWord
*getPool(int baseIdx
) { return pool
[baseIdx
- minBaseIdx
]; }
170 void setPool(int baseIdx
, TextWord
*p
) { pool
[baseIdx
- minBaseIdx
] = p
; }
172 int getBaseIdx(double base
);
174 void addWord(TextWord
*word
);
178 int minBaseIdx
; // min baseline bucket index
179 int maxBaseIdx
; // max baseline bucket index
180 TextWord
**pool
; // array of linked lists, one for each
181 // baseline value (multiple of 4 pts)
182 TextWord
*cursor
; // pointer to last-accessed word
183 int cursorBaseIdx
; // baseline bucket index of last-accessed word
185 friend class TextBlock
;
186 friend class TextPage
;
191 //------------------------------------------------------------------------
193 //------------------------------------------------------------------------
198 TextLine(TextBlock
*blkA
, int rotA
, double baseA
);
201 void addWord(TextWord
*word
);
203 // Return the distance along the primary axis between <this> and
205 double primaryDelta(TextLine
*line
);
207 // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
208 // based on a primary-axis comparison, e.g., x ordering if rot=0.
209 int primaryCmp(TextLine
*line
);
211 // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
212 // based on a secondary-axis comparison of the baselines, e.g., y
213 // ordering if rot=0.
214 int secondaryCmp(TextLine
*line
);
216 int cmpYX(TextLine
*line
);
218 static int cmpXY(const void *p1
, const void *p2
);
220 void coalesce(UnicodeMap
*uMap
);
222 void visitSelection(TextSelectionVisitor
*visitor
,
223 PDFRectangle
*selection
);
227 TextBlock
*blk
; // parent block
228 int rot
; // text rotation
229 double xMin
, xMax
; // bounding box x coordinates
230 double yMin
, yMax
; // bounding box y coordinates
231 double base
; // baseline x or y coordinate
232 TextWord
*words
; // words in this line
233 TextWord
*lastWord
; // last word in this line
234 Unicode
*text
; // Unicode text of the line, including
235 // spaces between words
236 double *edge
; // "near" edge x or y coord of each char
237 // (plus one extra entry for the last char)
238 int *col
; // starting column number of each Unicode char
239 int len
; // number of Unicode chars
240 int convertedLen
; // total number of converted characters
241 GBool hyphenated
; // set if last char is a hyphen
242 TextLine
*next
; // next line in block
243 Unicode
*normalized
; // normalized form of Unicode text
244 int normalized_len
; // number of normalized Unicode chars
245 int *normalized_idx
; // indices of normalized chars into Unicode text
247 friend class TextLineFrag
;
248 friend class TextBlock
;
249 friend class TextFlow
;
250 friend class TextWordList
;
251 friend class TextPage
;
253 friend class TextSelectionPainter
;
254 friend class TextSelectionSizer
;
255 friend class TextSelectionDumper
;
258 //------------------------------------------------------------------------
260 //------------------------------------------------------------------------
265 TextBlock(TextPage
*pageA
, int rotA
);
268 void addWord(TextWord
*word
);
270 void coalesce(UnicodeMap
*uMap
);
272 // Update this block's priMin and priMax values, looking at <blk>.
273 void updatePriMinMax(TextBlock
*blk
);
275 static int cmpXYPrimaryRot(const void *p1
, const void *p2
);
277 static int cmpYXPrimaryRot(const void *p1
, const void *p2
);
279 int primaryCmp(TextBlock
*blk
);
281 double secondaryDelta(TextBlock
*blk
);
283 // Returns true if <this> is below <blk>, relative to the page's
285 GBool
isBelow(TextBlock
*blk
);
287 void visitSelection(TextSelectionVisitor
*visitor
,
288 PDFRectangle
*selection
);
292 TextPage
*page
; // the parent page
293 int rot
; // text rotation
294 double xMin
, xMax
; // bounding box x coordinates
295 double yMin
, yMax
; // bounding box y coordinates
296 double priMin
, priMax
; // whitespace bounding box along primary axis
298 TextPool
*pool
; // pool of words (used only until lines
300 TextLine
*lines
; // linked list of lines
301 TextLine
*curLine
; // most recently added line
302 int nLines
; // number of lines
303 int charCount
; // number of characters in the block
304 int col
; // starting column
305 int nColumns
; // number of columns in the block
308 TextBlock
*stackNext
;
310 friend class TextLine
;
311 friend class TextLineFrag
;
312 friend class TextFlow
;
313 friend class TextWordList
;
314 friend class TextPage
;
315 friend class TextSelectionPainter
;
318 //------------------------------------------------------------------------
320 //------------------------------------------------------------------------
325 TextFlow(TextPage
*pageA
, TextBlock
*blk
);
328 // Add a block to the end of this flow.
329 void addBlock(TextBlock
*blk
);
331 // Returns true if <blk> fits below <prevBlk> in the flow, i.e., (1)
332 // it uses a font no larger than the last block added to the flow,
333 // and (2) it fits within the flow's [priMin, priMax] along the
335 GBool
blockFits(TextBlock
*blk
, TextBlock
*prevBlk
);
339 TextPage
*page
; // the parent page
340 double xMin
, xMax
; // bounding box x coordinates
341 double yMin
, yMax
; // bounding box y coordinates
342 double priMin
, priMax
; // whitespace bounding box along primary axis
343 TextBlock
*blocks
; // blocks in flow
344 TextBlock
*lastBlk
; // last block in this flow
347 friend class TextWordList
;
348 friend class TextPage
;
351 #if TEXTOUT_WORD_LIST
353 //------------------------------------------------------------------------
355 //------------------------------------------------------------------------
360 // Build a flat word list, in content stream order (if
361 // text->rawOrder is true), physical layout order (if <physLayout>
362 // is true and text->rawOrder is false), or reading order (if both
364 TextWordList(TextPage
*text
, GBool physLayout
);
368 // Return the number of words on the list.
371 // Return the <idx>th word from the list.
372 TextWord
*get(int idx
);
379 #endif // TEXTOUT_WORD_LIST
381 //------------------------------------------------------------------------
383 //------------------------------------------------------------------------
389 TextPage(GBool rawOrderA
);
395 void startPage(GfxState
*state
);
397 // End the current page.
400 // Update the current font.
401 void updateFont(GfxState
*state
);
404 void beginWord(GfxState
*state
, double x0
, double y0
);
406 // Add a character to the current word.
407 void addChar(GfxState
*state
, double x
, double y
,
408 double dx
, double dy
,
409 CharCode c
, int nBytes
, Unicode
*u
, int uLen
);
411 // End the current word, sorting it into the list of words.
414 // Add a word, sorting it into the list of words.
415 void addWord(TextWord
*word
);
417 // Coalesce strings that look like parts of the same line.
418 void coalesce(GBool physLayout
);
420 // Find a string. If <startAtTop> is true, starts looking at the
421 // top of the page; else if <startAtLast> is true, starts looking
422 // immediately after the last find result; else starts looking at
423 // <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the
424 // bottom of the page; else if <stopAtLast> is true, stops looking
425 // just before the last find result; else stops looking at
427 GBool
findText(Unicode
*s
, int len
,
428 GBool startAtTop
, GBool stopAtBottom
,
429 GBool startAtLast
, GBool stopAtLast
,
430 GBool caseSensitive
, GBool backward
,
431 double *xMin
, double *yMin
,
432 double *xMax
, double *yMax
);
434 // Get the text which is inside the specified rectangle.
435 GooString
*getText(double xMin
, double yMin
,
436 double xMax
, double yMax
);
438 void visitSelection(TextSelectionVisitor
*visitor
,
439 PDFRectangle
*selection
);
441 void drawSelection(OutputDev
*out
,
444 PDFRectangle
*selection
,
445 GfxColor
*glyph_color
, GfxColor
*box_color
);
447 GooList
*getSelectionRegion(PDFRectangle
*selection
, double scale
);
449 GooString
*getSelectionText(PDFRectangle
*selection
);
451 // Find a string by character position and length. If found, sets
452 // the text bounding rectangle and returns true; otherwise returns
454 GBool
findCharRange(int pos
, int length
,
455 double *xMin
, double *yMin
,
456 double *xMax
, double *yMax
);
458 // Dump contents of page to a file.
459 void dump(void *outputStream
, TextOutputFunc outputFunc
,
462 #if TEXTOUT_WORD_LIST
463 // Build a flat word list, in content stream order (if
464 // this->rawOrder is true), physical layout order (if <physLayout>
465 // is true and this->rawOrder is false), or reading order (if both
467 TextWordList
*makeWordList(GBool physLayout
);
473 void assignColumns(TextLineFrag
*frags
, int nFrags
, int rot
);
474 int dumpFragment(Unicode
*text
, int len
, UnicodeMap
*uMap
, GooString
*s
);
476 GBool rawOrder
; // keep text in content stream order
478 double pageWidth
, pageHeight
; // width and height of current page
479 TextWord
*curWord
; // currently active string
480 int charPos
; // next character position (within content
482 TextFontInfo
*curFont
; // current font
483 double curFontSize
; // current font size
484 int nest
; // current nesting level (for Type 3 fonts)
485 int nTinyChars
; // number of "tiny" chars seen so far
486 GBool lastCharOverlap
; // set if the last added char overlapped the
489 TextPool
*pools
[4]; // a "pool" of TextWords for each rotation
490 TextFlow
*flows
; // linked list of flows
491 TextBlock
**blocks
; // array of blocks, in yx order
492 int nBlocks
; // number of blocks
493 int primaryRot
; // primary rotation
494 GBool primaryLR
; // primary direction (true means L-to-R,
495 // false means R-to-L)
496 TextWord
*rawWords
; // list of words, in raw order (only if
498 TextWord
*rawLastWord
; // last word on rawWords list
500 GooList
*fonts
; // all font info objects used on this
501 // page [TextFontInfo]
503 double lastFindXMin
, // coordinates of the last "find" result
507 friend class TextLine
;
508 friend class TextLineFrag
;
509 friend class TextBlock
;
510 friend class TextFlow
;
511 friend class TextWordList
;
512 friend class TextSelectionPainter
;
513 friend class TextSelectionDumper
;
516 //------------------------------------------------------------------------
518 //------------------------------------------------------------------------
520 class TextOutputDev
: public OutputDev
{
523 // Open a text output file. If <fileName> is NULL, no file is
524 // written (this is useful, e.g., for searching text). If
525 // <physLayoutA> is true, the original physical layout of the text
526 // is maintained. If <rawOrder> is true, the text is kept in
527 // content stream order.
528 TextOutputDev(char *fileName
, GBool physLayoutA
,
529 GBool rawOrderA
, GBool append
);
531 // Create a TextOutputDev which will write to a generic stream. If
532 // <physLayoutA> is true, the original physical layout of the text
533 // is maintained. If <rawOrder> is true, the text is kept in
534 // content stream order.
535 TextOutputDev(TextOutputFunc func
, void *stream
,
536 GBool physLayoutA
, GBool rawOrderA
);
539 virtual ~TextOutputDev();
541 // Check if file was successfully created.
542 virtual GBool
isOk() { return ok
; }
544 //---- get info about output device
546 // Does this device use upside-down coordinates?
547 // (Upside-down means (0,0) is the top left corner of the page.)
548 virtual GBool
upsideDown() { return gTrue
; }
550 // Does this device use drawChar() or drawString()?
551 virtual GBool
useDrawChar() { return gTrue
; }
553 // Does this device use beginType3Char/endType3Char? Otherwise,
554 // text in Type 3 fonts will be drawn with drawChar/drawString.
555 virtual GBool
interpretType3Chars() { return gFalse
; }
557 // Does this device need non-text content?
558 virtual GBool
needNonText() { return gFalse
; }
560 //----- initialization and control
563 virtual void startPage(int pageNum
, GfxState
*state
);
566 virtual void endPage();
568 //----- update text state
569 virtual void updateFont(GfxState
*state
);
572 virtual void beginString(GfxState
*state
, GooString
*s
);
573 virtual void endString(GfxState
*state
);
574 virtual void drawChar(GfxState
*state
, double x
, double y
,
575 double dx
, double dy
,
576 double originX
, double originY
,
577 CharCode c
, int nBytes
, Unicode
*u
, int uLen
);
579 //----- special access
581 // Find a string. If <startAtTop> is true, starts looking at the
582 // top of the page; else if <startAtLast> is true, starts looking
583 // immediately after the last find result; else starts looking at
584 // <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the
585 // bottom of the page; else if <stopAtLast> is true, stops looking
586 // just before the last find result; else stops looking at
588 GBool
findText(Unicode
*s
, int len
,
589 GBool startAtTop
, GBool stopAtBottom
,
590 GBool startAtLast
, GBool stopAtLast
,
591 GBool caseSensitive
, GBool backward
,
592 double *xMin
, double *yMin
,
593 double *xMax
, double *yMax
);
595 // Get the text which is inside the specified rectangle.
596 GooString
*getText(double xMin
, double yMin
,
597 double xMax
, double yMax
);
599 // Find a string by character position and length. If found, sets
600 // the text bounding rectangle and returns true; otherwise returns
602 GBool
findCharRange(int pos
, int length
,
603 double *xMin
, double *yMin
,
604 double *xMax
, double *yMax
);
606 void drawSelection(OutputDev
*out
, double scale
, int rotation
,
607 PDFRectangle
*selection
,
608 GfxColor
*glyph_color
, GfxColor
*box_color
);
610 GooList
*getSelectionRegion(PDFRectangle
*selection
, double scale
);
612 GooString
*getSelectionText(PDFRectangle
*selection
);
614 #if TEXTOUT_WORD_LIST
615 // Build a flat word list, in content stream order (if
616 // this->rawOrder is true), physical layout order (if
617 // this->physLayout is true and this->rawOrder is false), or reading
618 // order (if both flags are false).
619 TextWordList
*makeWordList();
622 // Returns the TextPage object for the last rasterized page,
623 // transferring ownership to the caller.
624 TextPage
*takeText();
628 TextOutputFunc outputFunc
; // output function
629 void *outputStream
; // output stream
630 GBool needClose
; // need to close the output file?
631 // (only if outputStream is a FILE*)
632 TextPage
*text
; // text for the current page
633 GBool physLayout
; // maintain original physical layout when
635 GBool rawOrder
; // keep text in content stream order
636 GBool ok
; // set up ok?