2 * Summary: interface for an HTML 4.0 non-verifying parser
3 * Description: this module implements an HTML 4.0 non-verifying parser
4 * with API compatible with the XML parser ones. It should
5 * be able to parse "real world" HTML, even if severely
6 * broken from a specification point of view.
8 * Copy: See Copyright for the status of this software.
10 * Author: Daniel Veillard
13 #ifndef __HTML_PARSER_H__
14 #define __HTML_PARSER_H__
15 #include <libxml/xmlversion.h>
16 #include <libxml/parser.h>
18 #ifdef LIBXML_HTML_ENABLED
25 * Most of the back-end structures from XML and HTML are shared.
27 typedef xmlParserCtxt htmlParserCtxt
;
28 typedef xmlParserCtxtPtr htmlParserCtxtPtr
;
29 typedef xmlParserNodeInfo htmlParserNodeInfo
;
30 typedef xmlSAXHandler htmlSAXHandler
;
31 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr
;
32 typedef xmlParserInput htmlParserInput
;
33 typedef xmlParserInputPtr htmlParserInputPtr
;
34 typedef xmlDocPtr htmlDocPtr
;
35 typedef xmlNodePtr htmlNodePtr
;
38 * Internal description of an HTML element, representing HTML 4.01
39 * and XHTML 1.0 (which share the same structure).
41 typedef struct _htmlElemDesc htmlElemDesc
;
42 typedef htmlElemDesc
*htmlElemDescPtr
;
43 struct _htmlElemDesc
{
44 const char *name
; /* The tag name */
45 char startTag
; /* Whether the start tag can be implied */
46 char endTag
; /* Whether the end tag can be implied */
47 char saveEndTag
; /* Whether the end tag should be saved */
48 char empty
; /* Is this an empty element ? */
49 char depr
; /* Is this a deprecated element ? */
50 char dtd
; /* 1: only in Loose DTD, 2: only Frameset one */
51 char isinline
; /* is this a block 0 or inline 1 element */
52 const char *desc
; /* the description */
55 * New fields encapsulating HTML structure
58 * This is a very limited representation. It fails to tell us when
59 * an element *requires* subelements (we only have whether they're
60 * allowed or not), and it doesn't tell us where CDATA and PCDATA
61 * are allowed. Some element relationships are not fully represented:
62 * these are flagged with the word MODIFIER
64 const char** subelts
; /* allowed sub-elements of this element */
65 const char* defaultsubelt
; /* subelement for suggested auto-repair
66 if necessary or NULL */
67 const char** attrs_opt
; /* Optional Attributes */
68 const char** attrs_depr
; /* Additional deprecated attributes */
69 const char** attrs_req
; /* Required attributes */
73 * Internal description of an HTML entity.
75 typedef struct _htmlEntityDesc htmlEntityDesc
;
76 typedef htmlEntityDesc
*htmlEntityDescPtr
;
77 struct _htmlEntityDesc
{
78 unsigned int value
; /* the UNICODE value for the character */
79 const char *name
; /* The entity name */
80 const char *desc
; /* the description */
84 * There is only few public functions.
86 XMLPUBFUN
const htmlElemDesc
* XMLCALL
87 htmlTagLookup (const xmlChar
*tag
);
88 XMLPUBFUN
const htmlEntityDesc
* XMLCALL
89 htmlEntityLookup(const xmlChar
*name
);
90 XMLPUBFUN
const htmlEntityDesc
* XMLCALL
91 htmlEntityValueLookup(unsigned int value
);
94 htmlIsAutoClosed(htmlDocPtr doc
,
97 htmlAutoCloseTag(htmlDocPtr doc
,
100 XMLPUBFUN
const htmlEntityDesc
* XMLCALL
101 htmlParseEntityRef(htmlParserCtxtPtr ctxt
,
102 const xmlChar
**str
);
103 XMLPUBFUN
int XMLCALL
104 htmlParseCharRef(htmlParserCtxtPtr ctxt
);
105 XMLPUBFUN
void XMLCALL
106 htmlParseElement(htmlParserCtxtPtr ctxt
);
108 XMLPUBFUN htmlParserCtxtPtr XMLCALL
109 htmlNewParserCtxt(void);
111 XMLPUBFUN htmlParserCtxtPtr XMLCALL
112 htmlCreateMemoryParserCtxt(const char *buffer
,
115 XMLPUBFUN
int XMLCALL
116 htmlParseDocument(htmlParserCtxtPtr ctxt
);
117 XMLPUBFUN htmlDocPtr XMLCALL
118 htmlSAXParseDoc (xmlChar
*cur
,
119 const char *encoding
,
120 htmlSAXHandlerPtr sax
,
122 XMLPUBFUN htmlDocPtr XMLCALL
123 htmlParseDoc (xmlChar
*cur
,
124 const char *encoding
);
125 XMLPUBFUN htmlDocPtr XMLCALL
126 htmlSAXParseFile(const char *filename
,
127 const char *encoding
,
128 htmlSAXHandlerPtr sax
,
130 XMLPUBFUN htmlDocPtr XMLCALL
131 htmlParseFile (const char *filename
,
132 const char *encoding
);
133 XMLPUBFUN
int XMLCALL
134 UTF8ToHtml (unsigned char *out
,
136 const unsigned char *in
,
138 XMLPUBFUN
int XMLCALL
139 htmlEncodeEntities(unsigned char *out
,
141 const unsigned char *in
,
142 int *inlen
, int quoteChar
);
143 XMLPUBFUN
int XMLCALL
144 htmlIsScriptAttribute(const xmlChar
*name
);
145 XMLPUBFUN
int XMLCALL
146 htmlHandleOmittedElem(int val
);
148 #ifdef LIBXML_PUSH_ENABLED
150 * Interfaces for the Push mode.
152 XMLPUBFUN htmlParserCtxtPtr XMLCALL
153 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax
,
157 const char *filename
,
158 xmlCharEncoding enc
);
159 XMLPUBFUN
int XMLCALL
160 htmlParseChunk (htmlParserCtxtPtr ctxt
,
164 #endif /* LIBXML_PUSH_ENABLED */
166 XMLPUBFUN
void XMLCALL
167 htmlFreeParserCtxt (htmlParserCtxtPtr ctxt
);
170 * New set of simpler/more flexible APIs
175 * This is the set of XML parser options that can be passed down
176 * to the xmlReadDoc() and similar calls.
179 HTML_PARSE_RECOVER
= 1<<0, /* Relaxed parsing */
180 HTML_PARSE_NOERROR
= 1<<5, /* suppress error reports */
181 HTML_PARSE_NOWARNING
= 1<<6, /* suppress warning reports */
182 HTML_PARSE_PEDANTIC
= 1<<7, /* pedantic error reporting */
183 HTML_PARSE_NOBLANKS
= 1<<8, /* remove blank nodes */
184 HTML_PARSE_NONET
= 1<<11,/* Forbid network access */
185 HTML_PARSE_COMPACT
= 1<<16 /* compact small text nodes */
188 XMLPUBFUN
void XMLCALL
189 htmlCtxtReset (htmlParserCtxtPtr ctxt
);
190 XMLPUBFUN
int XMLCALL
191 htmlCtxtUseOptions (htmlParserCtxtPtr ctxt
,
193 XMLPUBFUN htmlDocPtr XMLCALL
194 htmlReadDoc (const xmlChar
*cur
,
196 const char *encoding
,
198 XMLPUBFUN htmlDocPtr XMLCALL
199 htmlReadFile (const char *URL
,
200 const char *encoding
,
202 XMLPUBFUN htmlDocPtr XMLCALL
203 htmlReadMemory (const char *buffer
,
206 const char *encoding
,
208 XMLPUBFUN htmlDocPtr XMLCALL
211 const char *encoding
,
213 XMLPUBFUN htmlDocPtr XMLCALL
214 htmlReadIO (xmlInputReadCallback ioread
,
215 xmlInputCloseCallback ioclose
,
218 const char *encoding
,
220 XMLPUBFUN htmlDocPtr XMLCALL
221 htmlCtxtReadDoc (xmlParserCtxtPtr ctxt
,
224 const char *encoding
,
226 XMLPUBFUN htmlDocPtr XMLCALL
227 htmlCtxtReadFile (xmlParserCtxtPtr ctxt
,
228 const char *filename
,
229 const char *encoding
,
231 XMLPUBFUN htmlDocPtr XMLCALL
232 htmlCtxtReadMemory (xmlParserCtxtPtr ctxt
,
236 const char *encoding
,
238 XMLPUBFUN htmlDocPtr XMLCALL
239 htmlCtxtReadFd (xmlParserCtxtPtr ctxt
,
242 const char *encoding
,
244 XMLPUBFUN htmlDocPtr XMLCALL
245 htmlCtxtReadIO (xmlParserCtxtPtr ctxt
,
246 xmlInputReadCallback ioread
,
247 xmlInputCloseCallback ioclose
,
250 const char *encoding
,
253 /* NRK/Jan2003: further knowledge of HTML structure
256 HTML_NA
= 0 , /* something we don't check at all */
258 HTML_DEPRECATED
= 0x2 ,
260 HTML_REQUIRED
= 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
263 /* Using htmlElemDesc rather than name here, to emphasise the fact
264 that otherwise there's a lookup overhead
266 XMLPUBFUN htmlStatus XMLCALL
htmlAttrAllowed(const htmlElemDesc
*, const xmlChar
*, int) ;
267 XMLPUBFUN
int XMLCALL
htmlElementAllowedHere(const htmlElemDesc
*, const xmlChar
*) ;
268 XMLPUBFUN htmlStatus XMLCALL
htmlElementStatusHere(const htmlElemDesc
*, const htmlElemDesc
*) ;
269 XMLPUBFUN htmlStatus XMLCALL
htmlNodeStatus(const htmlNodePtr
, int) ;
271 * htmlDefaultSubelement:
274 * Returns the default subelement for this element
276 #define htmlDefaultSubelement(elt) elt->defaultsubelt
278 * htmlElementAllowedHereDesc:
279 * @parent: HTML parent element
282 * Checks whether an HTML element description may be a
283 * direct child of the specified element.
285 * Returns 1 if allowed; 0 otherwise.
287 #define htmlElementAllowedHereDesc(parent,elt) \
288 htmlElementAllowedHere((parent), (elt)->name)
293 * Returns the attributes required for the specified element.
295 #define htmlRequiredAttrs(elt) (elt)->attrs_req
302 #endif /* LIBXML_HTML_ENABLED */
303 #endif /* __HTML_PARSER_H__ */