2 * Summary: interface for an HTML 4.0 non-verifying parser
3 * Description: this module implements an HTML 4.0 non-verifying parser
4 * with API compatible with the XML parser ones. It should
5 * be able to parse "real world" HTML, even if severely
6 * broken from a specification point of view.
8 * Copy: See Copyright for the status of this software.
10 * Author: Daniel Veillard
13 #ifndef __HTML_PARSER_H__
14 #define __HTML_PARSER_H__
15 #include <libxml/xmlversion.h>
16 #include <libxml/parser.h>
18 #ifdef LIBXML_HTML_ENABLED
25 * Most of the back-end structures from XML and HTML are shared.
27 typedef xmlParserCtxt htmlParserCtxt
;
28 typedef xmlParserCtxtPtr htmlParserCtxtPtr
;
29 typedef xmlParserNodeInfo htmlParserNodeInfo
;
30 typedef xmlSAXHandler htmlSAXHandler
;
31 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr
;
32 typedef xmlParserInput htmlParserInput
;
33 typedef xmlParserInputPtr htmlParserInputPtr
;
34 typedef xmlDocPtr htmlDocPtr
;
35 typedef xmlNodePtr htmlNodePtr
;
38 * Internal description of an HTML element, representing HTML 4.01
39 * and XHTML 1.0 (which share the same structure).
41 typedef struct _htmlElemDesc htmlElemDesc
;
42 typedef htmlElemDesc
*htmlElemDescPtr
;
43 struct _htmlElemDesc
{
44 const char *name
; /* The tag name */
45 char startTag
; /* Whether the start tag can be implied */
46 char endTag
; /* Whether the end tag can be implied */
47 char saveEndTag
; /* Whether the end tag should be saved */
48 char empty
; /* Is this an empty element ? */
49 char depr
; /* Is this a deprecated element ? */
50 char dtd
; /* 1: only in Loose DTD, 2: only Frameset one */
51 char isinline
; /* is this a block 0 or inline 1 element */
52 const char *desc
; /* the description */
55 * New fields encapsulating HTML structure
58 * This is a very limited representation. It fails to tell us when
59 * an element *requires* subelements (we only have whether they're
60 * allowed or not), and it doesn't tell us where CDATA and PCDATA
61 * are allowed. Some element relationships are not fully represented:
62 * these are flagged with the word MODIFIER
64 const char** subelts
; /* allowed sub-elements of this element */
65 const char* defaultsubelt
; /* subelement for suggested auto-repair
66 if necessary or NULL */
67 const char** attrs_opt
; /* Optional Attributes */
68 const char** attrs_depr
; /* Additional deprecated attributes */
69 const char** attrs_req
; /* Required attributes */
73 * Internal description of an HTML entity.
75 typedef struct _htmlEntityDesc htmlEntityDesc
;
76 typedef htmlEntityDesc
*htmlEntityDescPtr
;
77 struct _htmlEntityDesc
{
78 unsigned int value
; /* the UNICODE value for the character */
79 const char *name
; /* The entity name */
80 const char *desc
; /* the description */
84 * There is only few public functions.
86 XMLPUBFUN
const htmlElemDesc
* XMLCALL
87 htmlTagLookup (const xmlChar
*tag
);
88 XMLPUBFUN
const htmlEntityDesc
* XMLCALL
89 htmlEntityLookup(const xmlChar
*name
);
90 XMLPUBFUN
const htmlEntityDesc
* XMLCALL
91 htmlEntityValueLookup(unsigned int value
);
94 htmlIsAutoClosed(htmlDocPtr doc
,
97 htmlAutoCloseTag(htmlDocPtr doc
,
100 XMLPUBFUN
const htmlEntityDesc
* XMLCALL
101 htmlParseEntityRef(htmlParserCtxtPtr ctxt
,
102 const xmlChar
**str
);
103 XMLPUBFUN
int XMLCALL
104 htmlParseCharRef(htmlParserCtxtPtr ctxt
);
105 XMLPUBFUN
void XMLCALL
106 htmlParseElement(htmlParserCtxtPtr ctxt
);
108 XMLPUBFUN htmlParserCtxtPtr XMLCALL
109 htmlNewParserCtxt(void);
111 XMLPUBFUN htmlParserCtxtPtr XMLCALL
112 htmlCreateMemoryParserCtxt(const char *buffer
,
115 XMLPUBFUN
int XMLCALL
116 htmlParseDocument(htmlParserCtxtPtr ctxt
);
117 XMLPUBFUN htmlDocPtr XMLCALL
118 htmlSAXParseDoc (const xmlChar
*cur
,
119 const char *encoding
,
120 htmlSAXHandlerPtr sax
,
122 XMLPUBFUN htmlDocPtr XMLCALL
123 htmlParseDoc (const xmlChar
*cur
,
124 const char *encoding
);
125 XMLPUBFUN htmlDocPtr XMLCALL
126 htmlSAXParseFile(const char *filename
,
127 const char *encoding
,
128 htmlSAXHandlerPtr sax
,
130 XMLPUBFUN htmlDocPtr XMLCALL
131 htmlParseFile (const char *filename
,
132 const char *encoding
);
133 XMLPUBFUN
int XMLCALL
134 UTF8ToHtml (unsigned char *out
,
136 const unsigned char *in
,
138 XMLPUBFUN
int XMLCALL
139 htmlEncodeEntities(unsigned char *out
,
141 const unsigned char *in
,
142 int *inlen
, int quoteChar
);
143 XMLPUBFUN
int XMLCALL
144 htmlIsScriptAttribute(const xmlChar
*name
);
145 XMLPUBFUN
int XMLCALL
146 htmlHandleOmittedElem(int val
);
148 #ifdef LIBXML_PUSH_ENABLED
150 * Interfaces for the Push mode.
152 XMLPUBFUN htmlParserCtxtPtr XMLCALL
153 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax
,
157 const char *filename
,
158 xmlCharEncoding enc
);
159 XMLPUBFUN
int XMLCALL
160 htmlParseChunk (htmlParserCtxtPtr ctxt
,
164 #endif /* LIBXML_PUSH_ENABLED */
166 XMLPUBFUN
void XMLCALL
167 htmlFreeParserCtxt (htmlParserCtxtPtr ctxt
);
170 * New set of simpler/more flexible APIs
175 * This is the set of XML parser options that can be passed down
176 * to the xmlReadDoc() and similar calls.
179 HTML_PARSE_RECOVER
= 1<<0, /* Relaxed parsing */
180 HTML_PARSE_NODEFDTD
= 1<<2, /* do not default a doctype if not found */
181 HTML_PARSE_NOERROR
= 1<<5, /* suppress error reports */
182 HTML_PARSE_NOWARNING
= 1<<6, /* suppress warning reports */
183 HTML_PARSE_PEDANTIC
= 1<<7, /* pedantic error reporting */
184 HTML_PARSE_NOBLANKS
= 1<<8, /* remove blank nodes */
185 HTML_PARSE_NONET
= 1<<11,/* Forbid network access */
186 HTML_PARSE_NOIMPLIED
= 1<<13,/* Do not add implied html/body... elements */
187 HTML_PARSE_COMPACT
= 1<<16,/* compact small text nodes */
188 HTML_PARSE_IGNORE_ENC
=1<<21 /* ignore internal document encoding hint */
191 XMLPUBFUN
void XMLCALL
192 htmlCtxtReset (htmlParserCtxtPtr ctxt
);
193 XMLPUBFUN
int XMLCALL
194 htmlCtxtUseOptions (htmlParserCtxtPtr ctxt
,
196 XMLPUBFUN htmlDocPtr XMLCALL
197 htmlReadDoc (const xmlChar
*cur
,
199 const char *encoding
,
201 XMLPUBFUN htmlDocPtr XMLCALL
202 htmlReadFile (const char *URL
,
203 const char *encoding
,
205 XMLPUBFUN htmlDocPtr XMLCALL
206 htmlReadMemory (const char *buffer
,
209 const char *encoding
,
211 XMLPUBFUN htmlDocPtr XMLCALL
214 const char *encoding
,
216 XMLPUBFUN htmlDocPtr XMLCALL
217 htmlReadIO (xmlInputReadCallback ioread
,
218 xmlInputCloseCallback ioclose
,
221 const char *encoding
,
223 XMLPUBFUN htmlDocPtr XMLCALL
224 htmlCtxtReadDoc (xmlParserCtxtPtr ctxt
,
227 const char *encoding
,
229 XMLPUBFUN htmlDocPtr XMLCALL
230 htmlCtxtReadFile (xmlParserCtxtPtr ctxt
,
231 const char *filename
,
232 const char *encoding
,
234 XMLPUBFUN htmlDocPtr XMLCALL
235 htmlCtxtReadMemory (xmlParserCtxtPtr ctxt
,
239 const char *encoding
,
241 XMLPUBFUN htmlDocPtr XMLCALL
242 htmlCtxtReadFd (xmlParserCtxtPtr ctxt
,
245 const char *encoding
,
247 XMLPUBFUN htmlDocPtr XMLCALL
248 htmlCtxtReadIO (xmlParserCtxtPtr ctxt
,
249 xmlInputReadCallback ioread
,
250 xmlInputCloseCallback ioclose
,
253 const char *encoding
,
256 /* NRK/Jan2003: further knowledge of HTML structure
259 HTML_NA
= 0 , /* something we don't check at all */
261 HTML_DEPRECATED
= 0x2 ,
263 HTML_REQUIRED
= 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
266 /* Using htmlElemDesc rather than name here, to emphasise the fact
267 that otherwise there's a lookup overhead
269 XMLPUBFUN htmlStatus XMLCALL
htmlAttrAllowed(const htmlElemDesc
*, const xmlChar
*, int) ;
270 XMLPUBFUN
int XMLCALL
htmlElementAllowedHere(const htmlElemDesc
*, const xmlChar
*) ;
271 XMLPUBFUN htmlStatus XMLCALL
htmlElementStatusHere(const htmlElemDesc
*, const htmlElemDesc
*) ;
272 XMLPUBFUN htmlStatus XMLCALL
htmlNodeStatus(const htmlNodePtr
, int) ;
274 * htmlDefaultSubelement:
277 * Returns the default subelement for this element
279 #define htmlDefaultSubelement(elt) elt->defaultsubelt
281 * htmlElementAllowedHereDesc:
282 * @parent: HTML parent element
285 * Checks whether an HTML element description may be a
286 * direct child of the specified element.
288 * Returns 1 if allowed; 0 otherwise.
290 #define htmlElementAllowedHereDesc(parent,elt) \
291 htmlElementAllowedHere((parent), (elt)->name)
296 * Returns the attributes required for the specified element.
298 #define htmlRequiredAttrs(elt) (elt)->attrs_req
305 #endif /* LIBXML_HTML_ENABLED */
306 #endif /* __HTML_PARSER_H__ */