f63851185a3e3f962c7e6b8d2abd7ad74637f1e2
[reactos.git] / reactos / lib / 3rdparty / libxml2 / HTMLparser.c
1 /*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
12
13 #include <string.h>
14 #ifdef HAVE_CTYPE_H
15 #include <ctype.h>
16 #endif
17 #ifdef HAVE_STDLIB_H
18 #include <stdlib.h>
19 #endif
20 #ifdef HAVE_SYS_STAT_H
21 #include <sys/stat.h>
22 #endif
23 #ifdef HAVE_FCNTL_H
24 #include <fcntl.h>
25 #endif
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 #ifdef HAVE_ZLIB_H
30 #include <zlib.h>
31 #endif
32
33 #include <libxml/xmlmemory.h>
34 #include <libxml/tree.h>
35 #include <libxml/parser.h>
36 #include <libxml/parserInternals.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/HTMLparser.h>
39 #include <libxml/HTMLtree.h>
40 #include <libxml/entities.h>
41 #include <libxml/encoding.h>
42 #include <libxml/valid.h>
43 #include <libxml/xmlIO.h>
44 #include <libxml/globals.h>
45 #include <libxml/uri.h>
46
47 #define HTML_MAX_NAMELEN 1000
48 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
49 #define HTML_PARSER_BUFFER_SIZE 100
50
51 /* #define DEBUG */
52 /* #define DEBUG_PUSH */
53
54 static int htmlOmittedDefaultValue = 1;
55
56 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
58 static void htmlParseComment(htmlParserCtxtPtr ctxt);
59
60 /************************************************************************
61 * *
62 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66 /**
67 * htmlErrMemory:
68 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73 static void
74 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75 {
76 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
79 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
85 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
86 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
90 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
91 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93 }
94
95 /**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105 static void
106 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108 {
109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
112 if (ctxt != NULL)
113 ctxt->errNo = error;
114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
121 }
122
123 /**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132 static void
133 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135 {
136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
139 if (ctxt != NULL)
140 ctxt->errNo = error;
141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
146 }
147
148 /************************************************************************
149 * *
150 * Parser stacks related functions and macros *
151 * *
152 ************************************************************************/
153
154 /**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
162 */
163 static int
164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
165 {
166 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
167 ctxt->html = 3;
168 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
169 ctxt->html = 10;
170 if (ctxt->nameNr >= ctxt->nameMax) {
171 ctxt->nameMax *= 2;
172 ctxt->nameTab = (const xmlChar * *)
173 xmlRealloc((xmlChar * *)ctxt->nameTab,
174 ctxt->nameMax *
175 sizeof(ctxt->nameTab[0]));
176 if (ctxt->nameTab == NULL) {
177 htmlErrMemory(ctxt, NULL);
178 return (0);
179 }
180 }
181 ctxt->nameTab[ctxt->nameNr] = value;
182 ctxt->name = value;
183 return (ctxt->nameNr++);
184 }
185 /**
186 * htmlnamePop:
187 * @ctxt: an HTML parser context
188 *
189 * Pops the top element name from the name stack
190 *
191 * Returns the name just removed
192 */
193 static const xmlChar *
194 htmlnamePop(htmlParserCtxtPtr ctxt)
195 {
196 const xmlChar *ret;
197
198 if (ctxt->nameNr <= 0)
199 return (NULL);
200 ctxt->nameNr--;
201 if (ctxt->nameNr < 0)
202 return (NULL);
203 if (ctxt->nameNr > 0)
204 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
205 else
206 ctxt->name = NULL;
207 ret = ctxt->nameTab[ctxt->nameNr];
208 ctxt->nameTab[ctxt->nameNr] = NULL;
209 return (ret);
210 }
211
212 /*
213 * Macros for accessing the content. Those should be used only by the parser,
214 * and not exported.
215 *
216 * Dirty macros, i.e. one need to make assumption on the context to use them
217 *
218 * CUR_PTR return the current pointer to the xmlChar to be parsed.
219 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
220 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
221 * in UNICODE mode. This should be used internally by the parser
222 * only to compare to ASCII values otherwise it would break when
223 * running with UTF-8 encoding.
224 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
225 * to compare on ASCII based substring.
226 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
227 * it should be used only to compare on ASCII based substring.
228 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
229 * strings without newlines within the parser.
230 *
231 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
232 *
233 * CURRENT Returns the current char value, with the full decoding of
234 * UTF-8 if we are using this mode. It returns an int.
235 * NEXT Skip to the next character, this does the proper decoding
236 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
237 * NEXTL(l) Skip the current unicode character of l xmlChars long.
238 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
239 */
240
241 #define UPPER (toupper(*ctxt->input->cur))
242
243 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
244
245 #define NXT(val) ctxt->input->cur[(val)]
246
247 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
248
249 #define CUR_PTR ctxt->input->cur
250
251 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
252 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
253 xmlParserInputShrink(ctxt->input)
254
255 #define GROW if ((ctxt->progressive == 0) && \
256 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
257 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
258
259 #define CURRENT ((int) (*ctxt->input->cur))
260
261 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
262
263 /* Inported from XML */
264
265 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
266 #define CUR ((int) (*ctxt->input->cur))
267 #define NEXT xmlNextChar(ctxt)
268
269 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
270 #define NXT(val) ctxt->input->cur[(val)]
271 #define CUR_PTR ctxt->input->cur
272
273
274 #define NEXTL(l) do { \
275 if (*(ctxt->input->cur) == '\n') { \
276 ctxt->input->line++; ctxt->input->col = 1; \
277 } else ctxt->input->col++; \
278 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
279 } while (0)
280
281 /************
282 \
283 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
284 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
285 ************/
286
287 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
288 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
289
290 #define COPY_BUF(l,b,i,v) \
291 if (l == 1) b[i++] = (xmlChar) v; \
292 else i += xmlCopyChar(l,&b[i],v)
293
294 /**
295 * htmlFindEncoding:
296 * @the HTML parser context
297 *
298 * Ty to find and encoding in the current data available in the input
299 * buffer this is needed to try to switch to the proper encoding when
300 * one face a character error.
301 * That's an heuristic, since it's operating outside of parsing it could
302 * try to use a meta which had been commented out, that's the reason it
303 * should only be used in case of error, not as a default.
304 *
305 * Returns an encoding string or NULL if not found, the string need to
306 * be freed
307 */
308 static xmlChar *
309 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
310 const xmlChar *start, *cur, *end;
311
312 if ((ctxt == NULL) || (ctxt->input == NULL) ||
313 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
314 (ctxt->input->buf->encoder != NULL))
315 return(NULL);
316 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
317 return(NULL);
318
319 start = ctxt->input->cur;
320 end = ctxt->input->end;
321 /* we also expect the input buffer to be zero terminated */
322 if (*end != 0)
323 return(NULL);
324
325 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
326 if (cur == NULL)
327 return(NULL);
328 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
329 if (cur == NULL)
330 return(NULL);
331 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
332 if (cur == NULL)
333 return(NULL);
334 cur += 8;
335 start = cur;
336 while (((*cur >= 'A') && (*cur <= 'Z')) ||
337 ((*cur >= 'a') && (*cur <= 'z')) ||
338 ((*cur >= '0') && (*cur <= '9')) ||
339 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
340 cur++;
341 if (cur == start)
342 return(NULL);
343 return(xmlStrndup(start, cur - start));
344 }
345
346 /**
347 * htmlCurrentChar:
348 * @ctxt: the HTML parser context
349 * @len: pointer to the length of the char read
350 *
351 * The current char value, if using UTF-8 this may actually span multiple
352 * bytes in the input buffer. Implement the end of line normalization:
353 * 2.11 End-of-Line Handling
354 * If the encoding is unspecified, in the case we find an ISO-Latin-1
355 * char, then the encoding converter is plugged in automatically.
356 *
357 * Returns the current char value and its length
358 */
359
360 static int
361 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
362 if (ctxt->instate == XML_PARSER_EOF)
363 return(0);
364
365 if (ctxt->token != 0) {
366 *len = 0;
367 return(ctxt->token);
368 }
369 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
370 /*
371 * We are supposed to handle UTF8, check it's valid
372 * From rfc2044: encoding of the Unicode values on UTF-8:
373 *
374 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
375 * 0000 0000-0000 007F 0xxxxxxx
376 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
377 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
378 *
379 * Check for the 0x110000 limit too
380 */
381 const unsigned char *cur = ctxt->input->cur;
382 unsigned char c;
383 unsigned int val;
384
385 c = *cur;
386 if (c & 0x80) {
387 if (cur[1] == 0) {
388 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
389 cur = ctxt->input->cur;
390 }
391 if ((cur[1] & 0xc0) != 0x80)
392 goto encoding_error;
393 if ((c & 0xe0) == 0xe0) {
394
395 if (cur[2] == 0) {
396 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
397 cur = ctxt->input->cur;
398 }
399 if ((cur[2] & 0xc0) != 0x80)
400 goto encoding_error;
401 if ((c & 0xf0) == 0xf0) {
402 if (cur[3] == 0) {
403 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
404 cur = ctxt->input->cur;
405 }
406 if (((c & 0xf8) != 0xf0) ||
407 ((cur[3] & 0xc0) != 0x80))
408 goto encoding_error;
409 /* 4-byte code */
410 *len = 4;
411 val = (cur[0] & 0x7) << 18;
412 val |= (cur[1] & 0x3f) << 12;
413 val |= (cur[2] & 0x3f) << 6;
414 val |= cur[3] & 0x3f;
415 } else {
416 /* 3-byte code */
417 *len = 3;
418 val = (cur[0] & 0xf) << 12;
419 val |= (cur[1] & 0x3f) << 6;
420 val |= cur[2] & 0x3f;
421 }
422 } else {
423 /* 2-byte code */
424 *len = 2;
425 val = (cur[0] & 0x1f) << 6;
426 val |= cur[1] & 0x3f;
427 }
428 if (!IS_CHAR(val)) {
429 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
430 "Char 0x%X out of allowed range\n", val);
431 }
432 return(val);
433 } else {
434 if ((*ctxt->input->cur == 0) &&
435 (ctxt->input->cur < ctxt->input->end)) {
436 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
437 "Char 0x%X out of allowed range\n", 0);
438 *len = 1;
439 return(' ');
440 }
441 /* 1-byte code */
442 *len = 1;
443 return((int) *ctxt->input->cur);
444 }
445 }
446 /*
447 * Assume it's a fixed length encoding (1) with
448 * a compatible encoding for the ASCII set, since
449 * XML constructs only use < 128 chars
450 */
451 *len = 1;
452 if ((int) *ctxt->input->cur < 0x80)
453 return((int) *ctxt->input->cur);
454
455 /*
456 * Humm this is bad, do an automatic flow conversion
457 */
458 {
459 xmlChar * guess;
460 xmlCharEncodingHandlerPtr handler;
461
462 guess = htmlFindEncoding(ctxt);
463 if (guess == NULL) {
464 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
465 } else {
466 if (ctxt->input->encoding != NULL)
467 xmlFree((xmlChar *) ctxt->input->encoding);
468 ctxt->input->encoding = guess;
469 handler = xmlFindCharEncodingHandler((const char *) guess);
470 if (handler != NULL) {
471 xmlSwitchToEncoding(ctxt, handler);
472 } else {
473 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
474 "Unsupported encoding %s", guess, NULL);
475 }
476 }
477 ctxt->charset = XML_CHAR_ENCODING_UTF8;
478 }
479
480 return(xmlCurrentChar(ctxt, len));
481
482 encoding_error:
483 /*
484 * If we detect an UTF8 error that probably mean that the
485 * input encoding didn't get properly advertized in the
486 * declaration header. Report the error and switch the encoding
487 * to ISO-Latin-1 (if you don't like this policy, just declare the
488 * encoding !)
489 */
490 {
491 char buffer[150];
492
493 if (ctxt->input->end - ctxt->input->cur >= 4) {
494 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
495 ctxt->input->cur[0], ctxt->input->cur[1],
496 ctxt->input->cur[2], ctxt->input->cur[3]);
497 } else {
498 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
499 }
500 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
501 "Input is not proper UTF-8, indicate encoding !\n",
502 BAD_CAST buffer, NULL);
503 }
504
505 ctxt->charset = XML_CHAR_ENCODING_8859_1;
506 *len = 1;
507 return((int) *ctxt->input->cur);
508 }
509
510 /**
511 * htmlSkipBlankChars:
512 * @ctxt: the HTML parser context
513 *
514 * skip all blanks character found at that point in the input streams.
515 *
516 * Returns the number of space chars skipped
517 */
518
519 static int
520 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
521 int res = 0;
522
523 while (IS_BLANK_CH(*(ctxt->input->cur))) {
524 if ((*ctxt->input->cur == 0) &&
525 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
526 xmlPopInput(ctxt);
527 } else {
528 if (*(ctxt->input->cur) == '\n') {
529 ctxt->input->line++; ctxt->input->col = 1;
530 } else ctxt->input->col++;
531 ctxt->input->cur++;
532 ctxt->nbChars++;
533 if (*ctxt->input->cur == 0)
534 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
535 }
536 res++;
537 }
538 return(res);
539 }
540
541
542
543 /************************************************************************
544 * *
545 * The list of HTML elements and their properties *
546 * *
547 ************************************************************************/
548
549 /*
550 * Start Tag: 1 means the start tag can be ommited
551 * End Tag: 1 means the end tag can be ommited
552 * 2 means it's forbidden (empty elements)
553 * 3 means the tag is stylistic and should be closed easily
554 * Depr: this element is deprecated
555 * DTD: 1 means that this element is valid only in the Loose DTD
556 * 2 means that this element is valid only in the Frameset DTD
557 *
558 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
559 , subElements , impliedsubelt , Attributes, userdata
560 */
561
562 /* Definitions and a couple of vars for HTML Elements */
563
564 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
565 #define NB_FONTSTYLE 8
566 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
567 #define NB_PHRASE 10
568 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
569 #define NB_SPECIAL 16
570 #define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
571 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
572 #define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
573 #define NB_BLOCK NB_HEADING + NB_LIST + 14
574 #define FORMCTRL "input", "select", "textarea", "label", "button"
575 #define NB_FORMCTRL 5
576 #define PCDATA
577 #define NB_PCDATA 0
578 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
579 #define NB_HEADING 6
580 #define LIST "ul", "ol", "dir", "menu"
581 #define NB_LIST 4
582 #define MODIFIER
583 #define NB_MODIFIER 0
584 #define FLOW BLOCK,INLINE
585 #define NB_FLOW NB_BLOCK + NB_INLINE
586 #define EMPTY NULL
587
588
589 static const char* const html_flow[] = { FLOW, NULL } ;
590 static const char* const html_inline[] = { INLINE, NULL } ;
591
592 /* placeholders: elts with content but no subelements */
593 static const char* const html_pcdata[] = { NULL } ;
594 #define html_cdata html_pcdata
595
596
597 /* ... and for HTML Attributes */
598
599 #define COREATTRS "id", "class", "style", "title"
600 #define NB_COREATTRS 4
601 #define I18N "lang", "dir"
602 #define NB_I18N 2
603 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
604 #define NB_EVENTS 9
605 #define ATTRS COREATTRS,I18N,EVENTS
606 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
607 #define CELLHALIGN "align", "char", "charoff"
608 #define NB_CELLHALIGN 3
609 #define CELLVALIGN "valign"
610 #define NB_CELLVALIGN 1
611
612 static const char* const html_attrs[] = { ATTRS, NULL } ;
613 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
614 static const char* const core_attrs[] = { COREATTRS, NULL } ;
615 static const char* const i18n_attrs[] = { I18N, NULL } ;
616
617
618 /* Other declarations that should go inline ... */
619 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
620 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
621 "tabindex", "onfocus", "onblur", NULL } ;
622 static const char* const target_attr[] = { "target", NULL } ;
623 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
624 static const char* const alt_attr[] = { "alt", NULL } ;
625 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
626 static const char* const href_attrs[] = { "href", NULL } ;
627 static const char* const clear_attrs[] = { "clear", NULL } ;
628 static const char* const inline_p[] = { INLINE, "p", NULL } ;
629
630 static const char* const flow_param[] = { FLOW, "param", NULL } ;
631 static const char* const applet_attrs[] = { COREATTRS , "codebase",
632 "archive", "alt", "name", "height", "width", "align",
633 "hspace", "vspace", NULL } ;
634 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
635 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
636 static const char* const basefont_attrs[] =
637 { "id", "size", "color", "face", NULL } ;
638 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
639 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
640 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
641 static const char* const body_depr[] = { "background", "bgcolor", "text",
642 "link", "vlink", "alink", NULL } ;
643 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
644 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
645
646
647 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
648 static const char* const col_elt[] = { "col", NULL } ;
649 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
650 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
651 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
652 static const char* const compact_attr[] = { "compact", NULL } ;
653 static const char* const label_attr[] = { "label", NULL } ;
654 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
655 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
656 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
657 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
658 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
659 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
660 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
661 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
662 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
663 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
664 static const char* const version_attr[] = { "version", NULL } ;
665 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
666 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
667 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
668 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
669 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
670 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
671 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
672 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
673 static const char* const align_attr[] = { "align", NULL } ;
674 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
675 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
676 static const char* const name_attr[] = { "name", NULL } ;
677 static const char* const action_attr[] = { "action", NULL } ;
678 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
679 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
680 static const char* const content_attr[] = { "content", NULL } ;
681 static const char* const type_attr[] = { "type", NULL } ;
682 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
683 static const char* const object_contents[] = { FLOW, "param", NULL } ;
684 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
685 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
686 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
687 static const char* const option_elt[] = { "option", NULL } ;
688 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
689 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
690 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
691 static const char* const width_attr[] = { "width", NULL } ;
692 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
693 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
694 static const char* const language_attr[] = { "language", NULL } ;
695 static const char* const select_content[] = { "optgroup", "option", NULL } ;
696 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
697 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
698 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
699 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
700 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
701 static const char* const tr_elt[] = { "tr", NULL } ;
702 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
703 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
704 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
705 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
706 static const char* const tr_contents[] = { "th", "td", NULL } ;
707 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
708 static const char* const li_elt[] = { "li", NULL } ;
709 static const char* const ul_depr[] = { "type", "compact", NULL} ;
710 static const char* const dir_attr[] = { "dir", NULL} ;
711
712 #define DECL (const char**)
713
714 static const htmlElemDesc
715 html40ElementTable[] = {
716 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
717 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
718 },
719 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
720 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
721 },
722 { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
723 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
724 },
725 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
726 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
727 },
728 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
729 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
730 },
731 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
732 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
733 },
734 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
735 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
736 },
737 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
738 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
739 },
740 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
741 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
742 },
743 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
744 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
745 },
746 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
747 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
748 },
749 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
750 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
751 },
752 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
753 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
754 },
755 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
756 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
757 },
758 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
759 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
760 },
761 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
762 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
763 },
764 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
765 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
766 },
767 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
768 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
769 },
770 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
771 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
772 },
773 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
774 EMPTY , NULL , DECL col_attrs , NULL, NULL
775 },
776 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
777 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
778 },
779 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
780 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
781 },
782 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
783 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
784 },
785 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
786 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
787 },
788 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
789 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
790 },
791 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
792 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
793 },
794 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
795 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
796 },
797 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
798 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
799 },
800 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
801 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
802 },
803 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
804 EMPTY, NULL, DECL embed_attrs, NULL, NULL
805 },
806 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
807 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
808 },
809 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
810 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
811 },
812 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
813 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
814 },
815 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
816 EMPTY, NULL, NULL, DECL frame_attrs, NULL
817 },
818 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
819 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
820 },
821 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
822 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
823 },
824 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
825 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
826 },
827 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
828 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
829 },
830 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
831 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
832 },
833 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
834 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
835 },
836 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
837 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
838 },
839 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
840 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
841 },
842 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
843 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
844 },
845 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
846 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
847 },
848 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
849 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
850 },
851 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
852 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
853 },
854 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
855 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
856 },
857 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
858 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
859 },
860 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
861 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
862 },
863 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
864 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
865 },
866 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
867 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
868 },
869 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
870 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
871 },
872 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
873 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
874 },
875 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
876 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
877 },
878 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
879 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
880 },
881 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
882 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
883 },
884 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
885 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
886 },
887 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
888 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
889 },
890 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
891 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
892 },
893 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
894 DECL html_flow, "div", DECL html_attrs, NULL, NULL
895 },
896 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
897 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
898 },
899 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
900 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
901 },
902 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
903 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
904 },
905 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
906 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
907 },
908 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
909 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
910 },
911 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
912 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
913 },
914 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
915 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
916 },
917 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
918 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
919 },
920 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
921 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
922 },
923 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
924 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
925 },
926 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
927 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
928 },
929 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
930 DECL select_content, NULL, DECL select_attrs, NULL, NULL
931 },
932 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
933 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
934 },
935 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
936 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
937 },
938 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
939 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
940 },
941 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
942 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
943 },
944 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
945 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
946 },
947 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
948 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
949 },
950 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
951 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
952 },
953 { "table", 0, 0, 0, 0, 0, 0, 0, "",
954 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
955 },
956 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
957 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
958 },
959 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
960 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
961 },
962 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
963 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
964 },
965 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
966 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
967 },
968 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
969 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
970 },
971 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
972 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
973 },
974 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
975 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
976 },
977 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
978 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
979 },
980 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
981 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
982 },
983 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
984 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
985 },
986 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
987 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
988 },
989 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
990 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
991 }
992 };
993
994 /*
995 * start tags that imply the end of current element
996 */
997 static const char * const htmlStartClose[] = {
998 "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
999 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1000 "listing", "xmp", "head", NULL,
1001 "head", "p", NULL,
1002 "title", "p", NULL,
1003 "body", "head", "style", "link", "title", "p", NULL,
1004 "frameset", "head", "style", "link", "title", "p", NULL,
1005 "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1006 "pre", "listing", "xmp", "head", "li", NULL,
1007 "hr", "p", "head", NULL,
1008 "h1", "p", "head", NULL,
1009 "h2", "p", "head", NULL,
1010 "h3", "p", "head", NULL,
1011 "h4", "p", "head", NULL,
1012 "h5", "p", "head", NULL,
1013 "h6", "p", "head", NULL,
1014 "dir", "p", "head", NULL,
1015 "address", "p", "head", "ul", NULL,
1016 "pre", "p", "head", "ul", NULL,
1017 "listing", "p", "head", NULL,
1018 "xmp", "p", "head", NULL,
1019 "blockquote", "p", "head", NULL,
1020 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1021 "xmp", "head", NULL,
1022 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1023 "head", "dd", NULL,
1024 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1025 "head", "dt", NULL,
1026 "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1027 "listing", "xmp", NULL,
1028 "ol", "p", "head", "ul", NULL,
1029 "menu", "p", "head", "ul", NULL,
1030 "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1031 "div", "p", "head", NULL,
1032 "noscript", "p", "head", NULL,
1033 "center", "font", "b", "i", "p", "head", NULL,
1034 "a", "a", NULL,
1035 "caption", "p", NULL,
1036 "colgroup", "caption", "colgroup", "col", "p", NULL,
1037 "col", "caption", "col", "p", NULL,
1038 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1039 "listing", "xmp", "a", NULL,
1040 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1041 "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1042 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1043 "thead", "caption", "col", "colgroup", NULL,
1044 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1045 "tbody", "p", NULL,
1046 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1047 "tfoot", "tbody", "p", NULL,
1048 "optgroup", "option", NULL,
1049 "option", "option", NULL,
1050 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1051 "pre", "listing", "xmp", "a", NULL,
1052 NULL
1053 };
1054
1055 /*
1056 * The list of HTML elements which are supposed not to have
1057 * CDATA content and where a p element will be implied
1058 *
1059 * TODO: extend that list by reading the HTML SGML DTD on
1060 * implied paragraph
1061 */
1062 static const char *const htmlNoContentElements[] = {
1063 "html",
1064 "head",
1065 NULL
1066 };
1067
1068 /*
1069 * The list of HTML attributes which are of content %Script;
1070 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1071 * it assumes the name starts with 'on'
1072 */
1073 static const char *const htmlScriptAttributes[] = {
1074 "onclick",
1075 "ondblclick",
1076 "onmousedown",
1077 "onmouseup",
1078 "onmouseover",
1079 "onmousemove",
1080 "onmouseout",
1081 "onkeypress",
1082 "onkeydown",
1083 "onkeyup",
1084 "onload",
1085 "onunload",
1086 "onfocus",
1087 "onblur",
1088 "onsubmit",
1089 "onrest",
1090 "onchange",
1091 "onselect"
1092 };
1093
1094 /*
1095 * This table is used by the htmlparser to know what to do with
1096 * broken html pages. By assigning different priorities to different
1097 * elements the parser can decide how to handle extra endtags.
1098 * Endtags are only allowed to close elements with lower or equal
1099 * priority.
1100 */
1101
1102 typedef struct {
1103 const char *name;
1104 int priority;
1105 } elementPriority;
1106
1107 static const elementPriority htmlEndPriority[] = {
1108 {"div", 150},
1109 {"td", 160},
1110 {"th", 160},
1111 {"tr", 170},
1112 {"thead", 180},
1113 {"tbody", 180},
1114 {"tfoot", 180},
1115 {"table", 190},
1116 {"head", 200},
1117 {"body", 200},
1118 {"html", 220},
1119 {NULL, 100} /* Default priority */
1120 };
1121
1122 static const char** htmlStartCloseIndex[100];
1123 static int htmlStartCloseIndexinitialized = 0;
1124
1125 /************************************************************************
1126 * *
1127 * functions to handle HTML specific data *
1128 * *
1129 ************************************************************************/
1130
1131 /**
1132 * htmlInitAutoClose:
1133 *
1134 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1135 * This is not reentrant. Call xmlInitParser() once before processing in
1136 * case of use in multithreaded programs.
1137 */
1138 void
1139 htmlInitAutoClose(void) {
1140 int indx, i = 0;
1141
1142 if (htmlStartCloseIndexinitialized) return;
1143
1144 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1145 indx = 0;
1146 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1147 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1148 while (htmlStartClose[i] != NULL) i++;
1149 i++;
1150 }
1151 htmlStartCloseIndexinitialized = 1;
1152 }
1153
1154 /**
1155 * htmlTagLookup:
1156 * @tag: The tag name in lowercase
1157 *
1158 * Lookup the HTML tag in the ElementTable
1159 *
1160 * Returns the related htmlElemDescPtr or NULL if not found.
1161 */
1162 const htmlElemDesc *
1163 htmlTagLookup(const xmlChar *tag) {
1164 unsigned int i;
1165
1166 for (i = 0; i < (sizeof(html40ElementTable) /
1167 sizeof(html40ElementTable[0]));i++) {
1168 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1169 return((htmlElemDescPtr) &html40ElementTable[i]);
1170 }
1171 return(NULL);
1172 }
1173
1174 /**
1175 * htmlGetEndPriority:
1176 * @name: The name of the element to look up the priority for.
1177 *
1178 * Return value: The "endtag" priority.
1179 **/
1180 static int
1181 htmlGetEndPriority (const xmlChar *name) {
1182 int i = 0;
1183
1184 while ((htmlEndPriority[i].name != NULL) &&
1185 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1186 i++;
1187
1188 return(htmlEndPriority[i].priority);
1189 }
1190
1191
1192 /**
1193 * htmlCheckAutoClose:
1194 * @newtag: The new tag name
1195 * @oldtag: The old tag name
1196 *
1197 * Checks whether the new tag is one of the registered valid tags for
1198 * closing old.
1199 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1200 *
1201 * Returns 0 if no, 1 if yes.
1202 */
1203 static int
1204 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1205 {
1206 int i, indx;
1207 const char **closed = NULL;
1208
1209 if (htmlStartCloseIndexinitialized == 0)
1210 htmlInitAutoClose();
1211
1212 /* inefficient, but not a big deal */
1213 for (indx = 0; indx < 100; indx++) {
1214 closed = htmlStartCloseIndex[indx];
1215 if (closed == NULL)
1216 return (0);
1217 if (xmlStrEqual(BAD_CAST * closed, newtag))
1218 break;
1219 }
1220
1221 i = closed - htmlStartClose;
1222 i++;
1223 while (htmlStartClose[i] != NULL) {
1224 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1225 return (1);
1226 }
1227 i++;
1228 }
1229 return (0);
1230 }
1231
1232 /**
1233 * htmlAutoCloseOnClose:
1234 * @ctxt: an HTML parser context
1235 * @newtag: The new tag name
1236 * @force: force the tag closure
1237 *
1238 * The HTML DTD allows an ending tag to implicitly close other tags.
1239 */
1240 static void
1241 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1242 {
1243 const htmlElemDesc *info;
1244 int i, priority;
1245
1246 priority = htmlGetEndPriority(newtag);
1247
1248 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1249
1250 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1251 break;
1252 /*
1253 * A missplaced endtag can only close elements with lower
1254 * or equal priority, so if we find an element with higher
1255 * priority before we find an element with
1256 * matching name, we just ignore this endtag
1257 */
1258 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1259 return;
1260 }
1261 if (i < 0)
1262 return;
1263
1264 while (!xmlStrEqual(newtag, ctxt->name)) {
1265 info = htmlTagLookup(ctxt->name);
1266 if ((info != NULL) && (info->endTag == 3)) {
1267 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1268 "Opening and ending tag mismatch: %s and %s\n",
1269 newtag, ctxt->name);
1270 }
1271 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1272 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1273 htmlnamePop(ctxt);
1274 }
1275 }
1276
1277 /**
1278 * htmlAutoCloseOnEnd:
1279 * @ctxt: an HTML parser context
1280 *
1281 * Close all remaining tags at the end of the stream
1282 */
1283 static void
1284 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1285 {
1286 int i;
1287
1288 if (ctxt->nameNr == 0)
1289 return;
1290 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1291 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1292 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1293 htmlnamePop(ctxt);
1294 }
1295 }
1296
1297 /**
1298 * htmlAutoClose:
1299 * @ctxt: an HTML parser context
1300 * @newtag: The new tag name or NULL
1301 *
1302 * The HTML DTD allows a tag to implicitly close other tags.
1303 * The list is kept in htmlStartClose array. This function is
1304 * called when a new tag has been detected and generates the
1305 * appropriates closes if possible/needed.
1306 * If newtag is NULL this mean we are at the end of the resource
1307 * and we should check
1308 */
1309 static void
1310 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1311 {
1312 while ((newtag != NULL) && (ctxt->name != NULL) &&
1313 (htmlCheckAutoClose(newtag, ctxt->name))) {
1314 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1315 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1316 htmlnamePop(ctxt);
1317 }
1318 if (newtag == NULL) {
1319 htmlAutoCloseOnEnd(ctxt);
1320 return;
1321 }
1322 while ((newtag == NULL) && (ctxt->name != NULL) &&
1323 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1324 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1325 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1326 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1327 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1328 htmlnamePop(ctxt);
1329 }
1330 }
1331
1332 /**
1333 * htmlAutoCloseTag:
1334 * @doc: the HTML document
1335 * @name: The tag name
1336 * @elem: the HTML element
1337 *
1338 * The HTML DTD allows a tag to implicitly close other tags.
1339 * The list is kept in htmlStartClose array. This function checks
1340 * if the element or one of it's children would autoclose the
1341 * given tag.
1342 *
1343 * Returns 1 if autoclose, 0 otherwise
1344 */
1345 int
1346 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1347 htmlNodePtr child;
1348
1349 if (elem == NULL) return(1);
1350 if (xmlStrEqual(name, elem->name)) return(0);
1351 if (htmlCheckAutoClose(elem->name, name)) return(1);
1352 child = elem->children;
1353 while (child != NULL) {
1354 if (htmlAutoCloseTag(doc, name, child)) return(1);
1355 child = child->next;
1356 }
1357 return(0);
1358 }
1359
1360 /**
1361 * htmlIsAutoClosed:
1362 * @doc: the HTML document
1363 * @elem: the HTML element
1364 *
1365 * The HTML DTD allows a tag to implicitly close other tags.
1366 * The list is kept in htmlStartClose array. This function checks
1367 * if a tag is autoclosed by one of it's child
1368 *
1369 * Returns 1 if autoclosed, 0 otherwise
1370 */
1371 int
1372 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1373 htmlNodePtr child;
1374
1375 if (elem == NULL) return(1);
1376 child = elem->children;
1377 while (child != NULL) {
1378 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1379 child = child->next;
1380 }
1381 return(0);
1382 }
1383
1384 /**
1385 * htmlCheckImplied:
1386 * @ctxt: an HTML parser context
1387 * @newtag: The new tag name
1388 *
1389 * The HTML DTD allows a tag to exists only implicitly
1390 * called when a new tag has been detected and generates the
1391 * appropriates implicit tags if missing
1392 */
1393 static void
1394 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1395 int i;
1396
1397 if (!htmlOmittedDefaultValue)
1398 return;
1399 if (xmlStrEqual(newtag, BAD_CAST"html"))
1400 return;
1401 if (ctxt->nameNr <= 0) {
1402 htmlnamePush(ctxt, BAD_CAST"html");
1403 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1404 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1405 }
1406 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1407 return;
1408 if ((ctxt->nameNr <= 1) &&
1409 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1410 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1411 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1412 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1413 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1414 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1415 if (ctxt->html >= 3) {
1416 /* we already saw or generated an <head> before */
1417 return;
1418 }
1419 /*
1420 * dropped OBJECT ... i you put it first BODY will be
1421 * assumed !
1422 */
1423 htmlnamePush(ctxt, BAD_CAST"head");
1424 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1425 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1426 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1427 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1428 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1429 if (ctxt->html >= 10) {
1430 /* we already saw or generated a <body> before */
1431 return;
1432 }
1433 for (i = 0;i < ctxt->nameNr;i++) {
1434 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1435 return;
1436 }
1437 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1438 return;
1439 }
1440 }
1441
1442 htmlnamePush(ctxt, BAD_CAST"body");
1443 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1444 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1445 }
1446 }
1447
1448 /**
1449 * htmlCheckParagraph
1450 * @ctxt: an HTML parser context
1451 *
1452 * Check whether a p element need to be implied before inserting
1453 * characters in the current element.
1454 *
1455 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1456 * in case of error.
1457 */
1458
1459 static int
1460 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1461 const xmlChar *tag;
1462 int i;
1463
1464 if (ctxt == NULL)
1465 return(-1);
1466 tag = ctxt->name;
1467 if (tag == NULL) {
1468 htmlAutoClose(ctxt, BAD_CAST"p");
1469 htmlCheckImplied(ctxt, BAD_CAST"p");
1470 htmlnamePush(ctxt, BAD_CAST"p");
1471 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1472 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1473 return(1);
1474 }
1475 if (!htmlOmittedDefaultValue)
1476 return(0);
1477 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1478 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1479 htmlAutoClose(ctxt, BAD_CAST"p");
1480 htmlCheckImplied(ctxt, BAD_CAST"p");
1481 htmlnamePush(ctxt, BAD_CAST"p");
1482 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1483 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1484 return(1);
1485 }
1486 }
1487 return(0);
1488 }
1489
1490 /**
1491 * htmlIsScriptAttribute:
1492 * @name: an attribute name
1493 *
1494 * Check if an attribute is of content type Script
1495 *
1496 * Returns 1 is the attribute is a script 0 otherwise
1497 */
1498 int
1499 htmlIsScriptAttribute(const xmlChar *name) {
1500 unsigned int i;
1501
1502 if (name == NULL)
1503 return(0);
1504 /*
1505 * all script attributes start with 'on'
1506 */
1507 if ((name[0] != 'o') || (name[1] != 'n'))
1508 return(0);
1509 for (i = 0;
1510 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1511 i++) {
1512 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1513 return(1);
1514 }
1515 return(0);
1516 }
1517
1518 /************************************************************************
1519 * *
1520 * The list of HTML predefined entities *
1521 * *
1522 ************************************************************************/
1523
1524
1525 static const htmlEntityDesc html40EntitiesTable[] = {
1526 /*
1527 * the 4 absolute ones, plus apostrophe.
1528 */
1529 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1530 { 38, "amp", "ampersand, U+0026 ISOnum" },
1531 { 39, "apos", "single quote" },
1532 { 60, "lt", "less-than sign, U+003C ISOnum" },
1533 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1534
1535 /*
1536 * A bunch still in the 128-255 range
1537 * Replacing them depend really on the charset used.
1538 */
1539 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1540 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1541 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1542 { 163, "pound","pound sign, U+00A3 ISOnum" },
1543 { 164, "curren","currency sign, U+00A4 ISOnum" },
1544 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1545 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1546 { 167, "sect", "section sign, U+00A7 ISOnum" },
1547 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1548 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1549 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1550 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1551 { 172, "not", "not sign, U+00AC ISOnum" },
1552 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1553 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1554 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1555 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1556 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1557 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1558 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1559 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1560 { 181, "micro","micro sign, U+00B5 ISOnum" },
1561 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1562 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1563 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1564 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1565 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1566 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1567 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1568 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1569 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1570 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1571 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1572 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1573 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1574 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1575 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1576 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1577 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1578 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1579 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1580 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1581 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1582 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1583 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1584 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1585 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1586 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1587 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1588 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1589 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1590 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1591 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1592 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1593 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1594 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1595 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1596 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1597 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1598 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1599 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1600 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1601 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1602 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1603 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1604 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1605 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1606 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1607 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1608 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1609 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1610 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1611 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1612 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1613 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1614 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1615 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1616 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1617 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1618 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1619 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1620 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1621 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1622 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1623 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1624 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1625 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1626 { 247, "divide","division sign, U+00F7 ISOnum" },
1627 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1628 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1629 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1630 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1631 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1632 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1633 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1634 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1635
1636 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1637 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1638 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1639 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1640 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1641
1642 /*
1643 * Anything below should really be kept as entities references
1644 */
1645 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1646
1647 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1648 { 732, "tilde","small tilde, U+02DC ISOdia" },
1649
1650 { 913, "Alpha","greek capital letter alpha, U+0391" },
1651 { 914, "Beta", "greek capital letter beta, U+0392" },
1652 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1653 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1654 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1655 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1656 { 919, "Eta", "greek capital letter eta, U+0397" },
1657 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1658 { 921, "Iota", "greek capital letter iota, U+0399" },
1659 { 922, "Kappa","greek capital letter kappa, U+039A" },
1660 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1661 { 924, "Mu", "greek capital letter mu, U+039C" },
1662 { 925, "Nu", "greek capital letter nu, U+039D" },
1663 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1664 { 927, "Omicron","greek capital letter omicron, U+039F" },
1665 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1666 { 929, "Rho", "greek capital letter rho, U+03A1" },
1667 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1668 { 932, "Tau", "greek capital letter tau, U+03A4" },
1669 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1670 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1671 { 935, "Chi", "greek capital letter chi, U+03A7" },
1672 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1673 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1674
1675 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1676 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1677 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1678 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1679 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1680 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1681 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1682 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1683 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1684 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1685 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1686 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1687 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1688 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1689 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1690 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1691 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1692 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1693 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1694 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1695 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1696 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1697 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1698 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1699 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1700 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1701 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1702 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1703
1704 { 8194, "ensp", "en space, U+2002 ISOpub" },
1705 { 8195, "emsp", "em space, U+2003 ISOpub" },
1706 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1707 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1708 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1709 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1710 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1711 { 8211, "ndash","en dash, U+2013 ISOpub" },
1712 { 8212, "mdash","em dash, U+2014 ISOpub" },
1713 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1714 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1715 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1716 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1717 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1718 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1719 { 8224, "dagger","dagger, U+2020 ISOpub" },
1720 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1721
1722 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1723 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1724
1725 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1726
1727 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1728 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1729
1730 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1731 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1732
1733 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1734 { 8260, "frasl","fraction slash, U+2044 NEW" },
1735
1736 { 8364, "euro", "euro sign, U+20AC NEW" },
1737
1738 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1739 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1740 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1741 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1742 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1743 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1744 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1745 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1746 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1747 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1748 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1749 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1750 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1751 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1752 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1753 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1754
1755 { 8704, "forall","for all, U+2200 ISOtech" },
1756 { 8706, "part", "partial differential, U+2202 ISOtech" },
1757 { 8707, "exist","there exists, U+2203 ISOtech" },
1758 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1759 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1760 { 8712, "isin", "element of, U+2208 ISOtech" },
1761 { 8713, "notin","not an element of, U+2209 ISOtech" },
1762 { 8715, "ni", "contains as member, U+220B ISOtech" },
1763 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1764 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1765 { 8722, "minus","minus sign, U+2212 ISOtech" },
1766 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1767 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1768 { 8733, "prop", "proportional to, U+221D ISOtech" },
1769 { 8734, "infin","infinity, U+221E ISOtech" },
1770 { 8736, "ang", "angle, U+2220 ISOamso" },
1771 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1772 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
1773 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1774 { 8746, "cup", "union = cup, U+222A ISOtech" },
1775 { 8747, "int", "integral, U+222B ISOtech" },
1776 { 8756, "there4","therefore, U+2234 ISOtech" },
1777 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1778 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1779 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1780 { 8800, "ne", "not equal to, U+2260 ISOtech" },
1781 { 8801, "equiv","identical to, U+2261 ISOtech" },
1782 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1783 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1784 { 8834, "sub", "subset of, U+2282 ISOtech" },
1785 { 8835, "sup", "superset of, U+2283 ISOtech" },
1786 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1787 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1788 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1789 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1790 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1791 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1792 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1793 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1794 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1795 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1796 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1797 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1798 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1799 { 9674, "loz", "lozenge, U+25CA ISOpub" },
1800
1801 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1802 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1803 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1804 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1805
1806 };
1807
1808 /************************************************************************
1809 * *
1810 * Commodity functions to handle entities *
1811 * *
1812 ************************************************************************/
1813
1814 /*
1815 * Macro used to grow the current buffer.
1816 */
1817 #define growBuffer(buffer) { \
1818 xmlChar *tmp; \
1819 buffer##_size *= 2; \
1820 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1821 if (tmp == NULL) { \
1822 htmlErrMemory(ctxt, "growing buffer\n"); \
1823 xmlFree(buffer); \
1824 return(NULL); \
1825 } \
1826 buffer = tmp; \
1827 }
1828
1829 /**
1830 * htmlEntityLookup:
1831 * @name: the entity name
1832 *
1833 * Lookup the given entity in EntitiesTable
1834 *
1835 * TODO: the linear scan is really ugly, an hash table is really needed.
1836 *
1837 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1838 */
1839 const htmlEntityDesc *
1840 htmlEntityLookup(const xmlChar *name) {
1841 unsigned int i;
1842
1843 for (i = 0;i < (sizeof(html40EntitiesTable)/
1844 sizeof(html40EntitiesTable[0]));i++) {
1845 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1846 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1847 }
1848 }
1849 return(NULL);
1850 }
1851
1852 /**
1853 * htmlEntityValueLookup:
1854 * @value: the entity's unicode value
1855 *
1856 * Lookup the given entity in EntitiesTable
1857 *
1858 * TODO: the linear scan is really ugly, an hash table is really needed.
1859 *
1860 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1861 */
1862 const htmlEntityDesc *
1863 htmlEntityValueLookup(unsigned int value) {
1864 unsigned int i;
1865
1866 for (i = 0;i < (sizeof(html40EntitiesTable)/
1867 sizeof(html40EntitiesTable[0]));i++) {
1868 if (html40EntitiesTable[i].value >= value) {
1869 if (html40EntitiesTable[i].value > value)
1870 break;
1871 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1872 }
1873 }
1874 return(NULL);
1875 }
1876
1877 /**
1878 * UTF8ToHtml:
1879 * @out: a pointer to an array of bytes to store the result
1880 * @outlen: the length of @out
1881 * @in: a pointer to an array of UTF-8 chars
1882 * @inlen: the length of @in
1883 *
1884 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1885 * plus HTML entities block of chars out.
1886 *
1887 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1888 * The value of @inlen after return is the number of octets consumed
1889 * as the return value is positive, else unpredictable.
1890 * The value of @outlen after return is the number of octets consumed.
1891 */
1892 int
1893 UTF8ToHtml(unsigned char* out, int *outlen,
1894 const unsigned char* in, int *inlen) {
1895 const unsigned char* processed = in;
1896 const unsigned char* outend;
1897 const unsigned char* outstart = out;
1898 const unsigned char* instart = in;
1899 const unsigned char* inend;
1900 unsigned int c, d;
1901 int trailing;
1902
1903 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1904 if (in == NULL) {
1905 /*
1906 * initialization nothing to do
1907 */
1908 *outlen = 0;
1909 *inlen = 0;
1910 return(0);
1911 }
1912 inend = in + (*inlen);
1913 outend = out + (*outlen);
1914 while (in < inend) {
1915 d = *in++;
1916 if (d < 0x80) { c= d; trailing= 0; }
1917 else if (d < 0xC0) {
1918 /* trailing byte in leading position */
1919 *outlen = out - outstart;
1920 *inlen = processed - instart;
1921 return(-2);
1922 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1923 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1924 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1925 else {
1926 /* no chance for this in Ascii */
1927 *outlen = out - outstart;
1928 *inlen = processed - instart;
1929 return(-2);
1930 }
1931
1932 if (inend - in < trailing) {
1933 break;
1934 }
1935
1936 for ( ; trailing; trailing--) {
1937 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1938 break;
1939 c <<= 6;
1940 c |= d & 0x3F;
1941 }
1942
1943 /* assertion: c is a single UTF-4 value */
1944 if (c < 0x80) {
1945 if (out + 1 >= outend)
1946 break;
1947 *out++ = c;
1948 } else {
1949 int len;
1950 const htmlEntityDesc * ent;
1951 const char *cp;
1952 char nbuf[16];
1953
1954 /*
1955 * Try to lookup a predefined HTML entity for it
1956 */
1957
1958 ent = htmlEntityValueLookup(c);
1959 if (ent == NULL) {
1960 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1961 cp = nbuf;
1962 }
1963 else
1964 cp = ent->name;
1965 len = strlen(cp);
1966 if (out + 2 + len >= outend)
1967 break;
1968 *out++ = '&';
1969 memcpy(out, cp, len);
1970 out += len;
1971 *out++ = ';';
1972 }
1973 processed = in;
1974 }
1975 *outlen = out - outstart;
1976 *inlen = processed - instart;
1977 return(0);
1978 }
1979
1980 /**
1981 * htmlEncodeEntities:
1982 * @out: a pointer to an array of bytes to store the result
1983 * @outlen: the length of @out
1984 * @in: a pointer to an array of UTF-8 chars
1985 * @inlen: the length of @in
1986 * @quoteChar: the quote character to escape (' or ") or zero.
1987 *
1988 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1989 * plus HTML entities block of chars out.
1990 *
1991 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1992 * The value of @inlen after return is the number of octets consumed
1993 * as the return value is positive, else unpredictable.
1994 * The value of @outlen after return is the number of octets consumed.
1995 */
1996 int
1997 htmlEncodeEntities(unsigned char* out, int *outlen,
1998 const unsigned char* in, int *inlen, int quoteChar) {
1999 const unsigned char* processed = in;
2000 const unsigned char* outend;
2001 const unsigned char* outstart = out;
2002 const unsigned char* instart = in;
2003 const unsigned char* inend;
2004 unsigned int c, d;
2005 int trailing;
2006
2007 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2008 return(-1);
2009 outend = out + (*outlen);
2010 inend = in + (*inlen);
2011 while (in < inend) {
2012 d = *in++;
2013 if (d < 0x80) { c= d; trailing= 0; }
2014 else if (d < 0xC0) {
2015 /* trailing byte in leading position */
2016 *outlen = out - outstart;
2017 *inlen = processed - instart;
2018 return(-2);
2019 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2020 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2021 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2022 else {
2023 /* no chance for this in Ascii */
2024 *outlen = out - outstart;
2025 *inlen = processed - instart;
2026 return(-2);
2027 }
2028
2029 if (inend - in < trailing)
2030 break;
2031
2032 while (trailing--) {
2033 if (((d= *in++) & 0xC0) != 0x80) {
2034 *outlen = out - outstart;
2035 *inlen = processed - instart;
2036 return(-2);
2037 }
2038 c <<= 6;
2039 c |= d & 0x3F;
2040 }
2041
2042 /* assertion: c is a single UTF-4 value */
2043 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2044 (c != '&') && (c != '<') && (c != '>')) {
2045 if (out >= outend)
2046 break;
2047 *out++ = c;
2048 } else {
2049 const htmlEntityDesc * ent;
2050 const char *cp;
2051 char nbuf[16];
2052 int len;
2053
2054 /*
2055 * Try to lookup a predefined HTML entity for it
2056 */
2057 ent = htmlEntityValueLookup(c);
2058 if (ent == NULL) {
2059 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2060 cp = nbuf;
2061 }
2062 else
2063 cp = ent->name;
2064 len = strlen(cp);
2065 if (out + 2 + len > outend)
2066 break;
2067 *out++ = '&';
2068 memcpy(out, cp, len);
2069 out += len;
2070 *out++ = ';';
2071 }
2072 processed = in;
2073 }
2074 *outlen = out - outstart;
2075 *inlen = processed - instart;
2076 return(0);
2077 }
2078
2079 /************************************************************************
2080 * *
2081 * Commodity functions to handle streams *
2082 * *
2083 ************************************************************************/
2084
2085 /**
2086 * htmlNewInputStream:
2087 * @ctxt: an HTML parser context
2088 *
2089 * Create a new input stream structure
2090 * Returns the new input stream or NULL
2091 */
2092 static htmlParserInputPtr
2093 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2094 htmlParserInputPtr input;
2095
2096 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2097 if (input == NULL) {
2098 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2099 return(NULL);
2100 }
2101 memset(input, 0, sizeof(htmlParserInput));
2102 input->filename = NULL;
2103 input->directory = NULL;
2104 input->base = NULL;
2105 input->cur = NULL;
2106 input->buf = NULL;
2107 input->line = 1;
2108 input->col = 1;
2109 input->buf = NULL;
2110 input->free = NULL;
2111 input->version = NULL;
2112 input->consumed = 0;
2113 input->length = 0;
2114 return(input);
2115 }
2116
2117
2118 /************************************************************************
2119 * *
2120 * Commodity functions, cleanup needed ? *
2121 * *
2122 ************************************************************************/
2123 /*
2124 * all tags allowing pc data from the html 4.01 loose dtd
2125 * NOTE: it might be more apropriate to integrate this information
2126 * into the html40ElementTable array but I don't want to risk any
2127 * binary incomptibility
2128 */
2129 static const char *allowPCData[] = {
2130 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2131 "blockquote", "body", "button", "caption", "center", "cite", "code",
2132 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2133 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2134 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2135 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2136 };
2137
2138 /**
2139 * areBlanks:
2140 * @ctxt: an HTML parser context
2141 * @str: a xmlChar *
2142 * @len: the size of @str
2143 *
2144 * Is this a sequence of blank chars that one can ignore ?
2145 *
2146 * Returns 1 if ignorable 0 otherwise.
2147 */
2148
2149 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2150 unsigned int i;
2151 int j;
2152 xmlNodePtr lastChild;
2153 xmlDtdPtr dtd;
2154
2155 for (j = 0;j < len;j++)
2156 if (!(IS_BLANK_CH(str[j]))) return(0);
2157
2158 if (CUR == 0) return(1);
2159 if (CUR != '<') return(0);
2160 if (ctxt->name == NULL)
2161 return(1);
2162 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2163 return(1);
2164 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2165 return(1);
2166
2167 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2168 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2169 dtd = xmlGetIntSubset(ctxt->myDoc);
2170 if (dtd != NULL && dtd->ExternalID != NULL) {
2171 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2172 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2173 return(1);
2174 }
2175 }
2176
2177 if (ctxt->node == NULL) return(0);
2178 lastChild = xmlGetLastChild(ctxt->node);
2179 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2180 lastChild = lastChild->prev;
2181 if (lastChild == NULL) {
2182 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2183 (ctxt->node->content != NULL)) return(0);
2184 /* keep ws in constructs like ...<b> </b>...
2185 for all tags "b" allowing PCDATA */
2186 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2187 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2188 return(0);
2189 }
2190 }
2191 } else if (xmlNodeIsText(lastChild)) {
2192 return(0);
2193 } else {
2194 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2195 for all tags "p" allowing PCDATA */
2196 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2197 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2198 return(0);
2199 }
2200 }
2201 }
2202 return(1);
2203 }
2204
2205 /**
2206 * htmlNewDocNoDtD:
2207 * @URI: URI for the dtd, or NULL
2208 * @ExternalID: the external ID of the DTD, or NULL
2209 *
2210 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2211 * are NULL
2212 *
2213 * Returns a new document, do not initialize the DTD if not provided
2214 */
2215 htmlDocPtr
2216 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2217 xmlDocPtr cur;
2218
2219 /*
2220 * Allocate a new document and fill the fields.
2221 */
2222 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2223 if (cur == NULL) {
2224 htmlErrMemory(NULL, "HTML document creation failed\n");
2225 return(NULL);
2226 }
2227 memset(cur, 0, sizeof(xmlDoc));
2228
2229 cur->type = XML_HTML_DOCUMENT_NODE;
2230 cur->version = NULL;
2231 cur->intSubset = NULL;
2232 cur->doc = cur;
2233 cur->name = NULL;
2234 cur->children = NULL;
2235 cur->extSubset = NULL;
2236 cur->oldNs = NULL;
2237 cur->encoding = NULL;
2238 cur->standalone = 1;
2239 cur->compression = 0;
2240 cur->ids = NULL;
2241 cur->refs = NULL;
2242 cur->_private = NULL;
2243 cur->charset = XML_CHAR_ENCODING_UTF8;
2244 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2245 if ((ExternalID != NULL) ||
2246 (URI != NULL))
2247 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2248 return(cur);
2249 }
2250
2251 /**
2252 * htmlNewDoc:
2253 * @URI: URI for the dtd, or NULL
2254 * @ExternalID: the external ID of the DTD, or NULL
2255 *
2256 * Creates a new HTML document
2257 *
2258 * Returns a new document
2259 */
2260 htmlDocPtr
2261 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2262 if ((URI == NULL) && (ExternalID == NULL))
2263 return(htmlNewDocNoDtD(
2264 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2265 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2266
2267 return(htmlNewDocNoDtD(URI, ExternalID));
2268 }
2269
2270
2271 /************************************************************************
2272 * *
2273 * The parser itself *
2274 * Relates to http://www.w3.org/TR/html40 *
2275 * *
2276 ************************************************************************/
2277
2278 /************************************************************************
2279 * *
2280 * The parser itself *
2281 * *
2282 ************************************************************************/
2283
2284 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2285
2286 /**
2287 * htmlParseHTMLName:
2288 * @ctxt: an HTML parser context
2289 *
2290 * parse an HTML tag or attribute name, note that we convert it to lowercase
2291 * since HTML names are not case-sensitive.
2292 *
2293 * Returns the Tag Name parsed or NULL
2294 */
2295
2296 static const xmlChar *
2297 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2298 int i = 0;
2299 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2300
2301 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2302 (CUR != ':') && (CUR != '.')) return(NULL);
2303
2304 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2305 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2306 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2307 (CUR == '.'))) {
2308 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2309 else loc[i] = CUR;
2310 i++;
2311
2312 NEXT;
2313 }
2314
2315 return(xmlDictLookup(ctxt->dict, loc, i));
2316 }
2317
2318
2319 /**
2320 * htmlParseHTMLName_nonInvasive:
2321 * @ctxt: an HTML parser context
2322 *
2323 * parse an HTML tag or attribute name, note that we convert it to lowercase
2324 * since HTML names are not case-sensitive, this doesn't consume the data
2325 * from the stream, it's a look-ahead
2326 *
2327 * Returns the Tag Name parsed or NULL
2328 */
2329
2330 static const xmlChar *
2331 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2332 int i = 0;
2333 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2334
2335 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2336 (NXT(1) != ':')) return(NULL);
2337
2338 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2339 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2340 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2341 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2342 else loc[i] = NXT(1+i);
2343 i++;
2344 }
2345
2346 return(xmlDictLookup(ctxt->dict, loc, i));
2347 }
2348
2349
2350 /**
2351 * htmlParseName:
2352 * @ctxt: an HTML parser context
2353 *
2354 * parse an HTML name, this routine is case sensitive.
2355 *
2356 * Returns the Name parsed or NULL
2357 */
2358
2359 static const xmlChar *
2360 htmlParseName(htmlParserCtxtPtr ctxt) {
2361 const xmlChar *in;
2362 const xmlChar *ret;
2363 int count = 0;
2364
2365 GROW;
2366
2367 /*
2368 * Accelerator for simple ASCII names
2369 */
2370 in = ctxt->input->cur;
2371 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2372 ((*in >= 0x41) && (*in <= 0x5A)) ||
2373 (*in == '_') || (*in == ':')) {
2374 in++;
2375 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2376 ((*in >= 0x41) && (*in <= 0x5A)) ||
2377 ((*in >= 0x30) && (*in <= 0x39)) ||
2378 (*in == '_') || (*in == '-') ||
2379 (*in == ':') || (*in == '.'))
2380 in++;
2381 if ((*in > 0) && (*in < 0x80)) {
2382 count = in - ctxt->input->cur;
2383 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2384 ctxt->input->cur = in;
2385 ctxt->nbChars += count;
2386 ctxt->input->col += count;
2387 return(ret);
2388 }
2389 }
2390 return(htmlParseNameComplex(ctxt));
2391 }
2392
2393 static const xmlChar *
2394 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2395 int len = 0, l;
2396 int c;
2397 int count = 0;
2398
2399 /*
2400 * Handler for more complex cases
2401 */
2402 GROW;
2403 c = CUR_CHAR(l);
2404 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2405 (!IS_LETTER(c) && (c != '_') &&
2406 (c != ':'))) {
2407 return(NULL);
2408 }
2409
2410 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2411 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2412 (c == '.') || (c == '-') ||
2413 (c == '_') || (c == ':') ||
2414 (IS_COMBINING(c)) ||
2415 (IS_EXTENDER(c)))) {
2416 if (count++ > 100) {
2417 count = 0;
2418 GROW;
2419 }
2420 len += l;
2421 NEXTL(l);
2422 c = CUR_CHAR(l);
2423 }
2424 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2425 }
2426
2427
2428 /**
2429 * htmlParseHTMLAttribute:
2430 * @ctxt: an HTML parser context
2431 * @stop: a char stop value
2432 *
2433 * parse an HTML attribute value till the stop (quote), if
2434 * stop is 0 then it stops at the first space
2435 *
2436 * Returns the attribute parsed or NULL
2437 */
2438
2439 static xmlChar *
2440 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2441 xmlChar *buffer = NULL;
2442 int buffer_size = 0;
2443 xmlChar *out = NULL;
2444 const xmlChar *name = NULL;
2445 const xmlChar *cur = NULL;
2446 const htmlEntityDesc * ent;
2447
2448 /*
2449 * allocate a translation buffer.
2450 */
2451 buffer_size = HTML_PARSER_BUFFER_SIZE;
2452 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2453 if (buffer == NULL) {
2454 htmlErrMemory(ctxt, "buffer allocation failed\n");
2455 return(NULL);
2456 }
2457 out = buffer;
2458
2459 /*
2460 * Ok loop until we reach one of the ending chars
2461 */
2462 while ((CUR != 0) && (CUR != stop)) {
2463 if ((stop == 0) && (CUR == '>')) break;
2464 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2465 if (CUR == '&') {
2466 if (NXT(1) == '#') {
2467 unsigned int c;
2468 int bits;
2469
2470 c = htmlParseCharRef(ctxt);
2471 if (c < 0x80)
2472 { *out++ = c; bits= -6; }
2473 else if (c < 0x800)
2474 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2475 else if (c < 0x10000)
2476 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2477 else
2478 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2479
2480 for ( ; bits >= 0; bits-= 6) {
2481 *out++ = ((c >> bits) & 0x3F) | 0x80;
2482 }
2483
2484 if (out - buffer > buffer_size - 100) {
2485 int indx = out - buffer;
2486
2487 growBuffer(buffer);
2488 out = &buffer[indx];
2489 }
2490 } else {
2491 ent = htmlParseEntityRef(ctxt, &name);
2492 if (name == NULL) {
2493 *out++ = '&';
2494 if (out - buffer > buffer_size - 100) {
2495 int indx = out - buffer;
2496
2497 growBuffer(buffer);
2498 out = &buffer[indx];
2499 }
2500 } else if (ent == NULL) {
2501 *out++ = '&';
2502 cur = name;
2503 while (*cur != 0) {
2504 if (out - buffer > buffer_size - 100) {
2505 int indx = out - buffer;
2506
2507 growBuffer(buffer);
2508 out = &buffer[indx];
2509 }
2510 *out++ = *cur++;
2511 }
2512 } else {
2513 unsigned int c;
2514 int bits;
2515
2516 if (out - buffer > buffer_size - 100) {
2517 int indx = out - buffer;
2518
2519 growBuffer(buffer);
2520 out = &buffer[indx];
2521 }
2522 c = ent->value;
2523 if (c < 0x80)
2524 { *out++ = c; bits= -6; }
2525 else if (c < 0x800)
2526 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2527 else if (c < 0x10000)
2528 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2529 else
2530 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2531
2532 for ( ; bits >= 0; bits-= 6) {
2533 *out++ = ((c >> bits) & 0x3F) | 0x80;
2534 }
2535 }
2536 }
2537 } else {
2538 unsigned int c;
2539 int bits, l;
2540
2541 if (out - buffer > buffer_size - 100) {
2542 int indx = out - buffer;
2543
2544 growBuffer(buffer);
2545 out = &buffer[indx];
2546 }
2547 c = CUR_CHAR(l);
2548 if (c < 0x80)
2549 { *out++ = c; bits= -6; }
2550 else if (c < 0x800)
2551 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2552 else if (c < 0x10000)
2553 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2554 else
2555 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2556
2557 for ( ; bits >= 0; bits-= 6) {
2558 *out++ = ((c >> bits) & 0x3F) | 0x80;
2559 }
2560 NEXT;
2561 }
2562 }
2563 *out = 0;
2564 return(buffer);
2565 }
2566
2567 /**
2568 * htmlParseEntityRef:
2569 * @ctxt: an HTML parser context
2570 * @str: location to store the entity name
2571 *
2572 * parse an HTML ENTITY references
2573 *
2574 * [68] EntityRef ::= '&' Name ';'
2575 *
2576 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2577 * if non-NULL *str will have to be freed by the caller.
2578 */
2579 const htmlEntityDesc *
2580 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2581 const xmlChar *name;
2582 const htmlEntityDesc * ent = NULL;
2583
2584 if (str != NULL) *str = NULL;
2585 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2586
2587 if (CUR == '&') {
2588 NEXT;
2589 name = htmlParseName(ctxt);
2590 if (name == NULL) {
2591 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2592 "htmlParseEntityRef: no name\n", NULL, NULL);
2593 } else {
2594 GROW;
2595 if (CUR == ';') {
2596 if (str != NULL)
2597 *str = name;
2598
2599 /*
2600 * Lookup the entity in the table.
2601 */
2602 ent = htmlEntityLookup(name);
2603 if (ent != NULL) /* OK that's ugly !!! */
2604 NEXT;
2605 } else {
2606 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2607 "htmlParseEntityRef: expecting ';'\n",
2608 NULL, NULL);
2609 if (str != NULL)
2610 *str = name;
2611 }
2612 }
2613 }
2614 return(ent);
2615 }
2616
2617 /**
2618 * htmlParseAttValue:
2619 * @ctxt: an HTML parser context
2620 *
2621 * parse a value for an attribute
2622 * Note: the parser won't do substitution of entities here, this
2623 * will be handled later in xmlStringGetNodeList, unless it was
2624 * asked for ctxt->replaceEntities != 0
2625 *
2626 * Returns the AttValue parsed or NULL.
2627 */
2628
2629 static xmlChar *
2630 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2631 xmlChar *ret = NULL;
2632
2633 if (CUR == '"') {
2634 NEXT;
2635 ret = htmlParseHTMLAttribute(ctxt, '"');
2636 if (CUR != '"') {
2637 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2638 "AttValue: \" expected\n", NULL, NULL);
2639 } else
2640 NEXT;
2641 } else if (CUR == '\'') {
2642 NEXT;
2643 ret = htmlParseHTMLAttribute(ctxt, '\'');
2644 if (CUR != '\'') {
2645 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2646 "AttValue: ' expected\n", NULL, NULL);
2647 } else
2648 NEXT;
2649 } else {
2650 /*
2651 * That's an HTMLism, the attribute value may not be quoted
2652 */
2653 ret = htmlParseHTMLAttribute(ctxt, 0);
2654 if (ret == NULL) {
2655 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2656 "AttValue: no value found\n", NULL, NULL);
2657 }
2658 }
2659 return(ret);
2660 }
2661
2662 /**
2663 * htmlParseSystemLiteral:
2664 * @ctxt: an HTML parser context
2665 *
2666 * parse an HTML Literal
2667 *
2668 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2669 *
2670 * Returns the SystemLiteral parsed or NULL
2671 */
2672
2673 static xmlChar *
2674 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2675 const xmlChar *q;
2676 xmlChar *ret = NULL;
2677
2678 if (CUR == '"') {
2679 NEXT;
2680 q = CUR_PTR;
2681 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2682 NEXT;
2683 if (!IS_CHAR_CH(CUR)) {
2684 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2685 "Unfinished SystemLiteral\n", NULL, NULL);
2686 } else {
2687 ret = xmlStrndup(q, CUR_PTR - q);
2688 NEXT;
2689 }
2690 } else if (CUR == '\'') {
2691 NEXT;
2692 q = CUR_PTR;
2693 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2694 NEXT;
2695 if (!IS_CHAR_CH(CUR)) {
2696 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2697 "Unfinished SystemLiteral\n", NULL, NULL);
2698 } else {
2699 ret = xmlStrndup(q, CUR_PTR - q);
2700 NEXT;
2701 }
2702 } else {
2703 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2704 " or ' expected\n", NULL, NULL);
2705 }
2706
2707 return(ret);
2708 }
2709
2710 /**
2711 * htmlParsePubidLiteral:
2712 * @ctxt: an HTML parser context
2713 *
2714 * parse an HTML public literal
2715 *
2716 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2717 *
2718 * Returns the PubidLiteral parsed or NULL.
2719 */
2720
2721 static xmlChar *
2722 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2723 const xmlChar *q;
2724 xmlChar *ret = NULL;
2725 /*
2726 * Name ::= (Letter | '_') (NameChar)*
2727 */
2728 if (CUR == '"') {
2729 NEXT;
2730 q = CUR_PTR;
2731 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2732 if (CUR != '"') {
2733 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2734 "Unfinished PubidLiteral\n", NULL, NULL);
2735 } else {
2736 ret = xmlStrndup(q, CUR_PTR - q);
2737 NEXT;
2738 }
2739 } else if (CUR == '\'') {
2740 NEXT;
2741 q = CUR_PTR;
2742 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2743 NEXT;
2744 if (CUR != '\'') {
2745 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2746 "Unfinished PubidLiteral\n", NULL, NULL);
2747 } else {
2748 ret = xmlStrndup(q, CUR_PTR - q);
2749 NEXT;
2750 }
2751 } else {
2752 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2753 "PubidLiteral \" or ' expected\n", NULL, NULL);
2754 }
2755
2756 return(ret);
2757 }
2758
2759 /**
2760 * htmlParseScript:
2761 * @ctxt: an HTML parser context
2762 *
2763 * parse the content of an HTML SCRIPT or STYLE element
2764 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2765 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2766 * http://www.w3.org/TR/html4/types.html#type-script
2767 * http://www.w3.org/TR/html4/types.html#h-6.15
2768 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2769 *
2770 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2771 * element and the value of intrinsic event attributes. User agents must
2772 * not evaluate script data as HTML markup but instead must pass it on as
2773 * data to a script engine.
2774 * NOTES:
2775 * - The content is passed like CDATA
2776 * - the attributes for style and scripting "onXXX" are also described
2777 * as CDATA but SGML allows entities references in attributes so their
2778 * processing is identical as other attributes
2779 */
2780 static void
2781 htmlParseScript(htmlParserCtxtPtr ctxt) {
2782 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2783 int nbchar = 0;
2784 int cur,l;
2785
2786 SHRINK;
2787 cur = CUR_CHAR(l);
2788 while (IS_CHAR_CH(cur)) {
2789 if ((cur == '<') && (NXT(1) == '/')) {
2790 /*
2791 * One should break here, the specification is clear:
2792 * Authors should therefore escape "</" within the content.
2793 * Escape mechanisms are specific to each scripting or
2794 * style sheet language.
2795 *
2796 * In recovery mode, only break if end tag match the
2797 * current tag, effectively ignoring all tags inside the
2798 * script/style block and treating the entire block as
2799 * CDATA.
2800 */
2801 if (ctxt->recovery) {
2802 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2803 xmlStrlen(ctxt->name)) == 0)
2804 {
2805 break; /* while */
2806 } else {
2807 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2808 "Element %s embeds close tag\n",
2809 ctxt->name, NULL);
2810 }
2811 } else {
2812 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2813 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2814 {
2815 break; /* while */
2816 }
2817 }
2818 }
2819 COPY_BUF(l,buf,nbchar,cur);
2820 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2821 if (ctxt->sax->cdataBlock!= NULL) {
2822 /*
2823 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2824 */
2825 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2826 } else if (ctxt->sax->characters != NULL) {
2827 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2828 }
2829 nbchar = 0;
2830 }
2831 GROW;
2832 NEXTL(l);
2833 cur = CUR_CHAR(l);
2834 }
2835
2836 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2837 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2838 "Invalid char in CDATA 0x%X\n", cur);
2839 NEXT;
2840 }
2841
2842 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2843 if (ctxt->sax->cdataBlock!= NULL) {
2844 /*
2845 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2846 */
2847 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2848 } else if (ctxt->sax->characters != NULL) {
2849 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2850 }
2851 }
2852 }
2853
2854
2855 /**
2856 * htmlParseCharData:
2857 * @ctxt: an HTML parser context
2858 *
2859 * parse a CharData section.
2860 * if we are within a CDATA section ']]>' marks an end of section.
2861 *
2862 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2863 */
2864
2865 static void
2866 htmlParseCharData(htmlParserCtxtPtr ctxt) {
2867 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2868 int nbchar = 0;
2869 int cur, l;
2870 int chunk = 0;
2871
2872 SHRINK;
2873 cur = CUR_CHAR(l);
2874 while (((cur != '<') || (ctxt->token == '<')) &&
2875 ((cur != '&') || (ctxt->token == '&')) &&
2876 (cur != 0)) {
2877 if (!(IS_CHAR(cur))) {
2878 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2879 "Invalid char in CDATA 0x%X\n", cur);
2880 } else {
2881 COPY_BUF(l,buf,nbchar,cur);
2882 }
2883 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2884 /*
2885 * Ok the segment is to be consumed as chars.
2886 */
2887 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2888 if (areBlanks(ctxt, buf, nbchar)) {
2889 if (ctxt->sax->ignorableWhitespace != NULL)
2890 ctxt->sax->ignorableWhitespace(ctxt->userData,
2891 buf, nbchar);
2892 } else {
2893 htmlCheckParagraph(ctxt);
2894 if (ctxt->sax->characters != NULL)
2895 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2896 }
2897 }
2898 nbchar = 0;
2899 }
2900 NEXTL(l);
2901 chunk++;
2902 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2903 chunk = 0;
2904 SHRINK;
2905 GROW;
2906 }
2907 cur = CUR_CHAR(l);
2908 if (cur == 0) {
2909 SHRINK;
2910 GROW;
2911 cur = CUR_CHAR(l);
2912 }
2913 }
2914 if (nbchar != 0) {
2915 buf[nbchar] = 0;
2916
2917 /*
2918 * Ok the segment is to be consumed as chars.
2919 */
2920 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2921 if (areBlanks(ctxt, buf, nbchar)) {
2922 if (ctxt->sax->ignorableWhitespace != NULL)
2923 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2924 } else {
2925 htmlCheckParagraph(ctxt);
2926 if (ctxt->sax->characters != NULL)
2927 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2928 }
2929 }
2930 } else {
2931 /*
2932 * Loop detection
2933 */
2934 if (cur == 0)
2935 ctxt->instate = XML_PARSER_EOF;
2936 }
2937 }
2938
2939 /**
2940 * htmlParseExternalID:
2941 * @ctxt: an HTML parser context
2942 * @publicID: a xmlChar** receiving PubidLiteral
2943 *
2944 * Parse an External ID or a Public ID
2945 *
2946 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2947 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2948 *
2949 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2950 *
2951 * Returns the function returns SystemLiteral and in the second
2952 * case publicID receives PubidLiteral, is strict is off
2953 * it is possible to return NULL and have publicID set.
2954 */
2955
2956 static xmlChar *
2957 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
2958 xmlChar *URI = NULL;
2959
2960 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2961 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2962 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2963 SKIP(6);
2964 if (!IS_BLANK_CH(CUR)) {
2965 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2966 "Space required after 'SYSTEM'\n", NULL, NULL);
2967 }
2968 SKIP_BLANKS;
2969 URI = htmlParseSystemLiteral(ctxt);
2970 if (URI == NULL) {
2971 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2972 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
2973 }
2974 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2975 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2976 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2977 SKIP(6);
2978 if (!IS_BLANK_CH(CUR)) {
2979 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2980 "Space required after 'PUBLIC'\n", NULL, NULL);
2981 }
2982 SKIP_BLANKS;
2983 *publicID = htmlParsePubidLiteral(ctxt);
2984 if (*publicID == NULL) {
2985 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2986 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2987 NULL, NULL);
2988 }
2989 SKIP_BLANKS;
2990 if ((CUR == '"') || (CUR == '\'')) {
2991 URI = htmlParseSystemLiteral(ctxt);
2992 }
2993 }
2994 return(URI);
2995 }
2996
2997 /**
2998 * xmlParsePI:
2999 * @ctxt: an XML parser context
3000 *
3001 * parse an XML Processing Instruction.
3002 *
3003 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3004 */
3005 static void
3006 htmlParsePI(htmlParserCtxtPtr ctxt) {
3007 xmlChar *buf = NULL;
3008 int len = 0;
3009 int size = HTML_PARSER_BUFFER_SIZE;
3010 int cur, l;
3011 const xmlChar *target;
3012 xmlParserInputState state;
3013 int count = 0;
3014
3015 if ((RAW == '<') && (NXT(1) == '?')) {
3016 state = ctxt->instate;
3017 ctxt->instate = XML_PARSER_PI;
3018 /*
3019 * this is a Processing Instruction.
3020 */
3021 SKIP(2);
3022 SHRINK;
3023
3024 /*
3025 * Parse the target name and check for special support like
3026 * namespace.
3027 */
3028 target = htmlParseName(ctxt);
3029 if (target != NULL) {
3030 if (RAW == '>') {
3031 SKIP(1);
3032
3033 /*
3034 * SAX: PI detected.
3035 */
3036 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3037 (ctxt->sax->processingInstruction != NULL))
3038 ctxt->sax->processingInstruction(ctxt->userData,
3039 target, NULL);
3040 ctxt->instate = state;
3041 return;
3042 }
3043 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3044 if (buf == NULL) {
3045 htmlErrMemory(ctxt, NULL);
3046 ctxt->instate = state;
3047 return;
3048 }
3049 cur = CUR;
3050 if (!IS_BLANK(cur)) {
3051 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3052 "ParsePI: PI %s space expected\n", target, NULL);
3053 }
3054 SKIP_BLANKS;
3055 cur = CUR_CHAR(l);
3056 while (IS_CHAR(cur) && (cur != '>')) {
3057 if (len + 5 >= size) {
3058 xmlChar *tmp;
3059
3060 size *= 2;
3061 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3062 if (tmp == NULL) {
3063 htmlErrMemory(ctxt, NULL);
3064 xmlFree(buf);
3065 ctxt->instate = state;
3066 return;
3067 }
3068 buf = tmp;
3069 }
3070 count++;
3071 if (count > 50) {
3072 GROW;
3073 count = 0;
3074 }
3075 COPY_BUF(l,buf,len,cur);
3076 NEXTL(l);
3077