Update libxml to 2.7.7
[reactos.git] / reactos / lib / 3rdparty / libxml2 / HTMLparser.c
1 /*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
12
13 #include <string.h>
14 #ifdef HAVE_CTYPE_H
15 #include <ctype.h>
16 #endif
17 #ifdef HAVE_STDLIB_H
18 #include <stdlib.h>
19 #endif
20 #ifdef HAVE_SYS_STAT_H
21 #include <sys/stat.h>
22 #endif
23 #ifdef HAVE_FCNTL_H
24 #include <fcntl.h>
25 #endif
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 #ifdef HAVE_ZLIB_H
30 #include <zlib.h>
31 #endif
32
33 #include <libxml/xmlmemory.h>
34 #include <libxml/tree.h>
35 #include <libxml/parser.h>
36 #include <libxml/parserInternals.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/HTMLparser.h>
39 #include <libxml/HTMLtree.h>
40 #include <libxml/entities.h>
41 #include <libxml/encoding.h>
42 #include <libxml/valid.h>
43 #include <libxml/xmlIO.h>
44 #include <libxml/globals.h>
45 #include <libxml/uri.h>
46
47 #define HTML_MAX_NAMELEN 1000
48 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
49 #define HTML_PARSER_BUFFER_SIZE 100
50
51 /* #define DEBUG */
52 /* #define DEBUG_PUSH */
53
54 static int htmlOmittedDefaultValue = 1;
55
56 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
58 static void htmlParseComment(htmlParserCtxtPtr ctxt);
59
60 /************************************************************************
61 * *
62 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66 /**
67 * htmlErrMemory:
68 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73 static void
74 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75 {
76 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
79 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
85 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
86 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
90 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
91 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93 }
94
95 /**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105 static void
106 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108 {
109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
112 if (ctxt != NULL)
113 ctxt->errNo = error;
114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
121 }
122
123 /**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132 static void
133 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135 {
136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
139 if (ctxt != NULL)
140 ctxt->errNo = error;
141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
146 }
147
148 /************************************************************************
149 * *
150 * Parser stacks related functions and macros *
151 * *
152 ************************************************************************/
153
154 /**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
162 */
163 static int
164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
165 {
166 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
167 ctxt->html = 3;
168 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
169 ctxt->html = 10;
170 if (ctxt->nameNr >= ctxt->nameMax) {
171 ctxt->nameMax *= 2;
172 ctxt->nameTab = (const xmlChar * *)
173 xmlRealloc((xmlChar * *)ctxt->nameTab,
174 ctxt->nameMax *
175 sizeof(ctxt->nameTab[0]));
176 if (ctxt->nameTab == NULL) {
177 htmlErrMemory(ctxt, NULL);
178 return (0);
179 }
180 }
181 ctxt->nameTab[ctxt->nameNr] = value;
182 ctxt->name = value;
183 return (ctxt->nameNr++);
184 }
185 /**
186 * htmlnamePop:
187 * @ctxt: an HTML parser context
188 *
189 * Pops the top element name from the name stack
190 *
191 * Returns the name just removed
192 */
193 static const xmlChar *
194 htmlnamePop(htmlParserCtxtPtr ctxt)
195 {
196 const xmlChar *ret;
197
198 if (ctxt->nameNr <= 0)
199 return (NULL);
200 ctxt->nameNr--;
201 if (ctxt->nameNr < 0)
202 return (NULL);
203 if (ctxt->nameNr > 0)
204 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
205 else
206 ctxt->name = NULL;
207 ret = ctxt->nameTab[ctxt->nameNr];
208 ctxt->nameTab[ctxt->nameNr] = NULL;
209 return (ret);
210 }
211
212 /**
213 * htmlNodeInfoPush:
214 * @ctxt: an HTML parser context
215 * @value: the node info
216 *
217 * Pushes a new element name on top of the node info stack
218 *
219 * Returns 0 in case of error, the index in the stack otherwise
220 */
221 static int
222 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
223 {
224 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
225 if (ctxt->nodeInfoMax == 0)
226 ctxt->nodeInfoMax = 5;
227 ctxt->nodeInfoMax *= 2;
228 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
229 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
230 ctxt->nodeInfoMax *
231 sizeof(ctxt->nodeInfoTab[0]));
232 if (ctxt->nodeInfoTab == NULL) {
233 htmlErrMemory(ctxt, NULL);
234 return (0);
235 }
236 }
237 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
238 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
239 return (ctxt->nodeInfoNr++);
240 }
241
242 /**
243 * htmlNodeInfoPop:
244 * @ctxt: an HTML parser context
245 *
246 * Pops the top element name from the node info stack
247 *
248 * Returns 0 in case of error, the pointer to NodeInfo otherwise
249 */
250 static htmlParserNodeInfo *
251 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
252 {
253 if (ctxt->nodeInfoNr <= 0)
254 return (NULL);
255 ctxt->nodeInfoNr--;
256 if (ctxt->nodeInfoNr < 0)
257 return (NULL);
258 if (ctxt->nodeInfoNr > 0)
259 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
260 else
261 ctxt->nodeInfo = NULL;
262 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
263 }
264
265 /*
266 * Macros for accessing the content. Those should be used only by the parser,
267 * and not exported.
268 *
269 * Dirty macros, i.e. one need to make assumption on the context to use them
270 *
271 * CUR_PTR return the current pointer to the xmlChar to be parsed.
272 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
273 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
274 * in UNICODE mode. This should be used internally by the parser
275 * only to compare to ASCII values otherwise it would break when
276 * running with UTF-8 encoding.
277 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
278 * to compare on ASCII based substring.
279 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
280 * it should be used only to compare on ASCII based substring.
281 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
282 * strings without newlines within the parser.
283 *
284 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
285 *
286 * CURRENT Returns the current char value, with the full decoding of
287 * UTF-8 if we are using this mode. It returns an int.
288 * NEXT Skip to the next character, this does the proper decoding
289 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
290 * NEXTL(l) Skip the current unicode character of l xmlChars long.
291 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
292 */
293
294 #define UPPER (toupper(*ctxt->input->cur))
295
296 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
297
298 #define NXT(val) ctxt->input->cur[(val)]
299
300 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
301
302 #define CUR_PTR ctxt->input->cur
303
304 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
305 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
306 xmlParserInputShrink(ctxt->input)
307
308 #define GROW if ((ctxt->progressive == 0) && \
309 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
310 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
311
312 #define CURRENT ((int) (*ctxt->input->cur))
313
314 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
315
316 /* Inported from XML */
317
318 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
319 #define CUR ((int) (*ctxt->input->cur))
320 #define NEXT xmlNextChar(ctxt)
321
322 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
323
324
325 #define NEXTL(l) do { \
326 if (*(ctxt->input->cur) == '\n') { \
327 ctxt->input->line++; ctxt->input->col = 1; \
328 } else ctxt->input->col++; \
329 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
330 } while (0)
331
332 /************
333 \
334 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
335 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
336 ************/
337
338 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
339 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
340
341 #define COPY_BUF(l,b,i,v) \
342 if (l == 1) b[i++] = (xmlChar) v; \
343 else i += xmlCopyChar(l,&b[i],v)
344
345 /**
346 * htmlFindEncoding:
347 * @the HTML parser context
348 *
349 * Ty to find and encoding in the current data available in the input
350 * buffer this is needed to try to switch to the proper encoding when
351 * one face a character error.
352 * That's an heuristic, since it's operating outside of parsing it could
353 * try to use a meta which had been commented out, that's the reason it
354 * should only be used in case of error, not as a default.
355 *
356 * Returns an encoding string or NULL if not found, the string need to
357 * be freed
358 */
359 static xmlChar *
360 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
361 const xmlChar *start, *cur, *end;
362
363 if ((ctxt == NULL) || (ctxt->input == NULL) ||
364 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
365 (ctxt->input->buf->encoder != NULL))
366 return(NULL);
367 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
368 return(NULL);
369
370 start = ctxt->input->cur;
371 end = ctxt->input->end;
372 /* we also expect the input buffer to be zero terminated */
373 if (*end != 0)
374 return(NULL);
375
376 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
377 if (cur == NULL)
378 return(NULL);
379 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
380 if (cur == NULL)
381 return(NULL);
382 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
383 if (cur == NULL)
384 return(NULL);
385 cur += 8;
386 start = cur;
387 while (((*cur >= 'A') && (*cur <= 'Z')) ||
388 ((*cur >= 'a') && (*cur <= 'z')) ||
389 ((*cur >= '0') && (*cur <= '9')) ||
390 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
391 cur++;
392 if (cur == start)
393 return(NULL);
394 return(xmlStrndup(start, cur - start));
395 }
396
397 /**
398 * htmlCurrentChar:
399 * @ctxt: the HTML parser context
400 * @len: pointer to the length of the char read
401 *
402 * The current char value, if using UTF-8 this may actually span multiple
403 * bytes in the input buffer. Implement the end of line normalization:
404 * 2.11 End-of-Line Handling
405 * If the encoding is unspecified, in the case we find an ISO-Latin-1
406 * char, then the encoding converter is plugged in automatically.
407 *
408 * Returns the current char value and its length
409 */
410
411 static int
412 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
413 if (ctxt->instate == XML_PARSER_EOF)
414 return(0);
415
416 if (ctxt->token != 0) {
417 *len = 0;
418 return(ctxt->token);
419 }
420 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
421 /*
422 * We are supposed to handle UTF8, check it's valid
423 * From rfc2044: encoding of the Unicode values on UTF-8:
424 *
425 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
426 * 0000 0000-0000 007F 0xxxxxxx
427 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
428 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
429 *
430 * Check for the 0x110000 limit too
431 */
432 const unsigned char *cur = ctxt->input->cur;
433 unsigned char c;
434 unsigned int val;
435
436 c = *cur;
437 if (c & 0x80) {
438 if (cur[1] == 0) {
439 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
440 cur = ctxt->input->cur;
441 }
442 if ((cur[1] & 0xc0) != 0x80)
443 goto encoding_error;
444 if ((c & 0xe0) == 0xe0) {
445
446 if (cur[2] == 0) {
447 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
448 cur = ctxt->input->cur;
449 }
450 if ((cur[2] & 0xc0) != 0x80)
451 goto encoding_error;
452 if ((c & 0xf0) == 0xf0) {
453 if (cur[3] == 0) {
454 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
455 cur = ctxt->input->cur;
456 }
457 if (((c & 0xf8) != 0xf0) ||
458 ((cur[3] & 0xc0) != 0x80))
459 goto encoding_error;
460 /* 4-byte code */
461 *len = 4;
462 val = (cur[0] & 0x7) << 18;
463 val |= (cur[1] & 0x3f) << 12;
464 val |= (cur[2] & 0x3f) << 6;
465 val |= cur[3] & 0x3f;
466 } else {
467 /* 3-byte code */
468 *len = 3;
469 val = (cur[0] & 0xf) << 12;
470 val |= (cur[1] & 0x3f) << 6;
471 val |= cur[2] & 0x3f;
472 }
473 } else {
474 /* 2-byte code */
475 *len = 2;
476 val = (cur[0] & 0x1f) << 6;
477 val |= cur[1] & 0x3f;
478 }
479 if (!IS_CHAR(val)) {
480 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
481 "Char 0x%X out of allowed range\n", val);
482 }
483 return(val);
484 } else {
485 if ((*ctxt->input->cur == 0) &&
486 (ctxt->input->cur < ctxt->input->end)) {
487 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
488 "Char 0x%X out of allowed range\n", 0);
489 *len = 1;
490 return(' ');
491 }
492 /* 1-byte code */
493 *len = 1;
494 return((int) *ctxt->input->cur);
495 }
496 }
497 /*
498 * Assume it's a fixed length encoding (1) with
499 * a compatible encoding for the ASCII set, since
500 * XML constructs only use < 128 chars
501 */
502 *len = 1;
503 if ((int) *ctxt->input->cur < 0x80)
504 return((int) *ctxt->input->cur);
505
506 /*
507 * Humm this is bad, do an automatic flow conversion
508 */
509 {
510 xmlChar * guess;
511 xmlCharEncodingHandlerPtr handler;
512
513 guess = htmlFindEncoding(ctxt);
514 if (guess == NULL) {
515 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
516 } else {
517 if (ctxt->input->encoding != NULL)
518 xmlFree((xmlChar *) ctxt->input->encoding);
519 ctxt->input->encoding = guess;
520 handler = xmlFindCharEncodingHandler((const char *) guess);
521 if (handler != NULL) {
522 xmlSwitchToEncoding(ctxt, handler);
523 } else {
524 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
525 "Unsupported encoding %s", guess, NULL);
526 }
527 }
528 ctxt->charset = XML_CHAR_ENCODING_UTF8;
529 }
530
531 return(xmlCurrentChar(ctxt, len));
532
533 encoding_error:
534 /*
535 * If we detect an UTF8 error that probably mean that the
536 * input encoding didn't get properly advertized in the
537 * declaration header. Report the error and switch the encoding
538 * to ISO-Latin-1 (if you don't like this policy, just declare the
539 * encoding !)
540 */
541 {
542 char buffer[150];
543
544 if (ctxt->input->end - ctxt->input->cur >= 4) {
545 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
546 ctxt->input->cur[0], ctxt->input->cur[1],
547 ctxt->input->cur[2], ctxt->input->cur[3]);
548 } else {
549 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
550 }
551 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
552 "Input is not proper UTF-8, indicate encoding !\n",
553 BAD_CAST buffer, NULL);
554 }
555
556 ctxt->charset = XML_CHAR_ENCODING_8859_1;
557 *len = 1;
558 return((int) *ctxt->input->cur);
559 }
560
561 /**
562 * htmlSkipBlankChars:
563 * @ctxt: the HTML parser context
564 *
565 * skip all blanks character found at that point in the input streams.
566 *
567 * Returns the number of space chars skipped
568 */
569
570 static int
571 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
572 int res = 0;
573
574 while (IS_BLANK_CH(*(ctxt->input->cur))) {
575 if ((*ctxt->input->cur == 0) &&
576 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
577 xmlPopInput(ctxt);
578 } else {
579 if (*(ctxt->input->cur) == '\n') {
580 ctxt->input->line++; ctxt->input->col = 1;
581 } else ctxt->input->col++;
582 ctxt->input->cur++;
583 ctxt->nbChars++;
584 if (*ctxt->input->cur == 0)
585 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
586 }
587 res++;
588 }
589 return(res);
590 }
591
592
593
594 /************************************************************************
595 * *
596 * The list of HTML elements and their properties *
597 * *
598 ************************************************************************/
599
600 /*
601 * Start Tag: 1 means the start tag can be ommited
602 * End Tag: 1 means the end tag can be ommited
603 * 2 means it's forbidden (empty elements)
604 * 3 means the tag is stylistic and should be closed easily
605 * Depr: this element is deprecated
606 * DTD: 1 means that this element is valid only in the Loose DTD
607 * 2 means that this element is valid only in the Frameset DTD
608 *
609 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
610 , subElements , impliedsubelt , Attributes, userdata
611 */
612
613 /* Definitions and a couple of vars for HTML Elements */
614
615 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
616 #define NB_FONTSTYLE 8
617 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
618 #define NB_PHRASE 10
619 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
620 #define NB_SPECIAL 16
621 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
622 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
623 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
624 #define NB_BLOCK NB_HEADING + NB_LIST + 14
625 #define FORMCTRL "input", "select", "textarea", "label", "button"
626 #define NB_FORMCTRL 5
627 #define PCDATA
628 #define NB_PCDATA 0
629 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
630 #define NB_HEADING 6
631 #define LIST "ul", "ol", "dir", "menu"
632 #define NB_LIST 4
633 #define MODIFIER
634 #define NB_MODIFIER 0
635 #define FLOW BLOCK,INLINE
636 #define NB_FLOW NB_BLOCK + NB_INLINE
637 #define EMPTY NULL
638
639
640 static const char* const html_flow[] = { FLOW, NULL } ;
641 static const char* const html_inline[] = { INLINE, NULL } ;
642
643 /* placeholders: elts with content but no subelements */
644 static const char* const html_pcdata[] = { NULL } ;
645 #define html_cdata html_pcdata
646
647
648 /* ... and for HTML Attributes */
649
650 #define COREATTRS "id", "class", "style", "title"
651 #define NB_COREATTRS 4
652 #define I18N "lang", "dir"
653 #define NB_I18N 2
654 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
655 #define NB_EVENTS 9
656 #define ATTRS COREATTRS,I18N,EVENTS
657 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
658 #define CELLHALIGN "align", "char", "charoff"
659 #define NB_CELLHALIGN 3
660 #define CELLVALIGN "valign"
661 #define NB_CELLVALIGN 1
662
663 static const char* const html_attrs[] = { ATTRS, NULL } ;
664 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
665 static const char* const core_attrs[] = { COREATTRS, NULL } ;
666 static const char* const i18n_attrs[] = { I18N, NULL } ;
667
668
669 /* Other declarations that should go inline ... */
670 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
671 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
672 "tabindex", "onfocus", "onblur", NULL } ;
673 static const char* const target_attr[] = { "target", NULL } ;
674 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
675 static const char* const alt_attr[] = { "alt", NULL } ;
676 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
677 static const char* const href_attrs[] = { "href", NULL } ;
678 static const char* const clear_attrs[] = { "clear", NULL } ;
679 static const char* const inline_p[] = { INLINE, "p", NULL } ;
680
681 static const char* const flow_param[] = { FLOW, "param", NULL } ;
682 static const char* const applet_attrs[] = { COREATTRS , "codebase",
683 "archive", "alt", "name", "height", "width", "align",
684 "hspace", "vspace", NULL } ;
685 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
686 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
687 static const char* const basefont_attrs[] =
688 { "id", "size", "color", "face", NULL } ;
689 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
690 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
691 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
692 static const char* const body_depr[] = { "background", "bgcolor", "text",
693 "link", "vlink", "alink", NULL } ;
694 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
695 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
696
697
698 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
699 static const char* const col_elt[] = { "col", NULL } ;
700 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
701 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
702 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
703 static const char* const compact_attr[] = { "compact", NULL } ;
704 static const char* const label_attr[] = { "label", NULL } ;
705 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
706 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
707 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
708 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
709 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
710 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
711 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
712 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
713 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
714 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
715 static const char* const version_attr[] = { "version", NULL } ;
716 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
717 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
718 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
719 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
720 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
721 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
722 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
723 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
724 static const char* const align_attr[] = { "align", NULL } ;
725 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
726 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
727 static const char* const name_attr[] = { "name", NULL } ;
728 static const char* const action_attr[] = { "action", NULL } ;
729 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
730 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
731 static const char* const content_attr[] = { "content", NULL } ;
732 static const char* const type_attr[] = { "type", NULL } ;
733 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
734 static const char* const object_contents[] = { FLOW, "param", NULL } ;
735 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
736 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
737 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
738 static const char* const option_elt[] = { "option", NULL } ;
739 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
740 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
741 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
742 static const char* const width_attr[] = { "width", NULL } ;
743 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
744 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
745 static const char* const language_attr[] = { "language", NULL } ;
746 static const char* const select_content[] = { "optgroup", "option", NULL } ;
747 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
748 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
749 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
750 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
751 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
752 static const char* const tr_elt[] = { "tr", NULL } ;
753 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
754 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
755 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
756 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
757 static const char* const tr_contents[] = { "th", "td", NULL } ;
758 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
759 static const char* const li_elt[] = { "li", NULL } ;
760 static const char* const ul_depr[] = { "type", "compact", NULL} ;
761 static const char* const dir_attr[] = { "dir", NULL} ;
762
763 #define DECL (const char**)
764
765 static const htmlElemDesc
766 html40ElementTable[] = {
767 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
768 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
769 },
770 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
771 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
772 },
773 { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
774 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
775 },
776 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
777 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
778 },
779 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
780 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
781 },
782 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
783 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
784 },
785 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
786 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
787 },
788 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
789 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
790 },
791 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
792 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
793 },
794 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
795 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
796 },
797 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
798 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
799 },
800 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
801 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
802 },
803 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
804 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
805 },
806 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
807 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
808 },
809 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
810 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
811 },
812 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
813 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
814 },
815 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
816 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
817 },
818 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
819 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
820 },
821 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
822 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
823 },
824 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
825 EMPTY , NULL , DECL col_attrs , NULL, NULL
826 },
827 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
828 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
829 },
830 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
831 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
832 },
833 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
834 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
835 },
836 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
837 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
838 },
839 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
840 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
841 },
842 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
843 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
844 },
845 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
846 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
847 },
848 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
849 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
850 },
851 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
852 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
853 },
854 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
855 EMPTY, NULL, DECL embed_attrs, NULL, NULL
856 },
857 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
858 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
859 },
860 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
861 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
862 },
863 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
864 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
865 },
866 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
867 EMPTY, NULL, NULL, DECL frame_attrs, NULL
868 },
869 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
870 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
871 },
872 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
873 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
874 },
875 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
876 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
877 },
878 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
879 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
880 },
881 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
882 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
883 },
884 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
885 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
886 },
887 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
888 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
889 },
890 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
891 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
892 },
893 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
894 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
895 },
896 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
897 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
898 },
899 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
900 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
901 },
902 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
903 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
904 },
905 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
906 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
907 },
908 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
909 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
910 },
911 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
912 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
913 },
914 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
915 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
916 },
917 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
918 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
919 },
920 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
921 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
922 },
923 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
924 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
925 },
926 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
927 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
928 },
929 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
930 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
931 },
932 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
933 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
934 },
935 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
936 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
937 },
938 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
939 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
940 },
941 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
942 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
943 },
944 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
945 DECL html_flow, "div", DECL html_attrs, NULL, NULL
946 },
947 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
948 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
949 },
950 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
951 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
952 },
953 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
954 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
955 },
956 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
957 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
958 },
959 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
960 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
961 },
962 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
963 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
964 },
965 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
966 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
967 },
968 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
969 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
970 },
971 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
972 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
973 },
974 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
975 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
976 },
977 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
978 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
979 },
980 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
981 DECL select_content, NULL, DECL select_attrs, NULL, NULL
982 },
983 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
984 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
985 },
986 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
987 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
988 },
989 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
990 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
991 },
992 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
993 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
994 },
995 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
996 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
997 },
998 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
999 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1000 },
1001 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1002 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1003 },
1004 { "table", 0, 0, 0, 0, 0, 0, 0, "",
1005 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1006 },
1007 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1008 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1009 },
1010 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1011 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1012 },
1013 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1014 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1015 },
1016 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1017 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1018 },
1019 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1020 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1021 },
1022 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1023 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1024 },
1025 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1026 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1027 },
1028 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1029 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1030 },
1031 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1032 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033 },
1034 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1035 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1036 },
1037 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1038 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1039 },
1040 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1041 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1042 }
1043 };
1044
1045 /*
1046 * start tags that imply the end of current element
1047 */
1048 static const char * const htmlStartClose[] = {
1049 "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1050 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1051 "listing", "xmp", "head", NULL,
1052 "head", "p", NULL,
1053 "title", "p", NULL,
1054 "body", "head", "style", "link", "title", "p", NULL,
1055 "frameset", "head", "style", "link", "title", "p", NULL,
1056 "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1057 "pre", "listing", "xmp", "head", "li", NULL,
1058 "hr", "p", "head", NULL,
1059 "h1", "p", "head", NULL,
1060 "h2", "p", "head", NULL,
1061 "h3", "p", "head", NULL,
1062 "h4", "p", "head", NULL,
1063 "h5", "p", "head", NULL,
1064 "h6", "p", "head", NULL,
1065 "dir", "p", "head", NULL,
1066 "address", "p", "head", "ul", NULL,
1067 "pre", "p", "head", "ul", NULL,
1068 "listing", "p", "head", NULL,
1069 "xmp", "p", "head", NULL,
1070 "blockquote", "p", "head", NULL,
1071 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1072 "xmp", "head", NULL,
1073 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1074 "head", "dd", NULL,
1075 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1076 "head", "dt", NULL,
1077 "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1078 "listing", "xmp", NULL,
1079 "ol", "p", "head", "ul", NULL,
1080 "menu", "p", "head", "ul", NULL,
1081 "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1082 "div", "p", "head", NULL,
1083 "noscript", "p", "head", NULL,
1084 "center", "font", "b", "i", "p", "head", NULL,
1085 "a", "a", NULL,
1086 "caption", "p", NULL,
1087 "colgroup", "caption", "colgroup", "col", "p", NULL,
1088 "col", "caption", "col", "p", NULL,
1089 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1090 "listing", "xmp", "a", NULL,
1091 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1092 "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1093 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1094 "thead", "caption", "col", "colgroup", NULL,
1095 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1096 "tbody", "p", NULL,
1097 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1098 "tfoot", "tbody", "p", NULL,
1099 "optgroup", "option", NULL,
1100 "option", "option", NULL,
1101 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1102 "pre", "listing", "xmp", "a", NULL,
1103 NULL
1104 };
1105
1106 /*
1107 * The list of HTML elements which are supposed not to have
1108 * CDATA content and where a p element will be implied
1109 *
1110 * TODO: extend that list by reading the HTML SGML DTD on
1111 * implied paragraph
1112 */
1113 static const char *const htmlNoContentElements[] = {
1114 "html",
1115 "head",
1116 NULL
1117 };
1118
1119 /*
1120 * The list of HTML attributes which are of content %Script;
1121 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1122 * it assumes the name starts with 'on'
1123 */
1124 static const char *const htmlScriptAttributes[] = {
1125 "onclick",
1126 "ondblclick",
1127 "onmousedown",
1128 "onmouseup",
1129 "onmouseover",
1130 "onmousemove",
1131 "onmouseout",
1132 "onkeypress",
1133 "onkeydown",
1134 "onkeyup",
1135 "onload",
1136 "onunload",
1137 "onfocus",
1138 "onblur",
1139 "onsubmit",
1140 "onrest",
1141 "onchange",
1142 "onselect"
1143 };
1144
1145 /*
1146 * This table is used by the htmlparser to know what to do with
1147 * broken html pages. By assigning different priorities to different
1148 * elements the parser can decide how to handle extra endtags.
1149 * Endtags are only allowed to close elements with lower or equal
1150 * priority.
1151 */
1152
1153 typedef struct {
1154 const char *name;
1155 int priority;
1156 } elementPriority;
1157
1158 static const elementPriority htmlEndPriority[] = {
1159 {"div", 150},
1160 {"td", 160},
1161 {"th", 160},
1162 {"tr", 170},
1163 {"thead", 180},
1164 {"tbody", 180},
1165 {"tfoot", 180},
1166 {"table", 190},
1167 {"head", 200},
1168 {"body", 200},
1169 {"html", 220},
1170 {NULL, 100} /* Default priority */
1171 };
1172
1173 static const char** htmlStartCloseIndex[100];
1174 static int htmlStartCloseIndexinitialized = 0;
1175
1176 /************************************************************************
1177 * *
1178 * functions to handle HTML specific data *
1179 * *
1180 ************************************************************************/
1181
1182 /**
1183 * htmlInitAutoClose:
1184 *
1185 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1186 * This is not reentrant. Call xmlInitParser() once before processing in
1187 * case of use in multithreaded programs.
1188 */
1189 void
1190 htmlInitAutoClose(void) {
1191 int indx, i = 0;
1192
1193 if (htmlStartCloseIndexinitialized) return;
1194
1195 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1196 indx = 0;
1197 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1198 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1199 while (htmlStartClose[i] != NULL) i++;
1200 i++;
1201 }
1202 htmlStartCloseIndexinitialized = 1;
1203 }
1204
1205 /**
1206 * htmlTagLookup:
1207 * @tag: The tag name in lowercase
1208 *
1209 * Lookup the HTML tag in the ElementTable
1210 *
1211 * Returns the related htmlElemDescPtr or NULL if not found.
1212 */
1213 const htmlElemDesc *
1214 htmlTagLookup(const xmlChar *tag) {
1215 unsigned int i;
1216
1217 for (i = 0; i < (sizeof(html40ElementTable) /
1218 sizeof(html40ElementTable[0]));i++) {
1219 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1220 return((htmlElemDescPtr) &html40ElementTable[i]);
1221 }
1222 return(NULL);
1223 }
1224
1225 /**
1226 * htmlGetEndPriority:
1227 * @name: The name of the element to look up the priority for.
1228 *
1229 * Return value: The "endtag" priority.
1230 **/
1231 static int
1232 htmlGetEndPriority (const xmlChar *name) {
1233 int i = 0;
1234
1235 while ((htmlEndPriority[i].name != NULL) &&
1236 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1237 i++;
1238
1239 return(htmlEndPriority[i].priority);
1240 }
1241
1242
1243 /**
1244 * htmlCheckAutoClose:
1245 * @newtag: The new tag name
1246 * @oldtag: The old tag name
1247 *
1248 * Checks whether the new tag is one of the registered valid tags for
1249 * closing old.
1250 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1251 *
1252 * Returns 0 if no, 1 if yes.
1253 */
1254 static int
1255 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1256 {
1257 int i, indx;
1258 const char **closed = NULL;
1259
1260 if (htmlStartCloseIndexinitialized == 0)
1261 htmlInitAutoClose();
1262
1263 /* inefficient, but not a big deal */
1264 for (indx = 0; indx < 100; indx++) {
1265 closed = htmlStartCloseIndex[indx];
1266 if (closed == NULL)
1267 return (0);
1268 if (xmlStrEqual(BAD_CAST * closed, newtag))
1269 break;
1270 }
1271
1272 i = closed - htmlStartClose;
1273 i++;
1274 while (htmlStartClose[i] != NULL) {
1275 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1276 return (1);
1277 }
1278 i++;
1279 }
1280 return (0);
1281 }
1282
1283 /**
1284 * htmlAutoCloseOnClose:
1285 * @ctxt: an HTML parser context
1286 * @newtag: The new tag name
1287 * @force: force the tag closure
1288 *
1289 * The HTML DTD allows an ending tag to implicitly close other tags.
1290 */
1291 static void
1292 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1293 {
1294 const htmlElemDesc *info;
1295 int i, priority;
1296
1297 priority = htmlGetEndPriority(newtag);
1298
1299 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1300
1301 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1302 break;
1303 /*
1304 * A missplaced endtag can only close elements with lower
1305 * or equal priority, so if we find an element with higher
1306 * priority before we find an element with
1307 * matching name, we just ignore this endtag
1308 */
1309 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1310 return;
1311 }
1312 if (i < 0)
1313 return;
1314
1315 while (!xmlStrEqual(newtag, ctxt->name)) {
1316 info = htmlTagLookup(ctxt->name);
1317 if ((info != NULL) && (info->endTag == 3)) {
1318 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1319 "Opening and ending tag mismatch: %s and %s\n",
1320 newtag, ctxt->name);
1321 }
1322 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1323 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1324 htmlnamePop(ctxt);
1325 }
1326 }
1327
1328 /**
1329 * htmlAutoCloseOnEnd:
1330 * @ctxt: an HTML parser context
1331 *
1332 * Close all remaining tags at the end of the stream
1333 */
1334 static void
1335 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1336 {
1337 int i;
1338
1339 if (ctxt->nameNr == 0)
1340 return;
1341 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1342 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1343 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1344 htmlnamePop(ctxt);
1345 }
1346 }
1347
1348 /**
1349 * htmlAutoClose:
1350 * @ctxt: an HTML parser context
1351 * @newtag: The new tag name or NULL
1352 *
1353 * The HTML DTD allows a tag to implicitly close other tags.
1354 * The list is kept in htmlStartClose array. This function is
1355 * called when a new tag has been detected and generates the
1356 * appropriates closes if possible/needed.
1357 * If newtag is NULL this mean we are at the end of the resource
1358 * and we should check
1359 */
1360 static void
1361 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1362 {
1363 while ((newtag != NULL) && (ctxt->name != NULL) &&
1364 (htmlCheckAutoClose(newtag, ctxt->name))) {
1365 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1366 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1367 htmlnamePop(ctxt);
1368 }
1369 if (newtag == NULL) {
1370 htmlAutoCloseOnEnd(ctxt);
1371 return;
1372 }
1373 while ((newtag == NULL) && (ctxt->name != NULL) &&
1374 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1375 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1376 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1377 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1378 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1379 htmlnamePop(ctxt);
1380 }
1381 }
1382
1383 /**
1384 * htmlAutoCloseTag:
1385 * @doc: the HTML document
1386 * @name: The tag name
1387 * @elem: the HTML element
1388 *
1389 * The HTML DTD allows a tag to implicitly close other tags.
1390 * The list is kept in htmlStartClose array. This function checks
1391 * if the element or one of it's children would autoclose the
1392 * given tag.
1393 *
1394 * Returns 1 if autoclose, 0 otherwise
1395 */
1396 int
1397 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1398 htmlNodePtr child;
1399
1400 if (elem == NULL) return(1);
1401 if (xmlStrEqual(name, elem->name)) return(0);
1402 if (htmlCheckAutoClose(elem->name, name)) return(1);
1403 child = elem->children;
1404 while (child != NULL) {
1405 if (htmlAutoCloseTag(doc, name, child)) return(1);
1406 child = child->next;
1407 }
1408 return(0);
1409 }
1410
1411 /**
1412 * htmlIsAutoClosed:
1413 * @doc: the HTML document
1414 * @elem: the HTML element
1415 *
1416 * The HTML DTD allows a tag to implicitly close other tags.
1417 * The list is kept in htmlStartClose array. This function checks
1418 * if a tag is autoclosed by one of it's child
1419 *
1420 * Returns 1 if autoclosed, 0 otherwise
1421 */
1422 int
1423 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1424 htmlNodePtr child;
1425
1426 if (elem == NULL) return(1);
1427 child = elem->children;
1428 while (child != NULL) {
1429 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1430 child = child->next;
1431 }
1432 return(0);
1433 }
1434
1435 /**
1436 * htmlCheckImplied:
1437 * @ctxt: an HTML parser context
1438 * @newtag: The new tag name
1439 *
1440 * The HTML DTD allows a tag to exists only implicitly
1441 * called when a new tag has been detected and generates the
1442 * appropriates implicit tags if missing
1443 */
1444 static void
1445 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1446 int i;
1447
1448 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1449 return;
1450 if (!htmlOmittedDefaultValue)
1451 return;
1452 if (xmlStrEqual(newtag, BAD_CAST"html"))
1453 return;
1454 if (ctxt->nameNr <= 0) {
1455 htmlnamePush(ctxt, BAD_CAST"html");
1456 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1457 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1458 }
1459 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1460 return;
1461 if ((ctxt->nameNr <= 1) &&
1462 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1463 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1464 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1465 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1466 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1467 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1468 if (ctxt->html >= 3) {
1469 /* we already saw or generated an <head> before */
1470 return;
1471 }
1472 /*
1473 * dropped OBJECT ... i you put it first BODY will be
1474 * assumed !
1475 */
1476 htmlnamePush(ctxt, BAD_CAST"head");
1477 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1478 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1479 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1480 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1481 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1482 if (ctxt->html >= 10) {
1483 /* we already saw or generated a <body> before */
1484 return;
1485 }
1486 for (i = 0;i < ctxt->nameNr;i++) {
1487 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1488 return;
1489 }
1490 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1491 return;
1492 }
1493 }
1494
1495 htmlnamePush(ctxt, BAD_CAST"body");
1496 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1497 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1498 }
1499 }
1500
1501 /**
1502 * htmlCheckParagraph
1503 * @ctxt: an HTML parser context
1504 *
1505 * Check whether a p element need to be implied before inserting
1506 * characters in the current element.
1507 *
1508 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1509 * in case of error.
1510 */
1511
1512 static int
1513 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1514 const xmlChar *tag;
1515 int i;
1516
1517 if (ctxt == NULL)
1518 return(-1);
1519 tag = ctxt->name;
1520 if (tag == NULL) {
1521 htmlAutoClose(ctxt, BAD_CAST"p");
1522 htmlCheckImplied(ctxt, BAD_CAST"p");
1523 htmlnamePush(ctxt, BAD_CAST"p");
1524 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1525 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1526 return(1);
1527 }
1528 if (!htmlOmittedDefaultValue)
1529 return(0);
1530 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1531 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1532 htmlAutoClose(ctxt, BAD_CAST"p");
1533 htmlCheckImplied(ctxt, BAD_CAST"p");
1534 htmlnamePush(ctxt, BAD_CAST"p");
1535 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1536 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1537 return(1);
1538 }
1539 }
1540 return(0);
1541 }
1542
1543 /**
1544 * htmlIsScriptAttribute:
1545 * @name: an attribute name
1546 *
1547 * Check if an attribute is of content type Script
1548 *
1549 * Returns 1 is the attribute is a script 0 otherwise
1550 */
1551 int
1552 htmlIsScriptAttribute(const xmlChar *name) {
1553 unsigned int i;
1554
1555 if (name == NULL)
1556 return(0);
1557 /*
1558 * all script attributes start with 'on'
1559 */
1560 if ((name[0] != 'o') || (name[1] != 'n'))
1561 return(0);
1562 for (i = 0;
1563 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1564 i++) {
1565 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1566 return(1);
1567 }
1568 return(0);
1569 }
1570
1571 /************************************************************************
1572 * *
1573 * The list of HTML predefined entities *
1574 * *
1575 ************************************************************************/
1576
1577
1578 static const htmlEntityDesc html40EntitiesTable[] = {
1579 /*
1580 * the 4 absolute ones, plus apostrophe.
1581 */
1582 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1583 { 38, "amp", "ampersand, U+0026 ISOnum" },
1584 { 39, "apos", "single quote" },
1585 { 60, "lt", "less-than sign, U+003C ISOnum" },
1586 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1587
1588 /*
1589 * A bunch still in the 128-255 range
1590 * Replacing them depend really on the charset used.
1591 */
1592 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1593 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1594 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1595 { 163, "pound","pound sign, U+00A3 ISOnum" },
1596 { 164, "curren","currency sign, U+00A4 ISOnum" },
1597 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1598 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1599 { 167, "sect", "section sign, U+00A7 ISOnum" },
1600 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1601 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1602 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1603 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1604 { 172, "not", "not sign, U+00AC ISOnum" },
1605 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1606 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1607 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1608 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1609 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1610 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1611 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1612 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1613 { 181, "micro","micro sign, U+00B5 ISOnum" },
1614 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1615 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1616 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1617 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1618 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1619 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1620 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1621 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1622 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1623 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1624 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1625 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1626 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1627 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1628 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1629 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1630 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1631 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1632 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1633 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1634 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1635 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1636 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1637 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1638 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1639 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1640 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1641 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1642 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1643 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1644 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1645 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1646 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1647 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1648 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1649 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1650 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1651 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1652 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1653 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1654 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1655 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1656 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1657 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1658 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1659 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1660 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1661 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1662 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1663 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1664 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1665 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1666 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1667 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1668 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1669 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1670 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1671 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1672 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1673 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1674 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1675 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1676 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1677 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1678 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1679 { 247, "divide","division sign, U+00F7 ISOnum" },
1680 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1681 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1682 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1683 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1684 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1685 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1686 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1687 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1688
1689 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1690 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1691 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1692 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1693 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1694
1695 /*
1696 * Anything below should really be kept as entities references
1697 */
1698 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1699
1700 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1701 { 732, "tilde","small tilde, U+02DC ISOdia" },
1702
1703 { 913, "Alpha","greek capital letter alpha, U+0391" },
1704 { 914, "Beta", "greek capital letter beta, U+0392" },
1705 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1706 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1707 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1708 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1709 { 919, "Eta", "greek capital letter eta, U+0397" },
1710 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1711 { 921, "Iota", "greek capital letter iota, U+0399" },
1712 { 922, "Kappa","greek capital letter kappa, U+039A" },
1713 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1714 { 924, "Mu", "greek capital letter mu, U+039C" },
1715 { 925, "Nu", "greek capital letter nu, U+039D" },
1716 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1717 { 927, "Omicron","greek capital letter omicron, U+039F" },
1718 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1719 { 929, "Rho", "greek capital letter rho, U+03A1" },
1720 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1721 { 932, "Tau", "greek capital letter tau, U+03A4" },
1722 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1723 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1724 { 935, "Chi", "greek capital letter chi, U+03A7" },
1725 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1726 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1727
1728 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1729 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1730 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1731 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1732 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1733 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1734 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1735 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1736 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1737 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1738 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1739 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1740 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1741 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1742 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1743 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1744 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1745 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1746 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1747 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1748 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1749 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1750 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1751 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1752 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1753 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1754 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1755 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1756
1757 { 8194, "ensp", "en space, U+2002 ISOpub" },
1758 { 8195, "emsp", "em space, U+2003 ISOpub" },
1759 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1760 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1761 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1762 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1763 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1764 { 8211, "ndash","en dash, U+2013 ISOpub" },
1765 { 8212, "mdash","em dash, U+2014 ISOpub" },
1766 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1767 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1768 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1769 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1770 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1771 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1772 { 8224, "dagger","dagger, U+2020 ISOpub" },
1773 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1774
1775 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1776 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1777
1778 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1779
1780 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1781 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1782
1783 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1784 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1785
1786 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1787 { 8260, "frasl","fraction slash, U+2044 NEW" },
1788
1789 { 8364, "euro", "euro sign, U+20AC NEW" },
1790
1791 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1792 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1793 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1794 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1795 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1796 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1797 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1798 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1799 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1800 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1801 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1802 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1803 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1804 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1805 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1806 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1807
1808 { 8704, "forall","for all, U+2200 ISOtech" },
1809 { 8706, "part", "partial differential, U+2202 ISOtech" },
1810 { 8707, "exist","there exists, U+2203 ISOtech" },
1811 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1812 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1813 { 8712, "isin", "element of, U+2208 ISOtech" },
1814 { 8713, "notin","not an element of, U+2209 ISOtech" },
1815 { 8715, "ni", "contains as member, U+220B ISOtech" },
1816 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1817 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1818 { 8722, "minus","minus sign, U+2212 ISOtech" },
1819 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1820 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1821 { 8733, "prop", "proportional to, U+221D ISOtech" },
1822 { 8734, "infin","infinity, U+221E ISOtech" },
1823 { 8736, "ang", "angle, U+2220 ISOamso" },
1824 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1825 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
1826 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1827 { 8746, "cup", "union = cup, U+222A ISOtech" },
1828 { 8747, "int", "integral, U+222B ISOtech" },
1829 { 8756, "there4","therefore, U+2234 ISOtech" },
1830 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1831 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1832 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1833 { 8800, "ne", "not equal to, U+2260 ISOtech" },
1834 { 8801, "equiv","identical to, U+2261 ISOtech" },
1835 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1836 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1837 { 8834, "sub", "subset of, U+2282 ISOtech" },
1838 { 8835, "sup", "superset of, U+2283 ISOtech" },
1839 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1840 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1841 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1842 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1843 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1844 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1845 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1846 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1847 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1848 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1849 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1850 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1851 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1852 { 9674, "loz", "lozenge, U+25CA ISOpub" },
1853
1854 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1855 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1856 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1857 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1858
1859 };
1860
1861 /************************************************************************
1862 * *
1863 * Commodity functions to handle entities *
1864 * *
1865 ************************************************************************/
1866
1867 /*
1868 * Macro used to grow the current buffer.
1869 */
1870 #define growBuffer(buffer) { \
1871 xmlChar *tmp; \
1872 buffer##_size *= 2; \
1873 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1874 if (tmp == NULL) { \
1875 htmlErrMemory(ctxt, "growing buffer\n"); \
1876 xmlFree(buffer); \
1877 return(NULL); \
1878 } \
1879 buffer = tmp; \
1880 }
1881
1882 /**
1883 * htmlEntityLookup:
1884 * @name: the entity name
1885 *
1886 * Lookup the given entity in EntitiesTable
1887 *
1888 * TODO: the linear scan is really ugly, an hash table is really needed.
1889 *
1890 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1891 */
1892 const htmlEntityDesc *
1893 htmlEntityLookup(const xmlChar *name) {
1894 unsigned int i;
1895
1896 for (i = 0;i < (sizeof(html40EntitiesTable)/
1897 sizeof(html40EntitiesTable[0]));i++) {
1898 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1899 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1900 }
1901 }
1902 return(NULL);
1903 }
1904
1905 /**
1906 * htmlEntityValueLookup:
1907 * @value: the entity's unicode value
1908 *
1909 * Lookup the given entity in EntitiesTable
1910 *
1911 * TODO: the linear scan is really ugly, an hash table is really needed.
1912 *
1913 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1914 */
1915 const htmlEntityDesc *
1916 htmlEntityValueLookup(unsigned int value) {
1917 unsigned int i;
1918
1919 for (i = 0;i < (sizeof(html40EntitiesTable)/
1920 sizeof(html40EntitiesTable[0]));i++) {
1921 if (html40EntitiesTable[i].value >= value) {
1922 if (html40EntitiesTable[i].value > value)
1923 break;
1924 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1925 }
1926 }
1927 return(NULL);
1928 }
1929
1930 /**
1931 * UTF8ToHtml:
1932 * @out: a pointer to an array of bytes to store the result
1933 * @outlen: the length of @out
1934 * @in: a pointer to an array of UTF-8 chars
1935 * @inlen: the length of @in
1936 *
1937 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1938 * plus HTML entities block of chars out.
1939 *
1940 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1941 * The value of @inlen after return is the number of octets consumed
1942 * as the return value is positive, else unpredictable.
1943 * The value of @outlen after return is the number of octets consumed.
1944 */
1945 int
1946 UTF8ToHtml(unsigned char* out, int *outlen,
1947 const unsigned char* in, int *inlen) {
1948 const unsigned char* processed = in;
1949 const unsigned char* outend;
1950 const unsigned char* outstart = out;
1951 const unsigned char* instart = in;
1952 const unsigned char* inend;
1953 unsigned int c, d;
1954 int trailing;
1955
1956 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1957 if (in == NULL) {
1958 /*
1959 * initialization nothing to do
1960 */
1961 *outlen = 0;
1962 *inlen = 0;
1963 return(0);
1964 }
1965 inend = in + (*inlen);
1966 outend = out + (*outlen);
1967 while (in < inend) {
1968 d = *in++;
1969 if (d < 0x80) { c= d; trailing= 0; }
1970 else if (d < 0xC0) {
1971 /* trailing byte in leading position */
1972 *outlen = out - outstart;
1973 *inlen = processed - instart;
1974 return(-2);
1975 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1976 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1977 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1978 else {
1979 /* no chance for this in Ascii */
1980 *outlen = out - outstart;
1981 *inlen = processed - instart;
1982 return(-2);
1983 }
1984
1985 if (inend - in < trailing) {
1986 break;
1987 }
1988
1989 for ( ; trailing; trailing--) {
1990 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1991 break;
1992 c <<= 6;
1993 c |= d & 0x3F;
1994 }
1995
1996 /* assertion: c is a single UTF-4 value */
1997 if (c < 0x80) {
1998 if (out + 1 >= outend)
1999 break;
2000 *out++ = c;
2001 } else {
2002 int len;
2003 const htmlEntityDesc * ent;
2004 const char *cp;
2005 char nbuf[16];
2006
2007 /*
2008 * Try to lookup a predefined HTML entity for it
2009 */
2010
2011 ent = htmlEntityValueLookup(c);
2012 if (ent == NULL) {
2013 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2014 cp = nbuf;
2015 }
2016 else
2017 cp = ent->name;
2018 len = strlen(cp);
2019 if (out + 2 + len >= outend)
2020 break;
2021 *out++ = '&';
2022 memcpy(out, cp, len);
2023 out += len;
2024 *out++ = ';';
2025 }
2026 processed = in;
2027 }
2028 *outlen = out - outstart;
2029 *inlen = processed - instart;
2030 return(0);
2031 }
2032
2033 /**
2034 * htmlEncodeEntities:
2035 * @out: a pointer to an array of bytes to store the result
2036 * @outlen: the length of @out
2037 * @in: a pointer to an array of UTF-8 chars
2038 * @inlen: the length of @in
2039 * @quoteChar: the quote character to escape (' or ") or zero.
2040 *
2041 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2042 * plus HTML entities block of chars out.
2043 *
2044 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2045 * The value of @inlen after return is the number of octets consumed
2046 * as the return value is positive, else unpredictable.
2047 * The value of @outlen after return is the number of octets consumed.
2048 */
2049 int
2050 htmlEncodeEntities(unsigned char* out, int *outlen,
2051 const unsigned char* in, int *inlen, int quoteChar) {
2052 const unsigned char* processed = in;
2053 const unsigned char* outend;
2054 const unsigned char* outstart = out;
2055 const unsigned char* instart = in;
2056 const unsigned char* inend;
2057 unsigned int c, d;
2058 int trailing;
2059
2060 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2061 return(-1);
2062 outend = out + (*outlen);
2063 inend = in + (*inlen);
2064 while (in < inend) {
2065 d = *in++;
2066 if (d < 0x80) { c= d; trailing= 0; }
2067 else if (d < 0xC0) {
2068 /* trailing byte in leading position */
2069 *outlen = out - outstart;
2070 *inlen = processed - instart;
2071 return(-2);
2072 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2073 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2074 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2075 else {
2076 /* no chance for this in Ascii */
2077 *outlen = out - outstart;
2078 *inlen = processed - instart;
2079 return(-2);
2080 }
2081
2082 if (inend - in < trailing)
2083 break;
2084
2085 while (trailing--) {
2086 if (((d= *in++) & 0xC0) != 0x80) {
2087 *outlen = out - outstart;
2088 *inlen = processed - instart;
2089 return(-2);
2090 }
2091 c <<= 6;
2092 c |= d & 0x3F;
2093 }
2094
2095 /* assertion: c is a single UTF-4 value */
2096 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2097 (c != '&') && (c != '<') && (c != '>')) {
2098 if (out >= outend)
2099 break;
2100 *out++ = c;
2101 } else {
2102 const htmlEntityDesc * ent;
2103 const char *cp;
2104 char nbuf[16];
2105 int len;
2106
2107 /*
2108 * Try to lookup a predefined HTML entity for it
2109 */
2110 ent = htmlEntityValueLookup(c);
2111 if (ent == NULL) {
2112 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2113 cp = nbuf;
2114 }
2115 else
2116 cp = ent->name;
2117 len = strlen(cp);
2118 if (out + 2 + len > outend)
2119 break;
2120 *out++ = '&';
2121 memcpy(out, cp, len);
2122 out += len;
2123 *out++ = ';';
2124 }
2125 processed = in;
2126 }
2127 *outlen = out - outstart;
2128 *inlen = processed - instart;
2129 return(0);
2130 }
2131
2132 /************************************************************************
2133 * *
2134 * Commodity functions to handle streams *
2135 * *
2136 ************************************************************************/
2137
2138 /**
2139 * htmlNewInputStream:
2140 * @ctxt: an HTML parser context
2141 *
2142 * Create a new input stream structure
2143 * Returns the new input stream or NULL
2144 */
2145 static htmlParserInputPtr
2146 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2147 htmlParserInputPtr input;
2148
2149 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2150 if (input == NULL) {
2151 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2152 return(NULL);
2153 }
2154 memset(input, 0, sizeof(htmlParserInput));
2155 input->filename = NULL;
2156 input->directory = NULL;
2157 input->base = NULL;
2158 input->cur = NULL;
2159 input->buf = NULL;
2160 input->line = 1;
2161 input->col = 1;
2162 input->buf = NULL;
2163 input->free = NULL;
2164 input->version = NULL;
2165 input->consumed = 0;
2166 input->length = 0;
2167 return(input);
2168 }
2169
2170
2171 /************************************************************************
2172 * *
2173 * Commodity functions, cleanup needed ? *
2174 * *
2175 ************************************************************************/
2176 /*
2177 * all tags allowing pc data from the html 4.01 loose dtd
2178 * NOTE: it might be more apropriate to integrate this information
2179 * into the html40ElementTable array but I don't want to risk any
2180 * binary incomptibility
2181 */
2182 static const char *allowPCData[] = {
2183 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2184 "blockquote", "body", "button", "caption", "center", "cite", "code",
2185 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2186 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2187 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2188 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2189 };
2190
2191 /**
2192 * areBlanks:
2193 * @ctxt: an HTML parser context
2194 * @str: a xmlChar *
2195 * @len: the size of @str
2196 *
2197 * Is this a sequence of blank chars that one can ignore ?
2198 *
2199 * Returns 1 if ignorable 0 otherwise.
2200 */
2201
2202 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2203 unsigned int i;
2204 int j;
2205 xmlNodePtr lastChild;
2206 xmlDtdPtr dtd;
2207
2208 for (j = 0;j < len;j++)
2209 if (!(IS_BLANK_CH(str[j]))) return(0);
2210
2211 if (CUR == 0) return(1);
2212 if (CUR != '<') return(0);
2213 if (ctxt->name == NULL)
2214 return(1);
2215 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2216 return(1);
2217 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2218 return(1);
2219
2220 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2221 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2222 dtd = xmlGetIntSubset(ctxt->myDoc);
2223 if (dtd != NULL && dtd->ExternalID != NULL) {
2224 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2225 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2226 return(1);
2227 }
2228 }
2229
2230 if (ctxt->node == NULL) return(0);
2231 lastChild = xmlGetLastChild(ctxt->node);
2232 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2233 lastChild = lastChild->prev;
2234 if (lastChild == NULL) {
2235 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2236 (ctxt->node->content != NULL)) return(0);
2237 /* keep ws in constructs like ...<b> </b>...
2238 for all tags "b" allowing PCDATA */
2239 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2240 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2241 return(0);
2242 }
2243 }
2244 } else if (xmlNodeIsText(lastChild)) {
2245 return(0);
2246 } else {
2247 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2248 for all tags "p" allowing PCDATA */
2249 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2250 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2251 return(0);
2252 }
2253 }
2254 }
2255 return(1);
2256 }
2257
2258 /**
2259 * htmlNewDocNoDtD:
2260 * @URI: URI for the dtd, or NULL
2261 * @ExternalID: the external ID of the DTD, or NULL
2262 *
2263 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2264 * are NULL
2265 *
2266 * Returns a new document, do not initialize the DTD if not provided
2267 */
2268 htmlDocPtr
2269 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2270 xmlDocPtr cur;
2271
2272 /*
2273 * Allocate a new document and fill the fields.
2274 */
2275 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2276 if (cur == NULL) {
2277 htmlErrMemory(NULL, "HTML document creation failed\n");
2278 return(NULL);
2279 }
2280 memset(cur, 0, sizeof(xmlDoc));
2281
2282 cur->type = XML_HTML_DOCUMENT_NODE;
2283 cur->version = NULL;
2284 cur->intSubset = NULL;
2285 cur->doc = cur;
2286 cur->name = NULL;
2287 cur->children = NULL;
2288 cur->extSubset = NULL;
2289 cur->oldNs = NULL;
2290 cur->encoding = NULL;
2291 cur->standalone = 1;
2292 cur->compression = 0;
2293 cur->ids = NULL;
2294 cur->refs = NULL;
2295 cur->_private = NULL;
2296 cur->charset = XML_CHAR_ENCODING_UTF8;
2297 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2298 if ((ExternalID != NULL) ||
2299 (URI != NULL))
2300 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2301 return(cur);
2302 }
2303
2304 /**
2305 * htmlNewDoc:
2306 * @URI: URI for the dtd, or NULL
2307 * @ExternalID: the external ID of the DTD, or NULL
2308 *
2309 * Creates a new HTML document
2310 *
2311 * Returns a new document
2312 */
2313 htmlDocPtr
2314 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2315 if ((URI == NULL) && (ExternalID == NULL))
2316 return(htmlNewDocNoDtD(
2317 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2318 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2319
2320 return(htmlNewDocNoDtD(URI, ExternalID));
2321 }
2322
2323
2324 /************************************************************************
2325 * *
2326 * The parser itself *
2327 * Relates to http://www.w3.org/TR/html40 *
2328 * *
2329 ************************************************************************/
2330
2331 /************************************************************************
2332 * *
2333 * The parser itself *
2334 * *
2335 ************************************************************************/
2336
2337 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2338
2339 /**
2340 * htmlParseHTMLName:
2341 * @ctxt: an HTML parser context
2342 *
2343 * parse an HTML tag or attribute name, note that we convert it to lowercase
2344 * since HTML names are not case-sensitive.
2345 *
2346 * Returns the Tag Name parsed or NULL
2347 */
2348
2349 static const xmlChar *
2350 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2351 int i = 0;
2352 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2353
2354 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2355 (CUR != ':') && (CUR != '.')) return(NULL);
2356
2357 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2358 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2359 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2360 (CUR == '.'))) {
2361 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2362 else loc[i] = CUR;
2363 i++;
2364
2365 NEXT;
2366 }
2367
2368 return(xmlDictLookup(ctxt->dict, loc, i));
2369 }
2370
2371
2372 /**
2373 * htmlParseHTMLName_nonInvasive:
2374 * @ctxt: an HTML parser context
2375 *
2376 * parse an HTML tag or attribute name, note that we convert it to lowercase
2377 * since HTML names are not case-sensitive, this doesn't consume the data
2378 * from the stream, it's a look-ahead
2379 *
2380 * Returns the Tag Name parsed or NULL
2381 */
2382
2383 static const xmlChar *
2384 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2385 int i = 0;
2386 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2387
2388 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2389 (NXT(1) != ':')) return(NULL);
2390
2391 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2392 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2393 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2394 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2395 else loc[i] = NXT(1+i);
2396 i++;
2397 }
2398
2399 return(xmlDictLookup(ctxt->dict, loc, i));
2400 }
2401
2402
2403 /**
2404 * htmlParseName:
2405 * @ctxt: an HTML parser context
2406 *
2407 * parse an HTML name, this routine is case sensitive.
2408 *
2409 * Returns the Name parsed or NULL
2410 */
2411
2412 static const xmlChar *
2413 htmlParseName(htmlParserCtxtPtr ctxt) {
2414 const xmlChar *in;
2415 const xmlChar *ret;
2416 int count = 0;
2417
2418 GROW;
2419
2420 /*
2421 * Accelerator for simple ASCII names
2422 */
2423 in = ctxt->input->cur;
2424 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2425 ((*in >= 0x41) && (*in <= 0x5A)) ||
2426 (*in == '_') || (*in == ':')) {
2427 in++;
2428 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2429 ((*in >= 0x41) && (*in <= 0x5A)) ||
2430 ((*in >= 0x30) && (*in <= 0x39)) ||
2431 (*in == '_') || (*in == '-') ||
2432 (*in == ':') || (*in == '.'))
2433 in++;
2434 if ((*in > 0) && (*in < 0x80)) {
2435 count = in - ctxt->input->cur;
2436 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2437 ctxt->input->cur = in;
2438 ctxt->nbChars += count;
2439 ctxt->input->col += count;
2440 return(ret);
2441 }
2442 }
2443 return(htmlParseNameComplex(ctxt));
2444 }
2445
2446 static const xmlChar *
2447 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2448 int len = 0, l;
2449 int c;
2450 int count = 0;
2451
2452 /*
2453 * Handler for more complex cases
2454 */
2455 GROW;
2456 c = CUR_CHAR(l);
2457 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2458 (!IS_LETTER(c) && (c != '_') &&
2459 (c != ':'))) {
2460 return(NULL);
2461 }
2462
2463 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2464 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2465 (c == '.') || (c == '-') ||
2466 (c == '_') || (c == ':') ||
2467 (IS_COMBINING(c)) ||
2468 (IS_EXTENDER(c)))) {
2469 if (count++ > 100) {
2470 count = 0;
2471 GROW;
2472 }
2473 len += l;
2474 NEXTL(l);
2475 c = CUR_CHAR(l);
2476 }
2477 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2478 }
2479
2480
2481 /**
2482 * htmlParseHTMLAttribute:
2483 * @ctxt: an HTML parser context
2484 * @stop: a char stop value
2485 *
2486 * parse an HTML attribute value till the stop (quote), if
2487 * stop is 0 then it stops at the first space
2488 *
2489 * Returns the attribute parsed or NULL
2490 */
2491
2492 static xmlChar *
2493 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2494 xmlChar *buffer = NULL;
2495 int buffer_size = 0;
2496 xmlChar *out = NULL;
2497 const xmlChar *name = NULL;
2498 const xmlChar *cur = NULL;
2499 const htmlEntityDesc * ent;
2500
2501 /*
2502 * allocate a translation buffer.
2503 */
2504 buffer_size = HTML_PARSER_BUFFER_SIZE;
2505 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2506 if (buffer == NULL) {
2507 htmlErrMemory(ctxt, "buffer allocation failed\n");
2508 return(NULL);
2509 }
2510 out = buffer;
2511
2512 /*
2513 * Ok loop until we reach one of the ending chars
2514 */
2515 while ((CUR != 0) && (CUR != stop)) {
2516 if ((stop == 0) && (CUR == '>')) break;
2517 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2518 if (CUR == '&') {
2519 if (NXT(1) == '#') {
2520 unsigned int c;
2521 int bits;
2522
2523 c = htmlParseCharRef(ctxt);
2524 if (c < 0x80)
2525 { *out++ = c; bits= -6; }
2526 else if (c < 0x800)
2527 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2528 else if (c < 0x10000)
2529 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2530 else
2531 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2532
2533 for ( ; bits >= 0; bits-= 6) {
2534 *out++ = ((c >> bits) & 0x3F) | 0x80;
2535 }
2536
2537 if (out - buffer > buffer_size - 100) {
2538 int indx = out - buffer;
2539
2540 growBuffer(buffer);
2541 out = &buffer[indx];
2542 }
2543 } else {
2544 ent = htmlParseEntityRef(ctxt, &name);
2545 if (name == NULL) {
2546 *out++ = '&';
2547 if (out - buffer > buffer_size - 100) {
2548 int indx = out - buffer;
2549
2550 growBuffer(buffer);
2551 out = &buffer[indx];
2552 }
2553 } else if (ent == NULL) {
2554 *out++ = '&';
2555 cur = name;
2556 while (*cur != 0) {
2557 if (out - buffer > buffer_size - 100) {
2558 int indx = out - buffer;
2559
2560 growBuffer(buffer);
2561 out = &buffer[indx];
2562 }
2563 *out++ = *cur++;
2564 }
2565 } else {
2566 unsigned int c;
2567 int bits;
2568
2569 if (out - buffer > buffer_size - 100) {
2570 int indx = out - buffer;
2571
2572 growBuffer(buffer);
2573 out = &buffer[indx];
2574 }
2575 c = ent->value;
2576 if (c < 0x80)
2577 { *out++ = c; bits= -6; }
2578 else if (c < 0x800)
2579 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2580 else if (c < 0x10000)
2581 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2582 else
2583 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2584
2585 for ( ; bits >= 0; bits-= 6) {
2586 *out++ = ((c >> bits) & 0x3F) | 0x80;
2587 }
2588 }
2589 }
2590 } else {
2591 unsigned int c;
2592 int bits, l;
2593
2594 if (out - buffer > buffer_size - 100) {
2595 int indx = out - buffer;
2596
2597 growBuffer(buffer);
2598 out = &buffer[indx];
2599 }
2600 c = CUR_CHAR(l);
2601 if (c < 0x80)
2602 { *out++ = c; bits= -6; }
2603 else if (c < 0x800)
2604 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2605 else if (c < 0x10000)
2606 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2607 else
2608 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2609
2610 for ( ; bits >= 0; bits-= 6) {
2611 *out++ = ((c >> bits) & 0x3F) | 0x80;
2612 }
2613 NEXT;
2614 }
2615 }
2616 *out = 0;
2617 return(buffer);
2618 }
2619
2620 /**
2621 * htmlParseEntityRef:
2622 * @ctxt: an HTML parser context
2623 * @str: location to store the entity name
2624 *
2625 * parse an HTML ENTITY references
2626 *
2627 * [68] EntityRef ::= '&' Name ';'
2628 *
2629 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2630 * if non-NULL *str will have to be freed by the caller.
2631 */
2632 const htmlEntityDesc *
2633 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2634 const xmlChar *name;
2635 const htmlEntityDesc * ent = NULL;
2636
2637 if (str != NULL) *str = NULL;
2638 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2639
2640 if (CUR == '&') {
2641 NEXT;
2642 name = htmlParseName(ctxt);
2643 if (name == NULL) {
2644 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2645 "htmlParseEntityRef: no name\n", NULL, NULL);
2646 } else {
2647 GROW;
2648 if (CUR == ';') {
2649 if (str != NULL)
2650 *str = name;
2651
2652 /*
2653 * Lookup the entity in the table.
2654 */
2655 ent = htmlEntityLookup(name);
2656 if (ent != NULL) /* OK that's ugly !!! */
2657 NEXT;
2658 } else {
2659 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2660 "htmlParseEntityRef: expecting ';'\n",
2661 NULL, NULL);
2662 if (str != NULL)
2663 *str = name;
2664 }
2665 }
2666 }
2667 return(ent);
2668 }
2669
2670 /**
2671 * htmlParseAttValue:
2672 * @ctxt: an HTML parser context
2673 *
2674 * parse a value for an attribute
2675 * Note: the parser won't do substitution of entities here, this
2676 * will be handled later in xmlStringGetNodeList, unless it was
2677 * asked for ctxt->replaceEntities != 0
2678 *
2679 * Returns the AttValue parsed or NULL.
2680 */
2681