- Fix KiDispatchException to unmask KI_EXCEPTION_INTERNAL when setting the exception...
[reactos.git] / irc / TechBot / CHMLibrary / CHMDecoding / HHCParser.cs
1 using System;
2 using System.Collections;
3 using System.Text;
4 using System.Text.RegularExpressions;
5
6 namespace HtmlHelp.ChmDecoding
7 {
8 /// <summary>
9 /// The class <c>HHCParser</c> implements a parser for HHC contents files.
10 /// </summary>
11 internal sealed class HHCParser
12 {
13 /// <summary>
14 /// regular expressions for replacing the sitemap boundary tags
15 /// </summary>
16 private static string RE_ULOpening = @"\<ul\>"; // will be replaced by a '(' for nested parsing
17 private static string RE_ULClosing = @"\</ul\>"; // will be replaced by a ')' for nested parsing
18
19 /// <summary>
20 /// Matching ul-tags
21 /// </summary>
22 private static string RE_ULBoundaries = @"\<ul\>(?<innerText>.*)\</ul\>";
23 /// <summary>
24 /// Matching the nested tree structure.
25 /// </summary>
26 private static string RE_NestedBoundaries = @"\( (?> [^()]+ | \( (?<DEPTH>) | \) (?<-DEPTH>) )* (?(DEPTH)(?!)) \)";
27 /// <summary>
28 /// Matching object-tags
29 /// </summary>
30 private static string RE_ObjectBoundaries = @"\<object(?<innerText>.*?)\</object\>";
31 /// <summary>
32 /// Matching param tags
33 /// </summary>
34 private static string RE_ParamBoundaries = @"\<param(?<innerText>.*?)\>";
35 /// <summary>
36 /// Extracting tag attributes
37 /// </summary>
38 private const string RE_QuoteAttributes = @"( |\t)*(?<attributeName>[\-a-zA-Z0-9]*)( |\t)*=( |\t)*(?<attributeTD>[\""\'])?(?<attributeValue>.*?(?(attributeTD)\k<attributeTD>|([\s>]|.$)))";
39
40 /// <summary>
41 /// private regular expressionobjects
42 /// </summary>
43 private static Regex ulRE;
44 private static Regex NestedRE;
45 private static Regex ObjectRE;
46 private static Regex ParamRE;
47 private static Regex AttributesRE;
48
49 /// <summary>
50 /// Internal member storing the list of TOCItems which are holding merge links
51 /// </summary>
52 private static ArrayList _mergeItems = null;
53
54 /// <summary>
55 /// Internal member storing the last read regular topic item.
56 /// This is used to handle "Merge" entries and add them as child to this instance.
57 /// </summary>
58 private static TOCItem _lastTopicItem = null;
59
60 /// <summary>
61 /// Parses a HHC file and returns an ArrayList with the table of contents (TOC) tree
62 /// </summary>
63 /// <param name="hhcFile">string content of the hhc file</param>
64 /// <param name="chmFile">CHMFile instance</param>
65 /// <returns>Returns an ArrayList with the table of contents (TOC) tree</returns>
66 public static ArrayList ParseHHC(string hhcFile, CHMFile chmFile)
67 {
68 _lastTopicItem = null;
69 _mergeItems = null; // clear merged item list
70 ArrayList tocList = new ArrayList();
71
72 ulRE = new Regex(RE_ULBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
73 NestedRE = new Regex(RE_NestedBoundaries, RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
74 ObjectRE = new Regex(RE_ObjectBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
75 ParamRE = new Regex(RE_ParamBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
76 AttributesRE = new Regex(RE_QuoteAttributes, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
77
78 int innerTextIdx = ulRE.GroupNumberFromName("innerText");
79
80 if( ulRE.IsMatch(hhcFile, 0) )
81 {
82 Match m = ulRE.Match(hhcFile, 0);
83
84 int nFirstUL = 0;
85
86 nFirstUL = hhcFile.ToLower().IndexOf("<ul>");
87
88 if(nFirstUL == -1)
89 nFirstUL = hhcFile.ToLower().IndexOf("<il>");
90
91 if( ObjectRE.IsMatch(hhcFile, 0) ) // first object block contains information types and categories
92 {
93 Match mO = ObjectRE.Match(hhcFile, 0);
94 int iOTxt = ObjectRE.GroupNumberFromName("innerText");
95
96 string globalText = mO.Groups[iOTxt].Value;
97
98 if( mO.Groups[iOTxt].Index <= nFirstUL)
99 ParseGlobalSettings( globalText, chmFile );
100 }
101
102 // parse toc tree
103 string innerText = m.Groups["innerText"].Value;
104
105 innerText = innerText.Replace("(", "&#040;");
106 innerText = innerText.Replace(")", "&#041;");
107 innerText = Regex.Replace(innerText, RE_ULOpening, "(", RegexOptions.IgnoreCase);
108 innerText = Regex.Replace(innerText, RE_ULClosing, ")", RegexOptions.IgnoreCase);
109
110 ParseTree( innerText, null, tocList, chmFile );
111
112 }
113
114 return tocList;
115 }
116
117 /// <summary>
118 /// Checks if the hhc file contains a global object tag.
119 /// </summary>
120 /// <param name="hhcFile">string content of the hhc file</param>
121 /// <param name="chmFile">chm file</param>
122 /// <returns>true if the hhc content contains a global object tag</returns>
123 public static bool HasGlobalObjectTag(string hhcFile, CHMFile chmFile)
124 {
125 bool bRet = false;
126
127 ulRE = new Regex(RE_ULBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
128 ObjectRE = new Regex(RE_ObjectBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
129
130 int innerTextIdx = ulRE.GroupNumberFromName("innerText");
131
132 if( ulRE.IsMatch(hhcFile, 0) )
133 {
134 Match m = ulRE.Match(hhcFile, 0);
135
136 int nFirstUL = 0;
137
138 nFirstUL = hhcFile.ToLower().IndexOf("<ul>");
139
140 if(nFirstUL == -1)
141 nFirstUL = hhcFile.ToLower().IndexOf("<il>");
142
143 if( ObjectRE.IsMatch(hhcFile, 0) ) // first object block contains information types and categories
144 {
145 Match mO = ObjectRE.Match(hhcFile, 0);
146 int iOTxt = ObjectRE.GroupNumberFromName("innerText");
147
148 string globalText = mO.Groups[iOTxt].Value;
149
150 if( mO.Groups[iOTxt].Index <= nFirstUL)
151 bRet = true;
152 }
153 }
154
155 return bRet;
156 }
157
158 /// <summary>
159 /// Gets true if the previously done parsing found merge-links
160 /// </summary>
161 public static bool HasMergeLinks
162 {
163 get
164 {
165 if(_mergeItems==null)
166 return false;
167
168 return _mergeItems.Count > 0;
169 }
170 }
171
172 /// <summary>
173 /// Gets all TOCItem references which are holding merge-links
174 /// </summary>
175 public static ArrayList MergeItems
176 {
177 get { return _mergeItems; }
178 }
179
180 /// <summary>
181 /// Recursively parses a sitemap tree
182 /// </summary>
183 /// <param name="text">content text</param>
184 /// <param name="parent">Parent for all read items</param>
185 /// <param name="arrNodes">arraylist which receives the extracted nodes</param>
186 /// <param name="chmFile">CHMFile instance</param>
187 private static void ParseTree( string text, TOCItem parent, ArrayList arrNodes, CHMFile chmFile )
188 {
189 string strPreItems="", strPostItems="";
190 string innerText = "";
191
192 int nIndex = 0;
193
194 while( NestedRE.IsMatch(text, nIndex) )
195 {
196 Match m = NestedRE.Match(text, nIndex);
197
198 innerText = m.Value.Substring( 1, m.Length-2);
199
200 strPreItems = text.Substring(nIndex,m.Index-nIndex);
201
202 ParseItems(strPreItems, parent, arrNodes, chmFile);
203
204 if((arrNodes.Count>0) && (innerText.Length > 0) )
205 {
206 TOCItem p = ((TOCItem)(arrNodes[arrNodes.Count-1]));
207 ParseTree( innerText, p, p.Children, chmFile );
208 }
209
210 nIndex = m.Index+m.Length;
211 }
212
213 if( nIndex == 0)
214 {
215 strPostItems = text.Substring(nIndex, text.Length-nIndex);
216 ParseItems(strPostItems, parent, arrNodes, chmFile);
217 }
218 else if( nIndex < text.Length-1)
219 {
220 strPostItems = text.Substring(nIndex, text.Length-nIndex);
221 ParseTree(strPostItems, parent, arrNodes, chmFile);
222 }
223 }
224
225 /// <summary>
226 /// Parses tree nodes from the text
227 /// </summary>
228 /// <param name="itemstext">text containing the items</param>
229 /// <param name="parent">Parent for all read items</param>
230 /// <param name="arrNodes">arraylist where the nodes should be added</param>
231 /// <param name="chmFile">CHMFile instance</param>
232 private static void ParseItems( string itemstext, TOCItem parent, ArrayList arrNodes, CHMFile chmFile)
233 {
234 int innerTextIdx = ObjectRE.GroupNumberFromName("innerText");
235 int innerPTextIdx = ParamRE.GroupNumberFromName("innerText");
236
237 // get group-name indexes
238 int nameIndex = AttributesRE.GroupNumberFromName("attributeName");
239 int valueIndex = AttributesRE.GroupNumberFromName("attributeValue");
240 int tdIndex = AttributesRE.GroupNumberFromName("attributeTD");
241
242 int nObjStartIndex = 0;
243
244 while( ObjectRE.IsMatch(itemstext, nObjStartIndex) )
245 {
246 Match m = ObjectRE.Match(itemstext, nObjStartIndex);
247
248 string innerText = m.Groups[innerTextIdx].Value;
249
250 TOCItem tocItem = new TOCItem();
251 tocItem.TocMode = DataMode.TextBased;
252 tocItem.AssociatedFile = chmFile;
253 tocItem.Parent = parent;
254
255 // read parameters
256 int nParamIndex = 0;
257
258 while( ParamRE.IsMatch(innerText, nParamIndex) )
259 {
260 Match mP = ParamRE.Match(innerText, nParamIndex);
261
262 string innerP = mP.Groups[innerPTextIdx].Value;
263
264 string paramName = "";
265 string paramValue = "";
266
267 int nAttrIdx = 0;
268
269 while( AttributesRE.IsMatch( innerP, nAttrIdx ) )
270 {
271 Match mA = AttributesRE.Match(innerP, nAttrIdx);
272
273 string attributeName = mA.Groups[nameIndex].Value;
274 string attributeValue = mA.Groups[valueIndex].Value;
275 string attributeTD = mA.Groups[tdIndex].Value;
276
277 if(attributeTD.Length > 0)
278 {
279 // delete the trailing textqualifier
280 if( attributeValue.Length > 0)
281 {
282 int ltqi = attributeValue.LastIndexOf( attributeTD );
283
284 if(ltqi >= 0)
285 {
286 attributeValue = attributeValue.Substring(0,ltqi);
287 }
288 }
289 }
290
291 if( attributeName.ToLower() == "name")
292 {
293 paramName = HttpUtility.HtmlDecode(attributeValue); // for unicode encoded values
294 }
295
296 if( attributeName.ToLower() == "value")
297 {
298 paramValue = HttpUtility.HtmlDecode(attributeValue); // for unicode encoded values
299 // delete trailing /
300 while((paramValue.Length>0)&&(paramValue[paramValue.Length-1] == '/'))
301 paramValue = paramValue.Substring(0,paramValue.Length-1);
302
303 }
304
305 nAttrIdx = mA.Index+mA.Length;
306 }
307
308 tocItem.Params[paramName] = paramValue;
309 switch(paramName.ToLower())
310 {
311 case "name":
312 {
313 tocItem.Name = paramValue;
314 };break;
315 case "local":
316 {
317 tocItem.Local = paramValue.Replace("../", "").Replace("./", "");
318 };break;
319 case "imagenumber":
320 {
321 tocItem.ImageIndex = Int32.Parse(paramValue);
322 tocItem.ImageIndex-=1;
323
324 int nFolderAdd = 0;
325
326 if((chmFile != null) && (chmFile.ImageTypeFolder))
327 {
328 // get the value which should be added, to display folders instead of books
329 if(HtmlHelpSystem.UseHH2TreePics)
330 nFolderAdd = 8;
331 else
332 nFolderAdd = 4;
333 }
334
335 if(tocItem.ImageIndex%2 != 0)
336 {
337 if(tocItem.ImageIndex==1)
338 tocItem.ImageIndex=0;
339 }
340 if(HtmlHelpSystem.UseHH2TreePics)
341 if( tocItem.ImageIndex == 0)
342 tocItem.ImageIndex = TOCItem.STD_FOLDER_HH2+nFolderAdd;
343 };break;
344 case "merge": // this item contains topics or a full TOC from a merged CHM
345 {
346 tocItem.MergeLink = paramValue;
347
348 // "register" this item as merge-link
349 if(_mergeItems==null)
350 _mergeItems=new ArrayList();
351
352 _mergeItems.Add(tocItem);
353
354 };break;
355 case "type": // information type assignment for item
356 {
357 tocItem.InfoTypeStrings.Add( paramValue );
358 };break;
359 }
360
361 nParamIndex = mP.Index+mP.Length;
362 }
363
364 tocItem.ChmFile = chmFile.ChmFilePath;
365
366 if(tocItem.MergeLink.Length > 0)
367 {
368 if(_lastTopicItem != null)
369 {
370 tocItem.Parent = _lastTopicItem;
371 _lastTopicItem.Children.Add(tocItem);
372 }
373 else
374 arrNodes.Add( tocItem );
375 }
376 else
377 {
378 _lastTopicItem = tocItem;
379 arrNodes.Add( tocItem );
380 }
381
382 nObjStartIndex = m.Index+m.Length;
383 }
384 }
385
386 /// <summary>
387 /// Parses the very first &lt;OBJECT&gt; tag in the sitemap file and extracts
388 /// information types and categories.
389 /// </summary>
390 /// <param name="sText">text of the object tag</param>
391 /// <param name="chmFile">CHMFile instance</param>
392 private static void ParseGlobalSettings(string sText, CHMFile chmFile)
393 {
394 int innerPTextIdx = ParamRE.GroupNumberFromName("innerText");
395
396 // get group-name indexes
397 int nameIndex = AttributesRE.GroupNumberFromName("attributeName");
398 int valueIndex = AttributesRE.GroupNumberFromName("attributeValue");
399 int tdIndex = AttributesRE.GroupNumberFromName("attributeTD");
400
401 // read parameters
402 int nParamIndex = 0;
403
404 // 0... unknown
405 // 1... inclusinve info type name
406 // 2... exclusive info type name
407 // 3... hidden info type name
408 // 4... category name
409 // 5... incl infotype name for category
410 // 6... excl infotype name for category
411 // 7... hidden infotype name for category
412 int prevItem = 0;
413
414 string sName = "";
415 string sDescription = "";
416 string curCategory = "";
417
418 while( ParamRE.IsMatch(sText, nParamIndex) )
419 {
420 Match mP = ParamRE.Match(sText, nParamIndex);
421
422 string innerP = mP.Groups[innerPTextIdx].Value;
423
424 string paramName = "";
425 string paramValue = "";
426
427 int nAttrIdx = 0;
428
429 while( AttributesRE.IsMatch( innerP, nAttrIdx ) )
430 {
431 Match mA = AttributesRE.Match(innerP, nAttrIdx);
432
433 string attributeName = mA.Groups[nameIndex].Value;
434 string attributeValue = mA.Groups[valueIndex].Value;
435 string attributeTD = mA.Groups[tdIndex].Value;
436
437 if(attributeTD.Length > 0)
438 {
439 // delete the trailing textqualifier
440 if( attributeValue.Length > 0)
441 {
442 int ltqi = attributeValue.LastIndexOf( attributeTD );
443
444 if(ltqi >= 0)
445 {
446 attributeValue = attributeValue.Substring(0,ltqi);
447 }
448 }
449 }
450
451 if( attributeName.ToLower() == "name")
452 {
453 paramName = HttpUtility.HtmlDecode(attributeValue); // for unicode encoded values
454 }
455
456 if( attributeName.ToLower() == "value")
457 {
458 paramValue = HttpUtility.HtmlDecode(attributeValue); // for unicode encoded values
459 // delete trailing /
460 while((paramValue.Length>0)&&(paramValue[paramValue.Length-1] == '/'))
461 paramValue = paramValue.Substring(0,paramValue.Length-1);
462
463 }
464
465 nAttrIdx = mA.Index+mA.Length;
466 }
467
468 switch(paramName.ToLower())
469 {
470 case "savetype": // inclusive information type name
471 {
472 prevItem = 1;
473 sName = paramValue;
474 };break;
475 case "savetypedesc": // description of information type
476 {
477 InformationTypeMode mode = InformationTypeMode.Inclusive;
478 sDescription = paramValue;
479
480 if( prevItem == 1)
481 mode = InformationTypeMode.Inclusive;
482 if( prevItem == 2)
483 mode = InformationTypeMode.Exclusive;
484 if( prevItem == 3)
485 mode = InformationTypeMode.Hidden;
486
487 if( chmFile.GetInformationType( sName ) == null)
488 {
489 // check if the HtmlHelpSystem already holds such an information type
490 if( chmFile.SystemInstance.GetInformationType( sName ) == null)
491 {
492 // info type not found yet
493
494 InformationType newType = new InformationType(sName, sDescription, mode);
495 chmFile.InformationTypes.Add(newType);
496 }
497 else
498 {
499 InformationType sysType = chmFile.SystemInstance.GetInformationType( sName );
500 chmFile.InformationTypes.Add( sysType );
501 }
502 }
503
504 prevItem = 0;
505 };break;
506 case "saveexclusive": // exclusive information type name
507 {
508 prevItem = 2;
509 sName = paramValue;
510 };break;
511 case "savehidden": // hidden information type name
512 {
513 prevItem = 3;
514 sName = paramValue;
515 };break;
516 case "category": // category name
517 {
518 prevItem = 4;
519 sName = paramValue;
520 curCategory = sName;
521 };break;
522 case "categorydesc": // category description
523 {
524 sDescription = paramValue;
525
526 if( chmFile.GetCategory( sName ) == null)
527 {
528 // check if the HtmlHelpSystem already holds such a category
529 if( chmFile.SystemInstance.GetCategory( sName ) == null)
530 {
531 // add category
532 Category newCat = new Category(sName, sDescription);
533 chmFile.Categories.Add(newCat);
534 }
535 else
536 {
537 Category sysCat = chmFile.SystemInstance.GetCategory( sName );
538 chmFile.Categories.Add( sysCat );
539 }
540 }
541
542 prevItem = 0;
543 };break;
544 case "type": // inclusive information type which is member of the previously read category
545 {
546 prevItem = 5;
547 sName = paramValue;
548 };break;
549 case "typedesc": // description of type for category
550 {
551 sDescription = paramValue;
552 Category cat = chmFile.GetCategory( curCategory );
553
554 if( cat != null)
555 {
556 // category found
557 InformationType infoType = chmFile.GetInformationType( sName );
558
559 if( infoType != null)
560 {
561 if( !cat.ContainsInformationType(infoType))
562 {
563 infoType.SetCategoryFlag(true);
564 cat.AddInformationType(infoType);
565 }
566 }
567 }
568
569 prevItem = 0;
570 };break;
571 case "typeexclusive": // exclusive information type which is member of the previously read category
572 {
573 prevItem = 6;
574 sName = paramValue;
575 };break;
576 case "typehidden": // hidden information type which is member of the previously read category
577 {
578 prevItem = 7;
579 sName = paramValue;
580 };break;
581 default:
582 {
583 prevItem = 0;
584 sName = "";
585 sDescription = "";
586 };break;
587 }
588
589 nParamIndex = mP.Index+mP.Length;
590 }
591 }
592 }
593 }