2 using System.Collections;
4 using System.Text.RegularExpressions;
6 namespace HtmlHelp.ChmDecoding
9 /// The class <c>HHCParser</c> implements a parser for HHC contents files.
11 internal sealed class HHCParser
14 /// regular expressions for replacing the sitemap boundary tags
16 private static string RE_ULOpening = @"\<ul\>"; // will be replaced by a '(' for nested parsing
17 private static string RE_ULClosing = @"\</ul\>"; // will be replaced by a ')' for nested parsing
22 private static string RE_ULBoundaries = @"\<ul\>(?<innerText>.*)\</ul\>";
24 /// Matching the nested tree structure.
26 private static string RE_NestedBoundaries = @"\( (?> [^()]+ | \( (?<DEPTH>) | \) (?<-DEPTH>) )* (?(DEPTH)(?!)) \)";
28 /// Matching object-tags
30 private static string RE_ObjectBoundaries = @"\<object(?<innerText>.*?)\</object\>";
32 /// Matching param tags
34 private static string RE_ParamBoundaries = @"\<param(?<innerText>.*?)\>";
36 /// Extracting tag attributes
38 private const string RE_QuoteAttributes = @"( |\t)*(?<attributeName>[\-a-zA-Z0-9]*)( |\t)*=( |\t)*(?<attributeTD>[\""\'])?(?<attributeValue>.*?(?(attributeTD)\k<attributeTD>|([\s>]|.$)))";
41 /// private regular expressionobjects
43 private static Regex ulRE;
44 private static Regex NestedRE;
45 private static Regex ObjectRE;
46 private static Regex ParamRE;
47 private static Regex AttributesRE;
50 /// Internal member storing the list of TOCItems which are holding merge links
52 private static ArrayList _mergeItems = null;
55 /// Internal member storing the last read regular topic item.
56 /// This is used to handle "Merge" entries and add them as child to this instance.
58 private static TOCItem _lastTopicItem = null;
61 /// Parses a HHC file and returns an ArrayList with the table of contents (TOC) tree
63 /// <param name="hhcFile">string content of the hhc file</param>
64 /// <param name="chmFile">CHMFile instance</param>
65 /// <returns>Returns an ArrayList with the table of contents (TOC) tree</returns>
66 public static ArrayList ParseHHC(string hhcFile, CHMFile chmFile)
68 _lastTopicItem = null;
69 _mergeItems = null; // clear merged item list
70 ArrayList tocList = new ArrayList();
72 ulRE = new Regex(RE_ULBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
73 NestedRE = new Regex(RE_NestedBoundaries, RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
74 ObjectRE = new Regex(RE_ObjectBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
75 ParamRE = new Regex(RE_ParamBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
76 AttributesRE = new Regex(RE_QuoteAttributes, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
78 int innerTextIdx = ulRE.GroupNumberFromName("innerText");
80 if( ulRE.IsMatch(hhcFile, 0) )
82 Match m = ulRE.Match(hhcFile, 0);
86 nFirstUL = hhcFile.ToLower().IndexOf("<ul>");
89 nFirstUL = hhcFile.ToLower().IndexOf("<il>");
91 if( ObjectRE.IsMatch(hhcFile, 0) ) // first object block contains information types and categories
93 Match mO = ObjectRE.Match(hhcFile, 0);
94 int iOTxt = ObjectRE.GroupNumberFromName("innerText");
96 string globalText = mO.Groups[iOTxt].Value;
98 if( mO.Groups[iOTxt].Index <= nFirstUL)
99 ParseGlobalSettings( globalText, chmFile );
103 string innerText = m.Groups["innerText"].Value;
105 innerText = innerText.Replace("(", "(");
106 innerText = innerText.Replace(")", ")");
107 innerText = Regex.Replace(innerText, RE_ULOpening, "(", RegexOptions.IgnoreCase);
108 innerText = Regex.Replace(innerText, RE_ULClosing, ")", RegexOptions.IgnoreCase);
110 ParseTree( innerText, null, tocList, chmFile );
118 /// Checks if the hhc file contains a global object tag.
120 /// <param name="hhcFile">string content of the hhc file</param>
121 /// <param name="chmFile">chm file</param>
122 /// <returns>true if the hhc content contains a global object tag</returns>
123 public static bool HasGlobalObjectTag(string hhcFile, CHMFile chmFile)
127 ulRE = new Regex(RE_ULBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
128 ObjectRE = new Regex(RE_ObjectBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
130 int innerTextIdx = ulRE.GroupNumberFromName("innerText");
132 if( ulRE.IsMatch(hhcFile, 0) )
134 Match m = ulRE.Match(hhcFile, 0);
138 nFirstUL = hhcFile.ToLower().IndexOf("<ul>");
141 nFirstUL = hhcFile.ToLower().IndexOf("<il>");
143 if( ObjectRE.IsMatch(hhcFile, 0) ) // first object block contains information types and categories
145 Match mO = ObjectRE.Match(hhcFile, 0);
146 int iOTxt = ObjectRE.GroupNumberFromName("innerText");
148 string globalText = mO.Groups[iOTxt].Value;
150 if( mO.Groups[iOTxt].Index <= nFirstUL)
159 /// Gets true if the previously done parsing found merge-links
161 public static bool HasMergeLinks
165 if(_mergeItems==null)
168 return _mergeItems.Count > 0;
173 /// Gets all TOCItem references which are holding merge-links
175 public static ArrayList MergeItems
177 get { return _mergeItems; }
181 /// Recursively parses a sitemap tree
183 /// <param name="text">content text</param>
184 /// <param name="parent">Parent for all read items</param>
185 /// <param name="arrNodes">arraylist which receives the extracted nodes</param>
186 /// <param name="chmFile">CHMFile instance</param>
187 private static void ParseTree( string text, TOCItem parent, ArrayList arrNodes, CHMFile chmFile )
189 string strPreItems="", strPostItems="";
190 string innerText = "";
194 while( NestedRE.IsMatch(text, nIndex) )
196 Match m = NestedRE.Match(text, nIndex);
198 innerText = m.Value.Substring( 1, m.Length-2);
200 strPreItems = text.Substring(nIndex,m.Index-nIndex);
202 ParseItems(strPreItems, parent, arrNodes, chmFile);
204 if((arrNodes.Count>0) && (innerText.Length > 0) )
206 TOCItem p = ((TOCItem)(arrNodes[arrNodes.Count-1]));
207 ParseTree( innerText, p, p.Children, chmFile );
210 nIndex = m.Index+m.Length;
215 strPostItems = text.Substring(nIndex, text.Length-nIndex);
216 ParseItems(strPostItems, parent, arrNodes, chmFile);
218 else if( nIndex < text.Length-1)
220 strPostItems = text.Substring(nIndex, text.Length-nIndex);
221 ParseTree(strPostItems, parent, arrNodes, chmFile);
226 /// Parses tree nodes from the text
228 /// <param name="itemstext">text containing the items</param>
229 /// <param name="parent">Parent for all read items</param>
230 /// <param name="arrNodes">arraylist where the nodes should be added</param>
231 /// <param name="chmFile">CHMFile instance</param>
232 private static void ParseItems( string itemstext, TOCItem parent, ArrayList arrNodes, CHMFile chmFile)
234 int innerTextIdx = ObjectRE.GroupNumberFromName("innerText");
235 int innerPTextIdx = ParamRE.GroupNumberFromName("innerText");
237 // get group-name indexes
238 int nameIndex = AttributesRE.GroupNumberFromName("attributeName");
239 int valueIndex = AttributesRE.GroupNumberFromName("attributeValue");
240 int tdIndex = AttributesRE.GroupNumberFromName("attributeTD");
242 int nObjStartIndex = 0;
244 while( ObjectRE.IsMatch(itemstext, nObjStartIndex) )
246 Match m = ObjectRE.Match(itemstext, nObjStartIndex);
248 string innerText = m.Groups[innerTextIdx].Value;
250 TOCItem tocItem = new TOCItem();
251 tocItem.TocMode = DataMode.TextBased;
252 tocItem.AssociatedFile = chmFile;
253 tocItem.Parent = parent;
258 while( ParamRE.IsMatch(innerText, nParamIndex) )
260 Match mP = ParamRE.Match(innerText, nParamIndex);
262 string innerP = mP.Groups[innerPTextIdx].Value;
264 string paramName = "";
265 string paramValue = "";
269 while( AttributesRE.IsMatch( innerP, nAttrIdx ) )
271 Match mA = AttributesRE.Match(innerP, nAttrIdx);
273 string attributeName = mA.Groups[nameIndex].Value;
274 string attributeValue = mA.Groups[valueIndex].Value;
275 string attributeTD = mA.Groups[tdIndex].Value;
277 if(attributeTD.Length > 0)
279 // delete the trailing textqualifier
280 if( attributeValue.Length > 0)
282 int ltqi = attributeValue.LastIndexOf( attributeTD );
286 attributeValue = attributeValue.Substring(0,ltqi);
291 if( attributeName.ToLower() == "name")
293 paramName = HttpUtility.HtmlDecode(attributeValue); // for unicode encoded values
296 if( attributeName.ToLower() == "value")
298 paramValue = HttpUtility.HtmlDecode(attributeValue); // for unicode encoded values
300 while((paramValue.Length>0)&&(paramValue[paramValue.Length-1] == '/'))
301 paramValue = paramValue.Substring(0,paramValue.Length-1);
305 nAttrIdx = mA.Index+mA.Length;
308 tocItem.Params[paramName] = paramValue;
309 switch(paramName.ToLower())
313 tocItem.Name = paramValue;
317 tocItem.Local = paramValue.Replace("../", "").Replace("./", "");
321 tocItem.ImageIndex = Int32.Parse(paramValue);
322 tocItem.ImageIndex-=1;
326 if((chmFile != null) && (chmFile.ImageTypeFolder))
328 // get the value which should be added, to display folders instead of books
329 if(HtmlHelpSystem.UseHH2TreePics)
335 if(tocItem.ImageIndex%2 != 0)
337 if(tocItem.ImageIndex==1)
338 tocItem.ImageIndex=0;
340 if(HtmlHelpSystem.UseHH2TreePics)
341 if( tocItem.ImageIndex == 0)
342 tocItem.ImageIndex = TOCItem.STD_FOLDER_HH2+nFolderAdd;
344 case "merge": // this item contains topics or a full TOC from a merged CHM
346 tocItem.MergeLink = paramValue;
348 // "register" this item as merge-link
349 if(_mergeItems==null)
350 _mergeItems=new ArrayList();
352 _mergeItems.Add(tocItem);
355 case "type": // information type assignment for item
357 tocItem.InfoTypeStrings.Add( paramValue );
361 nParamIndex = mP.Index+mP.Length;
364 tocItem.ChmFile = chmFile.ChmFilePath;
366 if(tocItem.MergeLink.Length > 0)
368 if(_lastTopicItem != null)
370 tocItem.Parent = _lastTopicItem;
371 _lastTopicItem.Children.Add(tocItem);
374 arrNodes.Add( tocItem );
378 _lastTopicItem = tocItem;
379 arrNodes.Add( tocItem );
382 nObjStartIndex = m.Index+m.Length;
387 /// Parses the very first <OBJECT> tag in the sitemap file and extracts
388 /// information types and categories.
390 /// <param name="sText">text of the object tag</param>
391 /// <param name="chmFile">CHMFile instance</param>
392 private static void ParseGlobalSettings(string sText, CHMFile chmFile)
394 int innerPTextIdx = ParamRE.GroupNumberFromName("innerText");
396 // get group-name indexes
397 int nameIndex = AttributesRE.GroupNumberFromName("attributeName");
398 int valueIndex = AttributesRE.GroupNumberFromName("attributeValue");
399 int tdIndex = AttributesRE.GroupNumberFromName("attributeTD");
405 // 1... inclusinve info type name
406 // 2... exclusive info type name
407 // 3... hidden info type name
408 // 4... category name
409 // 5... incl infotype name for category
410 // 6... excl infotype name for category
411 // 7... hidden infotype name for category
415 string sDescription = "";
416 string curCategory = "";
418 while( ParamRE.IsMatch(sText, nParamIndex) )
420 Match mP = ParamRE.Match(sText, nParamIndex);
422 string innerP = mP.Groups[innerPTextIdx].Value;
424 string paramName = "";
425 string paramValue = "";
429 while( AttributesRE.IsMatch( innerP, nAttrIdx ) )
431 Match mA = AttributesRE.Match(innerP, nAttrIdx);
433 string attributeName = mA.Groups[nameIndex].Value;
434 string attributeValue = mA.Groups[valueIndex].Value;
435 string attributeTD = mA.Groups[tdIndex].Value;
437 if(attributeTD.Length > 0)
439 // delete the trailing textqualifier
440 if( attributeValue.Length > 0)
442 int ltqi = attributeValue.LastIndexOf( attributeTD );
446 attributeValue = attributeValue.Substring(0,ltqi);
451 if( attributeName.ToLower() == "name")
453 paramName = HttpUtility.HtmlDecode(attributeValue); // for unicode encoded values
456 if( attributeName.ToLower() == "value")
458 paramValue = HttpUtility.HtmlDecode(attributeValue); // for unicode encoded values
460 while((paramValue.Length>0)&&(paramValue[paramValue.Length-1] == '/'))
461 paramValue = paramValue.Substring(0,paramValue.Length-1);
465 nAttrIdx = mA.Index+mA.Length;
468 switch(paramName.ToLower())
470 case "savetype": // inclusive information type name
475 case "savetypedesc": // description of information type
477 InformationTypeMode mode = InformationTypeMode.Inclusive;
478 sDescription = paramValue;
481 mode = InformationTypeMode.Inclusive;
483 mode = InformationTypeMode.Exclusive;
485 mode = InformationTypeMode.Hidden;
487 if( chmFile.GetInformationType( sName ) == null)
489 // check if the HtmlHelpSystem already holds such an information type
490 if( chmFile.SystemInstance.GetInformationType( sName ) == null)
492 // info type not found yet
494 InformationType newType = new InformationType(sName, sDescription, mode);
495 chmFile.InformationTypes.Add(newType);
499 InformationType sysType = chmFile.SystemInstance.GetInformationType( sName );
500 chmFile.InformationTypes.Add( sysType );
506 case "saveexclusive": // exclusive information type name
511 case "savehidden": // hidden information type name
516 case "category": // category name
522 case "categorydesc": // category description
524 sDescription = paramValue;
526 if( chmFile.GetCategory( sName ) == null)
528 // check if the HtmlHelpSystem already holds such a category
529 if( chmFile.SystemInstance.GetCategory( sName ) == null)
532 Category newCat = new Category(sName, sDescription);
533 chmFile.Categories.Add(newCat);
537 Category sysCat = chmFile.SystemInstance.GetCategory( sName );
538 chmFile.Categories.Add( sysCat );
544 case "type": // inclusive information type which is member of the previously read category
549 case "typedesc": // description of type for category
551 sDescription = paramValue;
552 Category cat = chmFile.GetCategory( curCategory );
557 InformationType infoType = chmFile.GetInformationType( sName );
559 if( infoType != null)
561 if( !cat.ContainsInformationType(infoType))
563 infoType.SetCategoryFlag(true);
564 cat.AddInformationType(infoType);
571 case "typeexclusive": // exclusive information type which is member of the previously read category
576 case "typehidden": // hidden information type which is member of the previously read category
589 nParamIndex = mP.Index+mP.Length;