prevent buffer overflow, LoadString accepts the size of the buffer in TCHARs, not...
[reactos.git] / irc / TechBot / CHMLibrary / CHMDecoding / FullTextEngine.cs
1 using System;
2 using System.Data;
3 using System.Diagnostics;
4 using System.Text;
5 using System.Text.RegularExpressions;
6 using System.IO;
7 using System.Collections;
8 using System.Globalization;
9
10 namespace HtmlHelp.ChmDecoding
11 {
12 /// <summary>
13 /// The class <c>FullTextSearcher</c> implements a fulltext searcher for a single chm file !
14 /// </summary>
15 internal sealed class FullTextEngine : IDisposable
16 {
17 #region Internal helper classes
18 /// <summary>
19 /// Internal class for decoding the header
20 /// </summary>
21 private sealed class FTHeader
22 {
23 /// <summary>
24 /// Internal member storing the number of indexed files
25 /// </summary>
26 private int _numberOfIndexFiles = 0;
27 /// <summary>
28 /// Internal member storing the offset of the root node
29 /// </summary>
30 private int _rootOffset = 0;
31 /// <summary>
32 /// Internal member storing the index-page count
33 /// </summary>
34 private int _pageCount = 0;
35 /// <summary>
36 /// Internal member storing the depth of the tree
37 /// </summary>
38 private int _depth = 0;
39 /// <summary>
40 /// Internal member storing the scale param for document index en-/decoding
41 /// </summary>
42 private byte _scaleDocIdx = 0;
43 /// <summary>
44 /// Internal member storing the scale param for code-count en-/decoding
45 /// </summary>
46 private byte _scaleCodeCnt = 0;
47 /// <summary>
48 /// Internal member storing the scale param for location codes en-/decoding
49 /// </summary>
50 private byte _scaleLocCodes = 0;
51 /// <summary>
52 /// Internal member storing the root param for document index en-/decoding
53 /// </summary>
54 private byte _rootDocIdx = 0;
55 /// <summary>
56 /// Internal member storing the root param for code-count en-/decoding
57 /// </summary>
58 private byte _rootCodeCnt = 0;
59 /// <summary>
60 /// Internal member storing the root param for location codes en-/decoding
61 /// </summary>
62 private byte _rootLocCodes = 0;
63 /// <summary>
64 /// Internal member storing the size of the nodes in bytes
65 /// </summary>
66 private int _nodeSize = 0;
67 /// <summary>
68 /// Internal member storing the length of the longest word
69 /// </summary>
70 private int _lengthOfLongestWord = 0;
71 /// <summary>
72 /// Internal member storing the total number of words
73 /// </summary>
74 private int _totalNumberOfWords = 0;
75 /// <summary>
76 /// Internal member storing the total number of unique words
77 /// </summary>
78 private int _numberOfUniqueWords = 0;
79 /// <summary>
80 /// Internal member storing the codepage identifier
81 /// </summary>
82 private int _codePage = 1252;
83 /// <summary>
84 /// Internal member storing the language code id
85 /// </summary>
86 private int _lcid = 1033;
87 /// <summary>
88 /// Internal member storing the text encoder
89 /// </summary>
90 private Encoding _textEncoder = Encoding.Default;
91
92 /// <summary>
93 /// Constructor of the header
94 /// </summary>
95 /// <param name="binaryData">binary data from which the header will be extracted</param>
96 public FTHeader(byte[] binaryData)
97 {
98 DecodeHeader(binaryData);
99 }
100
101 /// <summary>
102 /// Internal constructor for reading from dump
103 /// </summary>
104 internal FTHeader()
105 {
106 }
107
108 /// <summary>
109 /// Decodes the binary header information and fills the members
110 /// </summary>
111 /// <param name="binaryData">binary data from which the header will be extracted</param>
112 private void DecodeHeader(byte[] binaryData)
113 {
114 MemoryStream memStream = new MemoryStream(binaryData);
115 BinaryReader binReader = new BinaryReader(memStream);
116
117 binReader.ReadBytes(4); // 4 unknown bytes
118
119 _numberOfIndexFiles = binReader.ReadInt32(); // number of indexed files
120
121 binReader.ReadInt32(); // unknown
122 binReader.ReadInt32(); // unknown
123
124 _pageCount = binReader.ReadInt32(); // page-count
125 _rootOffset = binReader.ReadInt32(); // file offset of the root node
126 _depth = binReader.ReadInt16(); // depth of the tree
127
128 binReader.ReadInt32(); // unknown
129
130 _scaleDocIdx = binReader.ReadByte();
131 _rootDocIdx = binReader.ReadByte();
132 _scaleCodeCnt = binReader.ReadByte();
133 _rootCodeCnt = binReader.ReadByte();
134 _scaleLocCodes = binReader.ReadByte();
135 _rootLocCodes = binReader.ReadByte();
136
137 if( (_scaleDocIdx != 2) || ( _scaleCodeCnt != 2 ) || ( _scaleLocCodes != 2 ) )
138 {
139 Debug.WriteLine("Unsupported scale for s/r encoding !");
140 throw new InvalidOperationException("Unsupported scale for s/r encoding !");
141 }
142
143 binReader.ReadBytes(10); // unknown
144
145 _nodeSize = binReader.ReadInt32();
146
147 binReader.ReadInt32(); // unknown
148 binReader.ReadInt32(); // not important
149 binReader.ReadInt32(); // not important
150
151 _lengthOfLongestWord = binReader.ReadInt32();
152 _totalNumberOfWords = binReader.ReadInt32();
153 _numberOfUniqueWords = binReader.ReadInt32();
154
155 binReader.ReadInt32(); // not important
156 binReader.ReadInt32(); // not important
157 binReader.ReadInt32(); // not important
158 binReader.ReadInt32(); // not important
159 binReader.ReadInt32(); // not important
160 binReader.ReadInt32(); // not important
161
162 binReader.ReadBytes(24); // not important
163
164 _codePage = binReader.ReadInt32();
165 _lcid = binReader.ReadInt32();
166
167 CultureInfo ci = new CultureInfo(_lcid);
168 _textEncoder = Encoding.GetEncoding( ci.TextInfo.ANSICodePage );
169
170 // rest of header is not important for us
171 }
172
173 /// <summary>
174 /// Dump the class data to a binary writer
175 /// </summary>
176 /// <param name="writer">writer to write the data</param>
177 internal void Dump(ref BinaryWriter writer)
178 {
179 writer.Write( _numberOfIndexFiles );
180 writer.Write( _rootOffset );
181 writer.Write( _pageCount );
182 writer.Write( _depth );
183 writer.Write( _scaleDocIdx );
184 writer.Write( _rootDocIdx );
185 writer.Write( _scaleCodeCnt );
186 writer.Write( _rootCodeCnt );
187 writer.Write( _scaleLocCodes );
188 writer.Write( _rootLocCodes );
189 writer.Write( _nodeSize );
190 writer.Write( _lengthOfLongestWord );
191 writer.Write( _totalNumberOfWords );
192 writer.Write( _numberOfUniqueWords );
193 }
194
195 /// <summary>
196 /// Reads the object data from a dump store
197 /// </summary>
198 /// <param name="reader">reader to read the data</param>
199 internal void ReadDump(ref BinaryReader reader)
200 {
201 _numberOfIndexFiles = reader.ReadInt32();
202 _rootOffset = reader.ReadInt32();
203 _pageCount = reader.ReadInt32();
204 _depth = reader.ReadInt32();
205
206 _scaleDocIdx = reader.ReadByte();
207 _rootDocIdx = reader.ReadByte();
208 _scaleCodeCnt = reader.ReadByte();
209 _rootCodeCnt = reader.ReadByte();
210 _scaleLocCodes = reader.ReadByte();
211 _rootLocCodes = reader.ReadByte();
212
213 _nodeSize = reader.ReadInt32();
214 _lengthOfLongestWord = reader.ReadInt32();
215 _totalNumberOfWords = reader.ReadInt32();
216 _numberOfUniqueWords = reader.ReadInt32();
217 }
218
219 /// <summary>
220 /// Gets the number of indexed files
221 /// </summary>
222 public int IndexedFileCount
223 {
224 get { return _numberOfIndexFiles; }
225 }
226
227 /// <summary>
228 /// Gets the file offset of the root node
229 /// </summary>
230 public int RootOffset
231 {
232 get { return _rootOffset; }
233 }
234
235 /// <summary>
236 /// Gets the page count
237 /// </summary>
238 public int PageCount
239 {
240 get { return _pageCount; }
241 }
242
243 /// <summary>
244 /// Gets the index depth
245 /// </summary>
246 public int Depth
247 {
248 get { return _depth; }
249 }
250
251 /// <summary>
252 /// Gets the scale param for document index en-/decoding
253 /// </summary>
254 /// <remarks>The scale and root method of integer encoding needs two parameters,
255 /// which I'll call s (scale) and r (root size).
256 /// The integer is encoded as two parts, p (prefix) and q (actual bits).
257 /// p determines how many bits are stored, as well as implicitly determining
258 /// the high-order bit of the integer. </remarks>
259 public byte ScaleDocumentIndex
260 {
261 get { return _scaleDocIdx; }
262 }
263
264 /// <summary>
265 /// Gets the root param for the document index en-/decoding
266 /// </summary>
267 /// <remarks>The scale and root method of integer encoding needs two parameters,
268 /// which I'll call s (scale) and r (root size).
269 /// The integer is encoded as two parts, p (prefix) and q (actual bits).
270 /// p determines how many bits are stored, as well as implicitly determining
271 /// the high-order bit of the integer. </remarks>
272 public byte RootDocumentIndex
273 {
274 get { return _rootDocIdx; }
275 }
276
277 /// <summary>
278 /// Gets the scale param for the code-count en-/decoding
279 /// </summary>
280 /// <remarks>The scale and root method of integer encoding needs two parameters,
281 /// which I'll call s (scale) and r (root size).
282 /// The integer is encoded as two parts, p (prefix) and q (actual bits).
283 /// p determines how many bits are stored, as well as implicitly determining
284 /// the high-order bit of the integer. </remarks>
285 public byte ScaleCodeCount
286 {
287 get { return _scaleCodeCnt; }
288 }
289
290 /// <summary>
291 /// Gets the root param for the code-count en-/decoding
292 /// </summary>
293 /// <remarks>The scale and root method of integer encoding needs two parameters,
294 /// which I'll call s (scale) and r (root size).
295 /// The integer is encoded as two parts, p (prefix) and q (actual bits).
296 /// p determines how many bits are stored, as well as implicitly determining
297 /// the high-order bit of the integer. </remarks>
298 public byte RootCodeCount
299 {
300 get { return _rootCodeCnt; }
301 }
302
303 /// <summary>
304 /// Gets the scale param for the location codes en-/decoding
305 /// </summary>
306 /// <remarks>The scale and root method of integer encoding needs two parameters,
307 /// which I'll call s (scale) and r (root size).
308 /// The integer is encoded as two parts, p (prefix) and q (actual bits).
309 /// p determines how many bits are stored, as well as implicitly determining
310 /// the high-order bit of the integer. </remarks>
311 public byte ScaleLocationCodes
312 {
313 get { return _scaleLocCodes; }
314 }
315
316 /// <summary>
317 /// Gets the root param for the location codes en-/decoding
318 /// </summary>
319 /// <remarks>The scale and root method of integer encoding needs two parameters,
320 /// which I'll call s (scale) and r (root size).
321 /// The integer is encoded as two parts, p (prefix) and q (actual bits).
322 /// p determines how many bits are stored, as well as implicitly determining
323 /// the high-order bit of the integer. </remarks>
324 public byte RootLocationCodes
325 {
326 get { return _rootLocCodes; }
327 }
328
329 /// <summary>
330 /// Gets the size in bytes of each index/leaf node
331 /// </summary>
332 public int NodeSize
333 {
334 get { return _nodeSize; }
335 }
336
337 /// <summary>
338 /// Gets the length of the longest word in the index
339 /// </summary>
340 private int LengthOfLongestWord
341 {
342 get { return _lengthOfLongestWord; }
343 }
344
345 /// <summary>
346 /// Gets the total number of words indexed (including duplicates)
347 /// </summary>
348 public int TotalWordCount
349 {
350 get { return _totalNumberOfWords; }
351 }
352
353 /// <summary>
354 /// Gets the total number of unique words indexed (excluding duplicates)
355 /// </summary>
356 public int UniqueWordCount
357 {
358 get { return _numberOfUniqueWords; }
359 }
360
361 /// <summary>
362 /// Gets the codepage identifier
363 /// </summary>
364 public int CodePage
365 {
366 get { return _codePage; }
367 }
368
369 /// <summary>
370 /// Gets the language code id
371 /// </summary>
372 public int LCID
373 {
374 get { return _lcid; }
375 }
376
377 public Encoding TextEncoder
378 {
379 get
380 {
381 return _textEncoder;
382 }
383 }
384 }
385
386
387 /// <summary>
388 /// Internal class for easier hit recording and rate-calculation
389 /// </summary>
390 private sealed class HitHelper : IComparable
391 {
392 /// <summary>
393 /// Internal member storing the associated document index
394 /// </summary>
395 private int _documentIndex = 0;
396 /// <summary>
397 /// Internal member storing the title
398 /// </summary>
399 private string _title = "";
400 /// <summary>
401 /// Internal member storing the locale
402 /// </summary>
403 private string _locale = "";
404 /// <summary>
405 /// Internal member storing the location
406 /// </summary>
407 private string _location = "";
408 /// <summary>
409 /// Internal member storing the url
410 /// </summary>
411 private string _url = "";
412 /// <summary>
413 /// Internal member storing the rating
414 /// </summary>
415 private double _rating = 0;
416 /// <summary>
417 /// Internal member used for rating calculation
418 /// </summary>
419 private Hashtable _partialRating = new Hashtable();
420
421 /// <summary>
422 /// Constructor of the class
423 /// </summary>
424 /// <param name="documentIndex">document index</param>
425 /// <param name="title">title</param>
426 /// <param name="locale">locale parameter</param>
427 /// <param name="location">location</param>
428 /// <param name="url">url of document</param>
429 /// <param name="rating">rating</param>
430 public HitHelper(int documentIndex, string title, string locale, string location, string url, double rating)
431 {
432 _documentIndex = documentIndex;
433 _title = title;
434 _locale = locale;
435 _location = location;
436 _url = url;
437 _rating = rating;
438 }
439
440 /// <summary>
441 /// Updates the rating for a found word
442 /// </summary>
443 /// <param name="word">word found</param>
444 public void UpdateRating(string word)
445 {
446 if( _partialRating[word] == null)
447 {
448 _partialRating[word] = 100.0;
449 }
450 else
451 {
452 _partialRating[word] = ((double)_partialRating[word])*1.01;
453 }
454
455 _rating = 0.0;
456
457 foreach(double val in _partialRating.Values)
458 {
459 _rating += val;
460 }
461 }
462
463 /// <summary>
464 /// Implements the CompareTo method of the IComparable interface.
465 /// Allows an easy sort by the document rating
466 /// </summary>
467 /// <param name="obj">object to compare</param>
468 /// <returns>0 ... equal, -1 ... this instance is less than obj, 1 ... this instance is greater than obj</returns>
469 public int CompareTo(object obj)
470 {
471 if( obj is HitHelper )
472 {
473 HitHelper hObj = (HitHelper)obj;
474
475 return this.Rating.CompareTo( hObj.Rating );
476 }
477
478 return -1;
479 }
480
481 /// <summary>
482 /// Gets the internal hashtable used for counting word hits of the document
483 /// </summary>
484 internal Hashtable PartialRating
485 {
486 get { return _partialRating; }
487 }
488
489 /// <summary>
490 /// Gets the document index of the hit helper instance
491 /// </summary>
492 public int DocumentIndex
493 {
494 get { return _documentIndex; }
495 }
496
497 /// <summary>
498 /// Gets the title
499 /// </summary>
500 public string Title
501 {
502 get { return _title; }
503 }
504
505 /// <summary>
506 /// Gets the locale
507 /// </summary>
508 public string Locale
509 {
510 get { return _locale; }
511 }
512
513 /// <summary>
514 /// Gets the location
515 /// </summary>
516 public string Location
517 {
518 get { return _location; }
519 }
520
521 /// <summary>
522 /// Gets the url
523 /// </summary>
524 public string URL
525 {
526 get { return _url; }
527 }
528
529 /// <summary>
530 /// Gets the rating
531 /// </summary>
532 public double Rating
533 {
534 get { return _rating; }
535 }
536
537 }
538
539 #endregion
540
541 /// <summary>
542 /// Regular expression getting the text between to quotes
543 /// </summary>
544 private string RE_Quotes = @"\""(?<innerText>.*?)\""";
545 /// <summary>
546 /// Internal flag specifying if the object is going to be disposed
547 /// </summary>
548 private bool disposed = false;
549 /// <summary>
550 /// Internal member storing the binary file data
551 /// </summary>
552 private byte[] _binaryFileData = null;
553 /// <summary>
554 /// Internal datatable storing the search hits
555 /// </summary>
556 private DataTable _hits =null;
557 /// <summary>
558 /// Internal arraylist for hit management
559 /// </summary>
560 private ArrayList _hitsHelper = new ArrayList();
561 /// <summary>
562 /// Internal member storing the header of the file
563 /// </summary>
564 private FTHeader _header = null;
565 /// <summary>
566 /// Internal member storing the associated chmfile object
567 /// </summary>
568 private CHMFile _associatedFile = null;
569
570 /// <summary>
571 /// Constructor of the class
572 /// </summary>
573 /// <param name="binaryFileData">binary file data of the $FIftiMain file</param>
574 /// <param name="associatedFile">associated chm file</param>
575 public FullTextEngine(byte[] binaryFileData, CHMFile associatedFile)
576 {
577 _binaryFileData = binaryFileData;
578 _associatedFile = associatedFile;
579
580 if(_associatedFile.SystemFile.FullTextSearch)
581 {
582 _header = new FTHeader(_binaryFileData); // reading header
583 }
584 }
585
586 /// <summary>
587 /// Standard constructor
588 /// </summary>
589 internal FullTextEngine()
590 {
591 }
592
593 #region Data dumping
594 /// <summary>
595 /// Dump the class data to a binary writer
596 /// </summary>
597 /// <param name="writer">writer to write the data</param>
598 internal void Dump(ref BinaryWriter writer)
599 {
600 _header.Dump(ref writer);
601 writer.Write( _binaryFileData.Length );
602 writer.Write(_binaryFileData);
603 }
604
605 /// <summary>
606 /// Reads the object data from a dump store
607 /// </summary>
608 /// <param name="reader">reader to read the data</param>
609 internal void ReadDump(ref BinaryReader reader)
610 {
611 _header = new FTHeader();
612 _header.ReadDump(ref reader);
613
614 int nCnt = reader.ReadInt32();
615 _binaryFileData = reader.ReadBytes(nCnt);
616 }
617
618 /// <summary>
619 /// Sets the associated CHMFile instance
620 /// </summary>
621 /// <param name="associatedFile">instance to set</param>
622 internal void SetCHMFile(CHMFile associatedFile)
623 {
624 _associatedFile = associatedFile;
625 }
626 #endregion
627
628 /// <summary>
629 /// Gets a flag if full-text searching is available for this chm file.
630 /// </summary>
631 public bool CanSearch
632 {
633 get { return (_associatedFile.SystemFile.FullTextSearch && (_header != null) ); }
634 }
635
636 /// <summary>
637 /// Performs a fulltext search of a single file.
638 /// </summary>
639 /// <param name="search">word(s) or phrase to search</param>
640 /// <param name="partialMatches">true if partial word should be matched also
641 /// ( if this is true a search of 'support' will match 'supports', otherwise not )</param>
642 /// <param name="titleOnly">true if only search in titles</param>
643 /// <remarks>Hits are available through the <see cref="Hits">Hists property</see>.</remarks>
644 public bool Search(string search, bool partialMatches, bool titleOnly)
645 {
646 return Search(search, -1, partialMatches, titleOnly);
647 }
648
649 /// <summary>
650 /// Performs a fulltext search of a single file.
651 /// </summary>
652 /// <param name="search">word(s) or phrase to search</param>
653 /// <param name="MaxHits">max hits. If this number is reached, the search will be interrupted</param>
654 /// <param name="partialMatches">true if partial word should be matched also
655 /// ( if this is true a search of 'support' will match 'supports', otherwise not )</param>
656 /// <param name="titleOnly">true if only search in titles</param>
657 /// <remarks>Hits are available through the <see cref="Hits">Hists property</see>.</remarks>
658 public bool Search(string search, int MaxHits, bool partialMatches, bool titleOnly)
659 {
660 if(CanSearch)
661 {
662 string searchString = search;
663
664 // Check if this is a quoted string
665 bool IsQuoted = (search.IndexOf("\"")>-1);
666
667 if(IsQuoted)
668 searchString = search.Replace("\"",""); // remove the quotes during search
669
670 bool bRet = true;
671
672 _hitsHelper = null;
673 _hitsHelper = new ArrayList();
674
675 _hits = null;
676 CreateHitsTable();
677
678 string[] words = searchString.Split(new char[] {' '});
679
680 for(int i=0; i<words.Length; i++)
681 {
682 bRet &= SearchSingleWord(words[i], MaxHits, partialMatches, titleOnly);
683 if(_hitsHelper.Count >= MaxHits)
684 break;
685 }
686
687 if(bRet && IsQuoted)
688 {
689 FinalizeQuoted(search);
690 }
691
692 if(bRet)
693 {
694 _hitsHelper.Sort();
695
696 int nhCount = MaxHits;
697
698 if( MaxHits < 0)
699 {
700 nhCount = _hitsHelper.Count;
701 }
702
703 if( nhCount > _hitsHelper.Count )
704 nhCount = _hitsHelper.Count;
705
706 // create hits datatable
707 for(int i=nhCount; i > 0; i--)
708 {
709 HitHelper curHlp = (HitHelper)(_hitsHelper[i-1]);
710
711 DataRow newRow = _hits.NewRow();
712
713 newRow["Rating"] = curHlp.Rating;
714 newRow["Title"] = curHlp.Title;
715 newRow["Locale"] = curHlp.Locale;
716 newRow["Location"] = curHlp.Location;
717 newRow["URL"] = curHlp.URL;
718
719 _hits.Rows.Add( newRow );
720 }
721 }
722 return bRet;
723 }
724
725 return false;
726 }
727
728 /// <summary>
729 /// Gets rid of all search hits which doesn't match the quoted phrase
730 /// </summary>
731 /// <param name="search">full search string entered by the user</param>
732 /// <remarks>Phrase search is not possible using the internal full-text index. We're just filtering all
733 /// documents which don't contain all words of the phrase.</remarks>
734 private void FinalizeQuoted(string search)
735 {
736 Regex quoteRE = new Regex(RE_Quotes, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
737 int innerTextIdx = quoteRE.GroupNumberFromName("innerText");
738 int nIndex = 0;
739
740 // get all phrases
741 while( quoteRE.IsMatch(search, nIndex) )
742 {
743 Match m = quoteRE.Match(search, nIndex);
744
745 string phrase = m.Groups["innerText"].Value;
746
747 string[] wordsInPhrase = phrase.Split( new char[] {' '} );
748 int nCnt = _hitsHelper.Count;
749
750 for(int i=0; i < _hitsHelper.Count; i++)
751 {
752 if( ! CheckHit( ((HitHelper)(_hitsHelper[i])), wordsInPhrase) )
753 _hitsHelper.RemoveAt(i--);
754 }
755
756 nIndex = m.Index+m.Length;
757 }
758 }
759
760 /// <summary>
761 /// Eliminates all search hits where not all of the words have been found
762 /// </summary>
763 /// <param name="hit">hithelper instance to check</param>
764 /// <param name="wordsInPhrase">word list</param>
765 private bool CheckHit(HitHelper hit, string[] wordsInPhrase)
766 {
767
768 for(int i=0; i<wordsInPhrase.Length;i++)
769 {
770 if( (hit.PartialRating[wordsInPhrase[i]] == null) || (((double)(hit.PartialRating[wordsInPhrase[i]])) == 0.0) )
771 return false;
772 }
773 return true;
774 }
775
776 /// <summary>
777 /// Performs a search for a single word in the index
778 /// </summary>
779 /// <param name="word">word to search</param>
780 /// <param name="MaxHits">maximal hits to return</param>
781 /// <param name="partialMatches">true if partial word should be matched also
782 /// ( if this is true a search of 'support' will match 'supports', otherwise not )</param>
783 /// <param name="titleOnly">true if only search in titles</param>
784 /// <returns>Returns true if succeeded</returns>
785 private bool SearchSingleWord(string word,int MaxHits, bool partialMatches, bool titleOnly)
786 {
787 string wordLower = word.ToLower();
788
789 MemoryStream memStream = new MemoryStream(_binaryFileData);
790 BinaryReader binReader = new BinaryReader(memStream);
791
792 // seek to root node
793 binReader.BaseStream.Seek( _header.RootOffset, SeekOrigin.Begin );
794
795 if( _header.Depth > 2 )
796 {
797 // unsupported index depth
798 Debug.WriteLine("FullTextSearcher.SearchSingleWord() - Failed with message: Unsupported index depth !");
799 Debug.WriteLine("File: " + _associatedFile.ChmFilePath);
800 Debug.WriteLine(" ");
801 return false;
802 }
803
804 if( _header.Depth > 1 )
805 {
806 // seek to the right leaf node ( if depth == 1, we are at the leaf node)
807 int freeSpace = binReader.ReadInt16();
808
809 for(int i=0; i < _header.PageCount; ++i)
810 {
811 // exstract index entries
812 int nWLength = (int)binReader.ReadByte();
813 int nCPosition = (int)binReader.ReadByte();
814
815 string sName = BinaryReaderHelp.ExtractString(ref binReader, nWLength-1, 0, true, _header.TextEncoder);
816
817 int nLeafOffset = binReader.ReadInt32();
818 binReader.ReadInt16(); // unknown
819
820 if( sName.CompareTo(wordLower) >= 0)
821 {
822 // store current position
823 long curPos = binReader.BaseStream.Position;
824
825 // seek to leaf offset
826 binReader.BaseStream.Seek( nLeafOffset, SeekOrigin.Begin );
827
828 // read leafnode
829 ReadLeafNode(ref binReader, word, MaxHits, partialMatches, titleOnly);
830
831 // return to current position and continue reading index nodes
832 binReader.BaseStream.Seek( curPos, SeekOrigin.Begin );
833 }
834 }
835 }
836
837 return true;
838 }
839
840 /// <summary>
841 /// Reads a leaf node and extracts documents which holds the searched word
842 /// </summary>
843 /// <param name="binReader">reference to the reader</param>
844 /// <param name="word">word to search</param>
845 /// <param name="MaxHits">maximal hits to return</param>
846 /// <param name="partialMatches">true if partial word should be matched also
847 /// ( if this is true a search of 'support' will match 'supports', otherwise not )</param>
848 /// <param name="titleOnly">true if only search in titles</param>
849 private void ReadLeafNode(ref BinaryReader binReader, string word, int MaxHits, bool partialMatches, bool titleOnly)
850 {
851 int nNextPageOffset = binReader.ReadInt32();
852 binReader.ReadInt16(); // unknown
853 int lfreeSpace = binReader.ReadInt16();
854 string curFullWord = "";
855 bool bFound = false;
856 string wordLower = word.ToLower();
857
858 for(;;)
859 {
860 if(binReader.BaseStream.Position >= binReader.BaseStream.Length)
861 break;
862
863 int nWLength = (int)binReader.ReadByte();
864
865 if(nWLength == 0)
866 break;
867
868 int nCPosition = (int)binReader.ReadByte();
869
870 string sName = BinaryReaderHelp.ExtractString(ref binReader, nWLength-1, 0, true, _header.TextEncoder);
871
872 int Context = (int)binReader.ReadByte(); // 0...body tag, 1...title tag, others unknown
873
874 long nrOfWCL = BinaryReaderHelp.ReadENCINT(ref binReader);
875 int wclOffset = binReader.ReadInt32();
876
877 binReader.ReadInt16(); // unknown
878
879 long bytesOfWCL = BinaryReaderHelp.ReadENCINT(ref binReader);
880
881 if( nCPosition > 0)
882 {
883 curFullWord = CombineStrings(curFullWord, sName, nCPosition);
884 }
885 else
886 {
887 curFullWord = sName;
888 }
889
890 bFound = false;
891 if(partialMatches)
892 bFound = ( curFullWord.IndexOf(wordLower) >= 0 );
893 else
894 bFound = (curFullWord == wordLower);
895
896 if( bFound )
897 {
898 if( (titleOnly && (Context==1)) || (!titleOnly) )
899 {
900 // store actual offset
901 long curPos = binReader.BaseStream.Position;
902
903 // found the word, begin with WCL encoding
904 binReader.BaseStream.Seek(wclOffset, SeekOrigin.Begin );
905
906 byte[] wclBytes = binReader.ReadBytes((int)bytesOfWCL);
907
908 DecodeWCL(wclBytes, MaxHits, word);
909
910 // back and continue reading leafnodes
911 binReader.BaseStream.Seek(curPos, SeekOrigin.Begin );
912 }
913 }
914 }
915 }
916
917 /// <summary>
918 /// Decodes the s/r encoded WordCodeList (=wcl) and creates hit entries
919 /// </summary>
920 /// <param name="wclBytes">wcl encoded byte array</param>
921 /// <param name="MaxHits">maximal hits</param>
922 /// <param name="word">the word to find</param>
923 private void DecodeWCL(byte[] wclBytes,int MaxHits, string word)
924 {
925 byte[] wclBits = new byte[ wclBytes.Length*8 ];
926
927 int nBitIdx=0;
928
929 for(int i=0; i<wclBytes.Length; i++)
930 {
931 for(int j=0; j<8; j++)
932 {
933 wclBits[nBitIdx] = ((byte)(wclBytes[i] & ((byte)( (byte)0x1 << (7-j) )))) > (byte)0 ? (byte)1 : (byte)0;
934 nBitIdx++;
935 }
936 }
937
938 nBitIdx = 0;
939
940 int nDocIdx = 0; // delta encoded
941
942 while(nBitIdx < wclBits.Length)
943 {
944 nDocIdx += BinaryReaderHelp.ReadSRItem(wclBits, _header.ScaleDocumentIndex, _header.RootDocumentIndex, ref nBitIdx);
945 int nCodeCnt = BinaryReaderHelp.ReadSRItem(wclBits, _header.ScaleCodeCount, _header.RootCodeCount, ref nBitIdx);
946
947 int nWordLocation = 0; // delta encoded
948
949 for(int locidx=0; locidx<nCodeCnt; locidx++)
950 {
951 nWordLocation += BinaryReaderHelp.ReadSRItem(wclBits, _header.ScaleLocationCodes, _header.RootLocationCodes, ref nBitIdx);
952 }
953 // apply padding
954 while( (nBitIdx % 8) != 0)
955 nBitIdx++;
956
957 // Record hit
958 HitHelper hitObj = DocumentHit(nDocIdx);
959
960 if(hitObj == null)
961 {
962 if(_hitsHelper.Count > MaxHits)
963 return;
964
965 hitObj = new HitHelper(nDocIdx, ((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).Title,
966 ((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).Locale, _associatedFile.CompileFile,
967 ((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).URL, 0.0);
968
969 for(int k=0;k<nCodeCnt;k++)
970 hitObj.UpdateRating(word);
971
972 _hitsHelper.Add(hitObj);
973 }
974 else
975 {
976 for(int k=0;k<nCodeCnt;k++)
977 hitObj.UpdateRating(word);
978 }
979 }
980 }
981
982 /// <summary>
983 /// Combines a "master" word with a partial word.
984 /// </summary>
985 /// <param name="word">the master word</param>
986 /// <param name="partial">the partial word</param>
987 /// <param name="partialPosition">position to place the parial word</param>
988 /// <returns>returns a combined string</returns>
989 private string CombineStrings(string word, string partial, int partialPosition)
990 {
991 string sCombined = word;
992 int i=0;
993
994 for(i=0; i<partial.Length; i++)
995 {
996 if( (i+partialPosition) > (sCombined.Length-1) )
997 {
998 sCombined += partial[i];
999 }
1000 else
1001 {
1002 StringBuilder sb = new StringBuilder(sCombined);
1003
1004 sb.Replace( sCombined[partialPosition+i], partial[i], partialPosition+i, 1);
1005 sCombined = sb.ToString();
1006 }
1007 }
1008
1009 if(! ((i+partialPosition) > (sCombined.Length-1)) )
1010 {
1011 sCombined = sCombined.Substring(0, partialPosition+partial.Length);
1012 }
1013
1014 return sCombined;
1015 }
1016
1017 /// <summary>
1018 /// Gets the HitHelper instance for a specific document index
1019 /// </summary>
1020 /// <param name="index">document index</param>
1021 /// <returns>The reference of the hithelper instance for this document index, otherwise null</returns>
1022 private HitHelper DocumentHit(int index)
1023 {
1024 foreach(HitHelper curObj in _hitsHelper)
1025 {
1026 if( curObj.DocumentIndex == index)
1027 return curObj;
1028 }
1029
1030 return null;
1031 }
1032
1033 /// <summary>
1034 /// Creates a DataTable for storing the hits
1035 /// </summary>
1036 private void CreateHitsTable()
1037 {
1038 _hits = new DataTable("FT_Search_Hits");
1039
1040 DataColumn ftColumn;
1041
1042 ftColumn = new DataColumn();
1043 ftColumn.DataType = System.Type.GetType("System.Double");
1044 ftColumn.ColumnName = "Rating";
1045 ftColumn.ReadOnly = false;
1046 ftColumn.Unique = false;
1047
1048 _hits.Columns.Add(ftColumn);
1049
1050 ftColumn = new DataColumn();
1051 ftColumn.DataType = System.Type.GetType("System.String");
1052 ftColumn.ColumnName = "Title";
1053 ftColumn.ReadOnly = false;
1054 ftColumn.Unique = false;
1055
1056 _hits.Columns.Add(ftColumn);
1057
1058 ftColumn = new DataColumn();
1059 ftColumn.DataType = System.Type.GetType("System.String");
1060 ftColumn.ColumnName = "Locale";
1061 ftColumn.ReadOnly = false;
1062 ftColumn.Unique = false;
1063
1064 _hits.Columns.Add(ftColumn);
1065
1066 ftColumn = new DataColumn();
1067 ftColumn.DataType = System.Type.GetType("System.String");
1068 ftColumn.ColumnName = "Location";
1069 ftColumn.ReadOnly = false;
1070 ftColumn.Unique = false;
1071
1072 _hits.Columns.Add(ftColumn);
1073
1074 ftColumn = new DataColumn();
1075 ftColumn.DataType = System.Type.GetType("System.String");
1076 ftColumn.ColumnName = "URL";
1077 ftColumn.ReadOnly = false;
1078 ftColumn.Unique = false;
1079
1080 _hits.Columns.Add(ftColumn);
1081 }
1082
1083 /// <summary>
1084 /// Gets an datatable containing the hits of the last search
1085 /// </summary>
1086 public DataTable Hits
1087 {
1088 get { return _hits; }
1089 }
1090
1091 /// <summary>
1092 /// Implement IDisposable.
1093 /// </summary>
1094 public void Dispose()
1095 {
1096 Dispose(true);
1097 // This object will be cleaned up by the Dispose method.
1098 // Therefore, you should call GC.SupressFinalize to
1099 // take this object off the finalization queue
1100 // and prevent finalization code for this object
1101 // from executing a second time.
1102 GC.SuppressFinalize(this);
1103 }
1104
1105 /// <summary>
1106 /// Dispose(bool disposing) executes in two distinct scenarios.
1107 /// If disposing equals true, the method has been called directly
1108 /// or indirectly by a user's code. Managed and unmanaged resources
1109 /// can be disposed.
1110 /// If disposing equals false, the method has been called by the
1111 /// runtime from inside the finalizer and you should not reference
1112 /// other objects. Only unmanaged resources can be disposed.
1113 /// </summary>
1114 /// <param name="disposing">disposing flag</param>
1115 private void Dispose(bool disposing)
1116 {
1117 // Check to see if Dispose has already been called.
1118 if(!this.disposed)
1119 {
1120 // If disposing equals true, dispose all managed
1121 // and unmanaged resources.
1122 if(disposing)
1123 {
1124 // Dispose managed resources.
1125 _binaryFileData = null;
1126 }
1127 }
1128 disposed = true;
1129 }
1130 }
1131 }