Start source tree (final, I hope!) restructuration. Part 1/X
[reactos.git] / reactos / win32ss / base / kernel32 / winnls / string / sortkey.c
1 /*
2 * Unicode sort key generation
3 *
4 * Copyright 2003 Dmitry Timoshkov
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
19 */
20
21 #include <wine/unicode.h>
22
23 #define get_char_typeW(x) iswctype((x) >> 8, (x) & 0xFF)
24 extern int get_decomposition(WCHAR src, WCHAR *dst, unsigned int dstlen);
25 extern const unsigned int collation_table[];
26
27 /*
28 * flags - normalization NORM_* flags
29 *
30 * FIXME: 'variable' flag not handled
31 */
32 int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dstlen)
33 {
34 WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
35 int key_len[4];
36 char *key_ptr[4];
37 const WCHAR *src_save = src;
38 int srclen_save = srclen;
39
40 key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0;
41 for (; srclen; srclen--, src++)
42 {
43 int decomposed_len = 1;/*get_decomposition(*src, dummy, 4);*/
44 dummy[0] = *src;
45 if (decomposed_len)
46 {
47 int i;
48 for (i = 0; i < decomposed_len; i++)
49 {
50 WCHAR wch = dummy[i];
51 unsigned int ce;
52
53 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
54 * and skips white space and punctuation characters for
55 * NORM_IGNORESYMBOLS.
56 */
57 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
58 continue;
59
60 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
61
62 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
63 if (ce != (unsigned int)-1)
64 {
65 if (ce >> 16) key_len[0] += 2;
66 if ((ce >> 8) & 0xff) key_len[1]++;
67 if ((ce >> 4) & 0x0f) key_len[2]++;
68 if (ce & 1)
69 {
70 if (wch >> 8) key_len[3]++;
71 key_len[3]++;
72 }
73 }
74 else
75 {
76 key_len[0] += 2;
77 if (wch >> 8) key_len[0]++;
78 if (wch & 0xff) key_len[0]++;
79 }
80 }
81 }
82 }
83
84 if (!dstlen) /* compute length */
85 /* 4 * '\1' + 1 * '\0' + key length */
86 return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1;
87
88 if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1)
89 return 0; /* overflow */
90
91 src = src_save;
92 srclen = srclen_save;
93
94 key_ptr[0] = dst;
95 key_ptr[1] = key_ptr[0] + key_len[0] + 1;
96 key_ptr[2] = key_ptr[1] + key_len[1] + 1;
97 key_ptr[3] = key_ptr[2] + key_len[2] + 1;
98
99 for (; srclen; srclen--, src++)
100 {
101 int decomposed_len = 1;/*get_decomposition(*src, dummy, 4);*/
102 dummy[0] = *src;
103 if (decomposed_len)
104 {
105 int i;
106 for (i = 0; i < decomposed_len; i++)
107 {
108 WCHAR wch = dummy[i];
109 unsigned int ce;
110
111 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
112 * and skips white space and punctuation characters for
113 * NORM_IGNORESYMBOLS.
114 */
115 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
116 continue;
117
118 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
119
120 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
121 if (ce != (unsigned int)-1)
122 {
123 WCHAR key;
124 if ((key = ce >> 16))
125 {
126 *key_ptr[0]++ = key >> 8;
127 *key_ptr[0]++ = key & 0xff;
128 }
129 /* make key 1 start from 2 */
130 if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1;
131 /* make key 2 start from 2 */
132 if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1;
133 /* key 3 is always a character code */
134 if (ce & 1)
135 {
136 if (wch >> 8) *key_ptr[3]++ = wch >> 8;
137 if (wch & 0xff) *key_ptr[3]++ = wch & 0xff;
138 }
139 }
140 else
141 {
142 *key_ptr[0]++ = 0xff;
143 *key_ptr[0]++ = 0xfe;
144 if (wch >> 8) *key_ptr[0]++ = wch >> 8;
145 if (wch & 0xff) *key_ptr[0]++ = wch & 0xff;
146 }
147 }
148 }
149 }
150
151 *key_ptr[0] = '\1';
152 *key_ptr[1] = '\1';
153 *key_ptr[2] = '\1';
154 *key_ptr[3]++ = '\1';
155 *key_ptr[3] = 0;
156
157 return key_ptr[3] - dst;
158 }
159
160 static inline int compare_unicode_weights(int flags, const WCHAR *str1, int len1,
161 const WCHAR *str2, int len2)
162 {
163 unsigned int ce1, ce2;
164 int ret;
165
166 /* 32-bit collation element table format:
167 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
168 * case weight - high 4 bit of low 8 bit.
169 */
170 while (len1 > 0 && len2 > 0)
171 {
172 if (flags & NORM_IGNORESYMBOLS)
173 {
174 int skip = 0;
175 /* FIXME: not tested */
176 if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
177 {
178 str1++;
179 len1--;
180 skip = 1;
181 }
182 if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
183 {
184 str2++;
185 len2--;
186 skip = 1;
187 }
188 if (skip) continue;
189 }
190
191 /* hyphen and apostrophe are treated differently depending on
192 * whether SORT_STRINGSORT specified or not
193 */
194 if (!(flags & SORT_STRINGSORT))
195 {
196 if (*str1 == '-' || *str1 == '\'')
197 {
198 if (*str2 != '-' && *str2 != '\'')
199 {
200 str1++;
201 len1--;
202 continue;
203 }
204 }
205 else if (*str2 == '-' || *str2 == '\'')
206 {
207 str2++;
208 len2--;
209 continue;
210 }
211 }
212
213 ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
214 ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
215
216 if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
217 ret = (ce1 >> 16) - (ce2 >> 16);
218 else
219 ret = *str1 - *str2;
220
221 if (ret) return ret;
222
223 str1++;
224 str2++;
225 len1--;
226 len2--;
227 }
228 return len1 - len2;
229 }
230
231 static inline int compare_diacritic_weights(int flags, const WCHAR *str1, int len1,
232 const WCHAR *str2, int len2)
233 {
234 unsigned int ce1, ce2;
235 int ret;
236
237 /* 32-bit collation element table format:
238 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
239 * case weight - high 4 bit of low 8 bit.
240 */
241 while (len1 > 0 && len2 > 0)
242 {
243 if (flags & NORM_IGNORESYMBOLS)
244 {
245 int skip = 0;
246 /* FIXME: not tested */
247 if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
248 {
249 str1++;
250 len1--;
251 skip = 1;
252 }
253 if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
254 {
255 str2++;
256 len2--;
257 skip = 1;
258 }
259 if (skip) continue;
260 }
261
262 ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
263 ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
264
265 if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
266 ret = ((ce1 >> 8) & 0xff) - ((ce2 >> 8) & 0xff);
267 else
268 ret = *str1 - *str2;
269
270 if (ret) return ret;
271
272 str1++;
273 str2++;
274 len1--;
275 len2--;
276 }
277 return len1 - len2;
278 }
279
280 static inline int compare_case_weights(int flags, const WCHAR *str1, int len1,
281 const WCHAR *str2, int len2)
282 {
283 unsigned int ce1, ce2;
284 int ret;
285
286 /* 32-bit collation element table format:
287 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
288 * case weight - high 4 bit of low 8 bit.
289 */
290 while (len1 > 0 && len2 > 0)
291 {
292 if (flags & NORM_IGNORESYMBOLS)
293 {
294 int skip = 0;
295 /* FIXME: not tested */
296 if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
297 {
298 str1++;
299 len1--;
300 skip = 1;
301 }
302 if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
303 {
304 str2++;
305 len2--;
306 skip = 1;
307 }
308 if (skip) continue;
309 }
310
311 ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
312 ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
313
314 if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
315 ret = ((ce1 >> 4) & 0x0f) - ((ce2 >> 4) & 0x0f);
316 else
317 ret = *str1 - *str2;
318
319 if (ret) return ret;
320
321 str1++;
322 str2++;
323 len1--;
324 len2--;
325 }
326 return len1 - len2;
327 }
328
329 static inline int real_length(const WCHAR *str, int len)
330 {
331 while (len && !str[len - 1]) len--;
332 return len;
333 }
334
335 int wine_compare_string(int flags, const WCHAR *str1, int len1,
336 const WCHAR *str2, int len2)
337 {
338 int ret;
339
340 len1 = real_length(str1, len1);
341 len2 = real_length(str2, len2);
342
343 ret = compare_unicode_weights(flags, str1, len1, str2, len2);
344 if (!ret)
345 {
346 if (!(flags & NORM_IGNORENONSPACE))
347 ret = compare_diacritic_weights(flags, str1, len1, str2, len2);
348 if (!ret && !(flags & NORM_IGNORECASE))
349 ret = compare_case_weights(flags, str1, len1, str2, len2);
350 }
351 return ret;
352 }