[KERNEL32/STRING] Sync sortkey.c with Wine Staging 3.3. CORE-14434
[reactos.git] / dll / win32 / kernel32 / winnls / string / sortkey.c
1 /*
2 * Unicode sort key generation
3 *
4 * Copyright 2003 Dmitry Timoshkov
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
19 */
20 #include "wine/unicode.h"
21
22 #ifdef __REACTOS__
23 #define get_char_typeW(x) iswctype((x) >> 8, (x) & 0xFF)
24 #endif
25 extern unsigned int wine_decompose( WCHAR ch, WCHAR *dst, unsigned int dstlen );
26 extern const unsigned int collation_table[];
27
28 /*
29 * flags - normalization NORM_* flags
30 *
31 * FIXME: 'variable' flag not handled
32 */
33 int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dstlen)
34 {
35 WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
36 int key_len[4];
37 char *key_ptr[4];
38 const WCHAR *src_save = src;
39 int srclen_save = srclen;
40
41 key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0;
42 for (; srclen; srclen--, src++)
43 {
44 unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
45 dummy[0] = *src;
46 if (decomposed_len)
47 {
48 for (i = 0; i < decomposed_len; i++)
49 {
50 WCHAR wch = dummy[i];
51 unsigned int ce;
52
53 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
54 * and skips white space and punctuation characters for
55 * NORM_IGNORESYMBOLS.
56 */
57 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
58 continue;
59
60 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
61
62 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
63 if (ce != (unsigned int)-1)
64 {
65 if (ce >> 16) key_len[0] += 2;
66 if ((ce >> 8) & 0xff) key_len[1]++;
67 if ((ce >> 4) & 0x0f) key_len[2]++;
68 if (ce & 1)
69 {
70 if (wch >> 8) key_len[3]++;
71 key_len[3]++;
72 }
73 }
74 else
75 {
76 key_len[0] += 2;
77 if (wch >> 8) key_len[0]++;
78 if (wch & 0xff) key_len[0]++;
79 }
80 }
81 }
82 }
83
84 if (!dstlen) /* compute length */
85 /* 4 * '\1' + 1 * '\0' + key length */
86 return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1;
87
88 if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1)
89 return 0; /* overflow */
90
91 src = src_save;
92 srclen = srclen_save;
93
94 key_ptr[0] = dst;
95 key_ptr[1] = key_ptr[0] + key_len[0] + 1;
96 key_ptr[2] = key_ptr[1] + key_len[1] + 1;
97 key_ptr[3] = key_ptr[2] + key_len[2] + 1;
98
99 for (; srclen; srclen--, src++)
100 {
101 unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
102 dummy[0] = *src;
103 if (decomposed_len)
104 {
105 for (i = 0; i < decomposed_len; i++)
106 {
107 WCHAR wch = dummy[i];
108 unsigned int ce;
109
110 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
111 * and skips white space and punctuation characters for
112 * NORM_IGNORESYMBOLS.
113 */
114 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
115 continue;
116
117 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
118
119 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
120 if (ce != (unsigned int)-1)
121 {
122 WCHAR key;
123 if ((key = ce >> 16))
124 {
125 *key_ptr[0]++ = key >> 8;
126 *key_ptr[0]++ = key & 0xff;
127 }
128 /* make key 1 start from 2 */
129 if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1;
130 /* make key 2 start from 2 */
131 if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1;
132 /* key 3 is always a character code */
133 if (ce & 1)
134 {
135 if (wch >> 8) *key_ptr[3]++ = wch >> 8;
136 if (wch & 0xff) *key_ptr[3]++ = wch & 0xff;
137 }
138 }
139 else
140 {
141 *key_ptr[0]++ = 0xff;
142 *key_ptr[0]++ = 0xfe;
143 if (wch >> 8) *key_ptr[0]++ = wch >> 8;
144 if (wch & 0xff) *key_ptr[0]++ = wch & 0xff;
145 }
146 }
147 }
148 }
149
150 *key_ptr[0] = '\1';
151 *key_ptr[1] = '\1';
152 *key_ptr[2] = '\1';
153 *key_ptr[3]++ = '\1';
154 *key_ptr[3] = 0;
155
156 return key_ptr[3] - dst;
157 }
158
159 static inline int compare_unicode_weights(int flags, const WCHAR *str1, int len1,
160 const WCHAR *str2, int len2)
161 {
162 unsigned int ce1, ce2;
163 int ret;
164
165 /* 32-bit collation element table format:
166 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
167 * case weight - high 4 bit of low 8 bit.
168 */
169 while (len1 > 0 && len2 > 0)
170 {
171 if (flags & NORM_IGNORESYMBOLS)
172 {
173 int skip = 0;
174 /* FIXME: not tested */
175 if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
176 {
177 str1++;
178 len1--;
179 skip = 1;
180 }
181 if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
182 {
183 str2++;
184 len2--;
185 skip = 1;
186 }
187 if (skip) continue;
188 }
189
190 /* hyphen and apostrophe are treated differently depending on
191 * whether SORT_STRINGSORT specified or not
192 */
193 if (!(flags & SORT_STRINGSORT))
194 {
195 if (*str1 == '-' || *str1 == '\'')
196 {
197 if (*str2 != '-' && *str2 != '\'')
198 {
199 str1++;
200 len1--;
201 continue;
202 }
203 }
204 else if (*str2 == '-' || *str2 == '\'')
205 {
206 str2++;
207 len2--;
208 continue;
209 }
210 }
211
212 ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
213 ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
214
215 if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
216 ret = (ce1 >> 16) - (ce2 >> 16);
217 else
218 ret = *str1 - *str2;
219
220 if (ret) return ret;
221
222 str1++;
223 str2++;
224 len1--;
225 len2--;
226 }
227 while (len1 && !*str1)
228 {
229 str1++;
230 len1--;
231 }
232 while (len2 && !*str2)
233 {
234 str2++;
235 len2--;
236 }
237 return len1 - len2;
238 }
239
240 static inline int compare_diacritic_weights(int flags, const WCHAR *str1, int len1,
241 const WCHAR *str2, int len2)
242 {
243 unsigned int ce1, ce2;
244 int ret;
245
246 /* 32-bit collation element table format:
247 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
248 * case weight - high 4 bit of low 8 bit.
249 */
250 while (len1 > 0 && len2 > 0)
251 {
252 if (flags & NORM_IGNORESYMBOLS)
253 {
254 int skip = 0;
255 /* FIXME: not tested */
256 if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
257 {
258 str1++;
259 len1--;
260 skip = 1;
261 }
262 if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
263 {
264 str2++;
265 len2--;
266 skip = 1;
267 }
268 if (skip) continue;
269 }
270
271 ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
272 ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
273
274 if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
275 ret = ((ce1 >> 8) & 0xff) - ((ce2 >> 8) & 0xff);
276 else
277 ret = *str1 - *str2;
278
279 if (ret) return ret;
280
281 str1++;
282 str2++;
283 len1--;
284 len2--;
285 }
286 while (len1 && !*str1)
287 {
288 str1++;
289 len1--;
290 }
291 while (len2 && !*str2)
292 {
293 str2++;
294 len2--;
295 }
296 return len1 - len2;
297 }
298
299 static inline int compare_case_weights(int flags, const WCHAR *str1, int len1,
300 const WCHAR *str2, int len2)
301 {
302 unsigned int ce1, ce2;
303 int ret;
304
305 /* 32-bit collation element table format:
306 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
307 * case weight - high 4 bit of low 8 bit.
308 */
309 while (len1 > 0 && len2 > 0)
310 {
311 if (flags & NORM_IGNORESYMBOLS)
312 {
313 int skip = 0;
314 /* FIXME: not tested */
315 if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
316 {
317 str1++;
318 len1--;
319 skip = 1;
320 }
321 if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
322 {
323 str2++;
324 len2--;
325 skip = 1;
326 }
327 if (skip) continue;
328 }
329
330 ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
331 ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
332
333 if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
334 ret = ((ce1 >> 4) & 0x0f) - ((ce2 >> 4) & 0x0f);
335 else
336 ret = *str1 - *str2;
337
338 if (ret) return ret;
339
340 str1++;
341 str2++;
342 len1--;
343 len2--;
344 }
345 while (len1 && !*str1)
346 {
347 str1++;
348 len1--;
349 }
350 while (len2 && !*str2)
351 {
352 str2++;
353 len2--;
354 }
355 return len1 - len2;
356 }
357
358 int wine_compare_string(int flags, const WCHAR *str1, int len1,
359 const WCHAR *str2, int len2)
360 {
361 int ret;
362
363 ret = compare_unicode_weights(flags, str1, len1, str2, len2);
364 if (!ret)
365 {
366 if (!(flags & NORM_IGNORENONSPACE))
367 ret = compare_diacritic_weights(flags, str1, len1, str2, len2);
368 if (!ret && !(flags & NORM_IGNORECASE))
369 ret = compare_case_weights(flags, str1, len1, str2, len2);
370 }
371 return ret;
372 }