2 * Unicode sort key generation
4 * Copyright 2003 Dmitry Timoshkov
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
21 #include <wine/unicode.h>
23 #define get_char_typeW(x) iswctype((x) >> 8, (x) & 0xFF)
24 extern int get_decomposition(WCHAR src
, WCHAR
*dst
, unsigned int dstlen
);
25 extern const unsigned int collation_table
[];
28 * flags - normalization NORM_* flags
30 * FIXME: 'variable' flag not handled
32 int wine_get_sortkey(int flags
, const WCHAR
*src
, int srclen
, char *dst
, int dstlen
)
34 WCHAR dummy
[4]; /* no decomposition is larger than 4 chars */
37 const WCHAR
*src_save
= src
;
38 int srclen_save
= srclen
;
40 key_len
[0] = key_len
[1] = key_len
[2] = key_len
[3] = 0;
41 for (; srclen
; srclen
--, src
++)
43 int decomposed_len
= 1;/*get_decomposition(*src, dummy, 4);*/
48 for (i
= 0; i
< decomposed_len
; i
++)
53 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
54 * and skips white space and punctuation characters for
57 if ((flags
& NORM_IGNORESYMBOLS
) && (get_char_typeW(wch
) & (C1_PUNCT
| C1_SPACE
)))
60 if (flags
& NORM_IGNORECASE
) wch
= tolowerW(wch
);
62 ce
= collation_table
[collation_table
[wch
>> 8] + (wch
& 0xff)];
63 if (ce
!= (unsigned int)-1)
65 if (ce
>> 16) key_len
[0] += 2;
66 if ((ce
>> 8) & 0xff) key_len
[1]++;
67 if ((ce
>> 4) & 0x0f) key_len
[2]++;
70 if (wch
>> 8) key_len
[3]++;
77 if (wch
>> 8) key_len
[0]++;
78 if (wch
& 0xff) key_len
[0]++;
84 if (!dstlen
) /* compute length */
85 /* 4 * '\1' + 1 * '\0' + key length */
86 return key_len
[0] + key_len
[1] + key_len
[2] + key_len
[3] + 4 + 1;
88 if (dstlen
< key_len
[0] + key_len
[1] + key_len
[2] + key_len
[3] + 4 + 1)
89 return 0; /* overflow */
95 key_ptr
[1] = key_ptr
[0] + key_len
[0] + 1;
96 key_ptr
[2] = key_ptr
[1] + key_len
[1] + 1;
97 key_ptr
[3] = key_ptr
[2] + key_len
[2] + 1;
99 for (; srclen
; srclen
--, src
++)
101 int decomposed_len
= 1;/*get_decomposition(*src, dummy, 4);*/
106 for (i
= 0; i
< decomposed_len
; i
++)
108 WCHAR wch
= dummy
[i
];
111 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
112 * and skips white space and punctuation characters for
113 * NORM_IGNORESYMBOLS.
115 if ((flags
& NORM_IGNORESYMBOLS
) && (get_char_typeW(wch
) & (C1_PUNCT
| C1_SPACE
)))
118 if (flags
& NORM_IGNORECASE
) wch
= tolowerW(wch
);
120 ce
= collation_table
[collation_table
[wch
>> 8] + (wch
& 0xff)];
121 if (ce
!= (unsigned int)-1)
124 if ((key
= ce
>> 16))
126 *key_ptr
[0]++ = key
>> 8;
127 *key_ptr
[0]++ = key
& 0xff;
129 /* make key 1 start from 2 */
130 if ((key
= (ce
>> 8) & 0xff)) *key_ptr
[1]++ = key
+ 1;
131 /* make key 2 start from 2 */
132 if ((key
= (ce
>> 4) & 0x0f)) *key_ptr
[2]++ = key
+ 1;
133 /* key 3 is always a character code */
136 if (wch
>> 8) *key_ptr
[3]++ = wch
>> 8;
137 if (wch
& 0xff) *key_ptr
[3]++ = wch
& 0xff;
142 *key_ptr
[0]++ = 0xff;
143 *key_ptr
[0]++ = 0xfe;
144 if (wch
>> 8) *key_ptr
[0]++ = wch
>> 8;
145 if (wch
& 0xff) *key_ptr
[0]++ = wch
& 0xff;
154 *key_ptr
[3]++ = '\1';
157 return key_ptr
[3] - dst
;
160 static inline int compare_unicode_weights(int flags
, const WCHAR
*str1
, int len1
,
161 const WCHAR
*str2
, int len2
)
163 unsigned int ce1
, ce2
;
166 /* 32-bit collation element table format:
167 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
168 * case weight - high 4 bit of low 8 bit.
170 while (len1
> 0 && len2
> 0)
172 if (flags
& NORM_IGNORESYMBOLS
)
175 /* FIXME: not tested */
176 if (get_char_typeW(*str1
) & (C1_PUNCT
| C1_SPACE
))
182 if (get_char_typeW(*str2
) & (C1_PUNCT
| C1_SPACE
))
191 /* hyphen and apostrophe are treated differently depending on
192 * whether SORT_STRINGSORT specified or not
194 if (!(flags
& SORT_STRINGSORT
))
196 if (*str1
== '-' || *str1
== '\'')
198 if (*str2
!= '-' && *str2
!= '\'')
205 else if (*str2
== '-' || *str2
== '\'')
213 ce1
= collation_table
[collation_table
[*str1
>> 8] + (*str1
& 0xff)];
214 ce2
= collation_table
[collation_table
[*str2
>> 8] + (*str2
& 0xff)];
216 if (ce1
!= (unsigned int)-1 && ce2
!= (unsigned int)-1)
217 ret
= (ce1
>> 16) - (ce2
>> 16);
231 static inline int compare_diacritic_weights(int flags
, const WCHAR
*str1
, int len1
,
232 const WCHAR
*str2
, int len2
)
234 unsigned int ce1
, ce2
;
237 /* 32-bit collation element table format:
238 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
239 * case weight - high 4 bit of low 8 bit.
241 while (len1
> 0 && len2
> 0)
243 if (flags
& NORM_IGNORESYMBOLS
)
246 /* FIXME: not tested */
247 if (get_char_typeW(*str1
) & (C1_PUNCT
| C1_SPACE
))
253 if (get_char_typeW(*str2
) & (C1_PUNCT
| C1_SPACE
))
262 ce1
= collation_table
[collation_table
[*str1
>> 8] + (*str1
& 0xff)];
263 ce2
= collation_table
[collation_table
[*str2
>> 8] + (*str2
& 0xff)];
265 if (ce1
!= (unsigned int)-1 && ce2
!= (unsigned int)-1)
266 ret
= ((ce1
>> 8) & 0xff) - ((ce2
>> 8) & 0xff);
280 static inline int compare_case_weights(int flags
, const WCHAR
*str1
, int len1
,
281 const WCHAR
*str2
, int len2
)
283 unsigned int ce1
, ce2
;
286 /* 32-bit collation element table format:
287 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
288 * case weight - high 4 bit of low 8 bit.
290 while (len1
> 0 && len2
> 0)
292 if (flags
& NORM_IGNORESYMBOLS
)
295 /* FIXME: not tested */
296 if (get_char_typeW(*str1
) & (C1_PUNCT
| C1_SPACE
))
302 if (get_char_typeW(*str2
) & (C1_PUNCT
| C1_SPACE
))
311 ce1
= collation_table
[collation_table
[*str1
>> 8] + (*str1
& 0xff)];
312 ce2
= collation_table
[collation_table
[*str2
>> 8] + (*str2
& 0xff)];
314 if (ce1
!= (unsigned int)-1 && ce2
!= (unsigned int)-1)
315 ret
= ((ce1
>> 4) & 0x0f) - ((ce2
>> 4) & 0x0f);
329 static inline int real_length(const WCHAR
*str
, int len
)
331 while (len
&& !str
[len
- 1]) len
--;
335 int wine_compare_string(int flags
, const WCHAR
*str1
, int len1
,
336 const WCHAR
*str2
, int len2
)
340 len1
= real_length(str1
, len1
);
341 len2
= real_length(str2
, len2
);
343 ret
= compare_unicode_weights(flags
, str1
, len1
, str2
, len2
);
346 if (!(flags
& NORM_IGNORENONSPACE
))
347 ret
= compare_diacritic_weights(flags
, str1
, len1
, str2
, len2
);
348 if (!ret
&& !(flags
& NORM_IGNORECASE
))
349 ret
= compare_case_weights(flags
, str1
, len1
, str2
, len2
);