2 * Unicode sort key generation
4 * Copyright 2003 Dmitry Timoshkov
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
20 #include "wine/unicode.h"
23 #define get_char_typeW(x) iswctype((x) >> 8, (x) & 0xFF)
25 extern unsigned int wine_decompose( WCHAR ch
, WCHAR
*dst
, unsigned int dstlen
);
26 extern const unsigned int collation_table
[];
29 * flags - normalization NORM_* flags
31 * FIXME: 'variable' flag not handled
33 int wine_get_sortkey(int flags
, const WCHAR
*src
, int srclen
, char *dst
, int dstlen
)
35 WCHAR dummy
[4]; /* no decomposition is larger than 4 chars */
38 const WCHAR
*src_save
= src
;
39 int srclen_save
= srclen
;
41 key_len
[0] = key_len
[1] = key_len
[2] = key_len
[3] = 0;
42 for (; srclen
; srclen
--, src
++)
44 unsigned int i
, decomposed_len
= 1;/*wine_decompose(*src, dummy, 4);*/
48 for (i
= 0; i
< decomposed_len
; i
++)
53 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
54 * and skips white space and punctuation characters for
57 if ((flags
& NORM_IGNORESYMBOLS
) && (get_char_typeW(wch
) & (C1_PUNCT
| C1_SPACE
)))
60 if (flags
& NORM_IGNORECASE
) wch
= tolowerW(wch
);
62 ce
= collation_table
[collation_table
[wch
>> 8] + (wch
& 0xff)];
63 if (ce
!= (unsigned int)-1)
65 if (ce
>> 16) key_len
[0] += 2;
66 if ((ce
>> 8) & 0xff) key_len
[1]++;
67 if ((ce
>> 4) & 0x0f) key_len
[2]++;
70 if (wch
>> 8) key_len
[3]++;
77 if (wch
>> 8) key_len
[0]++;
78 if (wch
& 0xff) key_len
[0]++;
84 if (!dstlen
) /* compute length */
85 /* 4 * '\1' + 1 * '\0' + key length */
86 return key_len
[0] + key_len
[1] + key_len
[2] + key_len
[3] + 4 + 1;
88 if (dstlen
< key_len
[0] + key_len
[1] + key_len
[2] + key_len
[3] + 4 + 1)
89 return 0; /* overflow */
95 key_ptr
[1] = key_ptr
[0] + key_len
[0] + 1;
96 key_ptr
[2] = key_ptr
[1] + key_len
[1] + 1;
97 key_ptr
[3] = key_ptr
[2] + key_len
[2] + 1;
99 for (; srclen
; srclen
--, src
++)
101 unsigned int i
, decomposed_len
= 1;/*wine_decompose(*src, dummy, 4);*/
105 for (i
= 0; i
< decomposed_len
; i
++)
107 WCHAR wch
= dummy
[i
];
110 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
111 * and skips white space and punctuation characters for
112 * NORM_IGNORESYMBOLS.
114 if ((flags
& NORM_IGNORESYMBOLS
) && (get_char_typeW(wch
) & (C1_PUNCT
| C1_SPACE
)))
117 if (flags
& NORM_IGNORECASE
) wch
= tolowerW(wch
);
119 ce
= collation_table
[collation_table
[wch
>> 8] + (wch
& 0xff)];
120 if (ce
!= (unsigned int)-1)
123 if ((key
= ce
>> 16))
125 *key_ptr
[0]++ = key
>> 8;
126 *key_ptr
[0]++ = key
& 0xff;
128 /* make key 1 start from 2 */
129 if ((key
= (ce
>> 8) & 0xff)) *key_ptr
[1]++ = key
+ 1;
130 /* make key 2 start from 2 */
131 if ((key
= (ce
>> 4) & 0x0f)) *key_ptr
[2]++ = key
+ 1;
132 /* key 3 is always a character code */
135 if (wch
>> 8) *key_ptr
[3]++ = wch
>> 8;
136 if (wch
& 0xff) *key_ptr
[3]++ = wch
& 0xff;
141 *key_ptr
[0]++ = 0xff;
142 *key_ptr
[0]++ = 0xfe;
143 if (wch
>> 8) *key_ptr
[0]++ = wch
>> 8;
144 if (wch
& 0xff) *key_ptr
[0]++ = wch
& 0xff;
153 *key_ptr
[3]++ = '\1';
156 return key_ptr
[3] - dst
;
159 static inline int compare_unicode_weights(int flags
, const WCHAR
*str1
, int len1
,
160 const WCHAR
*str2
, int len2
)
162 unsigned int ce1
, ce2
;
165 /* 32-bit collation element table format:
166 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
167 * case weight - high 4 bit of low 8 bit.
169 while (len1
> 0 && len2
> 0)
171 if (flags
& NORM_IGNORESYMBOLS
)
174 /* FIXME: not tested */
175 if (get_char_typeW(*str1
) & (C1_PUNCT
| C1_SPACE
))
181 if (get_char_typeW(*str2
) & (C1_PUNCT
| C1_SPACE
))
190 /* hyphen and apostrophe are treated differently depending on
191 * whether SORT_STRINGSORT specified or not
193 if (!(flags
& SORT_STRINGSORT
))
195 if (*str1
== '-' || *str1
== '\'')
197 if (*str2
!= '-' && *str2
!= '\'')
204 else if (*str2
== '-' || *str2
== '\'')
212 ce1
= collation_table
[collation_table
[*str1
>> 8] + (*str1
& 0xff)];
213 ce2
= collation_table
[collation_table
[*str2
>> 8] + (*str2
& 0xff)];
215 if (ce1
!= (unsigned int)-1 && ce2
!= (unsigned int)-1)
216 ret
= (ce1
>> 16) - (ce2
>> 16);
227 while (len1
&& !*str1
)
232 while (len2
&& !*str2
)
240 static inline int compare_diacritic_weights(int flags
, const WCHAR
*str1
, int len1
,
241 const WCHAR
*str2
, int len2
)
243 unsigned int ce1
, ce2
;
246 /* 32-bit collation element table format:
247 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
248 * case weight - high 4 bit of low 8 bit.
250 while (len1
> 0 && len2
> 0)
252 if (flags
& NORM_IGNORESYMBOLS
)
255 /* FIXME: not tested */
256 if (get_char_typeW(*str1
) & (C1_PUNCT
| C1_SPACE
))
262 if (get_char_typeW(*str2
) & (C1_PUNCT
| C1_SPACE
))
271 ce1
= collation_table
[collation_table
[*str1
>> 8] + (*str1
& 0xff)];
272 ce2
= collation_table
[collation_table
[*str2
>> 8] + (*str2
& 0xff)];
274 if (ce1
!= (unsigned int)-1 && ce2
!= (unsigned int)-1)
275 ret
= ((ce1
>> 8) & 0xff) - ((ce2
>> 8) & 0xff);
286 while (len1
&& !*str1
)
291 while (len2
&& !*str2
)
299 static inline int compare_case_weights(int flags
, const WCHAR
*str1
, int len1
,
300 const WCHAR
*str2
, int len2
)
302 unsigned int ce1
, ce2
;
305 /* 32-bit collation element table format:
306 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
307 * case weight - high 4 bit of low 8 bit.
309 while (len1
> 0 && len2
> 0)
311 if (flags
& NORM_IGNORESYMBOLS
)
314 /* FIXME: not tested */
315 if (get_char_typeW(*str1
) & (C1_PUNCT
| C1_SPACE
))
321 if (get_char_typeW(*str2
) & (C1_PUNCT
| C1_SPACE
))
330 ce1
= collation_table
[collation_table
[*str1
>> 8] + (*str1
& 0xff)];
331 ce2
= collation_table
[collation_table
[*str2
>> 8] + (*str2
& 0xff)];
333 if (ce1
!= (unsigned int)-1 && ce2
!= (unsigned int)-1)
334 ret
= ((ce1
>> 4) & 0x0f) - ((ce2
>> 4) & 0x0f);
345 while (len1
&& !*str1
)
350 while (len2
&& !*str2
)
358 int wine_compare_string(int flags
, const WCHAR
*str1
, int len1
,
359 const WCHAR
*str2
, int len2
)
363 ret
= compare_unicode_weights(flags
, str1
, len1
, str2
, len2
);
366 if (!(flags
& NORM_IGNORENONSPACE
))
367 ret
= compare_diacritic_weights(flags
, str1
, len1
, str2
, len2
);
368 if (!ret
&& !(flags
& NORM_IGNORECASE
))
369 ret
= compare_case_weights(flags
, str1
, len1
, str2
, len2
);