Sync with trunk (r48414)
[reactos.git] / lib / 3rdparty / libmpg123 / dct64_x86_64.S
1 /*
2 dct64_x86_64: SSE optimized dct64 for x86-64
3
4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Taihei Monma
7 */
8
9 #include "mangle.h"
10
11 #ifdef _WIN64
12 /* short *out0 */
13 #define ARG0 %r9
14 /* short *out1 */
15 #define ARG1 %rdx
16 /* real *samples */
17 #define ARG2 %r8
18 #else
19 /* short *out0 */
20 #define ARG0 %rdi
21 /* short *out1 */
22 #define ARG1 %rsi
23 /* real *samples */
24 #define ARG2 %rdx
25 #endif
26
27 /*
28 void dct64_x86_64(short *out0, short *out1, real *samples);
29 */
30
31 #ifndef __APPLE__
32 .section .rodata
33 #else
34 .data
35 #endif
36 ALIGN32
37 ASM_NAME(costab_x86_64):
38 .long 1056974725
39 .long 1057056395
40 .long 1057223771
41 .long 1057485416
42 .long 1057855544
43 .long 1058356026
44 .long 1059019886
45 .long 1059897405
46 .long 1061067246
47 .long 1062657950
48 .long 1064892987
49 .long 1066774581
50 .long 1069414683
51 .long 1073984175
52 .long 1079645762
53 .long 1092815430
54 .long 1057005197
55 .long 1057342072
56 .long 1058087743
57 .long 1059427869
58 .long 1061799040
59 .long 1065862217
60 .long 1071413542
61 .long 1084439708
62 .long 1057128951
63 .long 1058664893
64 .long 1063675095
65 .long 1076102863
66 .long 1057655764
67 .long 1067924853
68 .long 1060439283
69 .long 0
70 .text
71 ALIGN16
72 .globl ASM_NAME(dct64_x86_64)
73 ASM_NAME(dct64_x86_64):
74 #ifdef _WIN64 /* should save xmm6-15 */
75 movq %rcx, ARG0
76 subq $168, %rsp /* stack alignment + 10 xmm registers */
77 movaps %xmm6, (%rsp)
78 movaps %xmm7, 16(%rsp)
79 movaps %xmm8, 32(%rsp)
80 movaps %xmm9, 48(%rsp)
81 movaps %xmm10, 64(%rsp)
82 movaps %xmm11, 80(%rsp)
83 movaps %xmm12, 96(%rsp)
84 movaps %xmm13, 112(%rsp)
85 movaps %xmm14, 128(%rsp)
86 movaps %xmm15, 144(%rsp)
87 #endif
88
89 leaq ASM_NAME(costab_x86_64)(%rip), %rcx
90
91 MOVUAPS (ARG2), %xmm15
92 MOVUAPS 16(ARG2), %xmm14
93 MOVUAPS 112(ARG2), %xmm0
94 MOVUAPS 96(ARG2), %xmm1
95 shufps $0x1b, %xmm0, %xmm0
96 shufps $0x1b, %xmm1, %xmm1
97 movaps %xmm15, %xmm8
98 movaps %xmm14, %xmm9
99 addps %xmm0, %xmm8
100 addps %xmm1, %xmm9
101 subps %xmm0, %xmm15
102 subps %xmm1, %xmm14
103
104 MOVUAPS 32(ARG2), %xmm13
105 MOVUAPS 48(ARG2), %xmm12
106 MOVUAPS 80(ARG2), %xmm0
107 MOVUAPS 64(ARG2), %xmm1
108 shufps $0x1b, %xmm0, %xmm0
109 shufps $0x1b, %xmm1, %xmm1
110 movaps %xmm13, %xmm10
111 movaps %xmm12, %xmm11
112 addps %xmm0, %xmm10
113 addps %xmm1, %xmm11
114 subps %xmm0, %xmm13
115 subps %xmm1, %xmm12
116
117 movaps (%rcx), %xmm0
118 movaps 16(%rcx), %xmm1
119 movaps 32(%rcx), %xmm2
120 movaps 48(%rcx), %xmm3
121 mulps %xmm0, %xmm15
122 mulps %xmm1, %xmm14
123 mulps %xmm2, %xmm13
124 mulps %xmm3, %xmm12
125
126 movaps 64(%rcx), %xmm0
127 movaps 80(%rcx), %xmm1
128
129 pshufd $0x1b, %xmm11, %xmm2
130 pshufd $0x1b, %xmm10, %xmm3
131 shufps $0x1b, %xmm13, %xmm13
132 shufps $0x1b, %xmm12, %xmm12
133 movaps %xmm8, %xmm11
134 movaps %xmm9, %xmm10
135 movaps %xmm14, %xmm4
136 movaps %xmm15, %xmm5
137 subps %xmm2, %xmm11
138 subps %xmm3, %xmm10
139 subps %xmm13, %xmm14
140 subps %xmm12, %xmm15
141 addps %xmm2, %xmm8
142 addps %xmm3, %xmm9
143 addps %xmm5, %xmm12
144 addps %xmm4, %xmm13
145 mulps %xmm0, %xmm11
146 mulps %xmm1, %xmm10
147 mulps %xmm1, %xmm14
148 mulps %xmm0, %xmm15
149
150 movaps 96(%rcx), %xmm0
151
152 pshufd $0x1b, %xmm9, %xmm1
153 pshufd $0x1b, %xmm13, %xmm2
154 shufps $0x1b, %xmm10, %xmm10
155 shufps $0x1b, %xmm14, %xmm14
156 movaps %xmm8, %xmm9
157 movaps %xmm12, %xmm13
158 movaps %xmm11, %xmm3
159 movaps %xmm15, %xmm4
160 subps %xmm1, %xmm9
161 subps %xmm2, %xmm13
162 subps %xmm10, %xmm11
163 subps %xmm14, %xmm15
164 addps %xmm1, %xmm8
165 addps %xmm2, %xmm12
166 addps %xmm3, %xmm10
167 addps %xmm4, %xmm14
168 mulps %xmm0, %xmm9
169 mulps %xmm0, %xmm13
170 mulps %xmm0, %xmm11
171 mulps %xmm0, %xmm15
172
173 movaps 112(%rcx), %xmm0
174 movaps %xmm0, %xmm1
175 movlhps %xmm1, %xmm1
176
177 movaps %xmm8, %xmm2
178 movaps %xmm9, %xmm3
179 shufps $0x44, %xmm10, %xmm2
180 shufps $0xbb, %xmm11, %xmm9
181 shufps $0xbb, %xmm10, %xmm8
182 shufps $0x44, %xmm11, %xmm3
183 movaps %xmm2, %xmm4
184 movaps %xmm3, %xmm5
185 subps %xmm8, %xmm2
186 subps %xmm9, %xmm3
187 addps %xmm4, %xmm8
188 addps %xmm5, %xmm9
189 mulps %xmm1, %xmm2
190 mulps %xmm1, %xmm3
191 movaps %xmm8, %xmm10
192 movaps %xmm9, %xmm11
193 shufps $0x14, %xmm2, %xmm8
194 shufps $0xbe, %xmm2, %xmm10
195 shufps $0x14, %xmm3, %xmm9
196 shufps $0xbe, %xmm3, %xmm11
197
198 movaps %xmm12, %xmm2
199 movaps %xmm13, %xmm3
200 shufps $0x44, %xmm14, %xmm2
201 shufps $0xbb, %xmm15, %xmm13
202 shufps $0xbb, %xmm14, %xmm12
203 shufps $0x44, %xmm15, %xmm3
204 movaps %xmm2, %xmm4
205 movaps %xmm3, %xmm5
206 subps %xmm12, %xmm2
207 subps %xmm13, %xmm3
208 addps %xmm4, %xmm12
209 addps %xmm5, %xmm13
210 mulps %xmm1, %xmm2
211 mulps %xmm1, %xmm3
212 movaps %xmm12, %xmm14
213 movaps %xmm13, %xmm15
214 shufps $0x14, %xmm2, %xmm12
215 shufps $0xbe, %xmm2, %xmm14
216 shufps $0x14, %xmm3, %xmm13
217 shufps $0xbe, %xmm3, %xmm15
218
219 shufps $0xaa, %xmm0, %xmm0
220 pcmpeqd %xmm1, %xmm1
221 pslld $31, %xmm1
222 psllq $32, %xmm1
223 xorps %xmm1, %xmm0
224
225 movaps %xmm8, %xmm1
226 movaps %xmm10, %xmm2
227 unpcklps %xmm9, %xmm8
228 unpckhps %xmm9, %xmm1
229 unpcklps %xmm11, %xmm10
230 unpckhps %xmm11, %xmm2
231 movaps %xmm8, %xmm3
232 movaps %xmm10, %xmm4
233 unpcklps %xmm1, %xmm8
234 unpckhps %xmm1, %xmm3
235 unpcklps %xmm2, %xmm10
236 unpckhps %xmm2, %xmm4
237 movaps %xmm8, %xmm1
238 movaps %xmm10, %xmm2
239 subps %xmm3, %xmm1
240 subps %xmm4, %xmm2
241 addps %xmm3, %xmm8
242 addps %xmm4, %xmm10
243 mulps %xmm0, %xmm1
244 mulps %xmm0, %xmm2
245 movaps %xmm8, %xmm9
246 movaps %xmm10, %xmm11
247 unpcklps %xmm1, %xmm8
248 unpckhps %xmm1, %xmm9
249 unpcklps %xmm2, %xmm10
250 unpckhps %xmm2, %xmm11
251
252 movaps %xmm12, %xmm1
253 movaps %xmm14, %xmm2
254 unpcklps %xmm13, %xmm12
255 unpckhps %xmm13, %xmm1
256 unpcklps %xmm15, %xmm14
257 unpckhps %xmm15, %xmm2
258 movaps %xmm12, %xmm3
259 movaps %xmm14, %xmm4
260 unpcklps %xmm1, %xmm12
261 unpckhps %xmm1, %xmm3
262 unpcklps %xmm2, %xmm14
263 unpckhps %xmm2, %xmm4
264 movaps %xmm12, %xmm1
265 movaps %xmm14, %xmm2
266 subps %xmm3, %xmm1
267 subps %xmm4, %xmm2
268 addps %xmm3, %xmm12
269 addps %xmm4, %xmm14
270 mulps %xmm0, %xmm1
271 mulps %xmm0, %xmm2
272 movaps %xmm12, %xmm13
273 movaps %xmm14, %xmm15
274 unpcklps %xmm1, %xmm12
275 unpckhps %xmm1, %xmm13
276 unpcklps %xmm2, %xmm14
277 unpckhps %xmm2, %xmm15
278
279
280 xorps %xmm0, %xmm0
281 xorps %xmm1, %xmm1
282 shufpd $0x2, %xmm8, %xmm0
283 shufpd $0x2, %xmm9, %xmm1
284 psrlq $32, %xmm0
285 psrlq $32, %xmm1
286 addps %xmm0, %xmm8
287 addps %xmm1, %xmm9
288
289 xorps %xmm0, %xmm0
290 xorps %xmm1, %xmm1
291 shufpd $0x2, %xmm10, %xmm0
292 shufpd $0x2, %xmm11, %xmm1
293 psrlq $32, %xmm0
294 psrlq $32, %xmm1
295 addps %xmm0, %xmm10
296 addps %xmm1, %xmm11
297
298 xorps %xmm0, %xmm0
299 xorps %xmm1, %xmm1
300 shufpd $0x2, %xmm12, %xmm0
301 shufpd $0x2, %xmm13, %xmm1
302 psrlq $32, %xmm0
303 psrlq $32, %xmm1
304 addps %xmm0, %xmm12
305 addps %xmm1, %xmm13
306
307 xorps %xmm0, %xmm0
308 xorps %xmm1, %xmm1
309 shufpd $0x2, %xmm14, %xmm0
310 shufpd $0x2, %xmm15, %xmm1
311 psrlq $32, %xmm0
312 psrlq $32, %xmm1
313 addps %xmm0, %xmm14
314 addps %xmm1, %xmm15
315
316 pshufd $0x78, %xmm9, %xmm0
317 pshufd $0x78, %xmm11, %xmm1
318 pshufd $0x78, %xmm13, %xmm2
319 pshufd $0x78, %xmm15, %xmm3
320 psrldq $4, %xmm0
321 psrldq $4, %xmm1
322 psrldq $4, %xmm2
323 psrldq $4, %xmm3
324 addps %xmm0, %xmm9
325 addps %xmm1, %xmm11
326 addps %xmm2, %xmm13
327 addps %xmm3, %xmm15
328
329 pshufd $0x78, %xmm10, %xmm0
330 pshufd $0x78, %xmm14, %xmm1
331 psrldq $4, %xmm0
332 psrldq $4, %xmm1
333 addps %xmm11, %xmm10
334 addps %xmm15, %xmm14
335 addps %xmm0, %xmm11
336 addps %xmm1, %xmm15
337
338 cvtps2dq %xmm8, %xmm8
339 cvtps2dq %xmm9, %xmm9
340 cvtps2dq %xmm10, %xmm10
341 cvtps2dq %xmm11, %xmm11
342 packssdw %xmm10, %xmm8
343 packssdw %xmm11, %xmm9
344
345 movd %xmm8, %eax
346 movd %xmm9, %ecx
347 movw %ax, 512(ARG0)
348 movw %cx, 384(ARG0)
349 shrl $16, %eax
350 shrl $16, %ecx
351 movw %ax, (ARG0)
352 movw %ax, (ARG1)
353 movw %cx, 128(ARG1)
354
355 movhlps %xmm8, %xmm0
356 movhlps %xmm9, %xmm1
357 movd %xmm0, %eax
358 movd %xmm1, %ecx
359 movw %ax, 448(ARG0)
360 movw %cx, 320(ARG0)
361 shrl $16, %eax
362 shrl $16, %ecx
363 movw %ax, 64(ARG1)
364 movw %cx, 192(ARG1)
365
366 pshuflw $0xee, %xmm8, %xmm2
367 pshuflw $0xee, %xmm9, %xmm3
368 movd %xmm2, %eax
369 movd %xmm3, %ecx
370 movw %ax, 256(ARG0)
371 movw %cx, 128(ARG0)
372 shrl $16, %eax
373 shrl $16, %ecx
374 movw %ax, 256(ARG1)
375 movw %cx, 384(ARG1)
376
377 pshuflw $0xee, %xmm0, %xmm0
378 pshuflw $0xee, %xmm1, %xmm1
379 movd %xmm0, %eax
380 movd %xmm1, %ecx
381 movw %ax, 192(ARG0)
382 movw %cx, 64(ARG0)
383 shrl $16, %eax
384 shrl $16, %ecx
385 movw %ax, 320(ARG1)
386 movw %cx, 448(ARG1)
387
388 movaps %xmm12, %xmm0
389 movaps %xmm13, %xmm1
390 movaps %xmm14, %xmm2
391 movaps %xmm15, %xmm3
392 shufps $0x1e, %xmm0, %xmm0
393 pslldq $4, %xmm0
394 psrldq $4, %xmm0
395 addps %xmm2, %xmm12
396 addps %xmm3, %xmm13
397 addps %xmm1, %xmm14
398 addps %xmm0, %xmm15
399
400 cvtps2dq %xmm12, %xmm12
401 cvtps2dq %xmm13, %xmm13
402 cvtps2dq %xmm14, %xmm14
403 cvtps2dq %xmm15, %xmm15
404 packssdw %xmm13, %xmm12
405 packssdw %xmm15, %xmm14
406
407 movd %xmm12, %eax
408 movd %xmm14, %ecx
409 movw %ax, 480(ARG0)
410 movw %cx, 416(ARG0)
411 shrl $16, %eax
412 shrl $16, %ecx
413 movw %ax, 32(ARG1)
414 movw %cx, 96(ARG1)
415
416 pshuflw $0xee, %xmm12, %xmm0
417 pshuflw $0xee, %xmm14, %xmm1
418 movd %xmm0, %eax
419 movd %xmm1, %ecx
420 movw %ax, 224(ARG0)
421 movw %cx, 160(ARG0)
422 shrl $16, %eax
423 shrl $16, %ecx
424 movw %ax, 288(ARG1)
425 movw %cx, 352(ARG1)
426
427 movhlps %xmm12, %xmm0
428 movhlps %xmm14, %xmm1
429 movd %xmm0, %eax
430 movd %xmm1, %ecx
431 movw %ax, 352(ARG0)
432 movw %cx, 288(ARG0)
433 shrl $16, %eax
434 shrl $16, %ecx
435 movw %ax, 160(ARG1)
436 movw %cx, 224(ARG1)
437
438 pshuflw $0xee, %xmm0, %xmm0
439 pshuflw $0xee, %xmm1, %xmm1
440 movd %xmm0, %eax
441 movd %xmm1, %ecx
442 movw %ax, 96(ARG0)
443 movw %cx, 32(ARG0)
444 shrl $16, %eax
445 shrl $16, %ecx
446 movw %ax, 416(ARG1)
447 movw %cx, 480(ARG1)
448
449 #ifdef _WIN64
450 movaps (%rsp), %xmm6
451 movaps 16(%rsp), %xmm7
452 movaps 32(%rsp), %xmm8
453 movaps 48(%rsp), %xmm9
454 movaps 64(%rsp), %xmm10
455 movaps 80(%rsp), %xmm11
456 movaps 96(%rsp), %xmm12
457 movaps 112(%rsp), %xmm13
458 movaps 128(%rsp), %xmm14
459 movaps 144(%rsp), %xmm15
460 addq $168, %rsp
461 #endif
462 ret
463
464 NONEXEC_STACK