2 dct64_x86_64: SSE optimized dct64 for x86-64
4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Taihei Monma
28 void dct64_x86_64(short *out0, short *out1, real *samples);
37 ASM_NAME(costab_x86_64):
72 .globl ASM_NAME(dct64_x86_64)
73 ASM_NAME(dct64_x86_64):
74 #ifdef _WIN64 /* should save xmm6-15 */
76 subq $168, %rsp /* stack alignment + 10 xmm registers */
78 movaps %xmm7, 16(%rsp)
79 movaps %xmm8, 32(%rsp)
80 movaps %xmm9, 48(%rsp)
81 movaps %xmm10, 64(%rsp)
82 movaps %xmm11, 80(%rsp)
83 movaps %xmm12, 96(%rsp)
84 movaps %xmm13, 112(%rsp)
85 movaps %xmm14, 128(%rsp)
86 movaps %xmm15, 144(%rsp)
89 leaq ASM_NAME(costab_x86_64)(%rip), %rcx
91 MOVUAPS (ARG2), %xmm15
92 MOVUAPS 16(ARG2), %xmm14
93 MOVUAPS 112(ARG2), %xmm0
94 MOVUAPS 96(ARG2), %xmm1
95 shufps $0x1b, %xmm0, %xmm0
96 shufps $0x1b, %xmm1, %xmm1
104 MOVUAPS 32(ARG2), %xmm13
105 MOVUAPS 48(ARG2), %xmm12
106 MOVUAPS 80(ARG2), %xmm0
107 MOVUAPS 64(ARG2), %xmm1
108 shufps $0x1b, %xmm0, %xmm0
109 shufps $0x1b, %xmm1, %xmm1
110 movaps %xmm13, %xmm10
111 movaps %xmm12, %xmm11
118 movaps 16(%rcx), %xmm1
119 movaps 32(%rcx), %xmm2
120 movaps 48(%rcx), %xmm3
126 movaps 64(%rcx), %xmm0
127 movaps 80(%rcx), %xmm1
129 pshufd $0x1b, %xmm11, %xmm2
130 pshufd $0x1b, %xmm10, %xmm3
131 shufps $0x1b, %xmm13, %xmm13
132 shufps $0x1b, %xmm12, %xmm12
150 movaps 96(%rcx), %xmm0
152 pshufd $0x1b, %xmm9, %xmm1
153 pshufd $0x1b, %xmm13, %xmm2
154 shufps $0x1b, %xmm10, %xmm10
155 shufps $0x1b, %xmm14, %xmm14
157 movaps %xmm12, %xmm13
173 movaps 112(%rcx), %xmm0
179 shufps $0x44, %xmm10, %xmm2
180 shufps $0xbb, %xmm11, %xmm9
181 shufps $0xbb, %xmm10, %xmm8
182 shufps $0x44, %xmm11, %xmm3
193 shufps $0x14, %xmm2, %xmm8
194 shufps $0xbe, %xmm2, %xmm10
195 shufps $0x14, %xmm3, %xmm9
196 shufps $0xbe, %xmm3, %xmm11
200 shufps $0x44, %xmm14, %xmm2
201 shufps $0xbb, %xmm15, %xmm13
202 shufps $0xbb, %xmm14, %xmm12
203 shufps $0x44, %xmm15, %xmm3
212 movaps %xmm12, %xmm14
213 movaps %xmm13, %xmm15
214 shufps $0x14, %xmm2, %xmm12
215 shufps $0xbe, %xmm2, %xmm14
216 shufps $0x14, %xmm3, %xmm13
217 shufps $0xbe, %xmm3, %xmm15
219 shufps $0xaa, %xmm0, %xmm0
227 unpcklps %xmm9, %xmm8
228 unpckhps %xmm9, %xmm1
229 unpcklps %xmm11, %xmm10
230 unpckhps %xmm11, %xmm2
233 unpcklps %xmm1, %xmm8
234 unpckhps %xmm1, %xmm3
235 unpcklps %xmm2, %xmm10
236 unpckhps %xmm2, %xmm4
246 movaps %xmm10, %xmm11
247 unpcklps %xmm1, %xmm8
248 unpckhps %xmm1, %xmm9
249 unpcklps %xmm2, %xmm10
250 unpckhps %xmm2, %xmm11
254 unpcklps %xmm13, %xmm12
255 unpckhps %xmm13, %xmm1
256 unpcklps %xmm15, %xmm14
257 unpckhps %xmm15, %xmm2
260 unpcklps %xmm1, %xmm12
261 unpckhps %xmm1, %xmm3
262 unpcklps %xmm2, %xmm14
263 unpckhps %xmm2, %xmm4
272 movaps %xmm12, %xmm13
273 movaps %xmm14, %xmm15
274 unpcklps %xmm1, %xmm12
275 unpckhps %xmm1, %xmm13
276 unpcklps %xmm2, %xmm14
277 unpckhps %xmm2, %xmm15
282 shufpd $0x2, %xmm8, %xmm0
283 shufpd $0x2, %xmm9, %xmm1
291 shufpd $0x2, %xmm10, %xmm0
292 shufpd $0x2, %xmm11, %xmm1
300 shufpd $0x2, %xmm12, %xmm0
301 shufpd $0x2, %xmm13, %xmm1
309 shufpd $0x2, %xmm14, %xmm0
310 shufpd $0x2, %xmm15, %xmm1
316 pshufd $0x78, %xmm9, %xmm0
317 pshufd $0x78, %xmm11, %xmm1
318 pshufd $0x78, %xmm13, %xmm2
319 pshufd $0x78, %xmm15, %xmm3
329 pshufd $0x78, %xmm10, %xmm0
330 pshufd $0x78, %xmm14, %xmm1
338 cvtps2dq %xmm8, %xmm8
339 cvtps2dq %xmm9, %xmm9
340 cvtps2dq %xmm10, %xmm10
341 cvtps2dq %xmm11, %xmm11
342 packssdw %xmm10, %xmm8
343 packssdw %xmm11, %xmm9
366 pshuflw $0xee, %xmm8, %xmm2
367 pshuflw $0xee, %xmm9, %xmm3
377 pshuflw $0xee, %xmm0, %xmm0
378 pshuflw $0xee, %xmm1, %xmm1
392 shufps $0x1e, %xmm0, %xmm0
400 cvtps2dq %xmm12, %xmm12
401 cvtps2dq %xmm13, %xmm13
402 cvtps2dq %xmm14, %xmm14
403 cvtps2dq %xmm15, %xmm15
404 packssdw %xmm13, %xmm12
405 packssdw %xmm15, %xmm14
416 pshuflw $0xee, %xmm12, %xmm0
417 pshuflw $0xee, %xmm14, %xmm1
427 movhlps %xmm12, %xmm0
428 movhlps %xmm14, %xmm1
438 pshuflw $0xee, %xmm0, %xmm0
439 pshuflw $0xee, %xmm1, %xmm1
451 movaps 16(%rsp), %xmm7
452 movaps 32(%rsp), %xmm8
453 movaps 48(%rsp), %xmm9
454 movaps 64(%rsp), %xmm10
455 movaps 80(%rsp), %xmm11
456 movaps 96(%rsp), %xmm12
457 movaps 112(%rsp), %xmm13
458 movaps 128(%rsp), %xmm14
459 movaps 144(%rsp), %xmm15