2 synth_stereo_x86_64_accurate: SSE optimized synth for x86-64 (stereo specific, MPEG-compliant 16bit output version)
4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Taihei Monma
31 #define XMMREG_MAX (%r10) /* {32767.0, 32767.0, 32767.0, 32767.0} */
32 #define XMMREG_MIN (%r11) /* {-32768.0, -32768.0, -32768.0, -32768.0} */
33 #define TEMP_CLIP (%rsp)
36 int synth_1to1_stereo_x86_64_accurate_asm(real *window, real *b0l, real *b0r, short *samples, int bo1);
37 return value: number of clipped samples
57 .globl ASM_NAME(synth_1to1_stereo_x86_64_accurate_asm)
58 ASM_NAME(synth_1to1_stereo_x86_64_accurate_asm):
59 #ifdef _WIN64 /* should save xmm6-15 */
60 movl 40(%rsp), %eax /* 5th argument; placed after 32-byte shadow space */
62 subq $176, %rsp /* 10 xmm registers + temp */
63 movaps %xmm6, 16(%rsp)
64 movaps %xmm7, 32(%rsp)
65 movaps %xmm8, 48(%rsp)
66 movaps %xmm9, 64(%rsp)
67 movaps %xmm10, 80(%rsp)
68 movaps %xmm11, 96(%rsp)
69 movaps %xmm12, 112(%rsp)
70 movaps %xmm13, 128(%rsp)
71 movaps %xmm14, 144(%rsp)
72 movaps %xmm15, 160(%rsp)
74 subq $24, %rsp /* stack alignment + temp */
77 leaq ASM_NAME(maxmin_s16)(%rip), %r10
80 movaps %xmm0, TEMP_CLIP
92 leaq 64(WINDOW), WINDOW
99 movups (WINDOW), %xmm8
100 movups 16(WINDOW), %xmm1
101 movups 32(WINDOW), %xmm2
102 movups 48(WINDOW), %xmm3
103 movups 128(WINDOW), %xmm9
104 movups 144(WINDOW), %xmm5
105 movups 160(WINDOW), %xmm6
106 movups 176(WINDOW), %xmm7
122 mulps 112(B0L), %xmm7
125 mulps 32(B0R), %xmm10
126 mulps 48(B0R), %xmm11
127 mulps 64(B0R), %xmm12
128 mulps 80(B0R), %xmm13
129 mulps 96(B0R), %xmm14
130 mulps 112(B0R), %xmm15
145 movaps %xmm14, %xmm13
146 leaq 256(WINDOW), WINDOW
150 movups (WINDOW), %xmm10
151 movups 16(WINDOW), %xmm1
152 movups 32(WINDOW), %xmm2
153 movups 48(WINDOW), %xmm3
154 movups 128(WINDOW), %xmm11
155 movups 144(WINDOW), %xmm5
156 movups 160(WINDOW), %xmm6
157 movups 176(WINDOW), %xmm7
168 mulps 32(B0R), %xmm14
169 mulps 48(B0R), %xmm15
178 mulps 64(B0L), %xmm11
181 mulps 112(B0L), %xmm7
185 mulps 112(B0R), %xmm15
196 leaq 256(WINDOW), WINDOW
204 unpcklps %xmm9, %xmm8
205 unpcklps %xmm11, %xmm10
206 unpckhps %xmm9, %xmm0
207 unpckhps %xmm11, %xmm1
208 unpcklps %xmm13, %xmm12
209 unpcklps %xmm15, %xmm14
210 unpckhps %xmm13, %xmm4
211 unpckhps %xmm15, %xmm5
216 movlhps %xmm10, %xmm8
217 movhlps %xmm2, %xmm10
220 movlhps %xmm14, %xmm12
221 movhlps %xmm6, %xmm14
235 cmpnleps XMMREG_MAX, %xmm2
236 cmpltps XMMREG_MIN, %xmm3
237 cmpnleps XMMREG_MAX, %xmm5
238 cmpltps XMMREG_MIN, %xmm6
239 cvtps2dq %xmm0, %xmm0
240 cvtps2dq %xmm4, %xmm4
242 unpcklps %xmm4, %xmm0
243 unpckhps %xmm4, %xmm1
244 packssdw %xmm1, %xmm0
245 movups %xmm0, (SAMPLES)
247 packssdw %xmm5, %xmm2
248 packssdw %xmm6, %xmm3
252 paddw TEMP_CLIP, %xmm2
253 movaps %xmm2, TEMP_CLIP
255 leaq 16(SAMPLES), SAMPLES
263 movups (WINDOW), %xmm8
264 movups 16(WINDOW), %xmm1
265 movups 32(WINDOW), %xmm2
266 movups 48(WINDOW), %xmm3
267 movups 128(WINDOW), %xmm9
268 movups 144(WINDOW), %xmm5
269 movups 160(WINDOW), %xmm6
270 movups 176(WINDOW), %xmm7
283 mulps -64(B0L), %xmm9
284 mulps -48(B0L), %xmm5
285 mulps -32(B0L), %xmm6
286 mulps -16(B0L), %xmm7
289 mulps 32(B0R), %xmm10
290 mulps 48(B0R), %xmm11
291 mulps -64(B0R), %xmm12
292 mulps -48(B0R), %xmm13
293 mulps -32(B0R), %xmm14
294 mulps -16(B0R), %xmm15
309 movaps %xmm14, %xmm13
310 leaq 256(WINDOW), WINDOW
314 movups (WINDOW), %xmm10
315 movups 16(WINDOW), %xmm1
316 movups 32(WINDOW), %xmm2
317 movups 48(WINDOW), %xmm3
318 movups 128(WINDOW), %xmm11
319 movups 144(WINDOW), %xmm5
320 movups 160(WINDOW), %xmm6
321 movups 176(WINDOW), %xmm7
332 mulps 32(B0R), %xmm14
333 mulps 48(B0R), %xmm15
342 mulps -64(B0L), %xmm11
343 mulps -48(B0L), %xmm5
344 mulps -32(B0L), %xmm6
345 mulps -16(B0L), %xmm7
346 mulps -64(B0R), %xmm1
347 mulps -48(B0R), %xmm2
348 mulps -32(B0R), %xmm4
349 mulps -16(B0R), %xmm15
360 leaq 256(WINDOW), WINDOW
368 unpcklps %xmm9, %xmm8
369 unpcklps %xmm11, %xmm10
370 unpckhps %xmm9, %xmm0
371 unpckhps %xmm11, %xmm1
372 unpcklps %xmm13, %xmm12
373 unpcklps %xmm15, %xmm14
374 unpckhps %xmm13, %xmm4
375 unpckhps %xmm15, %xmm5
380 movlhps %xmm10, %xmm8
381 movhlps %xmm2, %xmm10
384 movlhps %xmm14, %xmm12
385 movhlps %xmm6, %xmm14
399 cmpnleps XMMREG_MAX, %xmm2
400 cmpltps XMMREG_MIN, %xmm3
401 cmpnleps XMMREG_MAX, %xmm5
402 cmpltps XMMREG_MIN, %xmm6
403 cvtps2dq %xmm0, %xmm0
404 cvtps2dq %xmm4, %xmm4
406 unpcklps %xmm4, %xmm0
407 unpckhps %xmm4, %xmm1
408 packssdw %xmm1, %xmm0
409 movups %xmm0, (SAMPLES)
411 packssdw %xmm5, %xmm2
412 packssdw %xmm6, %xmm3
416 paddw TEMP_CLIP, %xmm2
417 movaps %xmm2, TEMP_CLIP
419 leaq 16(SAMPLES), SAMPLES
423 movaps TEMP_CLIP, %xmm4
426 pshuflw $0x55, %xmm0, %xmm1
427 pshuflw $0xaa, %xmm0, %xmm2
428 pshuflw $0xff, %xmm0, %xmm3
438 movaps 16(%rsp), %xmm7
439 movaps 32(%rsp), %xmm8
440 movaps 48(%rsp), %xmm9
441 movaps 64(%rsp), %xmm10
442 movaps 80(%rsp), %xmm11
443 movaps 96(%rsp), %xmm12
444 movaps 112(%rsp), %xmm13
445 movaps 128(%rsp), %xmm14
446 movaps 144(%rsp), %xmm15