2 synth_stereo_sse_accurate: SSE optimized synth (stereo specific, MPEG-compliant 16bit output version)
4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Taihei Monma
20 #define TEMP(n) (12+16*n)(%esp)
21 #define MMREG_CLIP %mm7
24 int synth_1to1_stereo_sse_accurate_asm(real *window, real *b0l, real *b0r, short *samples, int bo1);
25 return value: number of clipped samples
35 .long 1191181824 /* 32767.0 */
39 .long -956301312 /* -32768.0 */
45 .globl ASM_NAME(synth_1to1_stereo_sse_accurate_asm)
46 ASM_NAME(synth_1to1_stereo_sse_accurate_asm):
55 pxor MMREG_CLIP, MMREG_CLIP
60 movl 20(%ebp), SAMPLES
64 leal 64(WINDOW), WINDOW
71 movups (WINDOW), %xmm0
72 movups 16(WINDOW), %xmm1
73 movups 32(WINDOW), %xmm2
74 movups 48(WINDOW), %xmm3
96 leal 128(WINDOW), WINDOW
100 movups (WINDOW), %xmm0
101 movups 16(WINDOW), %xmm1
102 movups 32(WINDOW), %xmm2
103 movups 48(WINDOW), %xmm3
122 movaps %xmm0, TEMP(1)
123 movaps %xmm4, TEMP(5)
125 leal 128(WINDOW), WINDOW
129 movups (WINDOW), %xmm0
130 movups 16(WINDOW), %xmm1
131 movups 32(WINDOW), %xmm2
132 movups 48(WINDOW), %xmm3
151 movaps %xmm0, TEMP(2)
152 movaps %xmm4, TEMP(6)
154 leal 128(WINDOW), WINDOW
158 movups (WINDOW), %xmm0
159 movups 16(WINDOW), %xmm1
160 movups 32(WINDOW), %xmm2
161 movups 48(WINDOW), %xmm3
181 movaps %xmm4, TEMP(7)
183 leal 128(WINDOW), WINDOW
187 movaps TEMP(0), %xmm4
188 movaps TEMP(1), %xmm5
189 movaps TEMP(2), %xmm6
192 unpcklps %xmm5, %xmm4
193 unpcklps %xmm7, %xmm6
194 unpckhps %xmm5, %xmm0
195 unpckhps %xmm7, %xmm1
207 movaps TEMP(4), %xmm4
208 movaps TEMP(5), %xmm5
209 movaps TEMP(6), %xmm6
210 movaps TEMP(7), %xmm7
213 unpcklps %xmm5, %xmm4
214 unpcklps %xmm7, %xmm6
215 unpckhps %xmm5, %xmm0
216 unpckhps %xmm7, %xmm1
232 cmpnleps ASM_NAME(maxmin_s16), %xmm1
233 cmpltps ASM_NAME(maxmin_s16)+16, %xmm2
234 cmpnleps ASM_NAME(maxmin_s16), %xmm3
235 cmpltps ASM_NAME(maxmin_s16)+16, %xmm4
248 movq %mm2, 8(SAMPLES)
273 paddw %mm0, MMREG_CLIP
275 leal 16(SAMPLES), SAMPLES
283 movups (WINDOW), %xmm0
284 movups 16(WINDOW), %xmm1
285 movups 32(WINDOW), %xmm2
286 movups 48(WINDOW), %xmm3
305 movaps %xmm0, TEMP(0)
306 movaps %xmm4, TEMP(4)
308 leal 128(WINDOW), WINDOW
312 movups (WINDOW), %xmm0
313 movups 16(WINDOW), %xmm1
314 movups 32(WINDOW), %xmm2
315 movups 48(WINDOW), %xmm3
334 movaps %xmm0, TEMP(1)
335 movaps %xmm4, TEMP(5)
337 leal 128(WINDOW), WINDOW
341 movups (WINDOW), %xmm0
342 movups 16(WINDOW), %xmm1
343 movups 32(WINDOW), %xmm2
344 movups 48(WINDOW), %xmm3
363 movaps %xmm0, TEMP(2)
364 movaps %xmm4, TEMP(6)
366 leal 128(WINDOW), WINDOW
370 movups (WINDOW), %xmm0
371 movups 16(WINDOW), %xmm1
372 movups 32(WINDOW), %xmm2
373 movups 48(WINDOW), %xmm3
393 movaps %xmm4, TEMP(7)
395 leal 128(WINDOW), WINDOW
399 movaps TEMP(0), %xmm4
400 movaps TEMP(1), %xmm5
401 movaps TEMP(2), %xmm6
404 unpcklps %xmm5, %xmm4
405 unpcklps %xmm7, %xmm6
406 unpckhps %xmm5, %xmm0
407 unpckhps %xmm7, %xmm1
419 movaps TEMP(4), %xmm4
420 movaps TEMP(5), %xmm5
421 movaps TEMP(6), %xmm6
422 movaps TEMP(7), %xmm7
425 unpcklps %xmm5, %xmm4
426 unpcklps %xmm7, %xmm6
427 unpckhps %xmm5, %xmm0
428 unpckhps %xmm7, %xmm1
444 cmpnleps ASM_NAME(maxmin_s16), %xmm1
445 cmpltps ASM_NAME(maxmin_s16)+16, %xmm2
446 cmpnleps ASM_NAME(maxmin_s16), %xmm3
447 cmpltps ASM_NAME(maxmin_s16)+16, %xmm4
460 movq %mm2, 8(SAMPLES)
485 paddw %mm0, MMREG_CLIP
487 leal 16(SAMPLES), SAMPLES
491 pshufw $0xee, MMREG_CLIP, %mm0
492 paddw MMREG_CLIP, %mm0
493 pshufw $0x55, %mm0, %mm1