2 synth_stereo_x86_64_s32: SSE optimized synth for x86-64 (stereo specific, s32 output version)
4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Taihei Monma
31 #define XMMREG_SCALE (%r9) /* {65536.0, 65536.0, 65536.0, 65536.0} */
32 #define XMMREG_MAX (%r10) /* {32767.999, 32767.999, 32767.999, 32767.999} */
33 #define XMMREG_MIN (%r11) /* {-32768.0, -32768.0, -32768.0, -32768.0} */
34 #define TEMP_CLIP (%rsp)
37 int synth_1to1_s32_stereo_x86_64_asm(real *window, real *b0l, real *b0r, int32_t *samples, int bo1);
38 return value: number of clipped samples
64 .globl ASM_NAME(synth_1to1_s32_stereo_x86_64_asm)
65 ASM_NAME(synth_1to1_s32_stereo_x86_64_asm):
66 #ifdef _WIN64 /* should save xmm6-15 */
67 movl 40(%rsp), %eax /* 5th argument; placed after 32-byte shadow space */
70 subq $184, %rsp /* stack alignment + 10 xmm registers + temp */
71 movaps %xmm6, 16(%rsp)
72 movaps %xmm7, 32(%rsp)
73 movaps %xmm8, 48(%rsp)
74 movaps %xmm9, 64(%rsp)
75 movaps %xmm10, 80(%rsp)
76 movaps %xmm11, 96(%rsp)
77 movaps %xmm12, 112(%rsp)
78 movaps %xmm13, 128(%rsp)
79 movaps %xmm14, 144(%rsp)
80 movaps %xmm15, 160(%rsp)
82 subq $24, %rsp /* stack alignment + temp */
96 leaq 64(WINDOW), WINDOW
99 leaq ASM_NAME(scale_s32)(%rip), %r9
100 leaq ASM_NAME(maxmin_s32)(%rip), %r10
103 movaps %xmm0, TEMP_CLIP
109 movups (WINDOW), %xmm8
110 movups 16(WINDOW), %xmm1
111 movups 32(WINDOW), %xmm2
112 movups 48(WINDOW), %xmm3
113 movups 128(WINDOW), %xmm9
114 movups 144(WINDOW), %xmm5
115 movups 160(WINDOW), %xmm6
116 movups 176(WINDOW), %xmm7
132 mulps 112(B0L), %xmm7
135 mulps 32(B0R), %xmm10
136 mulps 48(B0R), %xmm11
137 mulps 64(B0R), %xmm12
138 mulps 80(B0R), %xmm13
139 mulps 96(B0R), %xmm14
140 mulps 112(B0R), %xmm15
155 movaps %xmm14, %xmm13
156 leaq 256(WINDOW), WINDOW
160 movups (WINDOW), %xmm10
161 movups 16(WINDOW), %xmm1
162 movups 32(WINDOW), %xmm2
163 movups 48(WINDOW), %xmm3
164 movups 128(WINDOW), %xmm11
165 movups 144(WINDOW), %xmm5
166 movups 160(WINDOW), %xmm6
167 movups 176(WINDOW), %xmm7
178 mulps 32(B0R), %xmm14
179 mulps 48(B0R), %xmm15
188 mulps 64(B0L), %xmm11
191 mulps 112(B0L), %xmm7
195 mulps 112(B0R), %xmm15
206 leaq 256(WINDOW), WINDOW
214 unpcklps %xmm9, %xmm8
215 unpcklps %xmm11, %xmm10
216 unpckhps %xmm9, %xmm0
217 unpckhps %xmm11, %xmm1
218 unpcklps %xmm13, %xmm12
219 unpcklps %xmm15, %xmm14
220 unpckhps %xmm13, %xmm4
221 unpckhps %xmm15, %xmm5
226 movlhps %xmm10, %xmm8
227 movhlps %xmm2, %xmm10
230 movlhps %xmm14, %xmm12
231 movhlps %xmm6, %xmm14
245 mulps XMMREG_SCALE, %xmm0
246 mulps XMMREG_SCALE, %xmm4
247 cmpnleps XMMREG_MAX, %xmm2
248 cmpltps XMMREG_MIN, %xmm3
249 cmpnleps XMMREG_MAX, %xmm5
250 cmpltps XMMREG_MIN, %xmm6
251 cvtps2dq %xmm0, %xmm0
252 cvtps2dq %xmm4, %xmm4
256 unpcklps %xmm4, %xmm0
257 unpckhps %xmm4, %xmm1
258 movups %xmm0, (SAMPLES)
259 movups %xmm1, 16(SAMPLES)
261 packssdw %xmm5, %xmm2
262 packssdw %xmm6, %xmm3
266 paddw TEMP_CLIP, %xmm2
267 movaps %xmm2, TEMP_CLIP
269 leaq 32(SAMPLES), SAMPLES
277 movups (WINDOW), %xmm8
278 movups 16(WINDOW), %xmm1
279 movups 32(WINDOW), %xmm2
280 movups 48(WINDOW), %xmm3
281 movups 128(WINDOW), %xmm9
282 movups 144(WINDOW), %xmm5
283 movups 160(WINDOW), %xmm6
284 movups 176(WINDOW), %xmm7
297 mulps -64(B0L), %xmm9
298 mulps -48(B0L), %xmm5
299 mulps -32(B0L), %xmm6
300 mulps -16(B0L), %xmm7
303 mulps 32(B0R), %xmm10
304 mulps 48(B0R), %xmm11
305 mulps -64(B0R), %xmm12
306 mulps -48(B0R), %xmm13
307 mulps -32(B0R), %xmm14
308 mulps -16(B0R), %xmm15
323 movaps %xmm14, %xmm13
324 leaq 256(WINDOW), WINDOW
328 movups (WINDOW), %xmm10
329 movups 16(WINDOW), %xmm1
330 movups 32(WINDOW), %xmm2
331 movups 48(WINDOW), %xmm3
332 movups 128(WINDOW), %xmm11
333 movups 144(WINDOW), %xmm5
334 movups 160(WINDOW), %xmm6
335 movups 176(WINDOW), %xmm7
346 mulps 32(B0R), %xmm14
347 mulps 48(B0R), %xmm15
356 mulps -64(B0L), %xmm11
357 mulps -48(B0L), %xmm5
358 mulps -32(B0L), %xmm6
359 mulps -16(B0L), %xmm7
360 mulps -64(B0R), %xmm1
361 mulps -48(B0R), %xmm2
362 mulps -32(B0R), %xmm4
363 mulps -16(B0R), %xmm15
374 leaq 256(WINDOW), WINDOW
382 unpcklps %xmm9, %xmm8
383 unpcklps %xmm11, %xmm10
384 unpckhps %xmm9, %xmm0
385 unpckhps %xmm11, %xmm1
386 unpcklps %xmm13, %xmm12
387 unpcklps %xmm15, %xmm14
388 unpckhps %xmm13, %xmm4
389 unpckhps %xmm15, %xmm5
394 movlhps %xmm10, %xmm8
395 movhlps %xmm2, %xmm10
398 movlhps %xmm14, %xmm12
399 movhlps %xmm6, %xmm14
413 mulps XMMREG_SCALE, %xmm0
414 mulps XMMREG_SCALE, %xmm4
415 cmpnleps XMMREG_MAX, %xmm2
416 cmpltps XMMREG_MIN, %xmm3
417 cmpnleps XMMREG_MAX, %xmm5
418 cmpltps XMMREG_MIN, %xmm6
419 cvtps2dq %xmm0, %xmm0
420 cvtps2dq %xmm4, %xmm4
424 unpcklps %xmm4, %xmm0
425 unpckhps %xmm4, %xmm1
426 movups %xmm0, (SAMPLES)
427 movups %xmm1, 16(SAMPLES)
429 packssdw %xmm5, %xmm2
430 packssdw %xmm6, %xmm3
434 paddw TEMP_CLIP, %xmm2
435 movaps %xmm2, TEMP_CLIP
437 leaq 32(SAMPLES), SAMPLES
441 movaps TEMP_CLIP, %xmm4
444 pshuflw $0x55, %xmm0, %xmm1
445 pshuflw $0xaa, %xmm0, %xmm2
446 pshuflw $0xff, %xmm0, %xmm3
456 movaps 16(%rsp), %xmm7
457 movaps 32(%rsp), %xmm8
458 movaps 48(%rsp), %xmm9
459 movaps 64(%rsp), %xmm10
460 movaps 80(%rsp), %xmm11
461 movaps 96(%rsp), %xmm12
462 movaps 112(%rsp), %xmm13
463 movaps 128(%rsp), %xmm14
464 movaps 144(%rsp), %xmm15