2 synth_x86_64_s32: SSE optimized synth for x86-64 (s32 output version)
4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Taihei Monma
31 #define XMMREG_SCALE %xmm15 /* {65536.0, 65536.0, 65536.0, 65536.0} */
32 #define XMMREG_MAX %xmm14 /* {32767.999, 32767.999, 32767.999, 32767.999} */
33 #define XMMREG_MIN %xmm13 /* {-32768.0, -32768.0, -32768.0, -32768.0} */
34 #define XMMREG_CLIP %xmm12
37 int synth_1to1_s32_x86_64_asm(real *window, real *b0, int32_t *samples, int bo1);
38 return value: number of clipped samples
64 .globl ASM_NAME(synth_1to1_s32_x86_64_asm)
65 ASM_NAME(synth_1to1_s32_x86_64_asm):
66 #ifdef _WIN64 /* should save xmm6-15 */
68 subq $168, %rsp /* stack alignment + 10 xmm registers */
70 movaps %xmm7, 16(%rsp)
71 movaps %xmm8, 32(%rsp)
72 movaps %xmm9, 48(%rsp)
73 movaps %xmm10, 64(%rsp)
74 movaps %xmm11, 80(%rsp)
75 movaps %xmm12, 96(%rsp)
76 movaps %xmm13, 112(%rsp)
77 movaps %xmm14, 128(%rsp)
78 movaps %xmm15, 144(%rsp)
81 leaq ASM_NAME(scale_s32)(%rip), %rax
82 movaps (%rax), XMMREG_SCALE
83 leaq ASM_NAME(maxmin_s32)(%rip), %rax
84 movaps (%rax), XMMREG_MAX
85 movaps 16(%rax), XMMREG_MIN
87 xorps XMMREG_CLIP, XMMREG_CLIP
99 movups 16(ARG0), %xmm1
100 movups 32(ARG0), %xmm2
101 movups 48(ARG0), %xmm3
102 movups 128(ARG0), %xmm9
103 movups 144(ARG0), %xmm5
104 movups 160(ARG0), %xmm6
105 movups 176(ARG0), %xmm7
107 mulps 16(ARG1), %xmm1
108 mulps 32(ARG1), %xmm2
109 mulps 48(ARG1), %xmm3
110 mulps 64(ARG1), %xmm9
111 mulps 80(ARG1), %xmm5
112 mulps 96(ARG1), %xmm6
113 mulps 112(ARG1), %xmm7
124 movups (ARG0), %xmm10
125 movups 16(ARG0), %xmm1
126 movups 32(ARG0), %xmm2
127 movups 48(ARG0), %xmm3
128 movups 128(ARG0), %xmm11
129 movups 144(ARG0), %xmm5
130 movups 160(ARG0), %xmm6
131 movups 176(ARG0), %xmm7
133 mulps 16(ARG1), %xmm1
134 mulps 32(ARG1), %xmm2
135 mulps 48(ARG1), %xmm3
136 mulps 64(ARG1), %xmm11
137 mulps 80(ARG1), %xmm5
138 mulps 96(ARG1), %xmm6
139 mulps 112(ARG1), %xmm7
152 unpcklps %xmm9, %xmm8
153 unpcklps %xmm11, %xmm10
154 unpckhps %xmm9, %xmm0
155 unpckhps %xmm11, %xmm1
158 movlhps %xmm10, %xmm8
159 movhlps %xmm2, %xmm10
167 movups 16(ARG2), %xmm2
170 mulps XMMREG_SCALE, %xmm0
171 cmpnleps XMMREG_MAX, %xmm3
172 cmpltps XMMREG_MIN, %xmm4
173 cvtps2dq %xmm0, %xmm0
175 shufps $0xdd, %xmm2, %xmm1
177 unpcklps %xmm1, %xmm0
178 unpckhps %xmm1, %xmm2
180 movups %xmm2, 16(ARG2)
185 paddd %xmm3, XMMREG_CLIP
196 movups 16(ARG0), %xmm1
197 movups 32(ARG0), %xmm2
198 movups 48(ARG0), %xmm3
199 movups 128(ARG0), %xmm9
200 movups 144(ARG0), %xmm5
201 movups 160(ARG0), %xmm6
202 movups 176(ARG0), %xmm7
204 mulps 16(ARG1), %xmm1
205 mulps 32(ARG1), %xmm2
206 mulps 48(ARG1), %xmm3
207 mulps -64(ARG1), %xmm9
208 mulps -48(ARG1), %xmm5
209 mulps -32(ARG1), %xmm6
210 mulps -16(ARG1), %xmm7
219 leaq -128(ARG1), ARG1
221 movups (ARG0), %xmm10
222 movups 16(ARG0), %xmm1
223 movups 32(ARG0), %xmm2
224 movups 48(ARG0), %xmm3
225 movups 128(ARG0), %xmm11
226 movups 144(ARG0), %xmm5
227 movups 160(ARG0), %xmm6
228 movups 176(ARG0), %xmm7
230 mulps 16(ARG1), %xmm1
231 mulps 32(ARG1), %xmm2
232 mulps 48(ARG1), %xmm3
233 mulps -64(ARG1), %xmm11
234 mulps -48(ARG1), %xmm5
235 mulps -32(ARG1), %xmm6
236 mulps -16(ARG1), %xmm7
245 leaq -128(ARG1), ARG1
249 unpcklps %xmm9, %xmm8
250 unpcklps %xmm11, %xmm10
251 unpckhps %xmm9, %xmm0
252 unpckhps %xmm11, %xmm1
255 movlhps %xmm10, %xmm8
256 movhlps %xmm2, %xmm10
264 movups 16(ARG2), %xmm2
267 mulps XMMREG_SCALE, %xmm0
268 cmpnleps XMMREG_MAX, %xmm3
269 cmpltps XMMREG_MIN, %xmm4
270 cvtps2dq %xmm0, %xmm0
272 shufps $0xdd, %xmm2, %xmm1
274 unpcklps %xmm1, %xmm0
275 unpckhps %xmm1, %xmm2
277 movups %xmm2, 16(ARG2)
282 paddd %xmm3, XMMREG_CLIP
288 pshuflw $0xee, XMMREG_CLIP, %xmm0
289 movhlps XMMREG_CLIP, %xmm1
290 pshuflw $0xee, %xmm1, %xmm2
291 paddd %xmm0, XMMREG_CLIP
292 paddd %xmm1, XMMREG_CLIP
293 paddd %xmm2, XMMREG_CLIP
295 movd XMMREG_CLIP, %eax
299 movaps 16(%rsp), %xmm7
300 movaps 32(%rsp), %xmm8
301 movaps 48(%rsp), %xmm9
302 movaps 64(%rsp), %xmm10
303 movaps 80(%rsp), %xmm11
304 movaps 96(%rsp), %xmm12
305 movaps 112(%rsp), %xmm13
306 movaps 128(%rsp), %xmm14
307 movaps 144(%rsp), %xmm15