2 synth_stereo_x86_64: SSE optimized synth for x86-64 (stereo specific version)
4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Taihei Monma
31 #define XMMREG_CLIP %xmm15
32 #define XMMREG_MAX %xmm14 /* {32767, 32767, 32767, 32767} */
33 #define XMMREG_MIN %xmm13 /* {-32769, -32769, -32769, -32769} : not -32768 because SSE doesn't have "less than" comparison... */
34 #define XMMREG_FULL %xmm12 /* {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF} */
37 int synth_1to1_stereo_x86_64_asm(short *window, short *b0l, short *b0r, short *samples, int bo1);
38 return value: number of clipped samples
47 ASM_NAME(maxmin_x86_64):
58 .globl ASM_NAME(synth_1to1_stereo_x86_64_asm)
59 ASM_NAME(synth_1to1_stereo_x86_64_asm):
60 #ifdef _WIN64 /* should save xmm6-15 */
61 movl 40(%rsp), %eax /* 5th argument; placed after 32-byte shadow space */
62 subq $168, %rsp /* stack alignment + 10 xmm registers */
64 movaps %xmm7, 16(%rsp)
65 movaps %xmm8, 32(%rsp)
66 movaps %xmm9, 48(%rsp)
67 movaps %xmm10, 64(%rsp)
68 movaps %xmm11, 80(%rsp)
69 movaps %xmm12, 96(%rsp)
70 movaps %xmm13, 112(%rsp)
71 movaps %xmm14, 128(%rsp)
72 movaps %xmm15, 144(%rsp)
85 leaq 32(WINDOW), WINDOW
88 leaq ASM_NAME(maxmin_x86_64)(%rip), %rax
89 movaps (%rax), XMMREG_MAX
90 movaps 16(%rax), XMMREG_MIN
91 pxor XMMREG_CLIP, XMMREG_CLIP
92 pcmpeqd XMMREG_FULL, XMMREG_FULL
98 movups (WINDOW), %xmm0
99 movups 16(WINDOW), %xmm1
100 movups 64(WINDOW), %xmm2
101 movups 80(WINDOW), %xmm3
102 movups 128(WINDOW), %xmm4
103 movups 144(WINDOW), %xmm5
104 movups 192(WINDOW), %xmm6
105 movups 208(WINDOW), %xmm7
111 pmaddwd 16(B0L), %xmm1
112 pmaddwd 32(B0L), %xmm2
113 pmaddwd 48(B0L), %xmm3
115 pmaddwd 16(B0R), %xmm9
116 pmaddwd 32(B0R), %xmm10
117 pmaddwd 48(B0R), %xmm11
126 pmaddwd 64(B0L), %xmm4
127 pmaddwd 80(B0L), %xmm5
128 pmaddwd 96(B0L), %xmm6
129 pmaddwd 112(B0L), %xmm7
130 pmaddwd 64(B0R), %xmm1
131 pmaddwd 80(B0R), %xmm9
132 pmaddwd 96(B0R), %xmm3
133 pmaddwd 112(B0R), %xmm11
143 punpckldq %xmm2, %xmm0
144 punpckldq %xmm6, %xmm4
145 punpckhdq %xmm2, %xmm1
146 punpckhdq %xmm6, %xmm3
147 punpckldq %xmm10, %xmm8
148 punpckldq %xmm11, %xmm9
149 punpckhdq %xmm10, %xmm5
150 punpckhdq %xmm11, %xmm7
160 movhlps %xmm10, %xmm9
162 movhlps %xmm11, %xmm7
176 punpckldq %xmm8, %xmm0
177 punpckhdq %xmm8, %xmm1
178 packssdw %xmm1, %xmm0
179 movups %xmm0, (SAMPLES)
181 pcmpgtd XMMREG_MAX, %xmm2
182 pcmpgtd XMMREG_MIN, %xmm3
183 pcmpgtd XMMREG_MAX, %xmm4
184 pcmpgtd XMMREG_MIN, %xmm8
185 packssdw %xmm4, %xmm2
186 packssdw %xmm8, %xmm3
187 pxor XMMREG_FULL, %xmm3
191 paddw %xmm2, XMMREG_CLIP
193 leaq 256(WINDOW), WINDOW
196 leaq 16(SAMPLES), SAMPLES
205 movups (WINDOW), %xmm0
206 movups 16(WINDOW), %xmm1
207 movups 64(WINDOW), %xmm2
208 movups 80(WINDOW), %xmm3
209 movups 128(WINDOW), %xmm4
210 movups 144(WINDOW), %xmm5
211 movups 192(WINDOW), %xmm6
212 movups 208(WINDOW), %xmm7
218 pmaddwd 16(B0L), %xmm1
219 pmaddwd -32(B0L), %xmm2
220 pmaddwd -16(B0L), %xmm3
222 pmaddwd 16(B0R), %xmm9
223 pmaddwd -32(B0R), %xmm10
224 pmaddwd -16(B0R), %xmm11
233 pmaddwd -64(B0L), %xmm4
234 pmaddwd -48(B0L), %xmm5
235 pmaddwd -96(B0L), %xmm6
236 pmaddwd -80(B0L), %xmm7
237 pmaddwd -64(B0R), %xmm1
238 pmaddwd -48(B0R), %xmm9
239 pmaddwd -96(B0R), %xmm3
240 pmaddwd -80(B0R), %xmm11
250 punpckldq %xmm2, %xmm0
251 punpckldq %xmm6, %xmm4
252 punpckhdq %xmm2, %xmm1
253 punpckhdq %xmm6, %xmm3
254 punpckldq %xmm10, %xmm8
255 punpckldq %xmm11, %xmm9
256 punpckhdq %xmm10, %xmm5
257 punpckhdq %xmm11, %xmm7
267 movhlps %xmm10, %xmm9
269 movhlps %xmm11, %xmm7
283 punpckldq %xmm8, %xmm0
284 punpckhdq %xmm8, %xmm1
285 packssdw %xmm1, %xmm0
286 movups %xmm0, (SAMPLES)
288 pcmpgtd XMMREG_MAX, %xmm2
289 pcmpgtd XMMREG_MIN, %xmm3
290 pcmpgtd XMMREG_MAX, %xmm4
291 pcmpgtd XMMREG_MIN, %xmm8
292 packssdw %xmm4, %xmm2
293 packssdw %xmm8, %xmm3
294 pxor XMMREG_FULL, %xmm3
298 paddw %xmm2, XMMREG_CLIP
300 leaq 256(WINDOW), WINDOW
303 leaq 16(SAMPLES), SAMPLES
308 movhlps XMMREG_CLIP, %xmm0
309 paddw XMMREG_CLIP, %xmm0
310 pshuflw $0x55, %xmm0, %xmm1
311 pshuflw $0xaa, %xmm0, %xmm2
312 pshuflw $0xff, %xmm0, %xmm3
322 movaps 16(%rsp), %xmm7
323 movaps 32(%rsp), %xmm8
324 movaps 48(%rsp), %xmm9
325 movaps 64(%rsp), %xmm10
326 movaps 80(%rsp), %xmm11
327 movaps 96(%rsp), %xmm12
328 movaps 112(%rsp), %xmm13
329 movaps 128(%rsp), %xmm14
330 movaps 144(%rsp), %xmm15