Sync winemp3.acm with Wine HEAD. This one uses libmpg123 which was added in Version...
[reactos.git] / reactos / lib / 3rdparty / libmpg123 / synth_stereo_x86_64_s32.S
1 /*
2 synth_stereo_x86_64_s32: SSE optimized synth for x86-64 (stereo specific, s32 output version)
3
4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Taihei Monma
7 */
8
9 #include "mangle.h"
10
11 #ifdef _WIN64
12 /* short *window; */
13 #define WINDOW %rsi
14 /* short *b0l; */
15 #define B0L %rdx
16 /* short *b0r; */
17 #define B0R %r8
18 /* short *samples; */
19 #define SAMPLES %rdi
20 #else
21 /* real *window; */
22 #define WINDOW %rdi
23 /* real *b0l; */
24 #define B0L %rsi
25 /* real *b0r; */
26 #define B0R %rdx
27 /* real *samples; */
28 #define SAMPLES %r8
29 #endif
30
31 #define XMMREG_SCALE (%r9) /* {65536.0, 65536.0, 65536.0, 65536.0} */
32 #define XMMREG_MAX (%r10) /* {32767.999, 32767.999, 32767.999, 32767.999} */
33 #define XMMREG_MIN (%r11) /* {-32768.0, -32768.0, -32768.0, -32768.0} */
34 #define TEMP_CLIP (%rsp)
35
36 /*
37 int synth_1to1_s32_stereo_x86_64_asm(real *window, real *b0l, real *b0r, int32_t *samples, int bo1);
38 return value: number of clipped samples
39 */
40
41 #ifndef __APPLE__
42 .section .rodata
43 #else
44 .data
45 #endif
46 ALIGN32
47 ASM_NAME(scale_s32):
48 .long 1199570944
49 .long 1199570944
50 .long 1199570944
51 .long 1199570944
52 ALIGN16
53 ASM_NAME(maxmin_s32):
54 .long 1191182335
55 .long 1191182335
56 .long 1191182335
57 .long 1191182335
58 .long -956301312
59 .long -956301312
60 .long -956301312
61 .long -956301312
62 .text
63 ALIGN16
64 .globl ASM_NAME(synth_1to1_s32_stereo_x86_64_asm)
65 ASM_NAME(synth_1to1_s32_stereo_x86_64_asm):
66 #ifdef _WIN64 /* should save xmm6-15 */
67 movl 40(%rsp), %eax /* 5th argument; placed after 32-byte shadow space */
68 pushq %rsi
69 pushq %rdi
70 subq $184, %rsp /* stack alignment + 10 xmm registers + temp */
71 movaps %xmm6, 16(%rsp)
72 movaps %xmm7, 32(%rsp)
73 movaps %xmm8, 48(%rsp)
74 movaps %xmm9, 64(%rsp)
75 movaps %xmm10, 80(%rsp)
76 movaps %xmm11, 96(%rsp)
77 movaps %xmm12, 112(%rsp)
78 movaps %xmm13, 128(%rsp)
79 movaps %xmm14, 144(%rsp)
80 movaps %xmm15, 160(%rsp)
81 #else
82 subq $24, %rsp /* stack alignment + temp */
83 #endif
84
85 #ifdef _WIN64
86 shlq $32, %rax
87 shrq $30, %rax
88 movq %rcx, %rsi
89 movq %r9, %rdi
90 #else
91 movq %r8, %rax
92 shlq $32, %rax
93 shrq $30, %rax
94 movq %rcx, %r8
95 #endif
96 leaq 64(WINDOW), WINDOW
97 subq %rax, WINDOW
98
99 leaq ASM_NAME(scale_s32)(%rip), %r9
100 leaq ASM_NAME(maxmin_s32)(%rip), %r10
101 leaq 16(%r10), %r11
102 xorps %xmm0, %xmm0
103 movaps %xmm0, TEMP_CLIP
104
105 movl $4, %ecx
106
107 ALIGN16
108 Loop_start_1:
109 movups (WINDOW), %xmm8
110 movups 16(WINDOW), %xmm1
111 movups 32(WINDOW), %xmm2
112 movups 48(WINDOW), %xmm3
113 movups 128(WINDOW), %xmm9
114 movups 144(WINDOW), %xmm5
115 movups 160(WINDOW), %xmm6
116 movups 176(WINDOW), %xmm7
117 movaps %xmm8, %xmm0
118 movaps %xmm1, %xmm4
119 movaps %xmm2, %xmm10
120 movaps %xmm3, %xmm11
121 movaps %xmm9, %xmm12
122 movaps %xmm5, %xmm13
123 movaps %xmm6, %xmm14
124 movaps %xmm7, %xmm15
125 mulps (B0L), %xmm8
126 mulps 16(B0L), %xmm1
127 mulps 32(B0L), %xmm2
128 mulps 48(B0L), %xmm3
129 mulps 64(B0L), %xmm9
130 mulps 80(B0L), %xmm5
131 mulps 96(B0L), %xmm6
132 mulps 112(B0L), %xmm7
133 mulps (B0R), %xmm0
134 mulps 16(B0R), %xmm4
135 mulps 32(B0R), %xmm10
136 mulps 48(B0R), %xmm11
137 mulps 64(B0R), %xmm12
138 mulps 80(B0R), %xmm13
139 mulps 96(B0R), %xmm14
140 mulps 112(B0R), %xmm15
141
142 addps %xmm1, %xmm8
143 addps %xmm2, %xmm3
144 addps %xmm4, %xmm0
145 addps %xmm11, %xmm10
146 addps %xmm5, %xmm9
147 addps %xmm7, %xmm6
148 addps %xmm13, %xmm12
149 addps %xmm15, %xmm14
150 addps %xmm3, %xmm8
151 addps %xmm6, %xmm9
152 addps %xmm10, %xmm0
153 addps %xmm12, %xmm14
154 movaps %xmm0, %xmm12
155 movaps %xmm14, %xmm13
156 leaq 256(WINDOW), WINDOW
157 leaq 128(B0L), B0L
158 leaq 128(B0R), B0R
159
160 movups (WINDOW), %xmm10
161 movups 16(WINDOW), %xmm1
162 movups 32(WINDOW), %xmm2
163 movups 48(WINDOW), %xmm3
164 movups 128(WINDOW), %xmm11
165 movups 144(WINDOW), %xmm5
166 movups 160(WINDOW), %xmm6
167 movups 176(WINDOW), %xmm7
168 movaps %xmm10, %xmm0
169 movaps %xmm1, %xmm4
170 movaps %xmm2, %xmm14
171 movaps %xmm3, %xmm15
172 mulps (B0L), %xmm10
173 mulps 16(B0L), %xmm1
174 mulps 32(B0L), %xmm2
175 mulps 48(B0L), %xmm3
176 mulps (B0R), %xmm0
177 mulps 16(B0R), %xmm4
178 mulps 32(B0R), %xmm14
179 mulps 48(B0R), %xmm15
180 addps %xmm1, %xmm10
181 addps %xmm2, %xmm3
182 addps %xmm4, %xmm0
183 addps %xmm15, %xmm14
184 movaps %xmm11, %xmm1
185 movaps %xmm5, %xmm2
186 movaps %xmm6, %xmm4
187 movaps %xmm7, %xmm15
188 mulps 64(B0L), %xmm11
189 mulps 80(B0L), %xmm5
190 mulps 96(B0L), %xmm6
191 mulps 112(B0L), %xmm7
192 mulps 64(B0R), %xmm1
193 mulps 80(B0R), %xmm2
194 mulps 96(B0R), %xmm4
195 mulps 112(B0R), %xmm15
196 addps %xmm5, %xmm11
197 addps %xmm7, %xmm6
198 addps %xmm2, %xmm1
199 addps %xmm15, %xmm4
200
201 addps %xmm3, %xmm10
202 addps %xmm6, %xmm11
203 addps %xmm0, %xmm14
204 addps %xmm4, %xmm1
205 movaps %xmm1, %xmm15
206 leaq 256(WINDOW), WINDOW
207 leaq 128(B0L), B0L
208 leaq 128(B0R), B0R
209
210 movaps %xmm8, %xmm0
211 movaps %xmm10, %xmm1
212 movaps %xmm12, %xmm4
213 movaps %xmm14, %xmm5
214 unpcklps %xmm9, %xmm8
215 unpcklps %xmm11, %xmm10
216 unpckhps %xmm9, %xmm0
217 unpckhps %xmm11, %xmm1
218 unpcklps %xmm13, %xmm12
219 unpcklps %xmm15, %xmm14
220 unpckhps %xmm13, %xmm4
221 unpckhps %xmm15, %xmm5
222 movaps %xmm8, %xmm2
223 movaps %xmm0, %xmm3
224 movaps %xmm12, %xmm6
225 movaps %xmm4, %xmm7
226 movlhps %xmm10, %xmm8
227 movhlps %xmm2, %xmm10
228 movlhps %xmm1, %xmm0
229 movhlps %xmm3, %xmm1
230 movlhps %xmm14, %xmm12
231 movhlps %xmm6, %xmm14
232 movlhps %xmm5, %xmm4
233 movhlps %xmm7, %xmm5
234 subps %xmm10, %xmm8
235 subps %xmm1, %xmm0
236 subps %xmm14, %xmm12
237 subps %xmm5, %xmm4
238 addps %xmm8, %xmm0
239 addps %xmm12, %xmm4
240
241 movaps %xmm0, %xmm2
242 movaps %xmm0, %xmm3
243 movaps %xmm4, %xmm5
244 movaps %xmm4, %xmm6
245 mulps XMMREG_SCALE, %xmm0
246 mulps XMMREG_SCALE, %xmm4
247 cmpnleps XMMREG_MAX, %xmm2
248 cmpltps XMMREG_MIN, %xmm3
249 cmpnleps XMMREG_MAX, %xmm5
250 cmpltps XMMREG_MIN, %xmm6
251 cvtps2dq %xmm0, %xmm0
252 cvtps2dq %xmm4, %xmm4
253 xorps %xmm2, %xmm0
254 xorps %xmm5, %xmm4
255 movaps %xmm0, %xmm1
256 unpcklps %xmm4, %xmm0
257 unpckhps %xmm4, %xmm1
258 movups %xmm0, (SAMPLES)
259 movups %xmm1, 16(SAMPLES)
260
261 packssdw %xmm5, %xmm2
262 packssdw %xmm6, %xmm3
263 psrlw $15, %xmm2
264 psrlw $15, %xmm3
265 paddw %xmm3, %xmm2
266 paddw TEMP_CLIP, %xmm2
267 movaps %xmm2, TEMP_CLIP
268
269 leaq 32(SAMPLES), SAMPLES
270 decl %ecx
271 jnz Loop_start_1
272
273 movl $4, %ecx
274
275 ALIGN16
276 Loop_start_2:
277 movups (WINDOW), %xmm8
278 movups 16(WINDOW), %xmm1
279 movups 32(WINDOW), %xmm2
280 movups 48(WINDOW), %xmm3
281 movups 128(WINDOW), %xmm9
282 movups 144(WINDOW), %xmm5
283 movups 160(WINDOW), %xmm6
284 movups 176(WINDOW), %xmm7
285 movaps %xmm8, %xmm0
286 movaps %xmm1, %xmm4
287 movaps %xmm2, %xmm10
288 movaps %xmm3, %xmm11
289 movaps %xmm9, %xmm12
290 movaps %xmm5, %xmm13
291 movaps %xmm6, %xmm14
292 movaps %xmm7, %xmm15
293 mulps (B0L), %xmm8
294 mulps 16(B0L), %xmm1
295 mulps 32(B0L), %xmm2
296 mulps 48(B0L), %xmm3
297 mulps -64(B0L), %xmm9
298 mulps -48(B0L), %xmm5
299 mulps -32(B0L), %xmm6
300 mulps -16(B0L), %xmm7
301 mulps (B0R), %xmm0
302 mulps 16(B0R), %xmm4
303 mulps 32(B0R), %xmm10
304 mulps 48(B0R), %xmm11
305 mulps -64(B0R), %xmm12
306 mulps -48(B0R), %xmm13
307 mulps -32(B0R), %xmm14
308 mulps -16(B0R), %xmm15
309
310 addps %xmm1, %xmm8
311 addps %xmm2, %xmm3
312 addps %xmm4, %xmm0
313 addps %xmm11, %xmm10
314 addps %xmm5, %xmm9
315 addps %xmm7, %xmm6
316 addps %xmm13, %xmm12
317 addps %xmm15, %xmm14
318 addps %xmm3, %xmm8
319 addps %xmm6, %xmm9
320 addps %xmm10, %xmm0
321 addps %xmm12, %xmm14
322 movaps %xmm0, %xmm12
323 movaps %xmm14, %xmm13
324 leaq 256(WINDOW), WINDOW
325 leaq -128(B0L), B0L
326 leaq -128(B0R), B0R
327
328 movups (WINDOW), %xmm10
329 movups 16(WINDOW), %xmm1
330 movups 32(WINDOW), %xmm2
331 movups 48(WINDOW), %xmm3
332 movups 128(WINDOW), %xmm11
333 movups 144(WINDOW), %xmm5
334 movups 160(WINDOW), %xmm6
335 movups 176(WINDOW), %xmm7
336 movaps %xmm10, %xmm0
337 movaps %xmm1, %xmm4
338 movaps %xmm2, %xmm14
339 movaps %xmm3, %xmm15
340 mulps (B0L), %xmm10
341 mulps 16(B0L), %xmm1
342 mulps 32(B0L), %xmm2
343 mulps 48(B0L), %xmm3
344 mulps (B0R), %xmm0
345 mulps 16(B0R), %xmm4
346 mulps 32(B0R), %xmm14
347 mulps 48(B0R), %xmm15
348 addps %xmm1, %xmm10
349 addps %xmm2, %xmm3
350 addps %xmm4, %xmm0
351 addps %xmm15, %xmm14
352 movaps %xmm11, %xmm1
353 movaps %xmm5, %xmm2
354 movaps %xmm6, %xmm4
355 movaps %xmm7, %xmm15
356 mulps -64(B0L), %xmm11
357 mulps -48(B0L), %xmm5
358 mulps -32(B0L), %xmm6
359 mulps -16(B0L), %xmm7
360 mulps -64(B0R), %xmm1
361 mulps -48(B0R), %xmm2
362 mulps -32(B0R), %xmm4
363 mulps -16(B0R), %xmm15
364 addps %xmm5, %xmm11
365 addps %xmm7, %xmm6
366 addps %xmm2, %xmm1
367 addps %xmm15, %xmm4
368
369 addps %xmm3, %xmm10
370 addps %xmm6, %xmm11
371 addps %xmm0, %xmm14
372 addps %xmm4, %xmm1
373 movaps %xmm1, %xmm15
374 leaq 256(WINDOW), WINDOW
375 leaq -128(B0L), B0L
376 leaq -128(B0R), B0R
377
378 movaps %xmm8, %xmm0
379 movaps %xmm10, %xmm1
380 movaps %xmm12, %xmm4
381 movaps %xmm14, %xmm5
382 unpcklps %xmm9, %xmm8
383 unpcklps %xmm11, %xmm10
384 unpckhps %xmm9, %xmm0
385 unpckhps %xmm11, %xmm1
386 unpcklps %xmm13, %xmm12
387 unpcklps %xmm15, %xmm14
388 unpckhps %xmm13, %xmm4
389 unpckhps %xmm15, %xmm5
390 movaps %xmm8, %xmm2
391 movaps %xmm0, %xmm3
392 movaps %xmm12, %xmm6
393 movaps %xmm4, %xmm7
394 movlhps %xmm10, %xmm8
395 movhlps %xmm2, %xmm10
396 movlhps %xmm1, %xmm0
397 movhlps %xmm3, %xmm1
398 movlhps %xmm14, %xmm12
399 movhlps %xmm6, %xmm14
400 movlhps %xmm5, %xmm4
401 movhlps %xmm7, %xmm5
402 addps %xmm10, %xmm8
403 addps %xmm1, %xmm0
404 addps %xmm14, %xmm12
405 addps %xmm5, %xmm4
406 addps %xmm8, %xmm0
407 addps %xmm12, %xmm4
408
409 movaps %xmm0, %xmm2
410 movaps %xmm0, %xmm3
411 movaps %xmm4, %xmm5
412 movaps %xmm4, %xmm6
413 mulps XMMREG_SCALE, %xmm0
414 mulps XMMREG_SCALE, %xmm4
415 cmpnleps XMMREG_MAX, %xmm2
416 cmpltps XMMREG_MIN, %xmm3
417 cmpnleps XMMREG_MAX, %xmm5
418 cmpltps XMMREG_MIN, %xmm6
419 cvtps2dq %xmm0, %xmm0
420 cvtps2dq %xmm4, %xmm4
421 xorps %xmm2, %xmm0
422 xorps %xmm5, %xmm4
423 movaps %xmm0, %xmm1
424 unpcklps %xmm4, %xmm0
425 unpckhps %xmm4, %xmm1
426 movups %xmm0, (SAMPLES)
427 movups %xmm1, 16(SAMPLES)
428
429 packssdw %xmm5, %xmm2
430 packssdw %xmm6, %xmm3
431 psrlw $15, %xmm2
432 psrlw $15, %xmm3
433 paddw %xmm3, %xmm2
434 paddw TEMP_CLIP, %xmm2
435 movaps %xmm2, TEMP_CLIP
436
437 leaq 32(SAMPLES), SAMPLES
438 decl %ecx
439 jnz Loop_start_2
440
441 movaps TEMP_CLIP, %xmm4
442 movhlps %xmm4, %xmm0
443 paddw %xmm4, %xmm0
444 pshuflw $0x55, %xmm0, %xmm1
445 pshuflw $0xaa, %xmm0, %xmm2
446 pshuflw $0xff, %xmm0, %xmm3
447 paddw %xmm1, %xmm0
448 paddw %xmm2, %xmm0
449 paddw %xmm3, %xmm0
450
451 movd %xmm0, %eax
452 andl $0xffff, %eax
453
454 #ifdef _WIN64
455 movaps (%rsp), %xmm6
456 movaps 16(%rsp), %xmm7
457 movaps 32(%rsp), %xmm8
458 movaps 48(%rsp), %xmm9
459 movaps 64(%rsp), %xmm10
460 movaps 80(%rsp), %xmm11
461 movaps 96(%rsp), %xmm12
462 movaps 112(%rsp), %xmm13
463 movaps 128(%rsp), %xmm14
464 movaps 144(%rsp), %xmm15
465 addq $184, %rsp
466 popq %rdi
467 popq %rsi
468 #else
469 addq $24, %rsp
470 #endif
471 ret
472
473 NONEXEC_STACK