Sync with trunk (r48414)
[reactos.git] / lib / 3rdparty / libmpg123 / synth_stereo_sse_s32.S
1 /*
2 synth_stereo_sse_s32: SSE optimized synth (stereo specific, s32 output version)
3
4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Taihei Monma
7 */
8
9 #include "mangle.h"
10
11 /* real *window; */
12 #define WINDOW %ebx
13 /* real *b0l; */
14 #define B0L %edx
15 /* real *b0r; */
16 #define B0R %esi
17 /* real *samples; */
18 #define SAMPLES %edi
19
20 #define TEMP(n) (12+16*n)(%esp)
21 #define MMREG_CLIP %mm7
22
23 /*
24 int synth_1to1_s32_stereo_sse_asm(real *window, real *b0l, real *b0r, int32_t *samples, int bo1);
25 return value: number of clipped samples
26 */
27
28 #ifndef __APPLE__
29 .section .rodata
30 #else
31 .data
32 #endif
33 ALIGN32
34 ASM_NAME(scale_s32):
35 .long 1199570944 /* 65536.0 */
36 .long 1199570944
37 .long 1199570944
38 .long 1199570944
39 ALIGN16
40 ASM_NAME(maxmin_s32):
41 .long 1191182335 /* 32767.999 */
42 .long 1191182335
43 .long 1191182335
44 .long 1191182335
45 .long -956301312 /* -32768.0 */
46 .long -956301312
47 .long -956301312
48 .long -956301312
49 .text
50 ALIGN16
51 .globl ASM_NAME(synth_1to1_s32_stereo_sse_asm)
52 ASM_NAME(synth_1to1_s32_stereo_sse_asm):
53 pushl %ebp
54 movl %esp, %ebp
55 andl $-16, %esp
56 subl $128, %esp
57 pushl %ebx
58 pushl %esi
59 pushl %edi
60
61 pxor MMREG_CLIP, MMREG_CLIP
62
63 movl 8(%ebp), WINDOW
64 movl 12(%ebp), B0L
65 movl 16(%ebp), B0R
66 movl 20(%ebp), SAMPLES
67 movl 24(%ebp), %eax
68 shll $2, %eax
69
70 leal 64(WINDOW), WINDOW
71 subl %eax, WINDOW
72
73 movl $4, %ecx
74
75 ALIGN16
76 Loop_start_1:
77 movups (WINDOW), %xmm0
78 movups 16(WINDOW), %xmm1
79 movups 32(WINDOW), %xmm2
80 movups 48(WINDOW), %xmm3
81 movaps %xmm0, %xmm4
82 movaps %xmm1, %xmm5
83 movaps %xmm2, %xmm6
84 movaps %xmm3, %xmm7
85 mulps 0(B0L), %xmm0
86 mulps 16(B0L), %xmm1
87 mulps 32(B0L), %xmm2
88 mulps 48(B0L), %xmm3
89 mulps 0(B0R), %xmm4
90 mulps 16(B0R), %xmm5
91 mulps 32(B0R), %xmm6
92 mulps 48(B0R), %xmm7
93 addps %xmm1, %xmm0
94 addps %xmm3, %xmm2
95 addps %xmm5, %xmm4
96 addps %xmm7, %xmm6
97 addps %xmm2, %xmm0
98 addps %xmm6, %xmm4
99 movaps %xmm0, TEMP(0)
100 movaps %xmm4, TEMP(4)
101
102 leal 128(WINDOW), WINDOW
103 leal 64(B0L), B0L
104 leal 64(B0R), B0R
105
106 movups (WINDOW), %xmm0
107 movups 16(WINDOW), %xmm1
108 movups 32(WINDOW), %xmm2
109 movups 48(WINDOW), %xmm3
110 movaps %xmm0, %xmm4
111 movaps %xmm1, %xmm5
112 movaps %xmm2, %xmm6
113 movaps %xmm3, %xmm7
114 mulps 0(B0L), %xmm0
115 mulps 16(B0L), %xmm1
116 mulps 32(B0L), %xmm2
117 mulps 48(B0L), %xmm3
118 mulps 0(B0R), %xmm4
119 mulps 16(B0R), %xmm5
120 mulps 32(B0R), %xmm6
121 mulps 48(B0R), %xmm7
122 addps %xmm1, %xmm0
123 addps %xmm3, %xmm2
124 addps %xmm5, %xmm4
125 addps %xmm7, %xmm6
126 addps %xmm2, %xmm0
127 addps %xmm6, %xmm4
128 movaps %xmm0, TEMP(1)
129 movaps %xmm4, TEMP(5)
130
131 leal 128(WINDOW), WINDOW
132 leal 64(B0L), B0L
133 leal 64(B0R), B0R
134
135 movups (WINDOW), %xmm0
136 movups 16(WINDOW), %xmm1
137 movups 32(WINDOW), %xmm2
138 movups 48(WINDOW), %xmm3
139 movaps %xmm0, %xmm4
140 movaps %xmm1, %xmm5
141 movaps %xmm2, %xmm6
142 movaps %xmm3, %xmm7
143 mulps 0(B0L), %xmm0
144 mulps 16(B0L), %xmm1
145 mulps 32(B0L), %xmm2
146 mulps 48(B0L), %xmm3
147 mulps 0(B0R), %xmm4
148 mulps 16(B0R), %xmm5
149 mulps 32(B0R), %xmm6
150 mulps 48(B0R), %xmm7
151 addps %xmm1, %xmm0
152 addps %xmm3, %xmm2
153 addps %xmm5, %xmm4
154 addps %xmm7, %xmm6
155 addps %xmm2, %xmm0
156 addps %xmm6, %xmm4
157 movaps %xmm0, TEMP(2)
158 movaps %xmm4, TEMP(6)
159
160 leal 128(WINDOW), WINDOW
161 leal 64(B0L), B0L
162 leal 64(B0R), B0R
163
164 movups (WINDOW), %xmm0
165 movups 16(WINDOW), %xmm1
166 movups 32(WINDOW), %xmm2
167 movups 48(WINDOW), %xmm3
168 movaps %xmm0, %xmm4
169 movaps %xmm1, %xmm5
170 movaps %xmm2, %xmm6
171 movaps %xmm3, %xmm7
172 mulps 0(B0L), %xmm0
173 mulps 16(B0L), %xmm1
174 mulps 32(B0L), %xmm2
175 mulps 48(B0L), %xmm3
176 mulps 0(B0R), %xmm4
177 mulps 16(B0R), %xmm5
178 mulps 32(B0R), %xmm6
179 mulps 48(B0R), %xmm7
180 addps %xmm1, %xmm0
181 addps %xmm3, %xmm2
182 addps %xmm5, %xmm4
183 addps %xmm7, %xmm6
184 addps %xmm2, %xmm0
185 addps %xmm6, %xmm4
186 movaps %xmm0, %xmm7
187 movaps %xmm4, TEMP(7)
188
189 leal 128(WINDOW), WINDOW
190 leal 64(B0L), B0L
191 leal 64(B0R), B0R
192
193 movaps TEMP(0), %xmm4
194 movaps TEMP(1), %xmm5
195 movaps TEMP(2), %xmm6
196 movaps %xmm4, %xmm0
197 movaps %xmm6, %xmm1
198 unpcklps %xmm5, %xmm4
199 unpcklps %xmm7, %xmm6
200 unpckhps %xmm5, %xmm0
201 unpckhps %xmm7, %xmm1
202 movaps %xmm4, %xmm2
203 movaps %xmm0, %xmm3
204 movlhps %xmm6, %xmm4
205 movhlps %xmm2, %xmm6
206 movlhps %xmm1, %xmm0
207 movhlps %xmm3, %xmm1
208 subps %xmm6, %xmm4
209 subps %xmm1, %xmm0
210 addps %xmm4, %xmm0
211 movaps %xmm0, %xmm2
212
213 movaps TEMP(4), %xmm4
214 movaps TEMP(5), %xmm5
215 movaps TEMP(6), %xmm6
216 movaps TEMP(7), %xmm7
217 movaps %xmm4, %xmm0
218 movaps %xmm6, %xmm1
219 unpcklps %xmm5, %xmm4
220 unpcklps %xmm7, %xmm6
221 unpckhps %xmm5, %xmm0
222 unpckhps %xmm7, %xmm1
223 movaps %xmm2, %xmm5
224 movaps %xmm4, %xmm2
225 movaps %xmm0, %xmm3
226 movlhps %xmm6, %xmm4
227 movhlps %xmm2, %xmm6
228 movlhps %xmm1, %xmm0
229 movhlps %xmm3, %xmm1
230 subps %xmm6, %xmm4
231 subps %xmm1, %xmm0
232 addps %xmm4, %xmm0
233
234 movaps %xmm5, %xmm1
235 movaps %xmm5, %xmm2
236 movaps %xmm0, %xmm3
237 movaps %xmm0, %xmm4
238 mulps ASM_NAME(scale_s32), %xmm5
239 mulps ASM_NAME(scale_s32), %xmm0
240 cmpnleps ASM_NAME(maxmin_s32), %xmm1
241 cmpltps ASM_NAME(maxmin_s32)+16, %xmm2
242 cmpnleps ASM_NAME(maxmin_s32), %xmm3
243 cmpltps ASM_NAME(maxmin_s32)+16, %xmm4
244 cvtps2pi %xmm5, %mm0
245 cvtps2pi %xmm0, %mm1
246 cvtps2pi %xmm1, %mm2
247 cvtps2pi %xmm3, %mm3
248 psrad $31, %mm2
249 psrad $31, %mm3
250 pxor %mm2, %mm0
251 pxor %mm3, %mm1
252 movq %mm0, %mm4
253 punpckldq %mm1, %mm0
254 punpckhdq %mm1, %mm4
255 movq %mm0, (SAMPLES)
256 movq %mm4, 8(SAMPLES)
257 movhlps %xmm5, %xmm5
258 movhlps %xmm0, %xmm0
259 movhlps %xmm1, %xmm1
260 movhlps %xmm3, %xmm3
261 cvtps2pi %xmm5, %mm0
262 cvtps2pi %xmm0, %mm1
263 cvtps2pi %xmm1, %mm4
264 cvtps2pi %xmm3, %mm5
265 psrad $31, %mm4
266 psrad $31, %mm5
267 pxor %mm4, %mm0
268 pxor %mm5, %mm1
269 movq %mm0, %mm6
270 punpckldq %mm1, %mm0
271 punpckhdq %mm1, %mm6
272 movq %mm0, 16(SAMPLES)
273 movq %mm6, 24(SAMPLES)
274
275 packssdw %mm4, %mm2
276 packssdw %mm5, %mm3
277 psrlw $15, %mm2
278 psrlw $15, %mm3
279 cvtps2pi %xmm2, %mm0
280 cvtps2pi %xmm4, %mm1
281 movhlps %xmm2, %xmm2
282 movhlps %xmm4, %xmm4
283 cvtps2pi %xmm2, %mm4
284 cvtps2pi %xmm4, %mm5
285 packssdw %mm4, %mm0
286 packssdw %mm5, %mm1
287 psrlw $15, %mm0
288 psrlw $15, %mm1
289 paddw %mm3, %mm2
290 paddw %mm1, %mm0
291 paddw %mm2, %mm0
292 paddw %mm0, MMREG_CLIP
293
294 leal 32(SAMPLES), SAMPLES
295 decl %ecx
296 jnz Loop_start_1
297
298 movl $4, %ecx
299
300 ALIGN16
301 Loop_start_2:
302 movups (WINDOW), %xmm0
303 movups 16(WINDOW), %xmm1
304 movups 32(WINDOW), %xmm2
305 movups 48(WINDOW), %xmm3
306 movaps %xmm0, %xmm4
307 movaps %xmm1, %xmm5
308 movaps %xmm2, %xmm6
309 movaps %xmm3, %xmm7
310 mulps 0(B0L), %xmm0
311 mulps 16(B0L), %xmm1
312 mulps 32(B0L), %xmm2
313 mulps 48(B0L), %xmm3
314 mulps 0(B0R), %xmm4
315 mulps 16(B0R), %xmm5
316 mulps 32(B0R), %xmm6
317 mulps 48(B0R), %xmm7
318 addps %xmm1, %xmm0
319 addps %xmm3, %xmm2
320 addps %xmm5, %xmm4
321 addps %xmm7, %xmm6
322 addps %xmm2, %xmm0
323 addps %xmm6, %xmm4
324 movaps %xmm0, TEMP(0)
325 movaps %xmm4, TEMP(4)
326
327 leal 128(WINDOW), WINDOW
328 leal -64(B0L), B0L
329 leal -64(B0R), B0R
330
331 movups (WINDOW), %xmm0
332 movups 16(WINDOW), %xmm1
333 movups 32(WINDOW), %xmm2
334 movups 48(WINDOW), %xmm3
335 movaps %xmm0, %xmm4
336 movaps %xmm1, %xmm5
337 movaps %xmm2, %xmm6
338 movaps %xmm3, %xmm7
339 mulps 0(B0L), %xmm0
340 mulps 16(B0L), %xmm1
341 mulps 32(B0L), %xmm2
342 mulps 48(B0L), %xmm3
343 mulps 0(B0R), %xmm4
344 mulps 16(B0R), %xmm5
345 mulps 32(B0R), %xmm6
346 mulps 48(B0R), %xmm7
347 addps %xmm1, %xmm0
348 addps %xmm3, %xmm2
349 addps %xmm5, %xmm4
350 addps %xmm7, %xmm6
351 addps %xmm2, %xmm0
352 addps %xmm6, %xmm4
353 movaps %xmm0, TEMP(1)
354 movaps %xmm4, TEMP(5)
355
356 leal 128(WINDOW), WINDOW
357 leal -64(B0L), B0L
358 leal -64(B0R), B0R
359
360 movups (WINDOW), %xmm0
361 movups 16(WINDOW), %xmm1
362 movups 32(WINDOW), %xmm2
363 movups 48(WINDOW), %xmm3
364 movaps %xmm0, %xmm4
365 movaps %xmm1, %xmm5
366 movaps %xmm2, %xmm6
367 movaps %xmm3, %xmm7
368 mulps 0(B0L), %xmm0
369 mulps 16(B0L), %xmm1
370 mulps 32(B0L), %xmm2
371 mulps 48(B0L), %xmm3
372 mulps 0(B0R), %xmm4
373 mulps 16(B0R), %xmm5
374 mulps 32(B0R), %xmm6
375 mulps 48(B0R), %xmm7
376 addps %xmm1, %xmm0
377 addps %xmm3, %xmm2
378 addps %xmm5, %xmm4
379 addps %xmm7, %xmm6
380 addps %xmm2, %xmm0
381 addps %xmm6, %xmm4
382 movaps %xmm0, TEMP(2)
383 movaps %xmm4, TEMP(6)
384
385 leal 128(WINDOW), WINDOW
386 leal -64(B0L), B0L
387 leal -64(B0R), B0R
388
389 movups (WINDOW), %xmm0
390 movups 16(WINDOW), %xmm1
391 movups 32(WINDOW), %xmm2
392 movups 48(WINDOW), %xmm3
393 movaps %xmm0, %xmm4
394 movaps %xmm1, %xmm5
395 movaps %xmm2, %xmm6
396 movaps %xmm3, %xmm7
397 mulps 0(B0L), %xmm0
398 mulps 16(B0L), %xmm1
399 mulps 32(B0L), %xmm2
400 mulps 48(B0L), %xmm3
401 mulps 0(B0R), %xmm4
402 mulps 16(B0R), %xmm5
403 mulps 32(B0R), %xmm6
404 mulps 48(B0R), %xmm7
405 addps %xmm1, %xmm0
406 addps %xmm3, %xmm2
407 addps %xmm5, %xmm4
408 addps %xmm7, %xmm6
409 addps %xmm2, %xmm0
410 addps %xmm6, %xmm4
411 movaps %xmm0, %xmm7
412 movaps %xmm4, TEMP(7)
413
414 leal 128(WINDOW), WINDOW
415 leal -64(B0L), B0L
416 leal -64(B0R), B0R
417
418 movaps TEMP(0), %xmm4
419 movaps TEMP(1), %xmm5
420 movaps TEMP(2), %xmm6
421 movaps %xmm4, %xmm0
422 movaps %xmm6, %xmm1
423 unpcklps %xmm5, %xmm4
424 unpcklps %xmm7, %xmm6
425 unpckhps %xmm5, %xmm0
426 unpckhps %xmm7, %xmm1
427 movaps %xmm4, %xmm2
428 movaps %xmm0, %xmm3
429 movlhps %xmm6, %xmm4
430 movhlps %xmm2, %xmm6
431 movlhps %xmm1, %xmm0
432 movhlps %xmm3, %xmm1
433 addps %xmm6, %xmm4
434 addps %xmm1, %xmm0
435 addps %xmm4, %xmm0
436 movaps %xmm0, %xmm2
437
438 movaps TEMP(4), %xmm4
439 movaps TEMP(5), %xmm5
440 movaps TEMP(6), %xmm6
441 movaps TEMP(7), %xmm7
442 movaps %xmm4, %xmm0
443 movaps %xmm6, %xmm1
444 unpcklps %xmm5, %xmm4
445 unpcklps %xmm7, %xmm6
446 unpckhps %xmm5, %xmm0
447 unpckhps %xmm7, %xmm1
448 movaps %xmm2, %xmm5
449 movaps %xmm4, %xmm2
450 movaps %xmm0, %xmm3
451 movlhps %xmm6, %xmm4
452 movhlps %xmm2, %xmm6
453 movlhps %xmm1, %xmm0
454 movhlps %xmm3, %xmm1
455 addps %xmm6, %xmm4
456 addps %xmm1, %xmm0
457 addps %xmm4, %xmm0
458
459 movaps %xmm5, %xmm1
460 movaps %xmm5, %xmm2
461 movaps %xmm0, %xmm3
462 movaps %xmm0, %xmm4
463 mulps ASM_NAME(scale_s32), %xmm5
464 mulps ASM_NAME(scale_s32), %xmm0
465 cmpnleps ASM_NAME(maxmin_s32), %xmm1
466 cmpltps ASM_NAME(maxmin_s32)+16, %xmm2
467 cmpnleps ASM_NAME(maxmin_s32), %xmm3
468 cmpltps ASM_NAME(maxmin_s32)+16, %xmm4
469 cvtps2pi %xmm5, %mm0
470 cvtps2pi %xmm0, %mm1
471 cvtps2pi %xmm1, %mm2
472 cvtps2pi %xmm3, %mm3
473 psrad $31, %mm2
474 psrad $31, %mm3
475 pxor %mm2, %mm0
476 pxor %mm3, %mm1
477 movq %mm0, %mm4
478 punpckldq %mm1, %mm0
479 punpckhdq %mm1, %mm4
480 movq %mm0, (SAMPLES)
481 movq %mm4, 8(SAMPLES)
482 movhlps %xmm5, %xmm5
483 movhlps %xmm0, %xmm0
484 movhlps %xmm1, %xmm1
485 movhlps %xmm3, %xmm3
486 cvtps2pi %xmm5, %mm0
487 cvtps2pi %xmm0, %mm1
488 cvtps2pi %xmm1, %mm4
489 cvtps2pi %xmm3, %mm5
490 psrad $31, %mm4
491 psrad $31, %mm5
492 pxor %mm4, %mm0
493 pxor %mm5, %mm1
494 movq %mm0, %mm6
495 punpckldq %mm1, %mm0
496 punpckhdq %mm1, %mm6
497 movq %mm0, 16(SAMPLES)
498 movq %mm6, 24(SAMPLES)
499
500 packssdw %mm4, %mm2
501 packssdw %mm5, %mm3
502 psrlw $15, %mm2
503 psrlw $15, %mm3
504 cvtps2pi %xmm2, %mm0
505 cvtps2pi %xmm4, %mm1
506 movhlps %xmm2, %xmm2
507 movhlps %xmm4, %xmm4
508 cvtps2pi %xmm2, %mm4
509 cvtps2pi %xmm4, %mm5
510 packssdw %mm4, %mm0
511 packssdw %mm5, %mm1
512 psrlw $15, %mm0
513 psrlw $15, %mm1
514 paddw %mm3, %mm2
515 paddw %mm1, %mm0
516 paddw %mm2, %mm0
517 paddw %mm0, MMREG_CLIP
518
519 leal 32(SAMPLES), SAMPLES
520 decl %ecx
521 jnz Loop_start_2
522
523 pshufw $0xee, MMREG_CLIP, %mm0
524 paddw MMREG_CLIP, %mm0
525 pshufw $0x55, %mm0, %mm1
526 paddw %mm1, %mm0
527 movd %mm0, %eax
528 andl $0xffff, %eax
529
530 popl %edi
531 popl %esi
532 popl %ebx
533 movl %ebp, %esp
534 popl %ebp
535
536 emms
537
538 ret
539
540 NONEXEC_STACK