Sync with trunk (r48414)
[reactos.git] / lib / 3rdparty / libmpg123 / synth_stereo_sse_accurate.S
1 /*
2 synth_stereo_sse_accurate: SSE optimized synth (stereo specific, MPEG-compliant 16bit output version)
3
4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Taihei Monma
7 */
8
9 #include "mangle.h"
10
11 /* real *window; */
12 #define WINDOW %ebx
13 /* real *b0l; */
14 #define B0L %edx
15 /* real *b0r; */
16 #define B0R %esi
17 /* real *samples; */
18 #define SAMPLES %edi
19
20 #define TEMP(n) (12+16*n)(%esp)
21 #define MMREG_CLIP %mm7
22
23 /*
24 int synth_1to1_stereo_sse_accurate_asm(real *window, real *b0l, real *b0r, short *samples, int bo1);
25 return value: number of clipped samples
26 */
27
28 #ifndef __APPLE__
29 .section .rodata
30 #else
31 .data
32 #endif
33 ALIGN32
34 ASM_NAME(maxmin_s16):
35 .long 1191181824 /* 32767.0 */
36 .long 1191181824
37 .long 1191181824
38 .long 1191181824
39 .long -956301312 /* -32768.0 */
40 .long -956301312
41 .long -956301312
42 .long -956301312
43 .text
44 ALIGN16
45 .globl ASM_NAME(synth_1to1_stereo_sse_accurate_asm)
46 ASM_NAME(synth_1to1_stereo_sse_accurate_asm):
47 pushl %ebp
48 movl %esp, %ebp
49 andl $-16, %esp
50 subl $128, %esp
51 pushl %ebx
52 pushl %esi
53 pushl %edi
54
55 pxor MMREG_CLIP, MMREG_CLIP
56
57 movl 8(%ebp), WINDOW
58 movl 12(%ebp), B0L
59 movl 16(%ebp), B0R
60 movl 20(%ebp), SAMPLES
61 movl 24(%ebp), %eax
62 shll $2, %eax
63
64 leal 64(WINDOW), WINDOW
65 subl %eax, WINDOW
66
67 movl $4, %ecx
68
69 ALIGN16
70 Loop_start_1:
71 movups (WINDOW), %xmm0
72 movups 16(WINDOW), %xmm1
73 movups 32(WINDOW), %xmm2
74 movups 48(WINDOW), %xmm3
75 movaps %xmm0, %xmm4
76 movaps %xmm1, %xmm5
77 movaps %xmm2, %xmm6
78 movaps %xmm3, %xmm7
79 mulps 0(B0L), %xmm0
80 mulps 16(B0L), %xmm1
81 mulps 32(B0L), %xmm2
82 mulps 48(B0L), %xmm3
83 mulps 0(B0R), %xmm4
84 mulps 16(B0R), %xmm5
85 mulps 32(B0R), %xmm6
86 mulps 48(B0R), %xmm7
87 addps %xmm1, %xmm0
88 addps %xmm3, %xmm2
89 addps %xmm5, %xmm4
90 addps %xmm7, %xmm6
91 addps %xmm2, %xmm0
92 addps %xmm6, %xmm4
93 movaps %xmm0, TEMP(0)
94 movaps %xmm4, TEMP(4)
95
96 leal 128(WINDOW), WINDOW
97 leal 64(B0L), B0L
98 leal 64(B0R), B0R
99
100 movups (WINDOW), %xmm0
101 movups 16(WINDOW), %xmm1
102 movups 32(WINDOW), %xmm2
103 movups 48(WINDOW), %xmm3
104 movaps %xmm0, %xmm4
105 movaps %xmm1, %xmm5
106 movaps %xmm2, %xmm6
107 movaps %xmm3, %xmm7
108 mulps 0(B0L), %xmm0
109 mulps 16(B0L), %xmm1
110 mulps 32(B0L), %xmm2
111 mulps 48(B0L), %xmm3
112 mulps 0(B0R), %xmm4
113 mulps 16(B0R), %xmm5
114 mulps 32(B0R), %xmm6
115 mulps 48(B0R), %xmm7
116 addps %xmm1, %xmm0
117 addps %xmm3, %xmm2
118 addps %xmm5, %xmm4
119 addps %xmm7, %xmm6
120 addps %xmm2, %xmm0
121 addps %xmm6, %xmm4
122 movaps %xmm0, TEMP(1)
123 movaps %xmm4, TEMP(5)
124
125 leal 128(WINDOW), WINDOW
126 leal 64(B0L), B0L
127 leal 64(B0R), B0R
128
129 movups (WINDOW), %xmm0
130 movups 16(WINDOW), %xmm1
131 movups 32(WINDOW), %xmm2
132 movups 48(WINDOW), %xmm3
133 movaps %xmm0, %xmm4
134 movaps %xmm1, %xmm5
135 movaps %xmm2, %xmm6
136 movaps %xmm3, %xmm7
137 mulps 0(B0L), %xmm0
138 mulps 16(B0L), %xmm1
139 mulps 32(B0L), %xmm2
140 mulps 48(B0L), %xmm3
141 mulps 0(B0R), %xmm4
142 mulps 16(B0R), %xmm5
143 mulps 32(B0R), %xmm6
144 mulps 48(B0R), %xmm7
145 addps %xmm1, %xmm0
146 addps %xmm3, %xmm2
147 addps %xmm5, %xmm4
148 addps %xmm7, %xmm6
149 addps %xmm2, %xmm0
150 addps %xmm6, %xmm4
151 movaps %xmm0, TEMP(2)
152 movaps %xmm4, TEMP(6)
153
154 leal 128(WINDOW), WINDOW
155 leal 64(B0L), B0L
156 leal 64(B0R), B0R
157
158 movups (WINDOW), %xmm0
159 movups 16(WINDOW), %xmm1
160 movups 32(WINDOW), %xmm2
161 movups 48(WINDOW), %xmm3
162 movaps %xmm0, %xmm4
163 movaps %xmm1, %xmm5
164 movaps %xmm2, %xmm6
165 movaps %xmm3, %xmm7
166 mulps 0(B0L), %xmm0
167 mulps 16(B0L), %xmm1
168 mulps 32(B0L), %xmm2
169 mulps 48(B0L), %xmm3
170 mulps 0(B0R), %xmm4
171 mulps 16(B0R), %xmm5
172 mulps 32(B0R), %xmm6
173 mulps 48(B0R), %xmm7
174 addps %xmm1, %xmm0
175 addps %xmm3, %xmm2
176 addps %xmm5, %xmm4
177 addps %xmm7, %xmm6
178 addps %xmm2, %xmm0
179 addps %xmm6, %xmm4
180 movaps %xmm0, %xmm7
181 movaps %xmm4, TEMP(7)
182
183 leal 128(WINDOW), WINDOW
184 leal 64(B0L), B0L
185 leal 64(B0R), B0R
186
187 movaps TEMP(0), %xmm4
188 movaps TEMP(1), %xmm5
189 movaps TEMP(2), %xmm6
190 movaps %xmm4, %xmm0
191 movaps %xmm6, %xmm1
192 unpcklps %xmm5, %xmm4
193 unpcklps %xmm7, %xmm6
194 unpckhps %xmm5, %xmm0
195 unpckhps %xmm7, %xmm1
196 movaps %xmm4, %xmm2
197 movaps %xmm0, %xmm3
198 movlhps %xmm6, %xmm4
199 movhlps %xmm2, %xmm6
200 movlhps %xmm1, %xmm0
201 movhlps %xmm3, %xmm1
202 subps %xmm6, %xmm4
203 subps %xmm1, %xmm0
204 addps %xmm4, %xmm0
205 movaps %xmm0, %xmm2
206
207 movaps TEMP(4), %xmm4
208 movaps TEMP(5), %xmm5
209 movaps TEMP(6), %xmm6
210 movaps TEMP(7), %xmm7
211 movaps %xmm4, %xmm0
212 movaps %xmm6, %xmm1
213 unpcklps %xmm5, %xmm4
214 unpcklps %xmm7, %xmm6
215 unpckhps %xmm5, %xmm0
216 unpckhps %xmm7, %xmm1
217 movaps %xmm2, %xmm5
218 movaps %xmm4, %xmm2
219 movaps %xmm0, %xmm3
220 movlhps %xmm6, %xmm4
221 movhlps %xmm2, %xmm6
222 movlhps %xmm1, %xmm0
223 movhlps %xmm3, %xmm1
224 subps %xmm6, %xmm4
225 subps %xmm1, %xmm0
226 addps %xmm4, %xmm0
227
228 movaps %xmm5, %xmm1
229 movaps %xmm5, %xmm2
230 movaps %xmm0, %xmm3
231 movaps %xmm0, %xmm4
232 cmpnleps ASM_NAME(maxmin_s16), %xmm1
233 cmpltps ASM_NAME(maxmin_s16)+16, %xmm2
234 cmpnleps ASM_NAME(maxmin_s16), %xmm3
235 cmpltps ASM_NAME(maxmin_s16)+16, %xmm4
236 cvtps2pi %xmm5, %mm0
237 cvtps2pi %xmm0, %mm1
238 movhlps %xmm5, %xmm5
239 movhlps %xmm0, %xmm0
240 cvtps2pi %xmm5, %mm2
241 cvtps2pi %xmm0, %mm3
242 packssdw %mm2, %mm0
243 packssdw %mm3, %mm1
244 movq %mm0, %mm2
245 punpcklwd %mm1, %mm0
246 punpckhwd %mm1, %mm2
247 movq %mm0, (SAMPLES)
248 movq %mm2, 8(SAMPLES)
249
250 cvtps2pi %xmm1, %mm2
251 cvtps2pi %xmm3, %mm3
252 movhlps %xmm1, %xmm1
253 movhlps %xmm3, %xmm3
254 cvtps2pi %xmm1, %mm4
255 cvtps2pi %xmm3, %mm5
256 packssdw %mm4, %mm2
257 packssdw %mm5, %mm3
258 psrlw $15, %mm2
259 psrlw $15, %mm3
260 cvtps2pi %xmm2, %mm0
261 cvtps2pi %xmm4, %mm1
262 movhlps %xmm2, %xmm2
263 movhlps %xmm4, %xmm4
264 cvtps2pi %xmm2, %mm4
265 cvtps2pi %xmm4, %mm5
266 packssdw %mm4, %mm0
267 packssdw %mm5, %mm1
268 psrlw $15, %mm0
269 psrlw $15, %mm1
270 paddw %mm3, %mm2
271 paddw %mm1, %mm0
272 paddw %mm2, %mm0
273 paddw %mm0, MMREG_CLIP
274
275 leal 16(SAMPLES), SAMPLES
276 decl %ecx
277 jnz Loop_start_1
278
279 movl $4, %ecx
280
281 ALIGN16
282 Loop_start_2:
283 movups (WINDOW), %xmm0
284 movups 16(WINDOW), %xmm1
285 movups 32(WINDOW), %xmm2
286 movups 48(WINDOW), %xmm3
287 movaps %xmm0, %xmm4
288 movaps %xmm1, %xmm5
289 movaps %xmm2, %xmm6
290 movaps %xmm3, %xmm7
291 mulps 0(B0L), %xmm0
292 mulps 16(B0L), %xmm1
293 mulps 32(B0L), %xmm2
294 mulps 48(B0L), %xmm3
295 mulps 0(B0R), %xmm4
296 mulps 16(B0R), %xmm5
297 mulps 32(B0R), %xmm6
298 mulps 48(B0R), %xmm7
299 addps %xmm1, %xmm0
300 addps %xmm3, %xmm2
301 addps %xmm5, %xmm4
302 addps %xmm7, %xmm6
303 addps %xmm2, %xmm0
304 addps %xmm6, %xmm4
305 movaps %xmm0, TEMP(0)
306 movaps %xmm4, TEMP(4)
307
308 leal 128(WINDOW), WINDOW
309 leal -64(B0L), B0L
310 leal -64(B0R), B0R
311
312 movups (WINDOW), %xmm0
313 movups 16(WINDOW), %xmm1
314 movups 32(WINDOW), %xmm2
315 movups 48(WINDOW), %xmm3
316 movaps %xmm0, %xmm4
317 movaps %xmm1, %xmm5
318 movaps %xmm2, %xmm6
319 movaps %xmm3, %xmm7
320 mulps 0(B0L), %xmm0
321 mulps 16(B0L), %xmm1
322 mulps 32(B0L), %xmm2
323 mulps 48(B0L), %xmm3
324 mulps 0(B0R), %xmm4
325 mulps 16(B0R), %xmm5
326 mulps 32(B0R), %xmm6
327 mulps 48(B0R), %xmm7
328 addps %xmm1, %xmm0
329 addps %xmm3, %xmm2
330 addps %xmm5, %xmm4
331 addps %xmm7, %xmm6
332 addps %xmm2, %xmm0
333 addps %xmm6, %xmm4
334 movaps %xmm0, TEMP(1)
335 movaps %xmm4, TEMP(5)
336
337 leal 128(WINDOW), WINDOW
338 leal -64(B0L), B0L
339 leal -64(B0R), B0R
340
341 movups (WINDOW), %xmm0
342 movups 16(WINDOW), %xmm1
343 movups 32(WINDOW), %xmm2
344 movups 48(WINDOW), %xmm3
345 movaps %xmm0, %xmm4
346 movaps %xmm1, %xmm5
347 movaps %xmm2, %xmm6
348 movaps %xmm3, %xmm7
349 mulps 0(B0L), %xmm0
350 mulps 16(B0L), %xmm1
351 mulps 32(B0L), %xmm2
352 mulps 48(B0L), %xmm3
353 mulps 0(B0R), %xmm4
354 mulps 16(B0R), %xmm5
355 mulps 32(B0R), %xmm6
356 mulps 48(B0R), %xmm7
357 addps %xmm1, %xmm0
358 addps %xmm3, %xmm2
359 addps %xmm5, %xmm4
360 addps %xmm7, %xmm6
361 addps %xmm2, %xmm0
362 addps %xmm6, %xmm4
363 movaps %xmm0, TEMP(2)
364 movaps %xmm4, TEMP(6)
365
366 leal 128(WINDOW), WINDOW
367 leal -64(B0L), B0L
368 leal -64(B0R), B0R
369
370 movups (WINDOW), %xmm0
371 movups 16(WINDOW), %xmm1
372 movups 32(WINDOW), %xmm2
373 movups 48(WINDOW), %xmm3
374 movaps %xmm0, %xmm4
375 movaps %xmm1, %xmm5
376 movaps %xmm2, %xmm6
377 movaps %xmm3, %xmm7
378 mulps 0(B0L), %xmm0
379 mulps 16(B0L), %xmm1
380 mulps 32(B0L), %xmm2
381 mulps 48(B0L), %xmm3
382 mulps 0(B0R), %xmm4
383 mulps 16(B0R), %xmm5
384 mulps 32(B0R), %xmm6
385 mulps 48(B0R), %xmm7
386 addps %xmm1, %xmm0
387 addps %xmm3, %xmm2
388 addps %xmm5, %xmm4
389 addps %xmm7, %xmm6
390 addps %xmm2, %xmm0
391 addps %xmm6, %xmm4
392 movaps %xmm0, %xmm7
393 movaps %xmm4, TEMP(7)
394
395 leal 128(WINDOW), WINDOW
396 leal -64(B0L), B0L
397 leal -64(B0R), B0R
398
399 movaps TEMP(0), %xmm4
400 movaps TEMP(1), %xmm5
401 movaps TEMP(2), %xmm6
402 movaps %xmm4, %xmm0
403 movaps %xmm6, %xmm1
404 unpcklps %xmm5, %xmm4
405 unpcklps %xmm7, %xmm6
406 unpckhps %xmm5, %xmm0
407 unpckhps %xmm7, %xmm1
408 movaps %xmm4, %xmm2
409 movaps %xmm0, %xmm3
410 movlhps %xmm6, %xmm4
411 movhlps %xmm2, %xmm6
412 movlhps %xmm1, %xmm0
413 movhlps %xmm3, %xmm1
414 addps %xmm6, %xmm4
415 addps %xmm1, %xmm0
416 addps %xmm4, %xmm0
417 movaps %xmm0, %xmm2
418
419 movaps TEMP(4), %xmm4
420 movaps TEMP(5), %xmm5
421 movaps TEMP(6), %xmm6
422 movaps TEMP(7), %xmm7
423 movaps %xmm4, %xmm0
424 movaps %xmm6, %xmm1
425 unpcklps %xmm5, %xmm4
426 unpcklps %xmm7, %xmm6
427 unpckhps %xmm5, %xmm0
428 unpckhps %xmm7, %xmm1
429 movaps %xmm2, %xmm5
430 movaps %xmm4, %xmm2
431 movaps %xmm0, %xmm3
432 movlhps %xmm6, %xmm4
433 movhlps %xmm2, %xmm6
434 movlhps %xmm1, %xmm0
435 movhlps %xmm3, %xmm1
436 addps %xmm6, %xmm4
437 addps %xmm1, %xmm0
438 addps %xmm4, %xmm0
439
440 movaps %xmm5, %xmm1
441 movaps %xmm5, %xmm2
442 movaps %xmm0, %xmm3
443 movaps %xmm0, %xmm4
444 cmpnleps ASM_NAME(maxmin_s16), %xmm1
445 cmpltps ASM_NAME(maxmin_s16)+16, %xmm2
446 cmpnleps ASM_NAME(maxmin_s16), %xmm3
447 cmpltps ASM_NAME(maxmin_s16)+16, %xmm4
448 cvtps2pi %xmm5, %mm0
449 cvtps2pi %xmm0, %mm1
450 movhlps %xmm5, %xmm5
451 movhlps %xmm0, %xmm0
452 cvtps2pi %xmm5, %mm2
453 cvtps2pi %xmm0, %mm3
454 packssdw %mm2, %mm0
455 packssdw %mm3, %mm1
456 movq %mm0, %mm2
457 punpcklwd %mm1, %mm0
458 punpckhwd %mm1, %mm2
459 movq %mm0, (SAMPLES)
460 movq %mm2, 8(SAMPLES)
461
462 cvtps2pi %xmm1, %mm2
463 cvtps2pi %xmm3, %mm3
464 movhlps %xmm1, %xmm1
465 movhlps %xmm3, %xmm3
466 cvtps2pi %xmm1, %mm4
467 cvtps2pi %xmm3, %mm5
468 packssdw %mm4, %mm2
469 packssdw %mm5, %mm3
470 psrlw $15, %mm2
471 psrlw $15, %mm3
472 cvtps2pi %xmm2, %mm0
473 cvtps2pi %xmm4, %mm1
474 movhlps %xmm2, %xmm2
475 movhlps %xmm4, %xmm4
476 cvtps2pi %xmm2, %mm4
477 cvtps2pi %xmm4, %mm5
478 packssdw %mm4, %mm0
479 packssdw %mm5, %mm1
480 psrlw $15, %mm0
481 psrlw $15, %mm1
482 paddw %mm3, %mm2
483 paddw %mm1, %mm0
484 paddw %mm2, %mm0
485 paddw %mm0, MMREG_CLIP
486
487 leal 16(SAMPLES), SAMPLES
488 decl %ecx
489 jnz Loop_start_2
490
491 pshufw $0xee, MMREG_CLIP, %mm0
492 paddw MMREG_CLIP, %mm0
493 pshufw $0x55, %mm0, %mm1
494 paddw %mm1, %mm0
495 movd %mm0, %eax
496 andl $0xffff, %eax
497
498 popl %edi
499 popl %esi
500 popl %ebx
501 movl %ebp, %esp
502 popl %ebp
503
504 emms
505
506 ret
507
508 NONEXEC_STACK