Sync with trunk (r48414)
[reactos.git] / lib / 3rdparty / libmpg123 / synth_stereo_x86_64_accurate.S
1 /*
2 synth_stereo_x86_64_accurate: SSE optimized synth for x86-64 (stereo specific, MPEG-compliant 16bit output version)
3
4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Taihei Monma
7 */
8
9 #include "mangle.h"
10
11 #ifdef _WIN64
12 /* short *window; */
13 #define WINDOW %rsi
14 /* short *b0l; */
15 #define B0L %rdx
16 /* short *b0r; */
17 #define B0R %r8
18 /* short *samples; */
19 #define SAMPLES %r9
20 #else
21 /* real *window; */
22 #define WINDOW %rdi
23 /* real *b0l; */
24 #define B0L %rsi
25 /* real *b0r; */
26 #define B0R %rdx
27 /* real *samples; */
28 #define SAMPLES %r8
29 #endif
30
31 #define XMMREG_MAX (%r10) /* {32767.0, 32767.0, 32767.0, 32767.0} */
32 #define XMMREG_MIN (%r11) /* {-32768.0, -32768.0, -32768.0, -32768.0} */
33 #define TEMP_CLIP (%rsp)
34
35 /*
36 int synth_1to1_stereo_x86_64_accurate_asm(real *window, real *b0l, real *b0r, short *samples, int bo1);
37 return value: number of clipped samples
38 */
39
40 #ifndef __APPLE__
41 .section .rodata
42 #else
43 .data
44 #endif
45 ALIGN32
46 ASM_NAME(maxmin_s16):
47 .long 1191181824
48 .long 1191181824
49 .long 1191181824
50 .long 1191181824
51 .long -956301312
52 .long -956301312
53 .long -956301312
54 .long -956301312
55 .text
56 ALIGN16
57 .globl ASM_NAME(synth_1to1_stereo_x86_64_accurate_asm)
58 ASM_NAME(synth_1to1_stereo_x86_64_accurate_asm):
59 #ifdef _WIN64 /* should save xmm6-15 */
60 movl 40(%rsp), %eax /* 5th argument; placed after 32-byte shadow space */
61 pushq %rsi
62 subq $176, %rsp /* 10 xmm registers + temp */
63 movaps %xmm6, 16(%rsp)
64 movaps %xmm7, 32(%rsp)
65 movaps %xmm8, 48(%rsp)
66 movaps %xmm9, 64(%rsp)
67 movaps %xmm10, 80(%rsp)
68 movaps %xmm11, 96(%rsp)
69 movaps %xmm12, 112(%rsp)
70 movaps %xmm13, 128(%rsp)
71 movaps %xmm14, 144(%rsp)
72 movaps %xmm15, 160(%rsp)
73 #else
74 subq $24, %rsp /* stack alignment + temp */
75 #endif
76
77 leaq ASM_NAME(maxmin_s16)(%rip), %r10
78 leaq 16(%r10), %r11
79 xorps %xmm0, %xmm0
80 movaps %xmm0, TEMP_CLIP
81
82 #ifdef _WIN64
83 shlq $32, %rax
84 shrq $30, %rax
85 movq %rcx, %rbx
86 #else
87 movq %r8, %rax
88 shlq $32, %rax
89 shrq $30, %rax
90 movq %rcx, %r8
91 #endif
92 leaq 64(WINDOW), WINDOW
93 subq %rax, WINDOW
94
95 movl $4, %ecx
96
97 ALIGN16
98 Loop_start_1:
99 movups (WINDOW), %xmm8
100 movups 16(WINDOW), %xmm1
101 movups 32(WINDOW), %xmm2
102 movups 48(WINDOW), %xmm3
103 movups 128(WINDOW), %xmm9
104 movups 144(WINDOW), %xmm5
105 movups 160(WINDOW), %xmm6
106 movups 176(WINDOW), %xmm7
107 movaps %xmm8, %xmm0
108 movaps %xmm1, %xmm4
109 movaps %xmm2, %xmm10
110 movaps %xmm3, %xmm11
111 movaps %xmm9, %xmm12
112 movaps %xmm5, %xmm13
113 movaps %xmm6, %xmm14
114 movaps %xmm7, %xmm15
115 mulps (B0L), %xmm8
116 mulps 16(B0L), %xmm1
117 mulps 32(B0L), %xmm2
118 mulps 48(B0L), %xmm3
119 mulps 64(B0L), %xmm9
120 mulps 80(B0L), %xmm5
121 mulps 96(B0L), %xmm6
122 mulps 112(B0L), %xmm7
123 mulps (B0R), %xmm0
124 mulps 16(B0R), %xmm4
125 mulps 32(B0R), %xmm10
126 mulps 48(B0R), %xmm11
127 mulps 64(B0R), %xmm12
128 mulps 80(B0R), %xmm13
129 mulps 96(B0R), %xmm14
130 mulps 112(B0R), %xmm15
131
132 addps %xmm1, %xmm8
133 addps %xmm2, %xmm3
134 addps %xmm4, %xmm0
135 addps %xmm11, %xmm10
136 addps %xmm5, %xmm9
137 addps %xmm7, %xmm6
138 addps %xmm13, %xmm12
139 addps %xmm15, %xmm14
140 addps %xmm3, %xmm8
141 addps %xmm6, %xmm9
142 addps %xmm10, %xmm0
143 addps %xmm12, %xmm14
144 movaps %xmm0, %xmm12
145 movaps %xmm14, %xmm13
146 leaq 256(WINDOW), WINDOW
147 leaq 128(B0L), B0L
148 leaq 128(B0R), B0R
149
150 movups (WINDOW), %xmm10
151 movups 16(WINDOW), %xmm1
152 movups 32(WINDOW), %xmm2
153 movups 48(WINDOW), %xmm3
154 movups 128(WINDOW), %xmm11
155 movups 144(WINDOW), %xmm5
156 movups 160(WINDOW), %xmm6
157 movups 176(WINDOW), %xmm7
158 movaps %xmm10, %xmm0
159 movaps %xmm1, %xmm4
160 movaps %xmm2, %xmm14
161 movaps %xmm3, %xmm15
162 mulps (B0L), %xmm10
163 mulps 16(B0L), %xmm1
164 mulps 32(B0L), %xmm2
165 mulps 48(B0L), %xmm3
166 mulps (B0R), %xmm0
167 mulps 16(B0R), %xmm4
168 mulps 32(B0R), %xmm14
169 mulps 48(B0R), %xmm15
170 addps %xmm1, %xmm10
171 addps %xmm2, %xmm3
172 addps %xmm4, %xmm0
173 addps %xmm15, %xmm14
174 movaps %xmm11, %xmm1
175 movaps %xmm5, %xmm2
176 movaps %xmm6, %xmm4
177 movaps %xmm7, %xmm15
178 mulps 64(B0L), %xmm11
179 mulps 80(B0L), %xmm5
180 mulps 96(B0L), %xmm6
181 mulps 112(B0L), %xmm7
182 mulps 64(B0R), %xmm1
183 mulps 80(B0R), %xmm2
184 mulps 96(B0R), %xmm4
185 mulps 112(B0R), %xmm15
186 addps %xmm5, %xmm11
187 addps %xmm7, %xmm6
188 addps %xmm2, %xmm1
189 addps %xmm15, %xmm4
190
191 addps %xmm3, %xmm10
192 addps %xmm6, %xmm11
193 addps %xmm0, %xmm14
194 addps %xmm4, %xmm1
195 movaps %xmm1, %xmm15
196 leaq 256(WINDOW), WINDOW
197 leaq 128(B0L), B0L
198 leaq 128(B0R), B0R
199
200 movaps %xmm8, %xmm0
201 movaps %xmm10, %xmm1
202 movaps %xmm12, %xmm4
203 movaps %xmm14, %xmm5
204 unpcklps %xmm9, %xmm8
205 unpcklps %xmm11, %xmm10
206 unpckhps %xmm9, %xmm0
207 unpckhps %xmm11, %xmm1
208 unpcklps %xmm13, %xmm12
209 unpcklps %xmm15, %xmm14
210 unpckhps %xmm13, %xmm4
211 unpckhps %xmm15, %xmm5
212 movaps %xmm8, %xmm2
213 movaps %xmm0, %xmm3
214 movaps %xmm12, %xmm6
215 movaps %xmm4, %xmm7
216 movlhps %xmm10, %xmm8
217 movhlps %xmm2, %xmm10
218 movlhps %xmm1, %xmm0
219 movhlps %xmm3, %xmm1
220 movlhps %xmm14, %xmm12
221 movhlps %xmm6, %xmm14
222 movlhps %xmm5, %xmm4
223 movhlps %xmm7, %xmm5
224 subps %xmm10, %xmm8
225 subps %xmm1, %xmm0
226 subps %xmm14, %xmm12
227 subps %xmm5, %xmm4
228 addps %xmm8, %xmm0
229 addps %xmm12, %xmm4
230
231 movaps %xmm0, %xmm2
232 movaps %xmm0, %xmm3
233 movaps %xmm4, %xmm5
234 movaps %xmm4, %xmm6
235 cmpnleps XMMREG_MAX, %xmm2
236 cmpltps XMMREG_MIN, %xmm3
237 cmpnleps XMMREG_MAX, %xmm5
238 cmpltps XMMREG_MIN, %xmm6
239 cvtps2dq %xmm0, %xmm0
240 cvtps2dq %xmm4, %xmm4
241 movaps %xmm0, %xmm1
242 unpcklps %xmm4, %xmm0
243 unpckhps %xmm4, %xmm1
244 packssdw %xmm1, %xmm0
245 movups %xmm0, (SAMPLES)
246
247 packssdw %xmm5, %xmm2
248 packssdw %xmm6, %xmm3
249 psrlw $15, %xmm2
250 psrlw $15, %xmm3
251 paddw %xmm3, %xmm2
252 paddw TEMP_CLIP, %xmm2
253 movaps %xmm2, TEMP_CLIP
254
255 leaq 16(SAMPLES), SAMPLES
256 decl %ecx
257 jnz Loop_start_1
258
259 movl $4, %ecx
260
261 ALIGN16
262 Loop_start_2:
263 movups (WINDOW), %xmm8
264 movups 16(WINDOW), %xmm1
265 movups 32(WINDOW), %xmm2
266 movups 48(WINDOW), %xmm3
267 movups 128(WINDOW), %xmm9
268 movups 144(WINDOW), %xmm5
269 movups 160(WINDOW), %xmm6
270 movups 176(WINDOW), %xmm7
271 movaps %xmm8, %xmm0
272 movaps %xmm1, %xmm4
273 movaps %xmm2, %xmm10
274 movaps %xmm3, %xmm11
275 movaps %xmm9, %xmm12
276 movaps %xmm5, %xmm13
277 movaps %xmm6, %xmm14
278 movaps %xmm7, %xmm15
279 mulps (B0L), %xmm8
280 mulps 16(B0L), %xmm1
281 mulps 32(B0L), %xmm2
282 mulps 48(B0L), %xmm3
283 mulps -64(B0L), %xmm9
284 mulps -48(B0L), %xmm5
285 mulps -32(B0L), %xmm6
286 mulps -16(B0L), %xmm7
287 mulps (B0R), %xmm0
288 mulps 16(B0R), %xmm4
289 mulps 32(B0R), %xmm10
290 mulps 48(B0R), %xmm11
291 mulps -64(B0R), %xmm12
292 mulps -48(B0R), %xmm13
293 mulps -32(B0R), %xmm14
294 mulps -16(B0R), %xmm15
295
296 addps %xmm1, %xmm8
297 addps %xmm2, %xmm3
298 addps %xmm4, %xmm0
299 addps %xmm11, %xmm10
300 addps %xmm5, %xmm9
301 addps %xmm7, %xmm6
302 addps %xmm13, %xmm12
303 addps %xmm15, %xmm14
304 addps %xmm3, %xmm8
305 addps %xmm6, %xmm9
306 addps %xmm10, %xmm0
307 addps %xmm12, %xmm14
308 movaps %xmm0, %xmm12
309 movaps %xmm14, %xmm13
310 leaq 256(WINDOW), WINDOW
311 leaq -128(B0L), B0L
312 leaq -128(B0R), B0R
313
314 movups (WINDOW), %xmm10
315 movups 16(WINDOW), %xmm1
316 movups 32(WINDOW), %xmm2
317 movups 48(WINDOW), %xmm3
318 movups 128(WINDOW), %xmm11
319 movups 144(WINDOW), %xmm5
320 movups 160(WINDOW), %xmm6
321 movups 176(WINDOW), %xmm7
322 movaps %xmm10, %xmm0
323 movaps %xmm1, %xmm4
324 movaps %xmm2, %xmm14
325 movaps %xmm3, %xmm15
326 mulps (B0L), %xmm10
327 mulps 16(B0L), %xmm1
328 mulps 32(B0L), %xmm2
329 mulps 48(B0L), %xmm3
330 mulps (B0R), %xmm0
331 mulps 16(B0R), %xmm4
332 mulps 32(B0R), %xmm14
333 mulps 48(B0R), %xmm15
334 addps %xmm1, %xmm10
335 addps %xmm2, %xmm3
336 addps %xmm4, %xmm0
337 addps %xmm15, %xmm14
338 movaps %xmm11, %xmm1
339 movaps %xmm5, %xmm2
340 movaps %xmm6, %xmm4
341 movaps %xmm7, %xmm15
342 mulps -64(B0L), %xmm11
343 mulps -48(B0L), %xmm5
344 mulps -32(B0L), %xmm6
345 mulps -16(B0L), %xmm7
346 mulps -64(B0R), %xmm1
347 mulps -48(B0R), %xmm2
348 mulps -32(B0R), %xmm4
349 mulps -16(B0R), %xmm15
350 addps %xmm5, %xmm11
351 addps %xmm7, %xmm6
352 addps %xmm2, %xmm1
353 addps %xmm15, %xmm4
354
355 addps %xmm3, %xmm10
356 addps %xmm6, %xmm11
357 addps %xmm0, %xmm14
358 addps %xmm4, %xmm1
359 movaps %xmm1, %xmm15
360 leaq 256(WINDOW), WINDOW
361 leaq -128(B0L), B0L
362 leaq -128(B0R), B0R
363
364 movaps %xmm8, %xmm0
365 movaps %xmm10, %xmm1
366 movaps %xmm12, %xmm4
367 movaps %xmm14, %xmm5
368 unpcklps %xmm9, %xmm8
369 unpcklps %xmm11, %xmm10
370 unpckhps %xmm9, %xmm0
371 unpckhps %xmm11, %xmm1
372 unpcklps %xmm13, %xmm12
373 unpcklps %xmm15, %xmm14
374 unpckhps %xmm13, %xmm4
375 unpckhps %xmm15, %xmm5
376 movaps %xmm8, %xmm2
377 movaps %xmm0, %xmm3
378 movaps %xmm12, %xmm6
379 movaps %xmm4, %xmm7
380 movlhps %xmm10, %xmm8
381 movhlps %xmm2, %xmm10
382 movlhps %xmm1, %xmm0
383 movhlps %xmm3, %xmm1
384 movlhps %xmm14, %xmm12
385 movhlps %xmm6, %xmm14
386 movlhps %xmm5, %xmm4
387 movhlps %xmm7, %xmm5
388 addps %xmm10, %xmm8
389 addps %xmm1, %xmm0
390 addps %xmm14, %xmm12
391 addps %xmm5, %xmm4
392 addps %xmm8, %xmm0
393 addps %xmm12, %xmm4
394
395 movaps %xmm0, %xmm2
396 movaps %xmm0, %xmm3
397 movaps %xmm4, %xmm5
398 movaps %xmm4, %xmm6
399 cmpnleps XMMREG_MAX, %xmm2
400 cmpltps XMMREG_MIN, %xmm3
401 cmpnleps XMMREG_MAX, %xmm5
402 cmpltps XMMREG_MIN, %xmm6
403 cvtps2dq %xmm0, %xmm0
404 cvtps2dq %xmm4, %xmm4
405 movaps %xmm0, %xmm1
406 unpcklps %xmm4, %xmm0
407 unpckhps %xmm4, %xmm1
408 packssdw %xmm1, %xmm0
409 movups %xmm0, (SAMPLES)
410
411 packssdw %xmm5, %xmm2
412 packssdw %xmm6, %xmm3
413 psrlw $15, %xmm2
414 psrlw $15, %xmm3
415 paddw %xmm3, %xmm2
416 paddw TEMP_CLIP, %xmm2
417 movaps %xmm2, TEMP_CLIP
418
419 leaq 16(SAMPLES), SAMPLES
420 decl %ecx
421 jnz Loop_start_2
422
423 movaps TEMP_CLIP, %xmm4
424 movhlps %xmm4, %xmm0
425 paddw %xmm4, %xmm0
426 pshuflw $0x55, %xmm0, %xmm1
427 pshuflw $0xaa, %xmm0, %xmm2
428 pshuflw $0xff, %xmm0, %xmm3
429 paddw %xmm1, %xmm0
430 paddw %xmm2, %xmm0
431 paddw %xmm3, %xmm0
432
433 movd %xmm0, %eax
434 andl $0xffff, %eax
435
436 #ifdef _WIN64
437 movaps (%rsp), %xmm6
438 movaps 16(%rsp), %xmm7
439 movaps 32(%rsp), %xmm8
440 movaps 48(%rsp), %xmm9
441 movaps 64(%rsp), %xmm10
442 movaps 80(%rsp), %xmm11
443 movaps 96(%rsp), %xmm12
444 movaps 112(%rsp), %xmm13
445 movaps 128(%rsp), %xmm14
446 movaps 144(%rsp), %xmm15
447 addq $176, %rsp
448 popq %rsi
449 #else
450 addq $24, %rsp
451 #endif
452 ret
453
454 NONEXEC_STACK