Sync winemp3.acm with Wine HEAD. This one uses libmpg123 which was added in Version...
[reactos.git] / reactos / lib / 3rdparty / libmpg123 / synth_x86_64_s32.S
1 /*
2 synth_x86_64_s32: SSE optimized synth for x86-64 (s32 output version)
3
4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Taihei Monma
7 */
8
9 #include "mangle.h"
10
11 #ifdef _WIN64
12 /* short *window; */
13 #define ARG0 %r10
14 /* short *b0; */
15 #define ARG1 %rdx
16 /* short *samples; */
17 #define ARG2 %r8
18 /* int bo1; */
19 #define ARG3 %r9
20 #else
21 /* real *window; */
22 #define ARG0 %rdi
23 /* real *b0; */
24 #define ARG1 %rsi
25 /* real *samples; */
26 #define ARG2 %rdx
27 /* int bo1; */
28 #define ARG3 %rcx
29 #endif
30
31 #define XMMREG_SCALE %xmm15 /* {65536.0, 65536.0, 65536.0, 65536.0} */
32 #define XMMREG_MAX %xmm14 /* {32767.999, 32767.999, 32767.999, 32767.999} */
33 #define XMMREG_MIN %xmm13 /* {-32768.0, -32768.0, -32768.0, -32768.0} */
34 #define XMMREG_CLIP %xmm12
35
36 /*
37 int synth_1to1_s32_x86_64_asm(real *window, real *b0, int32_t *samples, int bo1);
38 return value: number of clipped samples
39 */
40
41 #ifndef __APPLE__
42 .section .rodata
43 #else
44 .data
45 #endif
46 ALIGN32
47 ASM_NAME(scale_s32):
48 .long 1199570944
49 .long 1199570944
50 .long 1199570944
51 .long 1199570944
52 ALIGN16
53 ASM_NAME(maxmin_s32):
54 .long 1191182335
55 .long 1191182335
56 .long 1191182335
57 .long 1191182335
58 .long -956301312
59 .long -956301312
60 .long -956301312
61 .long -956301312
62 .text
63 ALIGN16
64 .globl ASM_NAME(synth_1to1_s32_x86_64_asm)
65 ASM_NAME(synth_1to1_s32_x86_64_asm):
66 #ifdef _WIN64 /* should save xmm6-15 */
67 movq %rcx, ARG0
68 subq $168, %rsp /* stack alignment + 10 xmm registers */
69 movaps %xmm6, (%rsp)
70 movaps %xmm7, 16(%rsp)
71 movaps %xmm8, 32(%rsp)
72 movaps %xmm9, 48(%rsp)
73 movaps %xmm10, 64(%rsp)
74 movaps %xmm11, 80(%rsp)
75 movaps %xmm12, 96(%rsp)
76 movaps %xmm13, 112(%rsp)
77 movaps %xmm14, 128(%rsp)
78 movaps %xmm15, 144(%rsp)
79 #endif
80
81 leaq ASM_NAME(scale_s32)(%rip), %rax
82 movaps (%rax), XMMREG_SCALE
83 leaq ASM_NAME(maxmin_s32)(%rip), %rax
84 movaps (%rax), XMMREG_MAX
85 movaps 16(%rax), XMMREG_MIN
86
87 xorps XMMREG_CLIP, XMMREG_CLIP
88
89 andq $0xf, ARG3
90 shlq $2, ARG3
91 leaq 64(ARG0), ARG0
92 subq ARG3, ARG0
93
94 movl $4, %ecx
95
96 ALIGN16
97 Loop_start_1:
98 movups (ARG0), %xmm8
99 movups 16(ARG0), %xmm1
100 movups 32(ARG0), %xmm2
101 movups 48(ARG0), %xmm3
102 movups 128(ARG0), %xmm9
103 movups 144(ARG0), %xmm5
104 movups 160(ARG0), %xmm6
105 movups 176(ARG0), %xmm7
106 mulps (ARG1), %xmm8
107 mulps 16(ARG1), %xmm1
108 mulps 32(ARG1), %xmm2
109 mulps 48(ARG1), %xmm3
110 mulps 64(ARG1), %xmm9
111 mulps 80(ARG1), %xmm5
112 mulps 96(ARG1), %xmm6
113 mulps 112(ARG1), %xmm7
114
115 addps %xmm1, %xmm8
116 addps %xmm2, %xmm3
117 addps %xmm5, %xmm9
118 addps %xmm7, %xmm6
119 addps %xmm3, %xmm8
120 addps %xmm6, %xmm9
121 leaq 256(ARG0), ARG0
122 leaq 128(ARG1), ARG1
123
124 movups (ARG0), %xmm10
125 movups 16(ARG0), %xmm1
126 movups 32(ARG0), %xmm2
127 movups 48(ARG0), %xmm3
128 movups 128(ARG0), %xmm11
129 movups 144(ARG0), %xmm5
130 movups 160(ARG0), %xmm6
131 movups 176(ARG0), %xmm7
132 mulps (ARG1), %xmm10
133 mulps 16(ARG1), %xmm1
134 mulps 32(ARG1), %xmm2
135 mulps 48(ARG1), %xmm3
136 mulps 64(ARG1), %xmm11
137 mulps 80(ARG1), %xmm5
138 mulps 96(ARG1), %xmm6
139 mulps 112(ARG1), %xmm7
140
141 addps %xmm1, %xmm10
142 addps %xmm2, %xmm3
143 addps %xmm5, %xmm11
144 addps %xmm7, %xmm6
145 addps %xmm3, %xmm10
146 addps %xmm6, %xmm11
147 leaq 256(ARG0), ARG0
148 leaq 128(ARG1), ARG1
149
150 movaps %xmm8, %xmm0
151 movaps %xmm10, %xmm1
152 unpcklps %xmm9, %xmm8
153 unpcklps %xmm11, %xmm10
154 unpckhps %xmm9, %xmm0
155 unpckhps %xmm11, %xmm1
156 movaps %xmm8, %xmm2
157 movaps %xmm0, %xmm3
158 movlhps %xmm10, %xmm8
159 movhlps %xmm2, %xmm10
160 movlhps %xmm1, %xmm0
161 movhlps %xmm3, %xmm1
162 subps %xmm10, %xmm8
163 subps %xmm1, %xmm0
164 addps %xmm8, %xmm0
165
166 movups (ARG2), %xmm1
167 movups 16(ARG2), %xmm2
168 movaps %xmm0, %xmm3
169 movaps %xmm0, %xmm4
170 mulps XMMREG_SCALE, %xmm0
171 cmpnleps XMMREG_MAX, %xmm3
172 cmpltps XMMREG_MIN, %xmm4
173 cvtps2dq %xmm0, %xmm0
174 xorps %xmm3, %xmm0
175 shufps $0xdd, %xmm2, %xmm1
176 movaps %xmm0, %xmm2
177 unpcklps %xmm1, %xmm0
178 unpckhps %xmm1, %xmm2
179 movups %xmm0, (ARG2)
180 movups %xmm2, 16(ARG2)
181
182 psrld $31, %xmm3
183 psrld $31, %xmm4
184 paddd %xmm4, %xmm3
185 paddd %xmm3, XMMREG_CLIP
186
187 leaq 32(ARG2), ARG2
188 decl %ecx
189 jnz Loop_start_1
190
191 movl $4, %ecx
192
193 ALIGN16
194 Loop_start_2:
195 movups (ARG0), %xmm8
196 movups 16(ARG0), %xmm1
197 movups 32(ARG0), %xmm2
198 movups 48(ARG0), %xmm3
199 movups 128(ARG0), %xmm9
200 movups 144(ARG0), %xmm5
201 movups 160(ARG0), %xmm6
202 movups 176(ARG0), %xmm7
203 mulps (ARG1), %xmm8
204 mulps 16(ARG1), %xmm1
205 mulps 32(ARG1), %xmm2
206 mulps 48(ARG1), %xmm3
207 mulps -64(ARG1), %xmm9
208 mulps -48(ARG1), %xmm5
209 mulps -32(ARG1), %xmm6
210 mulps -16(ARG1), %xmm7
211
212 addps %xmm1, %xmm8
213 addps %xmm2, %xmm3
214 addps %xmm5, %xmm9
215 addps %xmm7, %xmm6
216 addps %xmm3, %xmm8
217 addps %xmm6, %xmm9
218 leaq 256(ARG0), ARG0
219 leaq -128(ARG1), ARG1
220
221 movups (ARG0), %xmm10
222 movups 16(ARG0), %xmm1
223 movups 32(ARG0), %xmm2
224 movups 48(ARG0), %xmm3
225 movups 128(ARG0), %xmm11
226 movups 144(ARG0), %xmm5
227 movups 160(ARG0), %xmm6
228 movups 176(ARG0), %xmm7
229 mulps (ARG1), %xmm10
230 mulps 16(ARG1), %xmm1
231 mulps 32(ARG1), %xmm2
232 mulps 48(ARG1), %xmm3
233 mulps -64(ARG1), %xmm11
234 mulps -48(ARG1), %xmm5
235 mulps -32(ARG1), %xmm6
236 mulps -16(ARG1), %xmm7
237
238 addps %xmm1, %xmm10
239 addps %xmm2, %xmm3
240 addps %xmm5, %xmm11
241 addps %xmm7, %xmm6
242 addps %xmm3, %xmm10
243 addps %xmm6, %xmm11
244 leaq 256(ARG0), ARG0
245 leaq -128(ARG1), ARG1
246
247 movaps %xmm8, %xmm0
248 movaps %xmm10, %xmm1
249 unpcklps %xmm9, %xmm8
250 unpcklps %xmm11, %xmm10
251 unpckhps %xmm9, %xmm0
252 unpckhps %xmm11, %xmm1
253 movaps %xmm8, %xmm2
254 movaps %xmm0, %xmm3
255 movlhps %xmm10, %xmm8
256 movhlps %xmm2, %xmm10
257 movlhps %xmm1, %xmm0
258 movhlps %xmm3, %xmm1
259 addps %xmm10, %xmm8
260 addps %xmm1, %xmm0
261 addps %xmm8, %xmm0
262
263 movups (ARG2), %xmm1
264 movups 16(ARG2), %xmm2
265 movaps %xmm0, %xmm3
266 movaps %xmm0, %xmm4
267 mulps XMMREG_SCALE, %xmm0
268 cmpnleps XMMREG_MAX, %xmm3
269 cmpltps XMMREG_MIN, %xmm4
270 cvtps2dq %xmm0, %xmm0
271 xorps %xmm3, %xmm0
272 shufps $0xdd, %xmm2, %xmm1
273 movaps %xmm0, %xmm2
274 unpcklps %xmm1, %xmm0
275 unpckhps %xmm1, %xmm2
276 movups %xmm0, (ARG2)
277 movups %xmm2, 16(ARG2)
278
279 psrld $31, %xmm3
280 psrld $31, %xmm4
281 paddd %xmm4, %xmm3
282 paddd %xmm3, XMMREG_CLIP
283
284 leaq 32(ARG2), ARG2
285 decl %ecx
286 jnz Loop_start_2
287
288 pshuflw $0xee, XMMREG_CLIP, %xmm0
289 movhlps XMMREG_CLIP, %xmm1
290 pshuflw $0xee, %xmm1, %xmm2
291 paddd %xmm0, XMMREG_CLIP
292 paddd %xmm1, XMMREG_CLIP
293 paddd %xmm2, XMMREG_CLIP
294
295 movd XMMREG_CLIP, %eax
296
297 #ifdef _WIN64
298 movaps (%rsp), %xmm6
299 movaps 16(%rsp), %xmm7
300 movaps 32(%rsp), %xmm8
301 movaps 48(%rsp), %xmm9
302 movaps 64(%rsp), %xmm10
303 movaps 80(%rsp), %xmm11
304 movaps 96(%rsp), %xmm12
305 movaps 112(%rsp), %xmm13
306 movaps 128(%rsp), %xmm14
307 movaps 144(%rsp), %xmm15
308 addq $168, %rsp
309 #endif
310 ret
311
312 NONEXEC_STACK