Sync winemp3.acm with Wine HEAD. This one uses libmpg123 which was added in Version...
[reactos.git] / reactos / lib / 3rdparty / libmpg123 / synth_sse_accurate.S
1 /*
2 synth_sse_accurate: SSE optimized synth (MPEG-compliant 16bit output version)
3
4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Taihei Monma
7 */
8
9 #include "mangle.h"
10
11 /* real *window; */
12 #define WINDOW %ebx
13 /* real *b0; */
14 #define B0 %edx
15 /* real *samples; */
16 #define SAMPLES %esi
17
18 #define MMREG_CLIP %mm7
19
20 /*
21 int synth_1to1_sse_accurate_asm(real *window, real *b0, short *samples, int bo1);
22 return value: number of clipped samples
23 */
24
25 #ifndef __APPLE__
26 .section .rodata
27 #else
28 .data
29 #endif
30 ALIGN32
31 ASM_NAME(maxmin_s16):
32 .long 1191181824 /* 32767.0 */
33 .long 1191181824
34 .long 1191181824
35 .long 1191181824
36 .long -956301312 /* -32768.0 */
37 .long -956301312
38 .long -956301312
39 .long -956301312
40 .text
41 ALIGN16
42 .globl ASM_NAME(synth_1to1_sse_accurate_asm)
43 ASM_NAME(synth_1to1_sse_accurate_asm):
44 pushl %ebp
45 movl %esp, %ebp
46 pushl %ebx
47 pushl %esi
48
49 pxor MMREG_CLIP, MMREG_CLIP
50
51 movl 8(%ebp), WINDOW
52 movl 12(%ebp), B0
53 movl 16(%ebp), SAMPLES
54 movl 20(%ebp), %eax
55 shll $2, %eax
56
57 leal 64(WINDOW), WINDOW
58 subl %eax, WINDOW
59
60 movl $4, %ecx
61
62 ALIGN16
63 Loop_start_1:
64 movups (WINDOW), %xmm0
65 movups 16(WINDOW), %xmm1
66 movups 32(WINDOW), %xmm2
67 movups 48(WINDOW), %xmm3
68 movups 128(WINDOW), %xmm4
69 movups 144(WINDOW), %xmm5
70 movups 160(WINDOW), %xmm6
71 movups 176(WINDOW), %xmm7
72 mulps 0(B0), %xmm0
73 mulps 16(B0), %xmm1
74 mulps 32(B0), %xmm2
75 mulps 48(B0), %xmm3
76 mulps 64(B0), %xmm4
77 mulps 80(B0), %xmm5
78 mulps 96(B0), %xmm6
79 mulps 112(B0), %xmm7
80 addps %xmm1, %xmm0
81 addps %xmm3, %xmm2
82 addps %xmm5, %xmm4
83 addps %xmm7, %xmm6
84 addps %xmm2, %xmm0
85 addps %xmm6, %xmm4
86 movaps %xmm4, %xmm5
87 movaps %xmm0, %xmm4
88
89 leal 256(WINDOW), WINDOW
90 leal 128(B0), B0
91
92 movups (WINDOW), %xmm0
93 movups 16(WINDOW), %xmm1
94 movups 32(WINDOW), %xmm2
95 movups 48(WINDOW), %xmm3
96 movups 128(WINDOW), %xmm6
97 movups 144(WINDOW), %xmm7
98 mulps (B0), %xmm0
99 mulps 16(B0), %xmm1
100 mulps 32(B0), %xmm2
101 mulps 48(B0), %xmm3
102 mulps 64(B0), %xmm6
103 mulps 80(B0), %xmm7
104 addps %xmm1, %xmm0
105 addps %xmm3, %xmm2
106 addps %xmm7, %xmm6
107 movups 160(WINDOW), %xmm1
108 movups 176(WINDOW), %xmm3
109 mulps 96(B0), %xmm1
110 mulps 112(B0), %xmm3
111 addps %xmm2, %xmm0
112 addps %xmm3, %xmm1
113 addps %xmm1, %xmm6
114 movaps %xmm6, %xmm7
115 movaps %xmm0, %xmm6
116
117 leal 256(WINDOW), WINDOW
118 leal 128(B0), B0
119
120 movaps %xmm4, %xmm0
121 movaps %xmm6, %xmm1
122 unpcklps %xmm5, %xmm4
123 unpcklps %xmm7, %xmm6
124 unpckhps %xmm5, %xmm0
125 unpckhps %xmm7, %xmm1
126 movaps %xmm4, %xmm2
127 movaps %xmm0, %xmm3
128 movlhps %xmm6, %xmm4
129 movhlps %xmm2, %xmm6
130 movlhps %xmm1, %xmm0
131 movhlps %xmm3, %xmm1
132 subps %xmm6, %xmm4
133 subps %xmm1, %xmm0
134 addps %xmm4, %xmm0
135
136 movaps %xmm0, %xmm1
137 movaps %xmm0, %xmm2
138 pshufw $0xdd, (SAMPLES), %mm2
139 pshufw $0xdd, 8(SAMPLES), %mm3
140 cmpnleps ASM_NAME(maxmin_s16), %xmm1
141 cmpltps ASM_NAME(maxmin_s16)+16, %xmm2
142 cvtps2pi %xmm0, %mm0
143 movhlps %xmm0, %xmm0
144 cvtps2pi %xmm0, %mm1
145 packssdw %mm1, %mm0
146 movq %mm0, %mm1
147 punpcklwd %mm2, %mm0
148 punpckhwd %mm3, %mm1
149 movq %mm0, (SAMPLES)
150 movq %mm1, 8(SAMPLES)
151
152 cvtps2pi %xmm1, %mm0
153 cvtps2pi %xmm2, %mm1
154 movhlps %xmm1, %xmm1
155 movhlps %xmm2, %xmm2
156 cvtps2pi %xmm1, %mm2
157 cvtps2pi %xmm2, %mm3
158 packssdw %mm2, %mm0
159 packssdw %mm3, %mm1
160 psrlw $15, %mm0
161 psrlw $15, %mm1
162 paddw %mm1, %mm0
163 paddw %mm0, MMREG_CLIP
164
165 leal 16(SAMPLES), SAMPLES
166 decl %ecx
167 jnz Loop_start_1
168
169 movl $4, %ecx
170
171 ALIGN16
172 Loop_start_2:
173 movups (WINDOW), %xmm0
174 movups 16(WINDOW), %xmm1
175 movups 32(WINDOW), %xmm2
176 movups 48(WINDOW), %xmm3
177 movups 128(WINDOW), %xmm4
178 movups 144(WINDOW), %xmm5
179 movups 160(WINDOW), %xmm6
180 movups 176(WINDOW), %xmm7
181 mulps 0(B0), %xmm0
182 mulps 16(B0), %xmm1
183 mulps 32(B0), %xmm2
184 mulps 48(B0), %xmm3
185 mulps -64(B0), %xmm4
186 mulps -48(B0), %xmm5
187 mulps -32(B0), %xmm6
188 mulps -16(B0), %xmm7
189 addps %xmm1, %xmm0
190 addps %xmm3, %xmm2
191 addps %xmm5, %xmm4
192 addps %xmm7, %xmm6
193 addps %xmm2, %xmm0
194 addps %xmm6, %xmm4
195 movaps %xmm4, %xmm5
196 movaps %xmm0, %xmm4
197
198 leal 256(WINDOW), WINDOW
199 leal -128(B0), B0
200
201 movups (WINDOW), %xmm0
202 movups 16(WINDOW), %xmm1
203 movups 32(WINDOW), %xmm2
204 movups 48(WINDOW), %xmm3
205 movups 128(WINDOW), %xmm6
206 movups 144(WINDOW), %xmm7
207 mulps (B0), %xmm0
208 mulps 16(B0), %xmm1
209 mulps 32(B0), %xmm2
210 mulps 48(B0), %xmm3
211 mulps -64(B0), %xmm6
212 mulps -48(B0), %xmm7
213 addps %xmm1, %xmm0
214 addps %xmm3, %xmm2
215 addps %xmm7, %xmm6
216 movups 160(WINDOW), %xmm1
217 movups 176(WINDOW), %xmm3
218 mulps -32(B0), %xmm1
219 mulps -16(B0), %xmm3
220 addps %xmm2, %xmm0
221 addps %xmm3, %xmm1
222 addps %xmm1, %xmm6
223 movaps %xmm6, %xmm7
224 movaps %xmm0, %xmm6
225
226 leal 256(WINDOW), WINDOW
227 leal -128(B0), B0
228
229 movaps %xmm4, %xmm0
230 movaps %xmm6, %xmm1
231 unpcklps %xmm5, %xmm4
232 unpcklps %xmm7, %xmm6
233 unpckhps %xmm5, %xmm0
234 unpckhps %xmm7, %xmm1
235 movaps %xmm4, %xmm2
236 movaps %xmm0, %xmm3
237 movlhps %xmm6, %xmm4
238 movhlps %xmm2, %xmm6
239 movlhps %xmm1, %xmm0
240 movhlps %xmm3, %xmm1
241 addps %xmm6, %xmm4
242 addps %xmm1, %xmm0
243 addps %xmm4, %xmm0
244
245 movaps %xmm0, %xmm1
246 movaps %xmm0, %xmm2
247 pshufw $0xdd, (SAMPLES), %mm2
248 pshufw $0xdd, 8(SAMPLES), %mm3
249 cmpnleps ASM_NAME(maxmin_s16), %xmm1
250 cmpltps ASM_NAME(maxmin_s16)+16, %xmm2
251 cvtps2pi %xmm0, %mm0
252 movhlps %xmm0, %xmm0
253 cvtps2pi %xmm0, %mm1
254 packssdw %mm1, %mm0
255 movq %mm0, %mm1
256 punpcklwd %mm2, %mm0
257 punpckhwd %mm3, %mm1
258 movq %mm0, (SAMPLES)
259 movq %mm1, 8(SAMPLES)
260
261 cvtps2pi %xmm1, %mm0
262 cvtps2pi %xmm2, %mm1
263 movhlps %xmm1, %xmm1
264 movhlps %xmm2, %xmm2
265 cvtps2pi %xmm1, %mm2
266 cvtps2pi %xmm2, %mm3
267 packssdw %mm2, %mm0
268 packssdw %mm3, %mm1
269 psrlw $15, %mm0
270 psrlw $15, %mm1
271 paddw %mm1, %mm0
272 paddw %mm0, MMREG_CLIP
273
274 leal 16(SAMPLES), SAMPLES
275 decl %ecx
276 jnz Loop_start_2
277
278 pshufw $0xee, MMREG_CLIP, %mm0
279 paddw MMREG_CLIP, %mm0
280 pshufw $0x55, %mm0, %mm1
281 paddw %mm1, %mm0
282 movd %mm0, %eax
283 andl $0xffff, %eax
284
285 popl %esi
286 popl %ebx
287 movl %ebp, %esp
288 popl %ebp
289
290 emms
291
292 ret
293
294 NONEXEC_STACK