Sync with trunk (r48414)
[reactos.git] / lib / 3rdparty / libmpg123 / synth_sse_float.S
1 /*
2 synth_sse_float: SSE optimized synth (float output version)
3
4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Taihei Monma
7 */
8
9 #include "mangle.h"
10
11 /* real *window; */
12 #define WINDOW %ebx
13 /* real *b0; */
14 #define B0 %edx
15 /* real *samples; */
16 #define SAMPLES %esi
17
18 /*
19 int synth_1to1_real_sse_asm(real *window, real *b0, real *samples, int bo1);
20 return value: number of clipped samples (0)
21 */
22
23 #ifndef __APPLE__
24 .section .rodata
25 #else
26 .data
27 #endif
28 ALIGN32
29 ASM_NAME(scale_sse):
30 .long 939524096
31 .long 939524096
32 .long 939524096
33 .long 939524096
34 .text
35 ALIGN16
36 .globl ASM_NAME(synth_1to1_real_sse_asm)
37 ASM_NAME(synth_1to1_real_sse_asm):
38 pushl %ebp
39 movl %esp, %ebp
40 pushl %ebx
41 pushl %esi
42
43 movl 8(%ebp), WINDOW
44 movl 12(%ebp), B0
45 movl 16(%ebp), SAMPLES
46 movl 20(%ebp), %eax
47 shll $2, %eax
48
49 leal 64(WINDOW), WINDOW
50 subl %eax, WINDOW
51
52 movl $4, %ecx
53
54 ALIGN16
55 Loop_start_1:
56 movups (WINDOW), %xmm0
57 movups 16(WINDOW), %xmm1
58 movups 32(WINDOW), %xmm2
59 movups 48(WINDOW), %xmm3
60 movups 128(WINDOW), %xmm4
61 movups 144(WINDOW), %xmm5
62 movups 160(WINDOW), %xmm6
63 movups 176(WINDOW), %xmm7
64 mulps 0(B0), %xmm0
65 mulps 16(B0), %xmm1
66 mulps 32(B0), %xmm2
67 mulps 48(B0), %xmm3
68 mulps 64(B0), %xmm4
69 mulps 80(B0), %xmm5
70 mulps 96(B0), %xmm6
71 mulps 112(B0), %xmm7
72 addps %xmm1, %xmm0
73 addps %xmm3, %xmm2
74 addps %xmm5, %xmm4
75 addps %xmm7, %xmm6
76 addps %xmm2, %xmm0
77 addps %xmm6, %xmm4
78 movaps %xmm4, %xmm5
79 movaps %xmm0, %xmm4
80
81 leal 256(WINDOW), WINDOW
82 leal 128(B0), B0
83
84 movups (WINDOW), %xmm0
85 movups 16(WINDOW), %xmm1
86 movups 32(WINDOW), %xmm2
87 movups 48(WINDOW), %xmm3
88 movups 128(WINDOW), %xmm6
89 movups 144(WINDOW), %xmm7
90 mulps (B0), %xmm0
91 mulps 16(B0), %xmm1
92 mulps 32(B0), %xmm2
93 mulps 48(B0), %xmm3
94 mulps 64(B0), %xmm6
95 mulps 80(B0), %xmm7
96 addps %xmm1, %xmm0
97 addps %xmm3, %xmm2
98 addps %xmm7, %xmm6
99 movups 160(WINDOW), %xmm1
100 movups 176(WINDOW), %xmm3
101 mulps 96(B0), %xmm1
102 mulps 112(B0), %xmm3
103 addps %xmm2, %xmm0
104 addps %xmm3, %xmm1
105 addps %xmm1, %xmm6
106 movaps %xmm6, %xmm7
107 movaps %xmm0, %xmm6
108
109 leal 256(WINDOW), WINDOW
110 leal 128(B0), B0
111
112 movaps %xmm4, %xmm0
113 movaps %xmm6, %xmm1
114 unpcklps %xmm5, %xmm4
115 unpcklps %xmm7, %xmm6
116 unpckhps %xmm5, %xmm0
117 unpckhps %xmm7, %xmm1
118 movaps %xmm4, %xmm2
119 movaps %xmm0, %xmm3
120 movlhps %xmm6, %xmm4
121 movhlps %xmm2, %xmm6
122 movlhps %xmm1, %xmm0
123 movhlps %xmm3, %xmm1
124 subps %xmm6, %xmm4
125 subps %xmm1, %xmm0
126 addps %xmm4, %xmm0
127
128 movups (SAMPLES), %xmm1
129 movups 16(SAMPLES), %xmm2
130 mulps ASM_NAME(scale_sse), %xmm0
131 shufps $0xdd, %xmm2, %xmm1
132 movaps %xmm0, %xmm2
133 unpcklps %xmm1, %xmm0
134 unpckhps %xmm1, %xmm2
135 movups %xmm0, (SAMPLES)
136 movups %xmm2, 16(SAMPLES)
137
138 leal 32(SAMPLES), SAMPLES
139 decl %ecx
140 jnz Loop_start_1
141
142 movl $4, %ecx
143
144 ALIGN16
145 Loop_start_2:
146 movups (WINDOW), %xmm0
147 movups 16(WINDOW), %xmm1
148 movups 32(WINDOW), %xmm2
149 movups 48(WINDOW), %xmm3
150 movups 128(WINDOW), %xmm4
151 movups 144(WINDOW), %xmm5
152 movups 160(WINDOW), %xmm6
153 movups 176(WINDOW), %xmm7
154 mulps 0(B0), %xmm0
155 mulps 16(B0), %xmm1
156 mulps 32(B0), %xmm2
157 mulps 48(B0), %xmm3
158 mulps -64(B0), %xmm4
159 mulps -48(B0), %xmm5
160 mulps -32(B0), %xmm6
161 mulps -16(B0), %xmm7
162 addps %xmm1, %xmm0
163 addps %xmm3, %xmm2
164 addps %xmm5, %xmm4
165 addps %xmm7, %xmm6
166 addps %xmm2, %xmm0
167 addps %xmm6, %xmm4
168 movaps %xmm4, %xmm5
169 movaps %xmm0, %xmm4
170
171 leal 256(WINDOW), WINDOW
172 leal -128(B0), B0
173
174 movups (WINDOW), %xmm0
175 movups 16(WINDOW), %xmm1
176 movups 32(WINDOW), %xmm2
177 movups 48(WINDOW), %xmm3
178 movups 128(WINDOW), %xmm6
179 movups 144(WINDOW), %xmm7
180 mulps (B0), %xmm0
181 mulps 16(B0), %xmm1
182 mulps 32(B0), %xmm2
183 mulps 48(B0), %xmm3
184 mulps -64(B0), %xmm6
185 mulps -48(B0), %xmm7
186 addps %xmm1, %xmm0
187 addps %xmm3, %xmm2
188 addps %xmm7, %xmm6
189 movups 160(WINDOW), %xmm1
190 movups 176(WINDOW), %xmm3
191 mulps -32(B0), %xmm1
192 mulps -16(B0), %xmm3
193 addps %xmm2, %xmm0
194 addps %xmm3, %xmm1
195 addps %xmm1, %xmm6
196 movaps %xmm6, %xmm7
197 movaps %xmm0, %xmm6
198
199 leal 256(WINDOW), WINDOW
200 leal -128(B0), B0
201
202 movaps %xmm4, %xmm0
203 movaps %xmm6, %xmm1
204 unpcklps %xmm5, %xmm4
205 unpcklps %xmm7, %xmm6
206 unpckhps %xmm5, %xmm0
207 unpckhps %xmm7, %xmm1
208 movaps %xmm4, %xmm2
209 movaps %xmm0, %xmm3
210 movlhps %xmm6, %xmm4
211 movhlps %xmm2, %xmm6
212 movlhps %xmm1, %xmm0
213 movhlps %xmm3, %xmm1
214 addps %xmm6, %xmm4
215 addps %xmm1, %xmm0
216 addps %xmm4, %xmm0
217
218 movups (SAMPLES), %xmm1
219 movups 16(SAMPLES), %xmm2
220 mulps ASM_NAME(scale_sse), %xmm0
221 shufps $0xdd, %xmm2, %xmm1
222 movaps %xmm0, %xmm2
223 unpcklps %xmm1, %xmm0
224 unpckhps %xmm1, %xmm2
225 movups %xmm0, (SAMPLES)
226 movups %xmm2, 16(SAMPLES)
227
228 leal 32(SAMPLES), SAMPLES
229 decl %ecx
230 jnz Loop_start_2
231
232 xorl %eax, %eax
233
234 popl %esi
235 popl %ebx
236 movl %ebp, %esp
237 popl %ebp
238
239 ret
240
241 NONEXEC_STACK