Sync with trunk (r48414)
[reactos.git] / lib / 3rdparty / libmpg123 / synth_stereo_x86_64.S
1 /*
2 synth_stereo_x86_64: SSE optimized synth for x86-64 (stereo specific version)
3
4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Taihei Monma
7 */
8
9 #include "mangle.h"
10
11 #ifdef _WIN64
12 /* short *window; */
13 #define WINDOW %r10
14 /* short *b0l; */
15 #define B0L %rdx
16 /* short *b0r; */
17 #define B0R %r8
18 /* short *samples; */
19 #define SAMPLES %r9
20 #else
21 /* short *window; */
22 #define WINDOW %rdi
23 /* short *b0l; */
24 #define B0L %rsi
25 /* short *b0r; */
26 #define B0R %rdx
27 /* short *samples; */
28 #define SAMPLES %r9
29 #endif
30
31 #define XMMREG_CLIP %xmm15
32 #define XMMREG_MAX %xmm14 /* {32767, 32767, 32767, 32767} */
33 #define XMMREG_MIN %xmm13 /* {-32769, -32769, -32769, -32769} : not -32768 because SSE doesn't have "less than" comparison... */
34 #define XMMREG_FULL %xmm12 /* {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF} */
35
36 /*
37 int synth_1to1_stereo_x86_64_asm(short *window, short *b0l, short *b0r, short *samples, int bo1);
38 return value: number of clipped samples
39 */
40
41 #ifndef __APPLE__
42 .section .rodata
43 #else
44 .data
45 #endif
46 ALIGN32
47 ASM_NAME(maxmin_x86_64):
48 .long 32767
49 .long 32767
50 .long 32767
51 .long 32767
52 .long -32769
53 .long -32769
54 .long -32769
55 .long -32769
56 .text
57 ALIGN16
58 .globl ASM_NAME(synth_1to1_stereo_x86_64_asm)
59 ASM_NAME(synth_1to1_stereo_x86_64_asm):
60 #ifdef _WIN64 /* should save xmm6-15 */
61 movl 40(%rsp), %eax /* 5th argument; placed after 32-byte shadow space */
62 subq $168, %rsp /* stack alignment + 10 xmm registers */
63 movaps %xmm6, (%rsp)
64 movaps %xmm7, 16(%rsp)
65 movaps %xmm8, 32(%rsp)
66 movaps %xmm9, 48(%rsp)
67 movaps %xmm10, 64(%rsp)
68 movaps %xmm11, 80(%rsp)
69 movaps %xmm12, 96(%rsp)
70 movaps %xmm13, 112(%rsp)
71 movaps %xmm14, 128(%rsp)
72 movaps %xmm15, 144(%rsp)
73 #endif
74
75 #ifdef _WIN64
76 shlq $32, %rax
77 shrq $31, %rax
78 movq %rcx, %r10
79 #else
80 movq %r8, %rax
81 shlq $32, %rax
82 shrq $31, %rax
83 movq %rcx, %r9
84 #endif
85 leaq 32(WINDOW), WINDOW
86 subq %rax, WINDOW
87
88 leaq ASM_NAME(maxmin_x86_64)(%rip), %rax
89 movaps (%rax), XMMREG_MAX
90 movaps 16(%rax), XMMREG_MIN
91 pxor XMMREG_CLIP, XMMREG_CLIP
92 pcmpeqd XMMREG_FULL, XMMREG_FULL
93
94 movl $4, %ecx
95
96 ALIGN16
97 Loop_start_1:
98 movups (WINDOW), %xmm0
99 movups 16(WINDOW), %xmm1
100 movups 64(WINDOW), %xmm2
101 movups 80(WINDOW), %xmm3
102 movups 128(WINDOW), %xmm4
103 movups 144(WINDOW), %xmm5
104 movups 192(WINDOW), %xmm6
105 movups 208(WINDOW), %xmm7
106 movaps %xmm0, %xmm8
107 movaps %xmm1, %xmm9
108 movaps %xmm2, %xmm10
109 movaps %xmm3, %xmm11
110 pmaddwd (B0L), %xmm0
111 pmaddwd 16(B0L), %xmm1
112 pmaddwd 32(B0L), %xmm2
113 pmaddwd 48(B0L), %xmm3
114 pmaddwd (B0R), %xmm8
115 pmaddwd 16(B0R), %xmm9
116 pmaddwd 32(B0R), %xmm10
117 pmaddwd 48(B0R), %xmm11
118 paddd %xmm1, %xmm0
119 paddd %xmm3, %xmm2
120 paddd %xmm9, %xmm8
121 paddd %xmm11, %xmm10
122 movaps %xmm4, %xmm1
123 movaps %xmm5, %xmm9
124 movaps %xmm6, %xmm3
125 movaps %xmm7, %xmm11
126 pmaddwd 64(B0L), %xmm4
127 pmaddwd 80(B0L), %xmm5
128 pmaddwd 96(B0L), %xmm6
129 pmaddwd 112(B0L), %xmm7
130 pmaddwd 64(B0R), %xmm1
131 pmaddwd 80(B0R), %xmm9
132 pmaddwd 96(B0R), %xmm3
133 pmaddwd 112(B0R), %xmm11
134 paddd %xmm5, %xmm4
135 paddd %xmm7, %xmm6
136 paddd %xmm1, %xmm9
137 paddd %xmm3, %xmm11
138
139 movaps %xmm0, %xmm1
140 movaps %xmm4, %xmm3
141 movaps %xmm8, %xmm5
142 movaps %xmm9, %xmm7
143 punpckldq %xmm2, %xmm0
144 punpckldq %xmm6, %xmm4
145 punpckhdq %xmm2, %xmm1
146 punpckhdq %xmm6, %xmm3
147 punpckldq %xmm10, %xmm8
148 punpckldq %xmm11, %xmm9
149 punpckhdq %xmm10, %xmm5
150 punpckhdq %xmm11, %xmm7
151 movaps %xmm0, %xmm2
152 movaps %xmm1, %xmm6
153 movaps %xmm8, %xmm10
154 movaps %xmm5, %xmm11
155 movlhps %xmm4, %xmm0
156 movhlps %xmm2, %xmm4
157 movlhps %xmm3, %xmm1
158 movhlps %xmm6, %xmm3
159 movlhps %xmm9, %xmm8
160 movhlps %xmm10, %xmm9
161 movlhps %xmm7, %xmm5
162 movhlps %xmm11, %xmm7
163 paddd %xmm4, %xmm0
164 paddd %xmm3, %xmm1
165 paddd %xmm9, %xmm8
166 paddd %xmm7, %xmm5
167 paddd %xmm1, %xmm0
168 paddd %xmm5, %xmm8
169 psrad $13, %xmm0
170 psrad $13, %xmm8
171
172 movaps %xmm0, %xmm1
173 movaps %xmm0, %xmm2
174 movaps %xmm0, %xmm3
175 movaps %xmm8, %xmm4
176 punpckldq %xmm8, %xmm0
177 punpckhdq %xmm8, %xmm1
178 packssdw %xmm1, %xmm0
179 movups %xmm0, (SAMPLES)
180
181 pcmpgtd XMMREG_MAX, %xmm2
182 pcmpgtd XMMREG_MIN, %xmm3
183 pcmpgtd XMMREG_MAX, %xmm4
184 pcmpgtd XMMREG_MIN, %xmm8
185 packssdw %xmm4, %xmm2
186 packssdw %xmm8, %xmm3
187 pxor XMMREG_FULL, %xmm3
188 psrlw $15, %xmm2
189 psrlw $15, %xmm3
190 paddw %xmm3, %xmm2
191 paddw %xmm2, XMMREG_CLIP
192
193 leaq 256(WINDOW), WINDOW
194 leaq 128(B0L), B0L
195 leaq 128(B0R), B0R
196 leaq 16(SAMPLES), SAMPLES
197
198 decl %ecx
199 jnz Loop_start_1
200
201 movl $4, %ecx
202
203 ALIGN16
204 Loop_start_2:
205 movups (WINDOW), %xmm0
206 movups 16(WINDOW), %xmm1
207 movups 64(WINDOW), %xmm2
208 movups 80(WINDOW), %xmm3
209 movups 128(WINDOW), %xmm4
210 movups 144(WINDOW), %xmm5
211 movups 192(WINDOW), %xmm6
212 movups 208(WINDOW), %xmm7
213 movaps %xmm0, %xmm8
214 movaps %xmm1, %xmm9
215 movaps %xmm2, %xmm10
216 movaps %xmm3, %xmm11
217 pmaddwd (B0L), %xmm0
218 pmaddwd 16(B0L), %xmm1
219 pmaddwd -32(B0L), %xmm2
220 pmaddwd -16(B0L), %xmm3
221 pmaddwd (B0R), %xmm8
222 pmaddwd 16(B0R), %xmm9
223 pmaddwd -32(B0R), %xmm10
224 pmaddwd -16(B0R), %xmm11
225 paddd %xmm1, %xmm0
226 paddd %xmm3, %xmm2
227 paddd %xmm9, %xmm8
228 paddd %xmm11, %xmm10
229 movaps %xmm4, %xmm1
230 movaps %xmm5, %xmm9
231 movaps %xmm6, %xmm3
232 movaps %xmm7, %xmm11
233 pmaddwd -64(B0L), %xmm4
234 pmaddwd -48(B0L), %xmm5
235 pmaddwd -96(B0L), %xmm6
236 pmaddwd -80(B0L), %xmm7
237 pmaddwd -64(B0R), %xmm1
238 pmaddwd -48(B0R), %xmm9
239 pmaddwd -96(B0R), %xmm3
240 pmaddwd -80(B0R), %xmm11
241 paddd %xmm5, %xmm4
242 paddd %xmm7, %xmm6
243 paddd %xmm1, %xmm9
244 paddd %xmm3, %xmm11
245
246 movaps %xmm0, %xmm1
247 movaps %xmm4, %xmm3
248 movaps %xmm8, %xmm5
249 movaps %xmm9, %xmm7
250 punpckldq %xmm2, %xmm0
251 punpckldq %xmm6, %xmm4
252 punpckhdq %xmm2, %xmm1
253 punpckhdq %xmm6, %xmm3
254 punpckldq %xmm10, %xmm8
255 punpckldq %xmm11, %xmm9
256 punpckhdq %xmm10, %xmm5
257 punpckhdq %xmm11, %xmm7
258 movaps %xmm0, %xmm2
259 movaps %xmm1, %xmm6
260 movaps %xmm8, %xmm10
261 movaps %xmm5, %xmm11
262 movlhps %xmm4, %xmm0
263 movhlps %xmm2, %xmm4
264 movlhps %xmm3, %xmm1
265 movhlps %xmm6, %xmm3
266 movlhps %xmm9, %xmm8
267 movhlps %xmm10, %xmm9
268 movlhps %xmm7, %xmm5
269 movhlps %xmm11, %xmm7
270 paddd %xmm4, %xmm0
271 paddd %xmm3, %xmm1
272 paddd %xmm9, %xmm8
273 paddd %xmm7, %xmm5
274 paddd %xmm1, %xmm0
275 paddd %xmm5, %xmm8
276 psrad $13, %xmm0
277 psrad $13, %xmm8
278
279 movaps %xmm0, %xmm1
280 movaps %xmm0, %xmm2
281 movaps %xmm0, %xmm3
282 movaps %xmm8, %xmm4
283 punpckldq %xmm8, %xmm0
284 punpckhdq %xmm8, %xmm1
285 packssdw %xmm1, %xmm0
286 movups %xmm0, (SAMPLES)
287
288 pcmpgtd XMMREG_MAX, %xmm2
289 pcmpgtd XMMREG_MIN, %xmm3
290 pcmpgtd XMMREG_MAX, %xmm4
291 pcmpgtd XMMREG_MIN, %xmm8
292 packssdw %xmm4, %xmm2
293 packssdw %xmm8, %xmm3
294 pxor XMMREG_FULL, %xmm3
295 psrlw $15, %xmm2
296 psrlw $15, %xmm3
297 paddw %xmm3, %xmm2
298 paddw %xmm2, XMMREG_CLIP
299
300 leaq 256(WINDOW), WINDOW
301 leaq -128(B0L), B0L
302 leaq -128(B0R), B0R
303 leaq 16(SAMPLES), SAMPLES
304
305 decl %ecx
306 jnz Loop_start_2
307
308 movhlps XMMREG_CLIP, %xmm0
309 paddw XMMREG_CLIP, %xmm0
310 pshuflw $0x55, %xmm0, %xmm1
311 pshuflw $0xaa, %xmm0, %xmm2
312 pshuflw $0xff, %xmm0, %xmm3
313 paddw %xmm1, %xmm0
314 paddw %xmm2, %xmm0
315 paddw %xmm3, %xmm0
316
317 movd %xmm0, %eax
318 andl $0xffff, %eax
319
320 #ifdef _WIN64
321 movaps (%rsp), %xmm6
322 movaps 16(%rsp), %xmm7
323 movaps 32(%rsp), %xmm8
324 movaps 48(%rsp), %xmm9
325 movaps 64(%rsp), %xmm10
326 movaps 80(%rsp), %xmm11
327 movaps 96(%rsp), %xmm12
328 movaps 112(%rsp), %xmm13
329 movaps 128(%rsp), %xmm14
330 movaps 144(%rsp), %xmm15
331 addq $168, %rsp
332 #endif
333 ret
334
335 NONEXEC_STACK