[LIBMPG123]
[reactos.git] / reactos / sdk / include / reactos / libs / libmpg123 / synth_sse3d.h
1 /*
2 decode_sse3d: Synth for SSE and extended 3DNow (yeah, the name is a relic)
3
4 copyright 2006-2007 by Zuxy Meng/the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by the mysterious higway for MMX (apparently)
7 then developed into SSE opt by Zuxy Meng, also building on Romain Dolbeau's AltiVec
8 Both have agreed to distribution under LGPL 2.1 .
9
10 Transformed back into standalone asm, with help of
11 gcc -S -DHAVE_CONFIG_H -I. -march=pentium -O3 -Wall -pedantic -fno-strict-aliasing -DREAL_IS_FLOAT -c -o decode_mmxsse.{S,c}
12
13 The difference between SSE and 3DNowExt is the dct64 function and the synth function name.
14 This template here uses the SYNTH_NAME and MPL_DCT64 macros for this - see decode_sse.S and decode_3dnowext.S...
15 That's not memory efficient since there's doubled code, but it's easier than giving another function pointer.
16 Maybe I'll change it in future, but now I need something that works.
17
18 Original comment from MPlayer source follows. Regarding the license history see
19 synth_mmx.S, which the original comment about this being licensed under GPL is
20 relating to.
21 */
22
23 /*
24 * This code was taken from http://www.mpg123.org
25 * See ChangeLog of mpg123-0.59s-pre.1 for detail
26 * Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
27 *
28 * Local ChangeLog:
29 * - Partial loops unrolling and removing MOVW insn from loops
30 */
31
32 #include "mangle.h"
33
34 .data
35 ALIGN8
36 one_null:
37 .long -65536
38 .long -65536
39 ALIGN8
40 null_one:
41 .long 65535
42 .long 65535
43
44 .text
45 ALIGN16
46 /* void SYNTH_NAME(real *bandPtr, int channel, short *samples, short *buffs, int *bo, float *decwins) */
47 .globl SYNTH_NAME
48 SYNTH_NAME:
49 pushl %ebp
50 /* stack:0=ebp 4=back 8=bandptr 12=channel 16=samples 20=buffs 24=bo 28=decwins */
51 movl %esp, %ebp
52
53 /* Now the old stack addresses are preserved via %epb. */
54 #ifdef PIC
55 subl $8,%esp /* What has been called temp before. */
56 #else
57 subl $4,%esp /* What has been called temp before. */
58 #endif
59 pushl %edi
60 pushl %esi
61 pushl %ebx
62
63 #ifdef PIC
64 #undef _EBX_
65 #define _EBX_ %eax
66 GET_GOT
67 #define EBXSAVE -4(%ebp)
68 movl _EBX_, EBXSAVE /* save PIC register */
69 #endif
70
71 #define TEMP 12(%esp)
72 /* APP */
73 movl 12(%ebp),%ecx
74 movl 16(%ebp),%edi
75 movl $15,%ebx
76 movl 24(%ebp),%edx
77 leal (%edi,%ecx,2),%edi
78 decl %ecx
79 movl 20(%ebp),%esi
80 movl (%edx),%eax
81 jecxz 1f
82 decl %eax
83 andl %ebx,%eax
84 leal 1088(%esi),%esi
85 movl %eax,(%edx)
86 1:
87 leal (%esi,%eax,2),%edx
88 movl %eax,TEMP
89 incl %eax
90 andl %ebx,%eax
91 leal 544(%esi,%eax,2),%ecx
92 incl %ebx
93 testl $1, %eax
94 jnz 2f
95 xchgl %edx,%ecx
96 incl TEMP
97 leal 544(%esi),%esi
98 2:
99 pushl 8(%ebp)
100 pushl %edx
101 pushl %ecx
102 call MPL_DCT64
103 addl $12, %esp
104 leal 1(%ebx), %ecx
105 subl TEMP,%ebx
106 pushl %ecx
107 /* leal ASM_NAME(decwins)(%ebx,%ebx,1), %edx */
108 movl 28(%ebp),%ecx
109 leal (%ecx,%ebx,2), %edx
110 movl (%esp),%ecx /* restore, but leave value on stack */
111 shrl $1, %ecx
112 #ifdef PIC
113 movl EBXSAVE, _EBX_
114 #endif
115 ALIGN16
116 3:
117 movq (%edx),%mm0
118 movq 64(%edx),%mm4
119 pmaddwd (%esi),%mm0
120 pmaddwd 32(%esi),%mm4
121 movq 8(%edx),%mm1
122 movq 72(%edx),%mm5
123 pmaddwd 8(%esi),%mm1
124 pmaddwd 40(%esi),%mm5
125 movq 16(%edx),%mm2
126 movq 80(%edx),%mm6
127 pmaddwd 16(%esi),%mm2
128 pmaddwd 48(%esi),%mm6
129 movq 24(%edx),%mm3
130 movq 88(%edx),%mm7
131 pmaddwd 24(%esi),%mm3
132 pmaddwd 56(%esi),%mm7
133 paddd %mm1,%mm0
134 paddd %mm5,%mm4
135 paddd %mm2,%mm0
136 paddd %mm6,%mm4
137 paddd %mm3,%mm0
138 paddd %mm7,%mm4
139 movq %mm0,%mm1
140 movq %mm4,%mm5
141 psrlq $32,%mm1
142 psrlq $32,%mm5
143 paddd %mm1,%mm0
144 paddd %mm5,%mm4
145 psrad $13,%mm0
146 psrad $13,%mm4
147 packssdw %mm0,%mm0
148 packssdw %mm4,%mm4
149 movq (%edi), %mm1
150 punpckldq %mm4, %mm0
151 pand LOCAL_VAR(one_null), %mm1
152 pand LOCAL_VAR(null_one), %mm0
153 por %mm0, %mm1
154 movq %mm1,(%edi)
155 leal 64(%esi),%esi
156 leal 128(%edx),%edx
157 leal 8(%edi),%edi
158 decl %ecx
159 jnz 3b
160 popl %ecx
161 andl $1, %ecx
162 jecxz 4f
163 movq (%edx),%mm0
164 pmaddwd (%esi),%mm0
165 movq 8(%edx),%mm1
166 pmaddwd 8(%esi),%mm1
167 movq 16(%edx),%mm2
168 pmaddwd 16(%esi),%mm2
169 movq 24(%edx),%mm3
170 pmaddwd 24(%esi),%mm3
171 paddd %mm1,%mm0
172 paddd %mm2,%mm0
173 paddd %mm3,%mm0
174 movq %mm0,%mm1
175 psrlq $32,%mm1
176 paddd %mm1,%mm0
177 psrad $13,%mm0
178 packssdw %mm0,%mm0
179 movd %mm0,%eax
180 movw %ax, (%edi)
181 leal 32(%esi),%esi
182 leal 64(%edx),%edx
183 leal 4(%edi),%edi
184 4:
185 subl $64,%esi
186 movl $7,%ecx
187
188 #ifdef PIC
189 movl EBXSAVE, _EBX_
190 #endif
191 ALIGN16
192 5:
193 movq (%edx),%mm0
194 movq 64(%edx),%mm4
195 pmaddwd (%esi),%mm0
196 pmaddwd -32(%esi),%mm4
197 movq 8(%edx),%mm1
198 movq 72(%edx),%mm5
199 pmaddwd 8(%esi),%mm1
200 pmaddwd -24(%esi),%mm5
201 movq 16(%edx),%mm2
202 movq 80(%edx),%mm6
203 pmaddwd 16(%esi),%mm2
204 pmaddwd -16(%esi),%mm6
205 movq 24(%edx),%mm3
206 movq 88(%edx),%mm7
207 pmaddwd 24(%esi),%mm3
208 pmaddwd -8(%esi),%mm7
209 paddd %mm1,%mm0
210 paddd %mm5,%mm4
211 paddd %mm2,%mm0
212 paddd %mm6,%mm4
213 paddd %mm3,%mm0
214 paddd %mm7,%mm4
215 movq %mm0,%mm1
216 movq %mm4,%mm5
217 psrlq $32,%mm1
218 psrlq $32,%mm5
219 paddd %mm0,%mm1
220 paddd %mm4,%mm5
221 psrad $13,%mm1
222 psrad $13,%mm5
223 packssdw %mm1,%mm1
224 packssdw %mm5,%mm5
225 psubd %mm0,%mm0
226 psubd %mm4,%mm4
227 psubsw %mm1,%mm0
228 psubsw %mm5,%mm4
229 movq (%edi), %mm1
230 punpckldq %mm4, %mm0
231 pand LOCAL_VAR(one_null), %mm1
232 pand LOCAL_VAR(null_one), %mm0
233 por %mm0, %mm1
234 movq %mm1,(%edi)
235 subl $64,%esi
236 addl $128,%edx
237 leal 8(%edi),%edi
238 decl %ecx
239 jnz 5b
240 movq (%edx),%mm0
241 pmaddwd (%esi),%mm0
242 movq 8(%edx),%mm1
243 pmaddwd 8(%esi),%mm1
244 movq 16(%edx),%mm2
245 pmaddwd 16(%esi),%mm2
246 movq 24(%edx),%mm3
247 pmaddwd 24(%esi),%mm3
248 paddd %mm1,%mm0
249 paddd %mm2,%mm0
250 paddd %mm3,%mm0
251 movq %mm0,%mm1
252 psrlq $32,%mm1
253 paddd %mm0,%mm1
254 psrad $13,%mm1
255 packssdw %mm1,%mm1
256 psubd %mm0,%mm0
257 psubsw %mm1,%mm0
258 movd %mm0,%eax
259 movw %ax,(%edi)
260 emms
261
262 /* NO_APP */
263 popl %ebx
264 popl %esi
265 popl %edi
266 mov %ebp, %esp
267 popl %ebp
268 ret