36c894903fbca2022683ed35e735e8b10b55df10
[reactos.git] / reactos / sdk / include / reactos / libs / libmpg123 / synth_sse3d.h
1 /*
2 decode_sse3d: Synth for SSE and extended 3DNow (yeah, the name is a relic)
3
4 copyright 2006-2007 by Zuxy Meng/the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by the mysterious higway for MMX (apparently)
7 then developed into SSE opt by Zuxy Meng, also building on Romain Dolbeau's AltiVec
8 Both have agreed to distribution under LGPL 2.1 .
9
10 Transformed back into standalone asm, with help of
11 gcc -S -DHAVE_CONFIG_H -I. -march=pentium -O3 -Wall -pedantic -fno-strict-aliasing -DREAL_IS_FLOAT -c -o decode_mmxsse.{S,c}
12
13 The difference between SSE and 3DNowExt is the dct64 function and the synth function name.
14 This template here uses the SYNTH_NAME and MPL_DCT64 macros for this - see decode_sse.S and decode_3dnowext.S...
15 That's not memory efficient since there's doubled code, but it's easier than giving another function pointer.
16 Maybe I'll change it in future, but now I need something that works.
17
18 Original comment from MPlayer source follows:
19 */
20
21 /*
22 * this code comes under GPL
23 * This code was taken from http://www.mpg123.org
24 * See ChangeLog of mpg123-0.59s-pre.1 for detail
25 * Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
26 *
27 * Local ChangeLog:
28 * - Partial loops unrolling and removing MOVW insn from loops
29 */
30
31 #include "mangle.h"
32
33 .data
34 ALIGN8
35 one_null:
36 .long -65536
37 .long -65536
38 ALIGN8
39 null_one:
40 .long 65535
41 .long 65535
42
43 .text
44 ALIGN16
45 /* void SYNTH_NAME(real *bandPtr, int channel, short *samples, short *buffs, int *bo, float *decwins) */
46 .globl SYNTH_NAME
47 SYNTH_NAME:
48 pushl %ebp
49 /* stack:0=ebp 4=back 8=bandptr 12=channel 16=samples 20=buffs 24=bo 28=decwins */
50 movl %esp, %ebp
51 /* Now the old stack addresses are preserved via %epb. */
52 subl $4,%esp /* What has been called temp before. */
53 pushl %edi
54 pushl %esi
55 pushl %ebx
56 #define TEMP 12(%esp)
57 /* APP */
58 movl 12(%ebp),%ecx
59 movl 16(%ebp),%edi
60 movl $15,%ebx
61 movl 24(%ebp),%edx
62 leal (%edi,%ecx,2),%edi
63 decl %ecx
64 movl 20(%ebp),%esi
65 movl (%edx),%eax
66 jecxz 1f
67 decl %eax
68 andl %ebx,%eax
69 leal 1088(%esi),%esi
70 movl %eax,(%edx)
71 1:
72 leal (%esi,%eax,2),%edx
73 movl %eax,TEMP
74 incl %eax
75 andl %ebx,%eax
76 leal 544(%esi,%eax,2),%ecx
77 incl %ebx
78 testl $1, %eax
79 jnz 2f
80 xchgl %edx,%ecx
81 incl TEMP
82 leal 544(%esi),%esi
83 2:
84 pushl 8(%ebp)
85 pushl %edx
86 pushl %ecx
87 call MPL_DCT64
88 addl $12, %esp
89 leal 1(%ebx), %ecx
90 subl TEMP,%ebx
91 pushl %ecx
92 /* leal ASM_NAME(decwins)(%ebx,%ebx,1), %edx */
93 movl 28(%ebp),%ecx
94 leal (%ecx,%ebx,2), %edx
95 movl (%esp),%ecx /* restore, but leave value on stack */
96 shrl $1, %ecx
97 ALIGN16
98 3:
99 movq (%edx),%mm0
100 movq 64(%edx),%mm4
101 pmaddwd (%esi),%mm0
102 pmaddwd 32(%esi),%mm4
103 movq 8(%edx),%mm1
104 movq 72(%edx),%mm5
105 pmaddwd 8(%esi),%mm1
106 pmaddwd 40(%esi),%mm5
107 movq 16(%edx),%mm2
108 movq 80(%edx),%mm6
109 pmaddwd 16(%esi),%mm2
110 pmaddwd 48(%esi),%mm6
111 movq 24(%edx),%mm3
112 movq 88(%edx),%mm7
113 pmaddwd 24(%esi),%mm3
114 pmaddwd 56(%esi),%mm7
115 paddd %mm1,%mm0
116 paddd %mm5,%mm4
117 paddd %mm2,%mm0
118 paddd %mm6,%mm4
119 paddd %mm3,%mm0
120 paddd %mm7,%mm4
121 movq %mm0,%mm1
122 movq %mm4,%mm5
123 psrlq $32,%mm1
124 psrlq $32,%mm5
125 paddd %mm1,%mm0
126 paddd %mm5,%mm4
127 psrad $13,%mm0
128 psrad $13,%mm4
129 packssdw %mm0,%mm0
130 packssdw %mm4,%mm4
131 movq (%edi), %mm1
132 punpckldq %mm4, %mm0
133 pand one_null, %mm1
134 pand null_one, %mm0
135 por %mm0, %mm1
136 movq %mm1,(%edi)
137 leal 64(%esi),%esi
138 leal 128(%edx),%edx
139 leal 8(%edi),%edi
140 decl %ecx
141 jnz 3b
142 popl %ecx
143 andl $1, %ecx
144 jecxz 4f
145 movq (%edx),%mm0
146 pmaddwd (%esi),%mm0
147 movq 8(%edx),%mm1
148 pmaddwd 8(%esi),%mm1
149 movq 16(%edx),%mm2
150 pmaddwd 16(%esi),%mm2
151 movq 24(%edx),%mm3
152 pmaddwd 24(%esi),%mm3
153 paddd %mm1,%mm0
154 paddd %mm2,%mm0
155 paddd %mm3,%mm0
156 movq %mm0,%mm1
157 psrlq $32,%mm1
158 paddd %mm1,%mm0
159 psrad $13,%mm0
160 packssdw %mm0,%mm0
161 movd %mm0,%eax
162 movw %ax, (%edi)
163 leal 32(%esi),%esi
164 leal 64(%edx),%edx
165 leal 4(%edi),%edi
166 4:
167 subl $64,%esi
168 movl $7,%ecx
169 ALIGN16
170 5:
171 movq (%edx),%mm0
172 movq 64(%edx),%mm4
173 pmaddwd (%esi),%mm0
174 pmaddwd -32(%esi),%mm4
175 movq 8(%edx),%mm1
176 movq 72(%edx),%mm5
177 pmaddwd 8(%esi),%mm1
178 pmaddwd -24(%esi),%mm5
179 movq 16(%edx),%mm2
180 movq 80(%edx),%mm6
181 pmaddwd 16(%esi),%mm2
182 pmaddwd -16(%esi),%mm6
183 movq 24(%edx),%mm3
184 movq 88(%edx),%mm7
185 pmaddwd 24(%esi),%mm3
186 pmaddwd -8(%esi),%mm7
187 paddd %mm1,%mm0
188 paddd %mm5,%mm4
189 paddd %mm2,%mm0
190 paddd %mm6,%mm4
191 paddd %mm3,%mm0
192 paddd %mm7,%mm4
193 movq %mm0,%mm1
194 movq %mm4,%mm5
195 psrlq $32,%mm1
196 psrlq $32,%mm5
197 paddd %mm0,%mm1
198 paddd %mm4,%mm5
199 psrad $13,%mm1
200 psrad $13,%mm5
201 packssdw %mm1,%mm1
202 packssdw %mm5,%mm5
203 psubd %mm0,%mm0
204 psubd %mm4,%mm4
205 psubsw %mm1,%mm0
206 psubsw %mm5,%mm4
207 movq (%edi), %mm1
208 punpckldq %mm4, %mm0
209 pand one_null, %mm1
210 pand null_one, %mm0
211 por %mm0, %mm1
212 movq %mm1,(%edi)
213 subl $64,%esi
214 addl $128,%edx
215 leal 8(%edi),%edi
216 decl %ecx
217 jnz 5b
218 movq (%edx),%mm0
219 pmaddwd (%esi),%mm0
220 movq 8(%edx),%mm1
221 pmaddwd 8(%esi),%mm1
222 movq 16(%edx),%mm2
223 pmaddwd 16(%esi),%mm2
224 movq 24(%edx),%mm3
225 pmaddwd 24(%esi),%mm3
226 paddd %mm1,%mm0
227 paddd %mm2,%mm0
228 paddd %mm3,%mm0
229 movq %mm0,%mm1
230 psrlq $32,%mm1
231 paddd %mm0,%mm1
232 psrad $13,%mm1
233 packssdw %mm1,%mm1
234 psubd %mm0,%mm0
235 psubsw %mm1,%mm0
236 movd %mm0,%eax
237 movw %ax,(%edi)
238 emms
239
240 /* NO_APP */
241 popl %ebx
242 popl %esi
243 popl %edi
244 addl $4,%esp
245 popl %ebp
246 ret