2 dct64_altivec.c: Discrete Cosine Tansform (DCT) for Altivec
4 copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Michael Hipp
7 altivec optimization by tmkk
11 * Discrete Cosine Tansform (DCT) for subband synthesis
13 * -funroll-loops (for gcc) will remove the loops for better performance
14 * using loops in the source-code enhances readabillity
17 * TODO: write an optimized version for the down-sampling modes
18 * (in these modes the bands 16-31 (2:1) or 8-31 (4:1) are zero
21 #include "mpg123lib_intern.h"
27 void dct64_altivec(real
*out0
,real
*out1
,real
*samples
)
29 ALIGNED(16) real bufs
[32];
32 register real
*b1
,*costab
;
34 vector
unsigned char vinvert
,vperm1
,vperm2
,vperm3
,vperm4
;
35 vector
float v1
,v2
,v3
,v4
,v5
,v6
,v7
,v8
;
36 vector
float vbs1
,vbs2
,vbs3
,vbs4
,vbs5
,vbs6
,vbs7
,vbs8
;
37 vector
float vbs9
,vbs10
,vbs11
,vbs12
,vbs13
,vbs14
,vbs15
,vbs16
;
42 vzero
= vec_xor(vzero
,vzero
);
44 vinvert
= (vector
unsigned char)(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3);
46 vinvert
= (vector
unsigned char){12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3};
48 vperm1
= vec_lvsl(0,b1
);
49 vperm2
= vec_perm(vperm1
,vperm1
,vinvert
);
55 v5
= vec_perm(v1
,v2
,vperm1
); /* b1[0,1,2,3] */
56 v6
= vec_perm(v3
,v4
,vperm2
); /* b1[31,30,29,28] */
58 vbs1
= vec_add(v5
,v6
);
59 vbs8
= vec_sub(v5
,v6
);
63 v5
= vec_perm(v2
,v1
,vperm1
); /* b1[4,5,6,7] */
64 v6
= vec_perm(v4
,v3
,vperm2
); /* b1[27,26,25,24] */
66 vbs2
= vec_add(v5
,v6
);
67 vbs7
= vec_sub(v5
,v6
);
71 v5
= vec_perm(v1
,v2
,vperm1
); /* b1[8,9,10,11] */
72 v6
= vec_perm(v3
,v4
,vperm2
); /* b1[23,22,21,20] */
74 vbs3
= vec_add(v5
,v6
);
75 vbs6
= vec_sub(v5
,v6
);
78 v5
= vec_perm(v2
,v1
,vperm1
); /* b1[12,13,14,15] */
79 v6
= vec_perm(v1
,v3
,vperm2
); /* b1[19,18,17,16] */
81 vbs4
= vec_add(v5
,v6
);
82 vbs5
= vec_sub(v5
,v6
);
84 v1
= vec_ld(0,costab
);
85 vbs8
= vec_madd(vbs8
,v1
,vzero
);
86 v2
= vec_ld(16,costab
);
87 vbs7
= vec_madd(vbs7
,v2
,vzero
);
88 v3
= vec_ld(32,costab
);
89 vbs6
= vec_madd(vbs6
,v3
,vzero
);
90 v4
= vec_ld(48,costab
);
91 vbs5
= vec_madd(vbs5
,v4
,vzero
);
92 vbs6
= vec_perm(vbs6
,vbs6
,vinvert
);
93 vbs5
= vec_perm(vbs5
,vbs5
,vinvert
);
98 v1
= vec_perm(vbs4
,vbs4
,vinvert
);
99 vbs9
= vec_add(vbs1
,v1
);
100 v3
= vec_sub(vbs1
,v1
);
101 v5
= vec_ld(0,costab
);
102 v2
= vec_perm(vbs3
,vbs3
,vinvert
);
103 vbs10
= vec_add(vbs2
,v2
);
104 v4
= vec_sub(vbs2
,v2
);
105 v6
= vec_ld(16,costab
);
106 vbs12
= vec_madd(v3
,v5
,vzero
);
107 vbs11
= vec_madd(v4
,v6
,vzero
);
109 v7
= vec_sub(vbs7
,vbs6
);
110 v8
= vec_sub(vbs8
,vbs5
);
111 vbs13
= vec_add(vbs5
,vbs8
);
112 vbs14
= vec_add(vbs6
,vbs7
);
113 vbs15
= vec_madd(v7
,v6
,vzero
);
114 vbs16
= vec_madd(v8
,v5
,vzero
);
119 v1
= vec_perm(vbs10
,vbs10
,vinvert
);
120 v5
= vec_perm(vbs14
,vbs14
,vinvert
);
121 vbs1
= vec_add(v1
,vbs9
);
122 vbs5
= vec_add(v5
,vbs13
);
123 v2
= vec_sub(vbs9
,v1
);
124 v6
= vec_sub(vbs13
,v5
);
125 v3
= vec_ld(0,costab
);
126 vbs11
= vec_perm(vbs11
,vbs11
,vinvert
);
127 vbs15
= vec_perm(vbs15
,vbs15
,vinvert
);
128 vbs3
= vec_add(vbs11
,vbs12
);
129 vbs7
= vec_add(vbs15
,vbs16
);
130 v4
= vec_sub(vbs12
,vbs11
);
131 v7
= vec_sub(vbs16
,vbs15
);
132 vbs2
= vec_madd(v2
,v3
,vzero
);
133 vbs4
= vec_madd(v4
,v3
,vzero
);
134 vbs6
= vec_madd(v6
,v3
,vzero
);
135 vbs8
= vec_madd(v7
,v3
,vzero
);
137 vbs2
= vec_perm(vbs2
,vbs2
,vinvert
);
138 vbs4
= vec_perm(vbs4
,vbs4
,vinvert
);
139 vbs6
= vec_perm(vbs6
,vbs6
,vinvert
);
140 vbs8
= vec_perm(vbs8
,vbs8
,vinvert
);
146 vperm1
= (vector
unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
147 vperm2
= (vector
unsigned char)(12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27);
148 vperm3
= (vector
unsigned char)(0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19);
150 vperm1
= (vector
unsigned char){0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23};
151 vperm2
= (vector
unsigned char){12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27};
152 vperm3
= (vector
unsigned char){0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19};
154 vperm4
= vec_add(vperm3
,vec_splat_u8(8));
156 v1
= vec_ld(0,costab
);
157 v2
= vec_splat(v1
,0);
158 v3
= vec_splat(v1
,1);
159 v1
= vec_mergeh(v2
,v3
);
161 v2
= vec_perm(vbs1
,vbs3
,vperm1
);
162 v3
= vec_perm(vbs2
,vbs4
,vperm1
);
163 v4
= vec_perm(vbs1
,vbs3
,vperm2
);
164 v5
= vec_perm(vbs2
,vbs4
,vperm2
);
169 v4
= vec_madd(v6
,v1
,vzero
);
170 v5
= vec_nmsub(v7
,v1
,vzero
);
171 vbs9
= vec_perm(v2
,v4
,vperm3
);
172 vbs11
= vec_perm(v2
,v4
,vperm4
);
173 vbs10
= vec_perm(v3
,v5
,vperm3
);
174 vbs12
= vec_perm(v3
,v5
,vperm4
);
176 v2
= vec_perm(vbs5
,vbs7
,vperm1
);
177 v3
= vec_perm(vbs6
,vbs8
,vperm1
);
178 v4
= vec_perm(vbs5
,vbs7
,vperm2
);
179 v5
= vec_perm(vbs6
,vbs8
,vperm2
);
184 v4
= vec_madd(v6
,v1
,vzero
);
185 v5
= vec_nmsub(v7
,v1
,vzero
);
186 vbs13
= vec_perm(v2
,v4
,vperm3
);
187 vbs15
= vec_perm(v2
,v4
,vperm4
);
188 vbs14
= vec_perm(v3
,v5
,vperm3
);
189 vbs16
= vec_perm(v3
,v5
,vperm4
);
194 v1
= vec_lde(0,costab
);
196 v2
= (vector
float)(1.0f
,-1.0f
,1.0f
,-1.0f
);
198 v2
= (vector
float){1.0f
,-1.0f
,1.0f
,-1.0f
};
200 v3
= vec_splat(v1
,0);
201 v1
= vec_madd(v2
,v3
,vzero
);
203 v2
= vec_mergeh(vbs9
,vbs10
);
204 v3
= vec_mergel(vbs9
,vbs10
);
205 v4
= vec_mergeh(vbs11
,vbs12
);
206 v5
= vec_mergel(vbs11
,vbs12
);
207 v6
= vec_mergeh(v2
,v3
);
208 v7
= vec_mergel(v2
,v3
);
209 v2
= vec_mergeh(v4
,v5
);
210 v3
= vec_mergel(v4
,v5
);
215 v2
= vec_madd(v4
,v1
,vzero
);
216 v3
= vec_madd(v5
,v1
,vzero
);
217 vbs1
= vec_mergeh(v6
,v2
);
218 vbs2
= vec_mergel(v6
,v2
);
219 vbs3
= vec_mergeh(v7
,v3
);
220 vbs4
= vec_mergel(v7
,v3
);
222 v2
= vec_mergeh(vbs13
,vbs14
);
223 v3
= vec_mergel(vbs13
,vbs14
);
224 v4
= vec_mergeh(vbs15
,vbs16
);
225 v5
= vec_mergel(vbs15
,vbs16
);
226 v6
= vec_mergeh(v2
,v3
);
227 v7
= vec_mergel(v2
,v3
);
228 v2
= vec_mergeh(v4
,v5
);
229 v3
= vec_mergel(v4
,v5
);
234 v2
= vec_madd(v4
,v1
,vzero
);
235 v3
= vec_madd(v5
,v1
,vzero
);
236 vbs5
= vec_mergeh(v6
,v2
);
237 vbs6
= vec_mergel(v6
,v2
);
238 vbs7
= vec_mergeh(v7
,v3
);
239 vbs8
= vec_mergel(v7
,v3
);
242 vec_st(vbs2
,16,bufs
);
243 vec_st(vbs3
,32,bufs
);
244 vec_st(vbs4
,48,bufs
);
245 vec_st(vbs5
,64,bufs
);
246 vec_st(vbs6
,80,bufs
);
247 vec_st(vbs7
,96,bufs
);
248 vec_st(vbs8
,112,bufs
);
255 for(b1
=bufs
,i
=8;i
;i
--,b1
+=4)
258 for(b1
=bufs
,i
=4;i
;i
--,b1
+=8)
265 for(b1
=bufs
,i
=2;i
;i
--,b1
+=16)
278 out0
[0x10*16] = bufs
[0];
279 out0
[0x10*15] = bufs
[16+0] + bufs
[16+8];
280 out0
[0x10*14] = bufs
[8];
281 out0
[0x10*13] = bufs
[16+8] + bufs
[16+4];
282 out0
[0x10*12] = bufs
[4];
283 out0
[0x10*11] = bufs
[16+4] + bufs
[16+12];
284 out0
[0x10*10] = bufs
[12];
285 out0
[0x10* 9] = bufs
[16+12] + bufs
[16+2];
286 out0
[0x10* 8] = bufs
[2];
287 out0
[0x10* 7] = bufs
[16+2] + bufs
[16+10];
288 out0
[0x10* 6] = bufs
[10];
289 out0
[0x10* 5] = bufs
[16+10] + bufs
[16+6];
290 out0
[0x10* 4] = bufs
[6];
291 out0
[0x10* 3] = bufs
[16+6] + bufs
[16+14];
292 out0
[0x10* 2] = bufs
[14];
293 out0
[0x10* 1] = bufs
[16+14] + bufs
[16+1];
294 out0
[0x10* 0] = bufs
[1];
296 out1
[0x10* 0] = bufs
[1];
297 out1
[0x10* 1] = bufs
[16+1] + bufs
[16+9];
298 out1
[0x10* 2] = bufs
[9];
299 out1
[0x10* 3] = bufs
[16+9] + bufs
[16+5];
300 out1
[0x10* 4] = bufs
[5];
301 out1
[0x10* 5] = bufs
[16+5] + bufs
[16+13];
302 out1
[0x10* 6] = bufs
[13];
303 out1
[0x10* 7] = bufs
[16+13] + bufs
[16+3];
304 out1
[0x10* 8] = bufs
[3];
305 out1
[0x10* 9] = bufs
[16+3] + bufs
[16+11];
306 out1
[0x10*10] = bufs
[11];
307 out1
[0x10*11] = bufs
[16+11] + bufs
[16+7];
308 out1
[0x10*12] = bufs
[7];
309 out1
[0x10*13] = bufs
[16+7] + bufs
[16+15];
310 out1
[0x10*14] = bufs
[15];
311 out1
[0x10*15] = bufs
[16+15];