2 decode.c: decoding samples...
4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Michael Hipp
7 altivec optimization by tmkk
10 #include "mpg123lib_intern.h"
16 /* A macro for normal synth functions */
17 #define SYNTH_ALTIVEC(B0STEP) \
18 v1 = vec_ld(0,window); \
19 v2 = vec_ld(16,window); \
20 v3 = vec_ld(32,window); \
21 v4 = vec_ld(48,window); \
22 v5 = vec_ld(64,window); \
23 v1 = vec_perm(v1,v2,vperm1); \
25 v2 = vec_perm(v2,v3,vperm1); \
27 v3 = vec_perm(v3,v4,vperm1); \
29 v4 = vec_perm(v4,v5,vperm1); \
32 vsum = vec_madd(v1,v6,vzero); \
33 vsum = vec_madd(v2,v7,vsum); \
34 vsum = vec_madd(v3,v8,vsum); \
35 vsum = vec_madd(v4,v9,vsum); \
40 v1 = vec_ld(0,window); \
41 v2 = vec_ld(16,window); \
42 v3 = vec_ld(32,window); \
43 v4 = vec_ld(48,window); \
44 v5 = vec_ld(64,window); \
45 v1 = vec_perm(v1,v2,vperm1); \
47 v2 = vec_perm(v2,v3,vperm1); \
49 v3 = vec_perm(v3,v4,vperm1); \
51 v4 = vec_perm(v4,v5,vperm1); \
54 vsum2 = vec_madd(v1,v6,vzero); \
55 vsum2 = vec_madd(v2,v7,vsum2); \
56 vsum2 = vec_madd(v3,v8,vsum2); \
57 vsum2 = vec_madd(v4,v9,vsum2); \
62 v1 = vec_ld(0,window); \
63 v2 = vec_ld(16,window); \
64 v3 = vec_ld(32,window); \
65 v4 = vec_ld(48,window); \
66 v5 = vec_ld(64,window); \
67 v1 = vec_perm(v1,v2,vperm1); \
69 v2 = vec_perm(v2,v3,vperm1); \
71 v3 = vec_perm(v3,v4,vperm1); \
73 v4 = vec_perm(v4,v5,vperm1); \
76 vsum3 = vec_madd(v1,v6,vzero); \
77 vsum3 = vec_madd(v2,v7,vsum3); \
78 vsum3 = vec_madd(v3,v8,vsum3); \
79 vsum3 = vec_madd(v4,v9,vsum3); \
84 v1 = vec_ld(0,window); \
85 v2 = vec_ld(16,window); \
86 v3 = vec_ld(32,window); \
87 v4 = vec_ld(48,window); \
88 v5 = vec_ld(64,window); \
89 v1 = vec_perm(v1,v2,vperm1); \
91 v2 = vec_perm(v2,v3,vperm1); \
93 v3 = vec_perm(v3,v4,vperm1); \
95 v4 = vec_perm(v4,v5,vperm1); \
98 vsum4 = vec_madd(v1,v6,vzero); \
99 vsum4 = vec_madd(v2,v7,vsum4); \
100 vsum4 = vec_madd(v3,v8,vsum4); \
101 vsum4 = vec_madd(v4,v9,vsum4); \
106 v1 = vec_mergeh(vsum,vsum3); \
107 v2 = vec_mergeh(vsum2,vsum4); \
108 v3 = vec_mergel(vsum,vsum3); \
109 v4 = vec_mergel(vsum2,vsum4); \
110 v5 = vec_mergeh(v1,v2); \
111 v6 = vec_mergel(v1,v2); \
112 v7 = vec_mergeh(v3,v4); \
113 v8 = vec_mergel(v3,v4);
115 /* A macro for stereo synth functions */
116 #define SYNTH_STEREO_ALTIVEC(B0STEP) \
117 v1 = vec_ld(0,window); \
118 v2 = vec_ld(16,window); \
119 v3 = vec_ld(32,window); \
120 v4 = vec_ld(48,window); \
121 v5 = vec_ld(64,window); \
122 v1 = vec_perm(v1,v2,vperm1); \
123 v6 = vec_ld(0,b0l); \
124 v10 = vec_ld(0,b0r); \
125 v2 = vec_perm(v2,v3,vperm1); \
126 v7 = vec_ld(16,b0l); \
127 v11 = vec_ld(16,b0r); \
128 v3 = vec_perm(v3,v4,vperm1); \
129 v8 = vec_ld(32,b0l); \
130 v12 = vec_ld(32,b0r); \
131 v4 = vec_perm(v4,v5,vperm1); \
132 v9 = vec_ld(48,b0l); \
133 v13 = vec_ld(48,b0r); \
135 vsum = vec_madd(v1,v6,vzero); \
136 vsum5 = vec_madd(v1,v10,vzero); \
137 vsum = vec_madd(v2,v7,vsum); \
138 vsum5 = vec_madd(v2,v11,vsum5); \
139 vsum = vec_madd(v3,v8,vsum); \
140 vsum5 = vec_madd(v3,v12,vsum5); \
141 vsum = vec_madd(v4,v9,vsum); \
142 vsum5 = vec_madd(v4,v13,vsum5); \
148 v1 = vec_ld(0,window); \
149 v2 = vec_ld(16,window); \
150 v3 = vec_ld(32,window); \
151 v4 = vec_ld(48,window); \
152 v5 = vec_ld(64,window); \
153 v1 = vec_perm(v1,v2,vperm1); \
154 v6 = vec_ld(0,b0l); \
155 v10 = vec_ld(0,b0r); \
156 v2 = vec_perm(v2,v3,vperm1); \
157 v7 = vec_ld(16,b0l); \
158 v11 = vec_ld(16,b0r); \
159 v3 = vec_perm(v3,v4,vperm1); \
160 v8 = vec_ld(32,b0l); \
161 v12 = vec_ld(32,b0r); \
162 v4 = vec_perm(v4,v5,vperm1); \
163 v9 = vec_ld(48,b0l); \
164 v13 = vec_ld(48,b0r); \
166 vsum2 = vec_madd(v1,v6,vzero); \
167 vsum6 = vec_madd(v1,v10,vzero); \
168 vsum2 = vec_madd(v2,v7,vsum2); \
169 vsum6 = vec_madd(v2,v11,vsum6); \
170 vsum2 = vec_madd(v3,v8,vsum2); \
171 vsum6 = vec_madd(v3,v12,vsum6); \
172 vsum2 = vec_madd(v4,v9,vsum2); \
173 vsum6 = vec_madd(v4,v13,vsum6); \
179 v1 = vec_ld(0,window); \
180 v2 = vec_ld(16,window); \
181 v3 = vec_ld(32,window); \
182 v4 = vec_ld(48,window); \
183 v5 = vec_ld(64,window); \
184 v1 = vec_perm(v1,v2,vperm1); \
185 v6 = vec_ld(0,b0l); \
186 v10 = vec_ld(0,b0r); \
187 v2 = vec_perm(v2,v3,vperm1); \
188 v7 = vec_ld(16,b0l); \
189 v11 = vec_ld(16,b0r); \
190 v3 = vec_perm(v3,v4,vperm1); \
191 v8 = vec_ld(32,b0l); \
192 v12 = vec_ld(32,b0r); \
193 v4 = vec_perm(v4,v5,vperm1); \
194 v9 = vec_ld(48,b0l); \
195 v13 = vec_ld(48,b0r); \
197 vsum3 = vec_madd(v1,v6,vzero); \
198 vsum7 = vec_madd(v1,v10,vzero); \
199 vsum3 = vec_madd(v2,v7,vsum3); \
200 vsum7 = vec_madd(v2,v11,vsum7); \
201 vsum3 = vec_madd(v3,v8,vsum3); \
202 vsum7 = vec_madd(v3,v12,vsum7); \
203 vsum3 = vec_madd(v4,v9,vsum3); \
204 vsum7 = vec_madd(v4,v13,vsum7); \
210 v1 = vec_ld(0,window); \
211 v2 = vec_ld(16,window); \
212 v3 = vec_ld(32,window); \
213 v4 = vec_ld(48,window); \
214 v5 = vec_ld(64,window); \
215 v1 = vec_perm(v1,v2,vperm1); \
216 v6 = vec_ld(0,b0l); \
217 v10 = vec_ld(0,b0r); \
218 v2 = vec_perm(v2,v3,vperm1); \
219 v7 = vec_ld(16,b0l); \
220 v11 = vec_ld(16,b0r); \
221 v3 = vec_perm(v3,v4,vperm1); \
222 v8 = vec_ld(32,b0l); \
223 v12 = vec_ld(32,b0r); \
224 v4 = vec_perm(v4,v5,vperm1); \
225 v9 = vec_ld(48,b0l); \
226 v13 = vec_ld(48,b0r); \
228 vsum4 = vec_madd(v1,v6,vzero); \
229 vsum8 = vec_madd(v1,v10,vzero); \
230 vsum4 = vec_madd(v2,v7,vsum4); \
231 vsum8 = vec_madd(v2,v11,vsum8); \
232 vsum4 = vec_madd(v3,v8,vsum4); \
233 vsum8 = vec_madd(v3,v12,vsum8); \
234 vsum4 = vec_madd(v4,v9,vsum4); \
235 vsum8 = vec_madd(v4,v13,vsum8); \
241 v1 = vec_mergeh(vsum,vsum3); \
242 v5 = vec_mergeh(vsum5,vsum7); \
243 v2 = vec_mergeh(vsum2,vsum4); \
244 v6 = vec_mergeh(vsum6,vsum8); \
245 v3 = vec_mergel(vsum,vsum3); \
246 v7 = vec_mergel(vsum5,vsum7); \
247 v4 = vec_mergel(vsum2,vsum4); \
248 v8 = vec_mergel(vsum6,vsum8); \
249 vsum = vec_mergeh(v1,v2); \
250 vsum5 = vec_mergeh(v5,v6); \
251 vsum2 = vec_mergel(v1,v2); \
252 vsum6 = vec_mergel(v5,v6); \
253 vsum3 = vec_mergeh(v3,v4); \
254 vsum7 = vec_mergeh(v7,v8); \
255 vsum4 = vec_mergel(v3,v4); \
256 vsum8 = vec_mergel(v7,v8);
258 int synth_1to1_altivec(real
*bandPtr
,int channel
,mpg123_handle
*fr
, int final
)
260 short *samples
= (short *) (fr
->buffer
.data
+fr
->buffer
.fill
);
266 if(fr
->have_eq_settings
) do_equalizer(bandPtr
,channel
,fr
->equalizer
);
272 buf
= fr
->real_buffs
[0];
277 buf
= fr
->real_buffs
[1];
284 dct64_altivec(buf
[1]+((fr
->bo
+1)&0xf),buf
[0]+fr
->bo
,bandPtr
);
290 dct64_altivec(buf
[0]+fr
->bo
,buf
[1]+fr
->bo
+1,bandPtr
);
296 real
*window
= fr
->decwin
+ 16 - bo1
;
298 ALIGNED(16) int clip_tmp
[4];
299 vector
float v1
,v2
,v3
,v4
,v5
,v6
,v7
,v8
,v9
;
300 vector
unsigned char vperm1
,vperm2
,vperm3
,vperm4
;
301 vector
float vsum
,vsum2
,vsum3
,vsum4
,vmin
,vmax
,vzero
;
302 vector
signed int vclip
;
303 vector
signed short vsample1
,vsample2
;
304 vector
unsigned int vshift
;
305 vclip
= vec_xor(vclip
,vclip
);
306 vzero
= vec_xor(vzero
,vzero
);
307 vshift
= vec_splat_u32(-1); /* 31 */
309 vmax
= (vector
float)(32767.0f
);
310 vmin
= (vector
float)(-32768.0f
);
311 vperm4
= (vector
unsigned char)(0,1,18,19,2,3,22,23,4,5,26,27,6,7,30,31);
313 vmax
= (vector
float){32767.0f
,32767.0f
,32767.0f
,32767.0f
};
314 vmin
= (vector
float){-32768.0f
,-32768.0f
,-32768.0f
,-32768.0f
};
315 vperm4
= (vector
unsigned char){0,1,18,19,2,3,22,23,4,5,26,27,6,7,30,31};
318 vperm1
= vec_lvsl(0,window
);
319 vperm2
= vec_lvsl(0,samples
);
320 vperm3
= vec_lvsr(0,samples
);
325 vsum
= vec_sub(v5
,v6
);
327 vsum
= vec_add(vsum
,v9
);
329 v3
= vec_round(vsum
);
330 v3
= (vector
float)vec_cts(v3
,0);
331 v1
= (vector
float)vec_cmpgt(vsum
,vmax
);
332 v2
= (vector
float)vec_cmplt(vsum
,vmin
);
333 vsample1
= vec_ld(0,samples
);
334 vsample2
= vec_ld(15,samples
);
335 v3
= (vector
float)vec_packs((vector
signed int)v3
,(vector
signed int)v3
);
336 v4
= (vector
float)vec_perm(vsample1
,vsample2
,vperm2
);
337 v5
= (vector
float)vec_perm(v3
,v4
,vperm4
);
338 v6
= (vector
float)vec_perm(vsample2
,vsample1
,vperm2
);
339 v7
= (vector
float)vec_perm(v5
,v6
,vperm3
);
340 v8
= (vector
float)vec_perm(v6
,v5
,vperm3
);
341 vec_st((vector
signed short)v7
,15,samples
);
342 vec_st((vector
signed short)v8
,0,samples
);
345 v1
= (vector
float)vec_sr((vector
unsigned int)v1
, vshift
);
346 v2
= (vector
float)vec_sr((vector
unsigned int)v2
, vshift
);
347 v1
= (vector
float)vec_add((vector
unsigned int)v1
,(vector
unsigned int)v2
);
348 vclip
= vec_sums((vector
signed int)v1
,vclip
);
355 vsum
= vec_add(v5
,v6
);
357 vsum
= vec_add(vsum
,v9
);
359 v3
= vec_round(vsum
);
360 v3
= (vector
float)vec_cts(v3
,0);
361 v1
= (vector
float)vec_cmpgt(vsum
,vmax
);
362 v2
= (vector
float)vec_cmplt(vsum
,vmin
);
363 vsample1
= vec_ld(0,samples
);
364 vsample2
= vec_ld(15,samples
);
365 v3
= (vector
float)vec_packs((vector
signed int)v3
,(vector
signed int)v3
);
366 v4
= (vector
float)vec_perm(vsample1
,vsample2
,vperm2
);
367 v5
= (vector
float)vec_perm(v3
,v4
,vperm4
);
368 v6
= (vector
float)vec_perm(vsample2
,vsample1
,vperm2
);
369 v7
= (vector
float)vec_perm(v5
,v6
,vperm3
);
370 v8
= (vector
float)vec_perm(v6
,v5
,vperm3
);
371 vec_st((vector
signed short)v7
,15,samples
);
372 vec_st((vector
signed short)v8
,0,samples
);
375 v1
= (vector
float)vec_sr((vector
unsigned int)v1
, vshift
);
376 v2
= (vector
float)vec_sr((vector
unsigned int)v2
, vshift
);
377 v1
= (vector
float)vec_add((vector
unsigned int)v1
,(vector
unsigned int)v2
);
378 vclip
= vec_sums((vector
signed int)v1
,vclip
);
381 vec_st(vclip
,0,clip_tmp
);
384 if(final
) fr
->buffer
.fill
+= 128;
389 int synth_1to1_stereo_altivec(real
*bandPtr_l
, real
*bandPtr_r
, mpg123_handle
*fr
)
391 short *samples
= (short *) (fr
->buffer
.data
+fr
->buffer
.fill
);
393 real
*b0l
, *b0r
, **bufl
, **bufr
;
397 if(fr
->have_eq_settings
)
399 do_equalizer(bandPtr_l
,0,fr
->equalizer
);
400 do_equalizer(bandPtr_r
,1,fr
->equalizer
);
405 bufl
= fr
->real_buffs
[0];
406 bufr
= fr
->real_buffs
[1];
413 dct64_altivec(bufl
[1]+((fr
->bo
+1)&0xf),bufl
[0]+fr
->bo
,bandPtr_l
);
414 dct64_altivec(bufr
[1]+((fr
->bo
+1)&0xf),bufr
[0]+fr
->bo
,bandPtr_r
);
421 dct64_altivec(bufl
[0]+fr
->bo
,bufl
[1]+fr
->bo
+1,bandPtr_l
);
422 dct64_altivec(bufr
[0]+fr
->bo
,bufr
[1]+fr
->bo
+1,bandPtr_r
);
428 real
*window
= fr
->decwin
+ 16 - bo1
;
430 ALIGNED(16) int clip_tmp
[4];
431 vector
float v1
,v2
,v3
,v4
,v5
,v6
,v7
,v8
,v9
,v10
,v11
,v12
,v13
;
432 vector
unsigned char vperm1
,vperm2
;
433 vector
float vsum
,vsum2
,vsum3
,vsum4
,vsum5
,vsum6
,vsum7
,vsum8
,vmin
,vmax
,vzero
;
434 vector
signed int vclip
;
435 vector
unsigned int vshift
;
436 vector
signed short vprev
;
437 vclip
= vec_xor(vclip
,vclip
);
438 vzero
= vec_xor(vzero
,vzero
);
439 vshift
= vec_splat_u32(-1); /* 31 */
441 vmax
= (vector
float)(32767.0f
);
442 vmin
= (vector
float)(-32768.0f
);
444 vmax
= (vector
float){32767.0f
,32767.0f
,32767.0f
,32767.0f
};
445 vmin
= (vector
float){-32768.0f
,-32768.0f
,-32768.0f
,-32768.0f
};
448 vperm1
= vec_lvsl(0,window
);
449 vperm2
= vec_lvsr(0,samples
);
450 vprev
= vec_perm(vec_ld(0,samples
),vec_ld(0,samples
),vec_lvsl(0,samples
));
453 SYNTH_STEREO_ALTIVEC(16);
455 vsum
= vec_sub(vsum
,vsum2
);
456 vsum2
= vec_sub(vsum5
,vsum6
);
457 vsum3
= vec_sub(vsum3
,vsum4
);
458 vsum4
= vec_sub(vsum7
,vsum8
);
459 vsum
= vec_add(vsum
,vsum3
);
460 vsum2
= vec_add(vsum2
,vsum4
);
462 v1
= vec_round(vsum
);
463 v2
= vec_round(vsum2
);
464 v1
= (vector
float)vec_cts(v1
,0);
465 v2
= (vector
float)vec_cts(v2
,0);
466 v3
= vec_mergeh(v1
, v2
);
467 v4
= vec_mergel(v1
, v2
);
468 v5
= (vector
float)vec_packs((vector
signed int)v3
,(vector
signed int)v4
);
469 v6
= (vector
float)vec_perm(vprev
,(vector
signed short)v5
,vperm2
);
470 vprev
= (vector
signed short)v5
;
471 v1
= (vector
float)vec_cmpgt(vsum
,vmax
);
472 v2
= (vector
float)vec_cmplt(vsum
,vmin
);
473 v3
= (vector
float)vec_cmpgt(vsum2
,vmax
);
474 v4
= (vector
float)vec_cmplt(vsum2
,vmin
);
475 vec_st((vector
signed short)v6
,0,samples
);
478 v1
= (vector
float)vec_sr((vector
unsigned int)v1
, vshift
);
479 v2
= (vector
float)vec_sr((vector
unsigned int)v2
, vshift
);
480 v3
= (vector
float)vec_sr((vector
unsigned int)v3
, vshift
);
481 v4
= (vector
float)vec_sr((vector
unsigned int)v4
, vshift
);
482 v1
= (vector
float)vec_add((vector
unsigned int)v1
,(vector
unsigned int)v2
);
483 v2
= (vector
float)vec_add((vector
unsigned int)v3
,(vector
unsigned int)v4
);
484 vclip
= vec_sums((vector
signed int)v1
,vclip
);
485 vclip
= vec_sums((vector
signed int)v2
,vclip
);
490 SYNTH_STEREO_ALTIVEC(-16);
492 vsum
= vec_add(vsum
,vsum2
);
493 vsum2
= vec_add(vsum5
,vsum6
);
494 vsum3
= vec_add(vsum3
,vsum4
);
495 vsum4
= vec_add(vsum7
,vsum8
);
496 vsum
= vec_add(vsum
,vsum3
);
497 vsum2
= vec_add(vsum2
,vsum4
);
499 v1
= vec_round(vsum
);
500 v2
= vec_round(vsum2
);
501 v1
= (vector
float)vec_cts(v1
,0);
502 v2
= (vector
float)vec_cts(v2
,0);
503 v3
= vec_mergeh(v1
, v2
);
504 v4
= vec_mergel(v1
, v2
);
505 v5
= (vector
float)vec_packs((vector
signed int)v3
,(vector
signed int)v4
);
506 v6
= (vector
float)vec_perm(vprev
,(vector
signed short)v5
,vperm2
);
507 vprev
= (vector
signed short)v5
;
508 v1
= (vector
float)vec_cmpgt(vsum
,vmax
);
509 v2
= (vector
float)vec_cmplt(vsum
,vmin
);
510 v3
= (vector
float)vec_cmpgt(vsum2
,vmax
);
511 v4
= (vector
float)vec_cmplt(vsum2
,vmin
);
512 vec_st((vector
signed short)v6
,0,samples
);
515 v1
= (vector
float)vec_sr((vector
unsigned int)v1
, vshift
);
516 v2
= (vector
float)vec_sr((vector
unsigned int)v2
, vshift
);
517 v3
= (vector
float)vec_sr((vector
unsigned int)v3
, vshift
);
518 v4
= (vector
float)vec_sr((vector
unsigned int)v4
, vshift
);
519 v1
= (vector
float)vec_add((vector
unsigned int)v1
,(vector
unsigned int)v2
);
520 v2
= (vector
float)vec_add((vector
unsigned int)v3
,(vector
unsigned int)v4
);
521 vclip
= vec_sums((vector
signed int)v1
,vclip
);
522 vclip
= vec_sums((vector
signed int)v2
,vclip
);
525 if((size_t)samples
& 0xf)
527 v1
= (vector
float)vec_perm(vec_ld(0,samples
),vec_ld(0,samples
),vec_lvsl(0,samples
));
528 v2
= (vector
float)vec_perm(vprev
,(vector
signed short)v1
,vperm2
);
529 vec_st((vector
signed short)v2
,0,samples
);
532 vec_st(vclip
,0,clip_tmp
);
535 fr
->buffer
.fill
+= 128;
540 int synth_1to1_real_altivec(real
*bandPtr
,int channel
,mpg123_handle
*fr
, int final
)
542 real
*samples
= (real
*) (fr
->buffer
.data
+fr
->buffer
.fill
);
547 if(fr
->have_eq_settings
) do_equalizer(bandPtr
,channel
,fr
->equalizer
);
553 buf
= fr
->real_buffs
[0];
558 buf
= fr
->real_buffs
[1];
565 dct64_altivec(buf
[1]+((fr
->bo
+1)&0xf),buf
[0]+fr
->bo
,bandPtr
);
571 dct64_altivec(buf
[0]+fr
->bo
,buf
[1]+fr
->bo
+1,bandPtr
);
577 real
*window
= fr
->decwin
+ 16 - bo1
;
579 vector
float v1
,v2
,v3
,v4
,v5
,v6
,v7
,v8
,v9
;
580 vector
unsigned char vperm1
,vperm2
,vperm3
,vperm4
, vperm5
;
581 vector
float vsum
,vsum2
,vsum3
,vsum4
,vscale
,vzero
;
582 vector
float vsample1
,vsample2
,vsample3
;
583 vzero
= vec_xor(vzero
, vzero
);
585 vscale
= (vector
float)(1.0f
/32768.0f
);
586 vperm4
= (vector
unsigned char)(0,1,2,3,20,21,22,23,4,5,6,7,28,29,30,31);
587 vperm5
= (vector
unsigned char)(8,9,10,11,20,21,22,23,12,13,14,15,28,29,30,31);
589 vscale
= (vector
float){1.0f
/32768.0f
,1.0f
/32768.0f
,1.0f
/32768.0f
,1.0f
/32768.0f
};
590 vperm4
= (vector
unsigned char){0,1,2,3,20,21,22,23,4,5,6,7,28,29,30,31};
591 vperm5
= (vector
unsigned char){8,9,10,11,20,21,22,23,12,13,14,15,28,29,30,31};
594 vperm1
= vec_lvsl(0,window
);
595 vperm2
= vec_lvsl(0,samples
);
596 vperm3
= vec_lvsr(0,samples
);
601 vsum
= vec_sub(v5
,v6
);
603 vsum
= vec_add(vsum
,v9
);
604 vsum
= vec_madd(vsum
, vscale
, vzero
);
606 vsample1
= vec_ld(0,samples
);
607 vsample2
= vec_ld(16,samples
);
608 vsample3
= vec_ld(31,samples
);
609 v1
= vec_perm(vsample1
, vsample2
, vperm2
);
610 v2
= vec_perm(vsample2
, vsample3
, vperm2
);
611 v1
= vec_perm(vsum
, v1
, vperm4
);
612 v2
= vec_perm(vsum
, v2
, vperm5
);
613 v3
= vec_perm(vsample3
, vsample2
, vperm2
);
614 v4
= vec_perm(vsample2
, vsample1
, vperm2
);
615 v5
= vec_perm(v2
, v3
, vperm3
);
616 v6
= vec_perm(v1
, v2
, vperm3
);
617 v7
= vec_perm(v4
, v1
, vperm3
);
618 vec_st(v5
,31,samples
);
619 vec_st(v6
,16,samples
);
620 vec_st(v7
,0,samples
);
628 vsum
= vec_add(v5
,v6
);
630 vsum
= vec_add(vsum
,v9
);
631 vsum
= vec_madd(vsum
, vscale
, vzero
);
633 vsample1
= vec_ld(0,samples
);
634 vsample2
= vec_ld(16,samples
);
635 vsample3
= vec_ld(31,samples
);
636 v1
= vec_perm(vsample1
, vsample2
, vperm2
);
637 v2
= vec_perm(vsample2
, vsample3
, vperm2
);
638 v1
= vec_perm(vsum
, v1
, vperm4
);
639 v2
= vec_perm(vsum
, v2
, vperm5
);
640 v3
= vec_perm(vsample3
, vsample2
, vperm2
);
641 v4
= vec_perm(vsample2
, vsample1
, vperm2
);
642 v5
= vec_perm(v2
, v3
, vperm3
);
643 v6
= vec_perm(v1
, v2
, vperm3
);
644 v7
= vec_perm(v4
, v1
, vperm3
);
645 vec_st(v5
,31,samples
);
646 vec_st(v6
,16,samples
);
647 vec_st(v7
,0,samples
);
651 if(final
) fr
->buffer
.fill
+= 256;
656 int synth_1to1_real_stereo_altivec(real
*bandPtr_l
, real
*bandPtr_r
, mpg123_handle
*fr
)
658 real
*samples
= (real
*) (fr
->buffer
.data
+fr
->buffer
.fill
);
660 real
*b0l
, *b0r
, **bufl
, **bufr
;
663 if(fr
->have_eq_settings
)
665 do_equalizer(bandPtr_l
,0,fr
->equalizer
);
666 do_equalizer(bandPtr_r
,1,fr
->equalizer
);
671 bufl
= fr
->real_buffs
[0];
672 bufr
= fr
->real_buffs
[1];
679 dct64_altivec(bufl
[1]+((fr
->bo
+1)&0xf),bufl
[0]+fr
->bo
,bandPtr_l
);
680 dct64_altivec(bufr
[1]+((fr
->bo
+1)&0xf),bufr
[0]+fr
->bo
,bandPtr_r
);
687 dct64_altivec(bufl
[0]+fr
->bo
,bufl
[1]+fr
->bo
+1,bandPtr_l
);
688 dct64_altivec(bufr
[0]+fr
->bo
,bufr
[1]+fr
->bo
+1,bandPtr_r
);
694 real
*window
= fr
->decwin
+ 16 - bo1
;
696 vector
float v1
,v2
,v3
,v4
,v5
,v6
,v7
,v8
,v9
,v10
,v11
,v12
,v13
;
697 vector
unsigned char vperm1
,vperm2
;
698 vector
float vsum
,vsum2
,vsum3
,vsum4
,vsum5
,vsum6
,vsum7
,vsum8
,vscale
,vzero
;
700 vzero
= vec_xor(vzero
,vzero
);
702 vscale
= (vector
float)(1.0f
/32768.0f
);
704 vscale
= (vector
float){1.0f
/32768.0f
,1.0f
/32768.0f
,1.0f
/32768.0f
,1.0f
/32768.0f
};
707 vperm1
= vec_lvsl(0,window
);
708 vperm2
= vec_lvsr(0,samples
);
709 vprev
= vec_perm(vec_ld(0,samples
),vec_ld(0,samples
),vec_lvsl(0,samples
));
712 SYNTH_STEREO_ALTIVEC(16);
714 vsum
= vec_sub(vsum
,vsum2
);
715 vsum2
= vec_sub(vsum5
,vsum6
);
716 vsum3
= vec_sub(vsum3
,vsum4
);
717 vsum4
= vec_sub(vsum7
,vsum8
);
718 vsum
= vec_add(vsum
,vsum3
);
719 vsum2
= vec_add(vsum2
,vsum4
);
720 vsum
= vec_madd(vsum
, vscale
, vzero
);
721 vsum2
= vec_madd(vsum2
, vscale
, vzero
);
723 v1
= vec_mergeh(vsum
, vsum2
);
724 v2
= vec_mergel(vsum
, vsum2
);
725 v3
= vec_perm(vprev
,v1
,vperm2
);
726 v4
= vec_perm(v1
,v2
,vperm2
);
728 vec_st(v3
,0,samples
);
729 vec_st(v4
,16,samples
);
735 SYNTH_STEREO_ALTIVEC(-16);
737 vsum
= vec_add(vsum
,vsum2
);
738 vsum2
= vec_add(vsum5
,vsum6
);
739 vsum3
= vec_add(vsum3
,vsum4
);
740 vsum4
= vec_add(vsum7
,vsum8
);
741 vsum
= vec_add(vsum
,vsum3
);
742 vsum2
= vec_add(vsum2
,vsum4
);
743 vsum
= vec_madd(vsum
, vscale
, vzero
);
744 vsum2
= vec_madd(vsum2
, vscale
, vzero
);
746 v1
= vec_mergeh(vsum
, vsum2
);
747 v2
= vec_mergel(vsum
, vsum2
);
748 v3
= vec_perm(vprev
,v1
,vperm2
);
749 v4
= vec_perm(v1
,v2
,vperm2
);
751 vec_st(v3
,0,samples
);
752 vec_st(v4
,16,samples
);
756 if((size_t)samples
& 0xf)
758 v1
= (vector
float)vec_perm(vec_ld(0,samples
),vec_ld(0,samples
),vec_lvsl(0,samples
));
759 v2
= (vector
float)vec_perm(vprev
,v1
,vperm2
);
760 vec_st(v2
,0,samples
);
763 fr
->buffer
.fill
+= 256;
768 int synth_1to1_s32_altivec(real
*bandPtr
,int channel
,mpg123_handle
*fr
, int final
)
770 int32_t *samples
= (int32_t *) (fr
->buffer
.data
+fr
->buffer
.fill
);
776 if(fr
->have_eq_settings
) do_equalizer(bandPtr
,channel
,fr
->equalizer
);
782 buf
= fr
->real_buffs
[0];
787 buf
= fr
->real_buffs
[1];
794 dct64_altivec(buf
[1]+((fr
->bo
+1)&0xf),buf
[0]+fr
->bo
,bandPtr
);
800 dct64_altivec(buf
[0]+fr
->bo
,buf
[1]+fr
->bo
+1,bandPtr
);
806 real
*window
= fr
->decwin
+ 16 - bo1
;
808 ALIGNED(16) int clip_tmp
[4];
809 vector
float v1
,v2
,v3
,v4
,v5
,v6
,v7
,v8
,v9
;
810 vector
unsigned char vperm1
,vperm2
,vperm3
,vperm4
,vperm5
;
811 vector
float vsum
,vsum2
,vsum3
,vsum4
,vmax
,vmin
,vzero
;
812 vector
signed int vsample1
,vsample2
,vsample3
;
813 vector
unsigned int vshift
;
814 vector
signed int vclip
;
815 vzero
= vec_xor(vzero
, vzero
);
816 vclip
= vec_xor(vclip
, vclip
);
817 vshift
= vec_splat_u32(-1); /* 31 */
819 vmax
= (vector
float)(32767.999f
);
820 vmin
= (vector
float)(-32768.0f
);
821 vperm4
= (vector
unsigned char)(0,1,2,3,20,21,22,23,4,5,6,7,28,29,30,31);
822 vperm5
= (vector
unsigned char)(8,9,10,11,20,21,22,23,12,13,14,15,28,29,30,31);
824 vmax
= (vector
float){32767.999f
,32767.999f
,32767.999f
,32767.999f
};
825 vmin
= (vector
float){-32768.0f
,-32768.0f
,-32768.0f
,-32768.0f
};
826 vperm4
= (vector
unsigned char){0,1,2,3,20,21,22,23,4,5,6,7,28,29,30,31};
827 vperm5
= (vector
unsigned char){8,9,10,11,20,21,22,23,12,13,14,15,28,29,30,31};
830 vperm1
= vec_lvsl(0,window
);
831 vperm2
= vec_lvsl(0,samples
);
832 vperm3
= vec_lvsr(0,samples
);
837 vsum
= vec_sub(v5
,v6
);
839 v1
= vec_add(vsum
,v9
);
840 vsum
= (vector
float)vec_cts(v1
,16);
841 v8
= (vector
float)vec_cmpgt(v1
,vmax
);
842 v9
= (vector
float)vec_cmplt(v1
,vmin
);
844 vsample1
= vec_ld(0,samples
);
845 vsample2
= vec_ld(16,samples
);
846 vsample3
= vec_ld(31,samples
);
847 v1
= (vector
float)vec_perm(vsample1
, vsample2
, vperm2
);
848 v2
= (vector
float)vec_perm(vsample2
, vsample3
, vperm2
);
849 v1
= vec_perm(vsum
, v1
, vperm4
);
850 v2
= vec_perm(vsum
, v2
, vperm5
);
851 v3
= (vector
float)vec_perm(vsample3
, vsample2
, vperm2
);
852 v4
= (vector
float)vec_perm(vsample2
, vsample1
, vperm2
);
853 v5
= vec_perm(v2
, v3
, vperm3
);
854 v6
= vec_perm(v1
, v2
, vperm3
);
855 v7
= vec_perm(v4
, v1
, vperm3
);
856 vec_st((vector
signed int)v5
,31,samples
);
857 vec_st((vector
signed int)v6
,16,samples
);
858 vec_st((vector
signed int)v7
,0,samples
);
861 v1
= (vector
float)vec_sr((vector
unsigned int)v8
, vshift
);
862 v2
= (vector
float)vec_sr((vector
unsigned int)v9
, vshift
);
863 v1
= (vector
float)vec_add((vector
unsigned int)v1
,(vector
unsigned int)v2
);
864 vclip
= vec_sums((vector
signed int)v1
,vclip
);
871 vsum
= vec_add(v5
,v6
);
873 v1
= vec_add(vsum
,v9
);
874 vsum
= (vector
float)vec_cts(v1
,16);
875 v8
= (vector
float)vec_cmpgt(v1
,vmax
);
876 v9
= (vector
float)vec_cmplt(v1
,vmin
);
878 vsample1
= vec_ld(0,samples
);
879 vsample2
= vec_ld(16,samples
);
880 vsample3
= vec_ld(31,samples
);
881 v1
= (vector
float)vec_perm(vsample1
, vsample2
, vperm2
);
882 v2
= (vector
float)vec_perm(vsample2
, vsample3
, vperm2
);
883 v1
= vec_perm(vsum
, v1
, vperm4
);
884 v2
= vec_perm(vsum
, v2
, vperm5
);
885 v3
= (vector
float)vec_perm(vsample3
, vsample2
, vperm2
);
886 v4
= (vector
float)vec_perm(vsample2
, vsample1
, vperm2
);
887 v5
= vec_perm(v2
, v3
, vperm3
);
888 v6
= vec_perm(v1
, v2
, vperm3
);
889 v7
= vec_perm(v4
, v1
, vperm3
);
890 vec_st((vector
signed int)v5
,31,samples
);
891 vec_st((vector
signed int)v6
,16,samples
);
892 vec_st((vector
signed int)v7
,0,samples
);
895 v1
= (vector
float)vec_sr((vector
unsigned int)v8
, vshift
);
896 v2
= (vector
float)vec_sr((vector
unsigned int)v9
, vshift
);
897 v1
= (vector
float)vec_add((vector
unsigned int)v1
,(vector
unsigned int)v2
);
898 vclip
= vec_sums((vector
signed int)v1
,vclip
);
901 vec_st(vclip
,0,clip_tmp
);
904 if(final
) fr
->buffer
.fill
+= 256;
910 int synth_1to1_s32_stereo_altivec(real
*bandPtr_l
, real
*bandPtr_r
, mpg123_handle
*fr
)
912 int32_t *samples
= (int32_t *) (fr
->buffer
.data
+fr
->buffer
.fill
);
914 real
*b0l
, *b0r
, **bufl
, **bufr
;
918 if(fr
->have_eq_settings
)
920 do_equalizer(bandPtr_l
,0,fr
->equalizer
);
921 do_equalizer(bandPtr_r
,1,fr
->equalizer
);
926 bufl
= fr
->real_buffs
[0];
927 bufr
= fr
->real_buffs
[1];
934 dct64_altivec(bufl
[1]+((fr
->bo
+1)&0xf),bufl
[0]+fr
->bo
,bandPtr_l
);
935 dct64_altivec(bufr
[1]+((fr
->bo
+1)&0xf),bufr
[0]+fr
->bo
,bandPtr_r
);
942 dct64_altivec(bufl
[0]+fr
->bo
,bufl
[1]+fr
->bo
+1,bandPtr_l
);
943 dct64_altivec(bufr
[0]+fr
->bo
,bufr
[1]+fr
->bo
+1,bandPtr_r
);
949 real
*window
= fr
->decwin
+ 16 - bo1
;
951 ALIGNED(16) int clip_tmp
[4];
952 vector
float v1
,v2
,v3
,v4
,v5
,v6
,v7
,v8
,v9
,v10
,v11
,v12
,v13
;
953 vector
unsigned char vperm1
,vperm2
;
954 vector
float vsum
,vsum2
,vsum3
,vsum4
,vsum5
,vsum6
,vsum7
,vsum8
,vmax
,vmin
,vzero
;
956 vector
unsigned int vshift
;
957 vector
signed int vclip
;
958 vzero
= vec_xor(vzero
, vzero
);
959 vclip
= vec_xor(vclip
, vclip
);
960 vshift
= vec_splat_u32(-1); /* 31 */
962 vmax
= (vector
float)(32767.999f
);
963 vmin
= (vector
float)(-32768.0f
);
965 vmax
= (vector
float){32767.999f
,32767.999f
,32767.999f
,32767.999f
};
966 vmin
= (vector
float){-32768.0f
,-32768.0f
,-32768.0f
,-32768.0f
};
969 vperm1
= vec_lvsl(0,window
);
970 vperm2
= vec_lvsr(0,samples
);
971 vprev
= (vector
float)vec_perm(vec_ld(0,samples
),vec_ld(0,samples
),vec_lvsl(0,samples
));
974 SYNTH_STEREO_ALTIVEC(16);
976 vsum
= vec_sub(vsum
,vsum2
);
977 vsum2
= vec_sub(vsum5
,vsum6
);
978 vsum3
= vec_sub(vsum3
,vsum4
);
979 vsum4
= vec_sub(vsum7
,vsum8
);
980 v1
= vec_add(vsum
,vsum3
);
981 v2
= vec_add(vsum2
,vsum4
);
982 vsum
= (vector
float)vec_cts(v1
,16);
983 vsum2
= (vector
float)vec_cts(v2
,16);
984 v5
= (vector
float)vec_cmpgt(v1
,vmax
);
985 v6
= (vector
float)vec_cmplt(v1
,vmin
);
986 v7
= (vector
float)vec_cmpgt(v2
,vmax
);
987 v8
= (vector
float)vec_cmplt(v2
,vmin
);
989 v1
= vec_mergeh(vsum
, vsum2
);
990 v2
= vec_mergel(vsum
, vsum2
);
991 v3
= vec_perm(vprev
,v1
,vperm2
);
992 v4
= vec_perm(v1
,v2
,vperm2
);
994 vec_st((vector
signed int)v3
,0,samples
);
995 vec_st((vector
signed int)v4
,16,samples
);
998 v1
= (vector
float)vec_sr((vector
unsigned int)v5
, vshift
);
999 v2
= (vector
float)vec_sr((vector
unsigned int)v6
, vshift
);
1000 v3
= (vector
float)vec_sr((vector
unsigned int)v7
, vshift
);
1001 v4
= (vector
float)vec_sr((vector
unsigned int)v8
, vshift
);
1002 v1
= (vector
float)vec_add((vector
unsigned int)v1
,(vector
unsigned int)v2
);
1003 v2
= (vector
float)vec_add((vector
unsigned int)v3
,(vector
unsigned int)v4
);
1004 vclip
= vec_sums((vector
signed int)v1
,vclip
);
1005 vclip
= vec_sums((vector
signed int)v2
,vclip
);
1010 SYNTH_STEREO_ALTIVEC(-16);
1012 vsum
= vec_add(vsum
,vsum2
);
1013 vsum2
= vec_add(vsum5
,vsum6
);
1014 vsum3
= vec_add(vsum3
,vsum4
);
1015 vsum4
= vec_add(vsum7
,vsum8
);
1016 v1
= vec_add(vsum
,vsum3
);
1017 v2
= vec_add(vsum2
,vsum4
);
1018 vsum
= (vector
float)vec_cts(v1
,16);
1019 vsum2
= (vector
float)vec_cts(v2
,16);
1020 v5
= (vector
float)vec_cmpgt(v1
,vmax
);
1021 v6
= (vector
float)vec_cmplt(v1
,vmin
);
1022 v7
= (vector
float)vec_cmpgt(v2
,vmax
);
1023 v8
= (vector
float)vec_cmplt(v2
,vmin
);
1025 v1
= vec_mergeh(vsum
, vsum2
);
1026 v2
= vec_mergel(vsum
, vsum2
);
1027 v3
= vec_perm(vprev
,v1
,vperm2
);
1028 v4
= vec_perm(v1
,v2
,vperm2
);
1030 vec_st((vector
signed int)v3
,0,samples
);
1031 vec_st((vector
signed int)v4
,16,samples
);
1034 v1
= (vector
float)vec_sr((vector
unsigned int)v5
, vshift
);
1035 v2
= (vector
float)vec_sr((vector
unsigned int)v6
, vshift
);
1036 v3
= (vector
float)vec_sr((vector
unsigned int)v7
, vshift
);
1037 v4
= (vector
float)vec_sr((vector
unsigned int)v8
, vshift
);
1038 v1
= (vector
float)vec_add((vector
unsigned int)v1
,(vector
unsigned int)v2
);
1039 v2
= (vector
float)vec_add((vector
unsigned int)v3
,(vector
unsigned int)v4
);
1040 vclip
= vec_sums((vector
signed int)v1
,vclip
);
1041 vclip
= vec_sums((vector
signed int)v2
,vclip
);
1044 if((size_t)samples
& 0xf)
1046 v1
= (vector
float)vec_perm(vec_ld(0,samples
),vec_ld(0,samples
),vec_lvsl(0,samples
));
1047 v2
= (vector
float)vec_perm(vprev
,v1
,vperm2
);
1048 vec_st((vector
signed int)v2
,0,samples
);
1051 vec_st(vclip
,0,clip_tmp
);
1054 fr
->buffer
.fill
+= 256;