Sync winemp3.acm with Wine HEAD. This one uses libmpg123 which was added in Version...
[reactos.git] / reactos / lib / 3rdparty / libmpg123 / synth_altivec.c
1 /*
2 decode.c: decoding samples...
3
4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Michael Hipp
7 altivec optimization by tmkk
8 */
9
10 #include "mpg123lib_intern.h"
11
12 #ifndef __APPLE__
13 #include <altivec.h>
14 #endif
15
16 /* A macro for normal synth functions */
17 #define SYNTH_ALTIVEC(B0STEP) \
18 v1 = vec_ld(0,window); \
19 v2 = vec_ld(16,window); \
20 v3 = vec_ld(32,window); \
21 v4 = vec_ld(48,window); \
22 v5 = vec_ld(64,window); \
23 v1 = vec_perm(v1,v2,vperm1); \
24 v6 = vec_ld(0,b0); \
25 v2 = vec_perm(v2,v3,vperm1); \
26 v7 = vec_ld(16,b0); \
27 v3 = vec_perm(v3,v4,vperm1); \
28 v8 = vec_ld(32,b0); \
29 v4 = vec_perm(v4,v5,vperm1); \
30 v9 = vec_ld(48,b0); \
31 \
32 vsum = vec_madd(v1,v6,vzero); \
33 vsum = vec_madd(v2,v7,vsum); \
34 vsum = vec_madd(v3,v8,vsum); \
35 vsum = vec_madd(v4,v9,vsum); \
36 \
37 window += 32; \
38 b0 += B0STEP; \
39 \
40 v1 = vec_ld(0,window); \
41 v2 = vec_ld(16,window); \
42 v3 = vec_ld(32,window); \
43 v4 = vec_ld(48,window); \
44 v5 = vec_ld(64,window); \
45 v1 = vec_perm(v1,v2,vperm1); \
46 v6 = vec_ld(0,b0); \
47 v2 = vec_perm(v2,v3,vperm1); \
48 v7 = vec_ld(16,b0); \
49 v3 = vec_perm(v3,v4,vperm1); \
50 v8 = vec_ld(32,b0); \
51 v4 = vec_perm(v4,v5,vperm1); \
52 v9 = vec_ld(48,b0); \
53 \
54 vsum2 = vec_madd(v1,v6,vzero); \
55 vsum2 = vec_madd(v2,v7,vsum2); \
56 vsum2 = vec_madd(v3,v8,vsum2); \
57 vsum2 = vec_madd(v4,v9,vsum2); \
58 \
59 window += 32; \
60 b0 += B0STEP; \
61 \
62 v1 = vec_ld(0,window); \
63 v2 = vec_ld(16,window); \
64 v3 = vec_ld(32,window); \
65 v4 = vec_ld(48,window); \
66 v5 = vec_ld(64,window); \
67 v1 = vec_perm(v1,v2,vperm1); \
68 v6 = vec_ld(0,b0); \
69 v2 = vec_perm(v2,v3,vperm1); \
70 v7 = vec_ld(16,b0); \
71 v3 = vec_perm(v3,v4,vperm1); \
72 v8 = vec_ld(32,b0); \
73 v4 = vec_perm(v4,v5,vperm1); \
74 v9 = vec_ld(48,b0); \
75 \
76 vsum3 = vec_madd(v1,v6,vzero); \
77 vsum3 = vec_madd(v2,v7,vsum3); \
78 vsum3 = vec_madd(v3,v8,vsum3); \
79 vsum3 = vec_madd(v4,v9,vsum3); \
80 \
81 window += 32; \
82 b0 += B0STEP; \
83 \
84 v1 = vec_ld(0,window); \
85 v2 = vec_ld(16,window); \
86 v3 = vec_ld(32,window); \
87 v4 = vec_ld(48,window); \
88 v5 = vec_ld(64,window); \
89 v1 = vec_perm(v1,v2,vperm1); \
90 v6 = vec_ld(0,b0); \
91 v2 = vec_perm(v2,v3,vperm1); \
92 v7 = vec_ld(16,b0); \
93 v3 = vec_perm(v3,v4,vperm1); \
94 v8 = vec_ld(32,b0); \
95 v4 = vec_perm(v4,v5,vperm1); \
96 v9 = vec_ld(48,b0); \
97 \
98 vsum4 = vec_madd(v1,v6,vzero); \
99 vsum4 = vec_madd(v2,v7,vsum4); \
100 vsum4 = vec_madd(v3,v8,vsum4); \
101 vsum4 = vec_madd(v4,v9,vsum4); \
102 \
103 window += 32; \
104 b0 += B0STEP; \
105 \
106 v1 = vec_mergeh(vsum,vsum3); \
107 v2 = vec_mergeh(vsum2,vsum4); \
108 v3 = vec_mergel(vsum,vsum3); \
109 v4 = vec_mergel(vsum2,vsum4); \
110 v5 = vec_mergeh(v1,v2); \
111 v6 = vec_mergel(v1,v2); \
112 v7 = vec_mergeh(v3,v4); \
113 v8 = vec_mergel(v3,v4);
114
115 /* A macro for stereo synth functions */
116 #define SYNTH_STEREO_ALTIVEC(B0STEP) \
117 v1 = vec_ld(0,window); \
118 v2 = vec_ld(16,window); \
119 v3 = vec_ld(32,window); \
120 v4 = vec_ld(48,window); \
121 v5 = vec_ld(64,window); \
122 v1 = vec_perm(v1,v2,vperm1); \
123 v6 = vec_ld(0,b0l); \
124 v10 = vec_ld(0,b0r); \
125 v2 = vec_perm(v2,v3,vperm1); \
126 v7 = vec_ld(16,b0l); \
127 v11 = vec_ld(16,b0r); \
128 v3 = vec_perm(v3,v4,vperm1); \
129 v8 = vec_ld(32,b0l); \
130 v12 = vec_ld(32,b0r); \
131 v4 = vec_perm(v4,v5,vperm1); \
132 v9 = vec_ld(48,b0l); \
133 v13 = vec_ld(48,b0r); \
134 \
135 vsum = vec_madd(v1,v6,vzero); \
136 vsum5 = vec_madd(v1,v10,vzero); \
137 vsum = vec_madd(v2,v7,vsum); \
138 vsum5 = vec_madd(v2,v11,vsum5); \
139 vsum = vec_madd(v3,v8,vsum); \
140 vsum5 = vec_madd(v3,v12,vsum5); \
141 vsum = vec_madd(v4,v9,vsum); \
142 vsum5 = vec_madd(v4,v13,vsum5); \
143 \
144 window += 32; \
145 b0l += B0STEP; \
146 b0r += B0STEP; \
147 \
148 v1 = vec_ld(0,window); \
149 v2 = vec_ld(16,window); \
150 v3 = vec_ld(32,window); \
151 v4 = vec_ld(48,window); \
152 v5 = vec_ld(64,window); \
153 v1 = vec_perm(v1,v2,vperm1); \
154 v6 = vec_ld(0,b0l); \
155 v10 = vec_ld(0,b0r); \
156 v2 = vec_perm(v2,v3,vperm1); \
157 v7 = vec_ld(16,b0l); \
158 v11 = vec_ld(16,b0r); \
159 v3 = vec_perm(v3,v4,vperm1); \
160 v8 = vec_ld(32,b0l); \
161 v12 = vec_ld(32,b0r); \
162 v4 = vec_perm(v4,v5,vperm1); \
163 v9 = vec_ld(48,b0l); \
164 v13 = vec_ld(48,b0r); \
165 \
166 vsum2 = vec_madd(v1,v6,vzero); \
167 vsum6 = vec_madd(v1,v10,vzero); \
168 vsum2 = vec_madd(v2,v7,vsum2); \
169 vsum6 = vec_madd(v2,v11,vsum6); \
170 vsum2 = vec_madd(v3,v8,vsum2); \
171 vsum6 = vec_madd(v3,v12,vsum6); \
172 vsum2 = vec_madd(v4,v9,vsum2); \
173 vsum6 = vec_madd(v4,v13,vsum6); \
174 \
175 window += 32; \
176 b0l += B0STEP; \
177 b0r += B0STEP; \
178 \
179 v1 = vec_ld(0,window); \
180 v2 = vec_ld(16,window); \
181 v3 = vec_ld(32,window); \
182 v4 = vec_ld(48,window); \
183 v5 = vec_ld(64,window); \
184 v1 = vec_perm(v1,v2,vperm1); \
185 v6 = vec_ld(0,b0l); \
186 v10 = vec_ld(0,b0r); \
187 v2 = vec_perm(v2,v3,vperm1); \
188 v7 = vec_ld(16,b0l); \
189 v11 = vec_ld(16,b0r); \
190 v3 = vec_perm(v3,v4,vperm1); \
191 v8 = vec_ld(32,b0l); \
192 v12 = vec_ld(32,b0r); \
193 v4 = vec_perm(v4,v5,vperm1); \
194 v9 = vec_ld(48,b0l); \
195 v13 = vec_ld(48,b0r); \
196 \
197 vsum3 = vec_madd(v1,v6,vzero); \
198 vsum7 = vec_madd(v1,v10,vzero); \
199 vsum3 = vec_madd(v2,v7,vsum3); \
200 vsum7 = vec_madd(v2,v11,vsum7); \
201 vsum3 = vec_madd(v3,v8,vsum3); \
202 vsum7 = vec_madd(v3,v12,vsum7); \
203 vsum3 = vec_madd(v4,v9,vsum3); \
204 vsum7 = vec_madd(v4,v13,vsum7); \
205 \
206 window += 32; \
207 b0l += B0STEP; \
208 b0r += B0STEP; \
209 \
210 v1 = vec_ld(0,window); \
211 v2 = vec_ld(16,window); \
212 v3 = vec_ld(32,window); \
213 v4 = vec_ld(48,window); \
214 v5 = vec_ld(64,window); \
215 v1 = vec_perm(v1,v2,vperm1); \
216 v6 = vec_ld(0,b0l); \
217 v10 = vec_ld(0,b0r); \
218 v2 = vec_perm(v2,v3,vperm1); \
219 v7 = vec_ld(16,b0l); \
220 v11 = vec_ld(16,b0r); \
221 v3 = vec_perm(v3,v4,vperm1); \
222 v8 = vec_ld(32,b0l); \
223 v12 = vec_ld(32,b0r); \
224 v4 = vec_perm(v4,v5,vperm1); \
225 v9 = vec_ld(48,b0l); \
226 v13 = vec_ld(48,b0r); \
227 \
228 vsum4 = vec_madd(v1,v6,vzero); \
229 vsum8 = vec_madd(v1,v10,vzero); \
230 vsum4 = vec_madd(v2,v7,vsum4); \
231 vsum8 = vec_madd(v2,v11,vsum8); \
232 vsum4 = vec_madd(v3,v8,vsum4); \
233 vsum8 = vec_madd(v3,v12,vsum8); \
234 vsum4 = vec_madd(v4,v9,vsum4); \
235 vsum8 = vec_madd(v4,v13,vsum8); \
236 \
237 window += 32; \
238 b0l += B0STEP; \
239 b0r += B0STEP; \
240 \
241 v1 = vec_mergeh(vsum,vsum3); \
242 v5 = vec_mergeh(vsum5,vsum7); \
243 v2 = vec_mergeh(vsum2,vsum4); \
244 v6 = vec_mergeh(vsum6,vsum8); \
245 v3 = vec_mergel(vsum,vsum3); \
246 v7 = vec_mergel(vsum5,vsum7); \
247 v4 = vec_mergel(vsum2,vsum4); \
248 v8 = vec_mergel(vsum6,vsum8); \
249 vsum = vec_mergeh(v1,v2); \
250 vsum5 = vec_mergeh(v5,v6); \
251 vsum2 = vec_mergel(v1,v2); \
252 vsum6 = vec_mergel(v5,v6); \
253 vsum3 = vec_mergeh(v3,v4); \
254 vsum7 = vec_mergeh(v7,v8); \
255 vsum4 = vec_mergel(v3,v4); \
256 vsum8 = vec_mergel(v7,v8);
257
258 int synth_1to1_altivec(real *bandPtr,int channel,mpg123_handle *fr, int final)
259 {
260 short *samples = (short *) (fr->buffer.data+fr->buffer.fill);
261
262 real *b0, **buf;
263 int clip;
264 int bo1;
265
266 if(fr->have_eq_settings) do_equalizer(bandPtr,channel,fr->equalizer);
267
268 if(!channel)
269 {
270 fr->bo--;
271 fr->bo &= 0xf;
272 buf = fr->real_buffs[0];
273 }
274 else
275 {
276 samples++;
277 buf = fr->real_buffs[1];
278 }
279
280 if(fr->bo & 0x1)
281 {
282 b0 = buf[0];
283 bo1 = fr->bo;
284 dct64_altivec(buf[1]+((fr->bo+1)&0xf),buf[0]+fr->bo,bandPtr);
285 }
286 else
287 {
288 b0 = buf[1];
289 bo1 = fr->bo+1;
290 dct64_altivec(buf[0]+fr->bo,buf[1]+fr->bo+1,bandPtr);
291 }
292
293
294 {
295 register int j;
296 real *window = fr->decwin + 16 - bo1;
297
298 ALIGNED(16) int clip_tmp[4];
299 vector float v1,v2,v3,v4,v5,v6,v7,v8,v9;
300 vector unsigned char vperm1,vperm2,vperm3,vperm4;
301 vector float vsum,vsum2,vsum3,vsum4,vmin,vmax,vzero;
302 vector signed int vclip;
303 vector signed short vsample1,vsample2;
304 vector unsigned int vshift;
305 vclip = vec_xor(vclip,vclip);
306 vzero = vec_xor(vzero,vzero);
307 vshift = vec_splat_u32(-1); /* 31 */
308 #ifdef __APPLE__
309 vmax = (vector float)(32767.0f);
310 vmin = (vector float)(-32768.0f);
311 vperm4 = (vector unsigned char)(0,1,18,19,2,3,22,23,4,5,26,27,6,7,30,31);
312 #else
313 vmax = (vector float){32767.0f,32767.0f,32767.0f,32767.0f};
314 vmin = (vector float){-32768.0f,-32768.0f,-32768.0f,-32768.0f};
315 vperm4 = (vector unsigned char){0,1,18,19,2,3,22,23,4,5,26,27,6,7,30,31};
316 #endif
317
318 vperm1 = vec_lvsl(0,window);
319 vperm2 = vec_lvsl(0,samples);
320 vperm3 = vec_lvsr(0,samples);
321 for (j=4;j;j--)
322 {
323 SYNTH_ALTIVEC(16);
324
325 vsum = vec_sub(v5,v6);
326 v9 = vec_sub(v7,v8);
327 vsum = vec_add(vsum,v9);
328
329 v3 = vec_round(vsum);
330 v3 = (vector float)vec_cts(v3,0);
331 v1 = (vector float)vec_cmpgt(vsum,vmax);
332 v2 = (vector float)vec_cmplt(vsum,vmin);
333 vsample1 = vec_ld(0,samples);
334 vsample2 = vec_ld(15,samples);
335 v3 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v3);
336 v4 = (vector float)vec_perm(vsample1,vsample2,vperm2);
337 v5 = (vector float)vec_perm(v3,v4,vperm4);
338 v6 = (vector float)vec_perm(vsample2,vsample1,vperm2);
339 v7 = (vector float)vec_perm(v5,v6,vperm3);
340 v8 = (vector float)vec_perm(v6,v5,vperm3);
341 vec_st((vector signed short)v7,15,samples);
342 vec_st((vector signed short)v8,0,samples);
343 samples += 8;
344
345 v1 = (vector float)vec_sr((vector unsigned int)v1, vshift);
346 v2 = (vector float)vec_sr((vector unsigned int)v2, vshift);
347 v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
348 vclip = vec_sums((vector signed int)v1,vclip);
349 }
350
351 for (j=4;j;j--)
352 {
353 SYNTH_ALTIVEC(-16);
354
355 vsum = vec_add(v5,v6);
356 v9 = vec_add(v7,v8);
357 vsum = vec_add(vsum,v9);
358
359 v3 = vec_round(vsum);
360 v3 = (vector float)vec_cts(v3,0);
361 v1 = (vector float)vec_cmpgt(vsum,vmax);
362 v2 = (vector float)vec_cmplt(vsum,vmin);
363 vsample1 = vec_ld(0,samples);
364 vsample2 = vec_ld(15,samples);
365 v3 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v3);
366 v4 = (vector float)vec_perm(vsample1,vsample2,vperm2);
367 v5 = (vector float)vec_perm(v3,v4,vperm4);
368 v6 = (vector float)vec_perm(vsample2,vsample1,vperm2);
369 v7 = (vector float)vec_perm(v5,v6,vperm3);
370 v8 = (vector float)vec_perm(v6,v5,vperm3);
371 vec_st((vector signed short)v7,15,samples);
372 vec_st((vector signed short)v8,0,samples);
373 samples += 8;
374
375 v1 = (vector float)vec_sr((vector unsigned int)v1, vshift);
376 v2 = (vector float)vec_sr((vector unsigned int)v2, vshift);
377 v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
378 vclip = vec_sums((vector signed int)v1,vclip);
379 }
380
381 vec_st(vclip,0,clip_tmp);
382 clip = clip_tmp[3];
383 }
384 if(final) fr->buffer.fill += 128;
385
386 return clip;
387 }
388
389 int synth_1to1_stereo_altivec(real *bandPtr_l, real *bandPtr_r, mpg123_handle *fr)
390 {
391 short *samples = (short *) (fr->buffer.data+fr->buffer.fill);
392
393 real *b0l, *b0r, **bufl, **bufr;
394 int clip;
395 int bo1;
396
397 if(fr->have_eq_settings)
398 {
399 do_equalizer(bandPtr_l,0,fr->equalizer);
400 do_equalizer(bandPtr_r,1,fr->equalizer);
401 }
402
403 fr->bo--;
404 fr->bo &= 0xf;
405 bufl = fr->real_buffs[0];
406 bufr = fr->real_buffs[1];
407
408 if(fr->bo & 0x1)
409 {
410 b0l = bufl[0];
411 b0r = bufr[0];
412 bo1 = fr->bo;
413 dct64_altivec(bufl[1]+((fr->bo+1)&0xf),bufl[0]+fr->bo,bandPtr_l);
414 dct64_altivec(bufr[1]+((fr->bo+1)&0xf),bufr[0]+fr->bo,bandPtr_r);
415 }
416 else
417 {
418 b0l = bufl[1];
419 b0r = bufr[1];
420 bo1 = fr->bo+1;
421 dct64_altivec(bufl[0]+fr->bo,bufl[1]+fr->bo+1,bandPtr_l);
422 dct64_altivec(bufr[0]+fr->bo,bufr[1]+fr->bo+1,bandPtr_r);
423 }
424
425
426 {
427 register int j;
428 real *window = fr->decwin + 16 - bo1;
429
430 ALIGNED(16) int clip_tmp[4];
431 vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13;
432 vector unsigned char vperm1,vperm2;
433 vector float vsum,vsum2,vsum3,vsum4,vsum5,vsum6,vsum7,vsum8,vmin,vmax,vzero;
434 vector signed int vclip;
435 vector unsigned int vshift;
436 vector signed short vprev;
437 vclip = vec_xor(vclip,vclip);
438 vzero = vec_xor(vzero,vzero);
439 vshift = vec_splat_u32(-1); /* 31 */
440 #ifdef __APPLE__
441 vmax = (vector float)(32767.0f);
442 vmin = (vector float)(-32768.0f);
443 #else
444 vmax = (vector float){32767.0f,32767.0f,32767.0f,32767.0f};
445 vmin = (vector float){-32768.0f,-32768.0f,-32768.0f,-32768.0f};
446 #endif
447
448 vperm1 = vec_lvsl(0,window);
449 vperm2 = vec_lvsr(0,samples);
450 vprev = vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples));
451 for (j=4;j;j--)
452 {
453 SYNTH_STEREO_ALTIVEC(16);
454
455 vsum = vec_sub(vsum,vsum2);
456 vsum2 = vec_sub(vsum5,vsum6);
457 vsum3 = vec_sub(vsum3,vsum4);
458 vsum4 = vec_sub(vsum7,vsum8);
459 vsum = vec_add(vsum,vsum3);
460 vsum2 = vec_add(vsum2,vsum4);
461
462 v1 = vec_round(vsum);
463 v2 = vec_round(vsum2);
464 v1 = (vector float)vec_cts(v1,0);
465 v2 = (vector float)vec_cts(v2,0);
466 v3 = vec_mergeh(v1, v2);
467 v4 = vec_mergel(v1, v2);
468 v5 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v4);
469 v6 = (vector float)vec_perm(vprev,(vector signed short)v5,vperm2);
470 vprev = (vector signed short)v5;
471 v1 = (vector float)vec_cmpgt(vsum,vmax);
472 v2 = (vector float)vec_cmplt(vsum,vmin);
473 v3 = (vector float)vec_cmpgt(vsum2,vmax);
474 v4 = (vector float)vec_cmplt(vsum2,vmin);
475 vec_st((vector signed short)v6,0,samples);
476 samples += 8;
477
478 v1 = (vector float)vec_sr((vector unsigned int)v1, vshift);
479 v2 = (vector float)vec_sr((vector unsigned int)v2, vshift);
480 v3 = (vector float)vec_sr((vector unsigned int)v3, vshift);
481 v4 = (vector float)vec_sr((vector unsigned int)v4, vshift);
482 v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
483 v2 = (vector float)vec_add((vector unsigned int)v3,(vector unsigned int)v4);
484 vclip = vec_sums((vector signed int)v1,vclip);
485 vclip = vec_sums((vector signed int)v2,vclip);
486 }
487
488 for (j=4;j;j--)
489 {
490 SYNTH_STEREO_ALTIVEC(-16);
491
492 vsum = vec_add(vsum,vsum2);
493 vsum2 = vec_add(vsum5,vsum6);
494 vsum3 = vec_add(vsum3,vsum4);
495 vsum4 = vec_add(vsum7,vsum8);
496 vsum = vec_add(vsum,vsum3);
497 vsum2 = vec_add(vsum2,vsum4);
498
499 v1 = vec_round(vsum);
500 v2 = vec_round(vsum2);
501 v1 = (vector float)vec_cts(v1,0);
502 v2 = (vector float)vec_cts(v2,0);
503 v3 = vec_mergeh(v1, v2);
504 v4 = vec_mergel(v1, v2);
505 v5 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v4);
506 v6 = (vector float)vec_perm(vprev,(vector signed short)v5,vperm2);
507 vprev = (vector signed short)v5;
508 v1 = (vector float)vec_cmpgt(vsum,vmax);
509 v2 = (vector float)vec_cmplt(vsum,vmin);
510 v3 = (vector float)vec_cmpgt(vsum2,vmax);
511 v4 = (vector float)vec_cmplt(vsum2,vmin);
512 vec_st((vector signed short)v6,0,samples);
513 samples += 8;
514
515 v1 = (vector float)vec_sr((vector unsigned int)v1, vshift);
516 v2 = (vector float)vec_sr((vector unsigned int)v2, vshift);
517 v3 = (vector float)vec_sr((vector unsigned int)v3, vshift);
518 v4 = (vector float)vec_sr((vector unsigned int)v4, vshift);
519 v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
520 v2 = (vector float)vec_add((vector unsigned int)v3,(vector unsigned int)v4);
521 vclip = vec_sums((vector signed int)v1,vclip);
522 vclip = vec_sums((vector signed int)v2,vclip);
523 }
524
525 if((size_t)samples & 0xf)
526 {
527 v1 = (vector float)vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples));
528 v2 = (vector float)vec_perm(vprev,(vector signed short)v1,vperm2);
529 vec_st((vector signed short)v2,0,samples);
530 }
531
532 vec_st(vclip,0,clip_tmp);
533 clip = clip_tmp[3];
534 }
535 fr->buffer.fill += 128;
536
537 return clip;
538 }
539
540 int synth_1to1_real_altivec(real *bandPtr,int channel,mpg123_handle *fr, int final)
541 {
542 real *samples = (real *) (fr->buffer.data+fr->buffer.fill);
543
544 real *b0, **buf;
545 int bo1;
546
547 if(fr->have_eq_settings) do_equalizer(bandPtr,channel,fr->equalizer);
548
549 if(!channel)
550 {
551 fr->bo--;
552 fr->bo &= 0xf;
553 buf = fr->real_buffs[0];
554 }
555 else
556 {
557 samples++;
558 buf = fr->real_buffs[1];
559 }
560
561 if(fr->bo & 0x1)
562 {
563 b0 = buf[0];
564 bo1 = fr->bo;
565 dct64_altivec(buf[1]+((fr->bo+1)&0xf),buf[0]+fr->bo,bandPtr);
566 }
567 else
568 {
569 b0 = buf[1];
570 bo1 = fr->bo+1;
571 dct64_altivec(buf[0]+fr->bo,buf[1]+fr->bo+1,bandPtr);
572 }
573
574
575 {
576 register int j;
577 real *window = fr->decwin + 16 - bo1;
578
579 vector float v1,v2,v3,v4,v5,v6,v7,v8,v9;
580 vector unsigned char vperm1,vperm2,vperm3,vperm4, vperm5;
581 vector float vsum,vsum2,vsum3,vsum4,vscale,vzero;
582 vector float vsample1,vsample2,vsample3;
583 vzero = vec_xor(vzero, vzero);
584 #ifdef __APPLE__
585 vscale = (vector float)(1.0f/32768.0f);
586 vperm4 = (vector unsigned char)(0,1,2,3,20,21,22,23,4,5,6,7,28,29,30,31);
587 vperm5 = (vector unsigned char)(8,9,10,11,20,21,22,23,12,13,14,15,28,29,30,31);
588 #else
589 vscale = (vector float){1.0f/32768.0f,1.0f/32768.0f,1.0f/32768.0f,1.0f/32768.0f};
590 vperm4 = (vector unsigned char){0,1,2,3,20,21,22,23,4,5,6,7,28,29,30,31};
591 vperm5 = (vector unsigned char){8,9,10,11,20,21,22,23,12,13,14,15,28,29,30,31};
592 #endif
593
594 vperm1 = vec_lvsl(0,window);
595 vperm2 = vec_lvsl(0,samples);
596 vperm3 = vec_lvsr(0,samples);
597 for (j=4;j;j--)
598 {
599 SYNTH_ALTIVEC(16);
600
601 vsum = vec_sub(v5,v6);
602 v9 = vec_sub(v7,v8);
603 vsum = vec_add(vsum,v9);
604 vsum = vec_madd(vsum, vscale, vzero);
605
606 vsample1 = vec_ld(0,samples);
607 vsample2 = vec_ld(16,samples);
608 vsample3 = vec_ld(31,samples);
609 v1 = vec_perm(vsample1, vsample2, vperm2);
610 v2 = vec_perm(vsample2, vsample3, vperm2);
611 v1 = vec_perm(vsum, v1, vperm4);
612 v2 = vec_perm(vsum, v2, vperm5);
613 v3 = vec_perm(vsample3, vsample2, vperm2);
614 v4 = vec_perm(vsample2, vsample1, vperm2);
615 v5 = vec_perm(v2, v3, vperm3);
616 v6 = vec_perm(v1, v2, vperm3);
617 v7 = vec_perm(v4, v1, vperm3);
618 vec_st(v5,31,samples);
619 vec_st(v6,16,samples);
620 vec_st(v7,0,samples);
621 samples += 8;
622 }
623
624 for (j=4;j;j--)
625 {
626 SYNTH_ALTIVEC(-16);
627
628 vsum = vec_add(v5,v6);
629 v9 = vec_add(v7,v8);
630 vsum = vec_add(vsum,v9);
631 vsum = vec_madd(vsum, vscale, vzero);
632
633 vsample1 = vec_ld(0,samples);
634 vsample2 = vec_ld(16,samples);
635 vsample3 = vec_ld(31,samples);
636 v1 = vec_perm(vsample1, vsample2, vperm2);
637 v2 = vec_perm(vsample2, vsample3, vperm2);
638 v1 = vec_perm(vsum, v1, vperm4);
639 v2 = vec_perm(vsum, v2, vperm5);
640 v3 = vec_perm(vsample3, vsample2, vperm2);
641 v4 = vec_perm(vsample2, vsample1, vperm2);
642 v5 = vec_perm(v2, v3, vperm3);
643 v6 = vec_perm(v1, v2, vperm3);
644 v7 = vec_perm(v4, v1, vperm3);
645 vec_st(v5,31,samples);
646 vec_st(v6,16,samples);
647 vec_st(v7,0,samples);
648 samples += 8;
649 }
650 }
651 if(final) fr->buffer.fill += 256;
652
653 return 0;
654 }
655
656 int synth_1to1_real_stereo_altivec(real *bandPtr_l, real *bandPtr_r, mpg123_handle *fr)
657 {
658 real *samples = (real *) (fr->buffer.data+fr->buffer.fill);
659
660 real *b0l, *b0r, **bufl, **bufr;
661 int bo1;
662
663 if(fr->have_eq_settings)
664 {
665 do_equalizer(bandPtr_l,0,fr->equalizer);
666 do_equalizer(bandPtr_r,1,fr->equalizer);
667 }
668
669 fr->bo--;
670 fr->bo &= 0xf;
671 bufl = fr->real_buffs[0];
672 bufr = fr->real_buffs[1];
673
674 if(fr->bo & 0x1)
675 {
676 b0l = bufl[0];
677 b0r = bufr[0];
678 bo1 = fr->bo;
679 dct64_altivec(bufl[1]+((fr->bo+1)&0xf),bufl[0]+fr->bo,bandPtr_l);
680 dct64_altivec(bufr[1]+((fr->bo+1)&0xf),bufr[0]+fr->bo,bandPtr_r);
681 }
682 else
683 {
684 b0l = bufl[1];
685 b0r = bufr[1];
686 bo1 = fr->bo+1;
687 dct64_altivec(bufl[0]+fr->bo,bufl[1]+fr->bo+1,bandPtr_l);
688 dct64_altivec(bufr[0]+fr->bo,bufr[1]+fr->bo+1,bandPtr_r);
689 }
690
691
692 {
693 register int j;
694 real *window = fr->decwin + 16 - bo1;
695
696 vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13;
697 vector unsigned char vperm1,vperm2;
698 vector float vsum,vsum2,vsum3,vsum4,vsum5,vsum6,vsum7,vsum8,vscale,vzero;
699 vector float vprev;
700 vzero = vec_xor(vzero,vzero);
701 #ifdef __APPLE__
702 vscale = (vector float)(1.0f/32768.0f);
703 #else
704 vscale = (vector float){1.0f/32768.0f,1.0f/32768.0f,1.0f/32768.0f,1.0f/32768.0f};
705 #endif
706
707 vperm1 = vec_lvsl(0,window);
708 vperm2 = vec_lvsr(0,samples);
709 vprev = vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples));
710 for (j=4;j;j--)
711 {
712 SYNTH_STEREO_ALTIVEC(16);
713
714 vsum = vec_sub(vsum,vsum2);
715 vsum2 = vec_sub(vsum5,vsum6);
716 vsum3 = vec_sub(vsum3,vsum4);
717 vsum4 = vec_sub(vsum7,vsum8);
718 vsum = vec_add(vsum,vsum3);
719 vsum2 = vec_add(vsum2,vsum4);
720 vsum = vec_madd(vsum, vscale, vzero);
721 vsum2 = vec_madd(vsum2, vscale, vzero);
722
723 v1 = vec_mergeh(vsum, vsum2);
724 v2 = vec_mergel(vsum, vsum2);
725 v3 = vec_perm(vprev,v1,vperm2);
726 v4 = vec_perm(v1,v2,vperm2);
727 vprev = v2;
728 vec_st(v3,0,samples);
729 vec_st(v4,16,samples);
730 samples += 8;
731 }
732
733 for (j=4;j;j--)
734 {
735 SYNTH_STEREO_ALTIVEC(-16);
736
737 vsum = vec_add(vsum,vsum2);
738 vsum2 = vec_add(vsum5,vsum6);
739 vsum3 = vec_add(vsum3,vsum4);
740 vsum4 = vec_add(vsum7,vsum8);
741 vsum = vec_add(vsum,vsum3);
742 vsum2 = vec_add(vsum2,vsum4);
743 vsum = vec_madd(vsum, vscale, vzero);
744 vsum2 = vec_madd(vsum2, vscale, vzero);
745
746 v1 = vec_mergeh(vsum, vsum2);
747 v2 = vec_mergel(vsum, vsum2);
748 v3 = vec_perm(vprev,v1,vperm2);
749 v4 = vec_perm(v1,v2,vperm2);
750 vprev = v2;
751 vec_st(v3,0,samples);
752 vec_st(v4,16,samples);
753 samples += 8;
754 }
755
756 if((size_t)samples & 0xf)
757 {
758 v1 = (vector float)vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples));
759 v2 = (vector float)vec_perm(vprev,v1,vperm2);
760 vec_st(v2,0,samples);
761 }
762 }
763 fr->buffer.fill += 256;
764
765 return 0;
766 }
767
768 int synth_1to1_s32_altivec(real *bandPtr,int channel,mpg123_handle *fr, int final)
769 {
770 int32_t *samples = (int32_t *) (fr->buffer.data+fr->buffer.fill);
771
772 real *b0, **buf;
773 int clip;
774 int bo1;
775
776 if(fr->have_eq_settings) do_equalizer(bandPtr,channel,fr->equalizer);
777
778 if(!channel)
779 {
780 fr->bo--;
781 fr->bo &= 0xf;
782 buf = fr->real_buffs[0];
783 }
784 else
785 {
786 samples++;
787 buf = fr->real_buffs[1];
788 }
789
790 if(fr->bo & 0x1)
791 {
792 b0 = buf[0];
793 bo1 = fr->bo;
794 dct64_altivec(buf[1]+((fr->bo+1)&0xf),buf[0]+fr->bo,bandPtr);
795 }
796 else
797 {
798 b0 = buf[1];
799 bo1 = fr->bo+1;
800 dct64_altivec(buf[0]+fr->bo,buf[1]+fr->bo+1,bandPtr);
801 }
802
803
804 {
805 register int j;
806 real *window = fr->decwin + 16 - bo1;
807
808 ALIGNED(16) int clip_tmp[4];
809 vector float v1,v2,v3,v4,v5,v6,v7,v8,v9;
810 vector unsigned char vperm1,vperm2,vperm3,vperm4,vperm5;
811 vector float vsum,vsum2,vsum3,vsum4,vmax,vmin,vzero;
812 vector signed int vsample1,vsample2,vsample3;
813 vector unsigned int vshift;
814 vector signed int vclip;
815 vzero = vec_xor(vzero, vzero);
816 vclip = vec_xor(vclip, vclip);
817 vshift = vec_splat_u32(-1); /* 31 */
818 #ifdef __APPLE__
819 vmax = (vector float)(32767.999f);
820 vmin = (vector float)(-32768.0f);
821 vperm4 = (vector unsigned char)(0,1,2,3,20,21,22,23,4,5,6,7,28,29,30,31);
822 vperm5 = (vector unsigned char)(8,9,10,11,20,21,22,23,12,13,14,15,28,29,30,31);
823 #else
824 vmax = (vector float){32767.999f,32767.999f,32767.999f,32767.999f};
825 vmin = (vector float){-32768.0f,-32768.0f,-32768.0f,-32768.0f};
826 vperm4 = (vector unsigned char){0,1,2,3,20,21,22,23,4,5,6,7,28,29,30,31};
827 vperm5 = (vector unsigned char){8,9,10,11,20,21,22,23,12,13,14,15,28,29,30,31};
828 #endif
829
830 vperm1 = vec_lvsl(0,window);
831 vperm2 = vec_lvsl(0,samples);
832 vperm3 = vec_lvsr(0,samples);
833 for (j=4;j;j--)
834 {
835 SYNTH_ALTIVEC(16);
836
837 vsum = vec_sub(v5,v6);
838 v9 = vec_sub(v7,v8);
839 v1 = vec_add(vsum,v9);
840 vsum = (vector float)vec_cts(v1,16);
841 v8 = (vector float)vec_cmpgt(v1,vmax);
842 v9 = (vector float)vec_cmplt(v1,vmin);
843
844 vsample1 = vec_ld(0,samples);
845 vsample2 = vec_ld(16,samples);
846 vsample3 = vec_ld(31,samples);
847 v1 = (vector float)vec_perm(vsample1, vsample2, vperm2);
848 v2 = (vector float)vec_perm(vsample2, vsample3, vperm2);
849 v1 = vec_perm(vsum, v1, vperm4);
850 v2 = vec_perm(vsum, v2, vperm5);
851 v3 = (vector float)vec_perm(vsample3, vsample2, vperm2);
852 v4 = (vector float)vec_perm(vsample2, vsample1, vperm2);
853 v5 = vec_perm(v2, v3, vperm3);
854 v6 = vec_perm(v1, v2, vperm3);
855 v7 = vec_perm(v4, v1, vperm3);
856 vec_st((vector signed int)v5,31,samples);
857 vec_st((vector signed int)v6,16,samples);
858 vec_st((vector signed int)v7,0,samples);
859 samples += 8;
860
861 v1 = (vector float)vec_sr((vector unsigned int)v8, vshift);
862 v2 = (vector float)vec_sr((vector unsigned int)v9, vshift);
863 v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
864 vclip = vec_sums((vector signed int)v1,vclip);
865 }
866
867 for (j=4;j;j--)
868 {
869 SYNTH_ALTIVEC(-16);
870
871 vsum = vec_add(v5,v6);
872 v9 = vec_add(v7,v8);
873 v1 = vec_add(vsum,v9);
874 vsum = (vector float)vec_cts(v1,16);
875 v8 = (vector float)vec_cmpgt(v1,vmax);
876 v9 = (vector float)vec_cmplt(v1,vmin);
877
878 vsample1 = vec_ld(0,samples);
879 vsample2 = vec_ld(16,samples);
880 vsample3 = vec_ld(31,samples);
881 v1 = (vector float)vec_perm(vsample1, vsample2, vperm2);
882 v2 = (vector float)vec_perm(vsample2, vsample3, vperm2);
883 v1 = vec_perm(vsum, v1, vperm4);
884 v2 = vec_perm(vsum, v2, vperm5);
885 v3 = (vector float)vec_perm(vsample3, vsample2, vperm2);
886 v4 = (vector float)vec_perm(vsample2, vsample1, vperm2);
887 v5 = vec_perm(v2, v3, vperm3);
888 v6 = vec_perm(v1, v2, vperm3);
889 v7 = vec_perm(v4, v1, vperm3);
890 vec_st((vector signed int)v5,31,samples);
891 vec_st((vector signed int)v6,16,samples);
892 vec_st((vector signed int)v7,0,samples);
893 samples += 8;
894
895 v1 = (vector float)vec_sr((vector unsigned int)v8, vshift);
896 v2 = (vector float)vec_sr((vector unsigned int)v9, vshift);
897 v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
898 vclip = vec_sums((vector signed int)v1,vclip);
899 }
900
901 vec_st(vclip,0,clip_tmp);
902 clip = clip_tmp[3];
903 }
904 if(final) fr->buffer.fill += 256;
905
906 return clip;
907 }
908
909
910 int synth_1to1_s32_stereo_altivec(real *bandPtr_l, real *bandPtr_r, mpg123_handle *fr)
911 {
912 int32_t *samples = (int32_t *) (fr->buffer.data+fr->buffer.fill);
913
914 real *b0l, *b0r, **bufl, **bufr;
915 int clip;
916 int bo1;
917
918 if(fr->have_eq_settings)
919 {
920 do_equalizer(bandPtr_l,0,fr->equalizer);
921 do_equalizer(bandPtr_r,1,fr->equalizer);
922 }
923
924 fr->bo--;
925 fr->bo &= 0xf;
926 bufl = fr->real_buffs[0];
927 bufr = fr->real_buffs[1];
928
929 if(fr->bo & 0x1)
930 {
931 b0l = bufl[0];
932 b0r = bufr[0];
933 bo1 = fr->bo;
934 dct64_altivec(bufl[1]+((fr->bo+1)&0xf),bufl[0]+fr->bo,bandPtr_l);
935 dct64_altivec(bufr[1]+((fr->bo+1)&0xf),bufr[0]+fr->bo,bandPtr_r);
936 }
937 else
938 {
939 b0l = bufl[1];
940 b0r = bufr[1];
941 bo1 = fr->bo+1;
942 dct64_altivec(bufl[0]+fr->bo,bufl[1]+fr->bo+1,bandPtr_l);
943 dct64_altivec(bufr[0]+fr->bo,bufr[1]+fr->bo+1,bandPtr_r);
944 }
945
946
947 {
948 register int j;
949 real *window = fr->decwin + 16 - bo1;
950
951 ALIGNED(16) int clip_tmp[4];
952 vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13;
953 vector unsigned char vperm1,vperm2;
954 vector float vsum,vsum2,vsum3,vsum4,vsum5,vsum6,vsum7,vsum8,vmax,vmin,vzero;
955 vector float vprev;
956 vector unsigned int vshift;
957 vector signed int vclip;
958 vzero = vec_xor(vzero, vzero);
959 vclip = vec_xor(vclip, vclip);
960 vshift = vec_splat_u32(-1); /* 31 */
961 #ifdef __APPLE__
962 vmax = (vector float)(32767.999f);
963 vmin = (vector float)(-32768.0f);
964 #else
965 vmax = (vector float){32767.999f,32767.999f,32767.999f,32767.999f};
966 vmin = (vector float){-32768.0f,-32768.0f,-32768.0f,-32768.0f};
967 #endif
968
969 vperm1 = vec_lvsl(0,window);
970 vperm2 = vec_lvsr(0,samples);
971 vprev = (vector float)vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples));
972 for (j=4;j;j--)
973 {
974 SYNTH_STEREO_ALTIVEC(16);
975
976 vsum = vec_sub(vsum,vsum2);
977 vsum2 = vec_sub(vsum5,vsum6);
978 vsum3 = vec_sub(vsum3,vsum4);
979 vsum4 = vec_sub(vsum7,vsum8);
980 v1 = vec_add(vsum,vsum3);
981 v2 = vec_add(vsum2,vsum4);
982 vsum = (vector float)vec_cts(v1,16);
983 vsum2 = (vector float)vec_cts(v2,16);
984 v5 = (vector float)vec_cmpgt(v1,vmax);
985 v6 = (vector float)vec_cmplt(v1,vmin);
986 v7 = (vector float)vec_cmpgt(v2,vmax);
987 v8 = (vector float)vec_cmplt(v2,vmin);
988
989 v1 = vec_mergeh(vsum, vsum2);
990 v2 = vec_mergel(vsum, vsum2);
991 v3 = vec_perm(vprev,v1,vperm2);
992 v4 = vec_perm(v1,v2,vperm2);
993 vprev = v2;
994 vec_st((vector signed int)v3,0,samples);
995 vec_st((vector signed int)v4,16,samples);
996 samples += 8;
997
998 v1 = (vector float)vec_sr((vector unsigned int)v5, vshift);
999 v2 = (vector float)vec_sr((vector unsigned int)v6, vshift);
1000 v3 = (vector float)vec_sr((vector unsigned int)v7, vshift);
1001 v4 = (vector float)vec_sr((vector unsigned int)v8, vshift);
1002 v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
1003 v2 = (vector float)vec_add((vector unsigned int)v3,(vector unsigned int)v4);
1004 vclip = vec_sums((vector signed int)v1,vclip);
1005 vclip = vec_sums((vector signed int)v2,vclip);
1006 }
1007
1008 for (j=4;j;j--)
1009 {
1010 SYNTH_STEREO_ALTIVEC(-16);
1011
1012 vsum = vec_add(vsum,vsum2);
1013 vsum2 = vec_add(vsum5,vsum6);
1014 vsum3 = vec_add(vsum3,vsum4);
1015 vsum4 = vec_add(vsum7,vsum8);
1016 v1 = vec_add(vsum,vsum3);
1017 v2 = vec_add(vsum2,vsum4);
1018 vsum = (vector float)vec_cts(v1,16);
1019 vsum2 = (vector float)vec_cts(v2,16);
1020 v5 = (vector float)vec_cmpgt(v1,vmax);
1021 v6 = (vector float)vec_cmplt(v1,vmin);
1022 v7 = (vector float)vec_cmpgt(v2,vmax);
1023 v8 = (vector float)vec_cmplt(v2,vmin);
1024
1025 v1 = vec_mergeh(vsum, vsum2);
1026 v2 = vec_mergel(vsum, vsum2);
1027 v3 = vec_perm(vprev,v1,vperm2);
1028 v4 = vec_perm(v1,v2,vperm2);
1029 vprev = v2;
1030 vec_st((vector signed int)v3,0,samples);
1031 vec_st((vector signed int)v4,16,samples);
1032 samples += 8;
1033
1034 v1 = (vector float)vec_sr((vector unsigned int)v5, vshift);
1035 v2 = (vector float)vec_sr((vector unsigned int)v6, vshift);
1036 v3 = (vector float)vec_sr((vector unsigned int)v7, vshift);
1037 v4 = (vector float)vec_sr((vector unsigned int)v8, vshift);
1038 v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
1039 v2 = (vector float)vec_add((vector unsigned int)v3,(vector unsigned int)v4);
1040 vclip = vec_sums((vector signed int)v1,vclip);
1041 vclip = vec_sums((vector signed int)v2,vclip);
1042 }
1043
1044 if((size_t)samples & 0xf)
1045 {
1046 v1 = (vector float)vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples));
1047 v2 = (vector float)vec_perm(vprev,v1,vperm2);
1048 vec_st((vector signed int)v2,0,samples);
1049 }
1050
1051 vec_st(vclip,0,clip_tmp);
1052 clip = clip_tmp[3];
1053 }
1054 fr->buffer.fill += 256;
1055
1056 return clip;
1057 }