SmartPDF - lightweight pdf viewer app for rosapps
[reactos.git] / rosapps / lib / libjpeg / jidctint.c
1 /*
2 * jidctint.c
3 *
4 * Copyright (C) 1991-1998, Thomas G. Lane.
5 * This file is part of the Independent JPEG Group's software.
6 * For conditions of distribution and use, see the accompanying README file.
7 *
8 * This file contains a slow-but-accurate integer implementation of the
9 * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine
10 * must also perform dequantization of the input coefficients.
11 *
12 * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
13 * on each row (or vice versa, but it's more convenient to emit a row at
14 * a time). Direct algorithms are also available, but they are much more
15 * complex and seem not to be any faster when reduced to code.
16 *
17 * This implementation is based on an algorithm described in
18 * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
19 * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
20 * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
21 * The primary algorithm described there uses 11 multiplies and 29 adds.
22 * We use their alternate method with 12 multiplies and 32 adds.
23 * The advantage of this method is that no data path contains more than one
24 * multiplication; this allows a very simple and accurate implementation in
25 * scaled fixed-point arithmetic, with a minimal number of shifts.
26 */
27
28 #define JPEG_INTERNALS
29 #include "jinclude.h"
30 #include "jpeglib.h"
31 #include "jdct.h" /* Private declarations for DCT subsystem */
32
33 #ifdef DCT_ISLOW_SUPPORTED
34
35
36 /*
37 * This module is specialized to the case DCTSIZE = 8.
38 */
39
40 #if DCTSIZE != 8
41 Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
42 #endif
43
44
45 /*
46 * The poop on this scaling stuff is as follows:
47 *
48 * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
49 * larger than the true IDCT outputs. The final outputs are therefore
50 * a factor of N larger than desired; since N=8 this can be cured by
51 * a simple right shift at the end of the algorithm. The advantage of
52 * this arrangement is that we save two multiplications per 1-D IDCT,
53 * because the y0 and y4 inputs need not be divided by sqrt(N).
54 *
55 * We have to do addition and subtraction of the integer inputs, which
56 * is no problem, and multiplication by fractional constants, which is
57 * a problem to do in integer arithmetic. We multiply all the constants
58 * by CONST_SCALE and convert them to integer constants (thus retaining
59 * CONST_BITS bits of precision in the constants). After doing a
60 * multiplication we have to divide the product by CONST_SCALE, with proper
61 * rounding, to produce the correct output. This division can be done
62 * cheaply as a right shift of CONST_BITS bits. We postpone shifting
63 * as long as possible so that partial sums can be added together with
64 * full fractional precision.
65 *
66 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
67 * they are represented to better-than-integral precision. These outputs
68 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
69 * with the recommended scaling. (To scale up 12-bit sample data further, an
70 * intermediate INT32 array would be needed.)
71 *
72 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
73 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
74 * shows that the values given below are the most effective.
75 */
76
77 #if BITS_IN_JSAMPLE == 8
78 #define CONST_BITS 13
79 #define PASS1_BITS 2
80 #else
81 #define CONST_BITS 13
82 #define PASS1_BITS 1 /* lose a little precision to avoid overflow */
83 #endif
84
85 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
86 * causing a lot of useless floating-point operations at run time.
87 * To get around this we use the following pre-calculated constants.
88 * If you change CONST_BITS you may want to add appropriate values.
89 * (With a reasonable C compiler, you can just rely on the FIX() macro...)
90 */
91
92 #if CONST_BITS == 13
93 #define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */
94 #define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */
95 #define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */
96 #define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */
97 #define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */
98 #define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */
99 #define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */
100 #define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */
101 #define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */
102 #define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */
103 #define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */
104 #define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */
105 #else
106 #define FIX_0_298631336 FIX(0.298631336)
107 #define FIX_0_390180644 FIX(0.390180644)
108 #define FIX_0_541196100 FIX(0.541196100)
109 #define FIX_0_765366865 FIX(0.765366865)
110 #define FIX_0_899976223 FIX(0.899976223)
111 #define FIX_1_175875602 FIX(1.175875602)
112 #define FIX_1_501321110 FIX(1.501321110)
113 #define FIX_1_847759065 FIX(1.847759065)
114 #define FIX_1_961570560 FIX(1.961570560)
115 #define FIX_2_053119869 FIX(2.053119869)
116 #define FIX_2_562915447 FIX(2.562915447)
117 #define FIX_3_072711026 FIX(3.072711026)
118 #endif
119
120
121 /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
122 * For 8-bit samples with the recommended scaling, all the variable
123 * and constant values involved are no more than 16 bits wide, so a
124 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
125 * For 12-bit samples, a full 32-bit multiplication will be needed.
126 */
127
128 #if BITS_IN_JSAMPLE == 8
129 #define MULTIPLY(var,const) MULTIPLY16C16(var,const)
130 #else
131 #define MULTIPLY(var,const) ((var) * (const))
132 #endif
133
134
135 /* Dequantize a coefficient by multiplying it by the multiplier-table
136 * entry; produce an int result. In this module, both inputs and result
137 * are 16 bits or less, so either int or short multiply will work.
138 */
139
140 #define DEQUANTIZE(coef,quantval) (((ISLOW_MULT_TYPE) (coef)) * (quantval))
141
142
143 /*
144 * Perform dequantization and inverse DCT on one block of coefficients.
145 */
146
147 GLOBAL(void)
148 jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
149 JCOEFPTR coef_block,
150 JSAMPARRAY output_buf, JDIMENSION output_col)
151 {
152 INT32 tmp0, tmp1, tmp2, tmp3;
153 INT32 tmp10, tmp11, tmp12, tmp13;
154 INT32 z1, z2, z3, z4, z5;
155 JCOEFPTR inptr;
156 ISLOW_MULT_TYPE * quantptr;
157 int * wsptr;
158 JSAMPROW outptr;
159 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
160 int ctr;
161 int workspace[DCTSIZE2]; /* buffers data between passes */
162 SHIFT_TEMPS
163
164 /* Pass 1: process columns from input, store into work array. */
165 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
166 /* furthermore, we scale the results by 2**PASS1_BITS. */
167
168 inptr = coef_block;
169 quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
170 wsptr = workspace;
171 for (ctr = DCTSIZE; ctr > 0; ctr--) {
172 /* Due to quantization, we will usually find that many of the input
173 * coefficients are zero, especially the AC terms. We can exploit this
174 * by short-circuiting the IDCT calculation for any column in which all
175 * the AC terms are zero. In that case each output is equal to the
176 * DC coefficient (with scale factor as needed).
177 * With typical images and quantization tables, half or more of the
178 * column DCT calculations can be simplified this way.
179 */
180
181 if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
182 inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
183 inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
184 inptr[DCTSIZE*7] == 0) {
185 /* AC terms all zero */
186 int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
187
188 wsptr[DCTSIZE*0] = dcval;
189 wsptr[DCTSIZE*1] = dcval;
190 wsptr[DCTSIZE*2] = dcval;
191 wsptr[DCTSIZE*3] = dcval;
192 wsptr[DCTSIZE*4] = dcval;
193 wsptr[DCTSIZE*5] = dcval;
194 wsptr[DCTSIZE*6] = dcval;
195 wsptr[DCTSIZE*7] = dcval;
196
197 inptr++; /* advance pointers to next column */
198 quantptr++;
199 wsptr++;
200 continue;
201 }
202
203 /* Even part: reverse the even part of the forward DCT. */
204 /* The rotator is sqrt(2)*c(-6). */
205
206 z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
207 z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
208
209 z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
210 tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
211 tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
212
213 z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
214 z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
215
216 tmp0 = (z2 + z3) << CONST_BITS;
217 tmp1 = (z2 - z3) << CONST_BITS;
218
219 tmp10 = tmp0 + tmp3;
220 tmp13 = tmp0 - tmp3;
221 tmp11 = tmp1 + tmp2;
222 tmp12 = tmp1 - tmp2;
223
224 /* Odd part per figure 8; the matrix is unitary and hence its
225 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
226 */
227
228 tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
229 tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
230 tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
231 tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
232
233 z1 = tmp0 + tmp3;
234 z2 = tmp1 + tmp2;
235 z3 = tmp0 + tmp2;
236 z4 = tmp1 + tmp3;
237 z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
238
239 tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
240 tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
241 tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
242 tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
243 z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
244 z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
245 z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
246 z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
247
248 z3 += z5;
249 z4 += z5;
250
251 tmp0 += z1 + z3;
252 tmp1 += z2 + z4;
253 tmp2 += z2 + z3;
254 tmp3 += z1 + z4;
255
256 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
257
258 wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
259 wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
260 wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
261 wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
262 wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
263 wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
264 wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
265 wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
266
267 inptr++; /* advance pointers to next column */
268 quantptr++;
269 wsptr++;
270 }
271
272 /* Pass 2: process rows from work array, store into output array. */
273 /* Note that we must descale the results by a factor of 8 == 2**3, */
274 /* and also undo the PASS1_BITS scaling. */
275
276 wsptr = workspace;
277 for (ctr = 0; ctr < DCTSIZE; ctr++) {
278 outptr = output_buf[ctr] + output_col;
279 /* Rows of zeroes can be exploited in the same way as we did with columns.
280 * However, the column calculation has created many nonzero AC terms, so
281 * the simplification applies less often (typically 5% to 10% of the time).
282 * On machines with very fast multiplication, it's possible that the
283 * test takes more time than it's worth. In that case this section
284 * may be commented out.
285 */
286
287 #ifndef NO_ZERO_ROW_TEST
288 if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
289 wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
290 /* AC terms all zero */
291 JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
292 & RANGE_MASK];
293
294 outptr[0] = dcval;
295 outptr[1] = dcval;
296 outptr[2] = dcval;
297 outptr[3] = dcval;
298 outptr[4] = dcval;
299 outptr[5] = dcval;
300 outptr[6] = dcval;
301 outptr[7] = dcval;
302
303 wsptr += DCTSIZE; /* advance pointer to next row */
304 continue;
305 }
306 #endif
307
308 /* Even part: reverse the even part of the forward DCT. */
309 /* The rotator is sqrt(2)*c(-6). */
310
311 z2 = (INT32) wsptr[2];
312 z3 = (INT32) wsptr[6];
313
314 z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
315 tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
316 tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
317
318 tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS;
319 tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS;
320
321 tmp10 = tmp0 + tmp3;
322 tmp13 = tmp0 - tmp3;
323 tmp11 = tmp1 + tmp2;
324 tmp12 = tmp1 - tmp2;
325
326 /* Odd part per figure 8; the matrix is unitary and hence its
327 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
328 */
329
330 tmp0 = (INT32) wsptr[7];
331 tmp1 = (INT32) wsptr[5];
332 tmp2 = (INT32) wsptr[3];
333 tmp3 = (INT32) wsptr[1];
334
335 z1 = tmp0 + tmp3;
336 z2 = tmp1 + tmp2;
337 z3 = tmp0 + tmp2;
338 z4 = tmp1 + tmp3;
339 z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
340
341 tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
342 tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
343 tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
344 tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
345 z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
346 z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
347 z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
348 z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
349
350 z3 += z5;
351 z4 += z5;
352
353 tmp0 += z1 + z3;
354 tmp1 += z2 + z4;
355 tmp2 += z2 + z3;
356 tmp3 += z1 + z4;
357
358 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
359
360 outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
361 CONST_BITS+PASS1_BITS+3)
362 & RANGE_MASK];
363 outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
364 CONST_BITS+PASS1_BITS+3)
365 & RANGE_MASK];
366 outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
367 CONST_BITS+PASS1_BITS+3)
368 & RANGE_MASK];
369 outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
370 CONST_BITS+PASS1_BITS+3)
371 & RANGE_MASK];
372 outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
373 CONST_BITS+PASS1_BITS+3)
374 & RANGE_MASK];
375 outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
376 CONST_BITS+PASS1_BITS+3)
377 & RANGE_MASK];
378 outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
379 CONST_BITS+PASS1_BITS+3)
380 & RANGE_MASK];
381 outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
382 CONST_BITS+PASS1_BITS+3)
383 & RANGE_MASK];
384
385 wsptr += DCTSIZE; /* advance pointer to next row */
386 }
387 }
388
389
390 #ifdef HAVE_SSE2_INTEL_MNEMONICS
391
392 /*
393 * Intel SSE2 optimized Inverse Discrete Cosine Transform
394 *
395 *
396 * Copyright (c) 2001-2002 Intel Corporation
397 * All Rights Reserved
398 *
399 *
400 * Authors:
401 * Danilov G.
402 *
403 *
404 *-----------------------------------------------------------------------------
405 *
406 * References:
407 * K.R. Rao and P. Yip
408 * Discrete Cosine Transform.
409 * Algorithms, Advantages, Applications.
410 * Academic Press, Inc, London, 1990.
411 * JPEG Group's software.
412 * This implementation is based on Appendix A.2 of the book (R&Y) ...
413 *
414 *-----------------------------------------------------------------------------
415 */
416
417 typedef unsigned char Ipp8u;
418 typedef unsigned short Ipp16u;
419 typedef unsigned int Ipp32u;
420
421 typedef signed char Ipp8s;
422 typedef signed short Ipp16s;
423 typedef signed int Ipp32s;
424
425 #define BITS_INV_ACC 4
426 #define SHIFT_INV_ROW 16 - BITS_INV_ACC
427 #define SHIFT_INV_COL 1 + BITS_INV_ACC
428
429 #define RND_INV_ROW 1024 * (6 - BITS_INV_ACC) /* 1 << (SHIFT_INV_ROW-1) */
430 #define RND_INV_COL = 16 * (BITS_INV_ACC - 3) /* 1 << (SHIFT_INV_COL-1) */
431 #define RND_INV_CORR = RND_INV_COL - 1 /* correction -1.0 and round */
432
433 #define c_inv_corr_0 -1024 * (6 - BITS_INV_ACC) + 65536 /* -0.5 + (16.0 or 32.0) */
434 #define c_inv_corr_1 1877 * (6 - BITS_INV_ACC) /* 0.9167 */
435 #define c_inv_corr_2 1236 * (6 - BITS_INV_ACC) /* 0.6035 */
436 #define c_inv_corr_3 680 * (6 - BITS_INV_ACC) /* 0.3322 */
437 #define c_inv_corr_4 0 * (6 - BITS_INV_ACC) /* 0.0 */
438 #define c_inv_corr_5 -569 * (6 - BITS_INV_ACC) /* -0.278 */
439 #define c_inv_corr_6 -512 * (6 - BITS_INV_ACC) /* -0.25 */
440 #define c_inv_corr_7 -651 * (6 - BITS_INV_ACC) /* -0.3176 */
441
442 #define RND_INV_ROW_0 RND_INV_ROW + c_inv_corr_0
443 #define RND_INV_ROW_1 RND_INV_ROW + c_inv_corr_1
444 #define RND_INV_ROW_2 RND_INV_ROW + c_inv_corr_2
445 #define RND_INV_ROW_3 RND_INV_ROW + c_inv_corr_3
446 #define RND_INV_ROW_4 RND_INV_ROW + c_inv_corr_4
447 #define RND_INV_ROW_5 RND_INV_ROW + c_inv_corr_5
448 #define RND_INV_ROW_6 RND_INV_ROW + c_inv_corr_6
449 #define RND_INV_ROW_7 RND_INV_ROW + c_inv_corr_7
450
451 /* Table for rows 0,4 - constants are multiplied on cos_4_16 */
452
453 __declspec(align(16)) short tab_i_04[] = {
454 16384, 21407, 16384, 8867,
455 -16384, 21407, 16384, -8867,
456 16384, -8867, 16384, -21407,
457 16384, 8867, -16384, -21407,
458 22725, 19266, 19266, -4520,
459 4520, 19266, 19266, -22725,
460 12873, -22725, 4520, -12873,
461 12873, 4520, -22725, -12873};
462
463 /* Table for rows 1,7 - constants are multiplied on cos_1_16 */
464
465 __declspec(align(16)) short tab_i_17[] = {
466 22725, 29692, 22725, 12299,
467 -22725, 29692, 22725, -12299,
468 22725, -12299, 22725, -29692,
469 22725, 12299, -22725, -29692,
470 31521, 26722, 26722, -6270,
471 6270, 26722, 26722, -31521,
472 17855, -31521, 6270, -17855,
473 17855, 6270, -31521, -17855};
474
475 /* Table for rows 2,6 - constants are multiplied on cos_2_16 */
476
477 __declspec(align(16)) short tab_i_26[] = {
478 21407, 27969, 21407, 11585,
479 -21407, 27969, 21407, -11585,
480 21407, -11585, 21407, -27969,
481 21407, 11585, -21407, -27969,
482 29692, 25172, 25172, -5906,
483 5906, 25172, 25172, -29692,
484 16819, -29692, 5906, -16819,
485 16819, 5906, -29692, -16819};
486
487 /* Table for rows 3,5 - constants are multiplied on cos_3_16 */
488
489 __declspec(align(16)) short tab_i_35[] = {
490 19266, 25172, 19266, 10426,
491 -19266, 25172, 19266, -10426,
492 19266, -10426, 19266, -25172,
493 19266, 10426, -19266, -25172,
494 26722, 22654, 22654, -5315,
495 5315, 22654, 22654, -26722,
496 15137, -26722, 5315, -15137,
497 15137, 5315, -26722, -15137};
498
499 __declspec(align(16)) long round_i_0[] = {RND_INV_ROW_0,RND_INV_ROW_0,
500 RND_INV_ROW_0,RND_INV_ROW_0};
501 __declspec(align(16)) long round_i_1[] = {RND_INV_ROW_1,RND_INV_ROW_1,
502 RND_INV_ROW_1,RND_INV_ROW_1};
503 __declspec(align(16)) long round_i_2[] = {RND_INV_ROW_2,RND_INV_ROW_2,
504 RND_INV_ROW_2,RND_INV_ROW_2};
505 __declspec(align(16)) long round_i_3[] = {RND_INV_ROW_3,RND_INV_ROW_3,
506 RND_INV_ROW_3,RND_INV_ROW_3};
507 __declspec(align(16)) long round_i_4[] = {RND_INV_ROW_4,RND_INV_ROW_4,
508 RND_INV_ROW_4,RND_INV_ROW_4};
509 __declspec(align(16)) long round_i_5[] = {RND_INV_ROW_5,RND_INV_ROW_5,
510 RND_INV_ROW_5,RND_INV_ROW_5};
511 __declspec(align(16)) long round_i_6[] = {RND_INV_ROW_6,RND_INV_ROW_6,
512 RND_INV_ROW_6,RND_INV_ROW_6};
513 __declspec(align(16)) long round_i_7[] = {RND_INV_ROW_7,RND_INV_ROW_7,
514 RND_INV_ROW_7,RND_INV_ROW_7};
515
516 __declspec(align(16)) short tg_1_16[] = {
517 13036, 13036, 13036, 13036, /* tg * (2<<16) + 0.5 */
518 13036, 13036, 13036, 13036};
519 __declspec(align(16)) short tg_2_16[] = {
520 27146, 27146, 27146, 27146, /* tg * (2<<16) + 0.5 */
521 27146, 27146, 27146, 27146};
522 __declspec(align(16)) short tg_3_16[] = {
523 -21746, -21746, -21746, -21746, /* tg * (2<<16) + 0.5 */
524 -21746, -21746, -21746, -21746};
525 __declspec(align(16)) short cos_4_16[] = {
526 -19195, -19195, -19195, -19195, /* cos * (2<<16) + 0.5 */
527 -19195, -19195, -19195, -19195};
528
529 /*
530 * In this implementation the outputs of the iDCT-1D are multiplied
531 * for rows 0,4 - on cos_4_16,
532 * for rows 1,7 - on cos_1_16,
533 * for rows 2,6 - on cos_2_16,
534 * for rows 3,5 - on cos_3_16
535 * and are shifted to the left for rise of accuracy
536 *
537 * For used constants
538 * FIX(float_const) = (short) (float_const * (1<<15) + 0.5)
539 *
540 *-----------------------------------------------------------------------------
541 *
542 * On the first stage the calculation is executed at once for two rows.
543 * The permutation for each output row is done on second stage
544 * t7 t6 t5 t4 t3 t2 t1 t0 -> t4 t5 t6 t7 t3 t2 t1 t0
545 *
546 *-----------------------------------------------------------------------------
547 */
548
549 #define DCT_8_INV_ROW_2R(TABLE, ROUND1, ROUND2) __asm { \
550 __asm pshuflw xmm1, xmm0, 10001000b \
551 __asm pshuflw xmm0, xmm0, 11011101b \
552 __asm pshufhw xmm1, xmm1, 10001000b \
553 __asm pshufhw xmm0, xmm0, 11011101b \
554 __asm movdqa xmm2, XMMWORD PTR [TABLE] \
555 __asm pmaddwd xmm2, xmm1 \
556 __asm movdqa xmm3, XMMWORD PTR [TABLE + 32] \
557 __asm pmaddwd xmm3, xmm0 \
558 __asm pmaddwd xmm1, XMMWORD PTR [TABLE + 16] \
559 __asm pmaddwd xmm0, XMMWORD PTR [TABLE + 48] \
560 __asm pshuflw xmm5, xmm4, 10001000b \
561 __asm pshuflw xmm4, xmm4, 11011101b \
562 __asm pshufhw xmm5, xmm5, 10001000b \
563 __asm pshufhw xmm4, xmm4, 11011101b \
564 __asm movdqa xmm6, XMMWORD PTR [TABLE] \
565 __asm pmaddwd xmm6, xmm5 \
566 __asm movdqa xmm7, XMMWORD PTR [TABLE + 32] \
567 __asm pmaddwd xmm7, xmm4 \
568 __asm pmaddwd xmm5, XMMWORD PTR [TABLE + 16] \
569 __asm pmaddwd xmm4, XMMWORD PTR [TABLE + 48] \
570 __asm pshufd xmm1, xmm1, 01001110b \
571 __asm pshufd xmm0, xmm0, 01001110b \
572 __asm paddd xmm2, XMMWORD PTR [ROUND1] \
573 __asm paddd xmm3, xmm0 \
574 __asm paddd xmm1, xmm2 \
575 __asm pshufd xmm5, xmm5, 01001110b \
576 __asm pshufd xmm4, xmm4, 01001110b \
577 __asm movdqa xmm2, xmm1 \
578 __asm psubd xmm2, xmm3 \
579 __asm psrad xmm2, SHIFT_INV_ROW \
580 __asm paddd xmm1, xmm3 \
581 __asm psrad xmm1, SHIFT_INV_ROW \
582 __asm packssdw xmm1, xmm2 \
583 __asm paddd xmm6, XMMWORD PTR [ROUND2] \
584 __asm paddd xmm7, xmm4 \
585 __asm paddd xmm5, xmm6 \
586 __asm movdqa xmm6, xmm5 \
587 __asm psubd xmm6, xmm7 \
588 __asm psrad xmm6, SHIFT_INV_ROW \
589 __asm paddd xmm5, xmm7 \
590 __asm psrad xmm5, SHIFT_INV_ROW \
591 __asm packssdw xmm5, xmm6 \
592 }
593
594 /*
595 *
596 * The second stage - inverse DCTs of columns
597 *
598 * The inputs are multiplied
599 * for rows 0,4 - on cos_4_16,
600 * for rows 1,7 - on cos_1_16,
601 * for rows 2,6 - on cos_2_16,
602 * for rows 3,5 - on cos_3_16
603 * and are shifted to the left for rise of accuracy
604 */
605
606 #define DCT_8_INV_COL_8R(INP, OUTP) __asm { \
607 __asm movdqa xmm0, [INP + 5*16] \
608 __asm movdqa xmm1, XMMWORD PTR tg_3_16 \
609 __asm movdqa xmm2, xmm0 \
610 __asm movdqa xmm3, [INP + 3*16] \
611 __asm pmulhw xmm0, xmm1 \
612 __asm movdqa xmm4, [INP + 7*16] \
613 __asm pmulhw xmm1, xmm3 \
614 __asm movdqa xmm5, XMMWORD PTR tg_1_16 \
615 __asm movdqa xmm6, xmm4 \
616 __asm pmulhw xmm4, xmm5 \
617 __asm paddsw xmm0, xmm2 \
618 __asm pmulhw xmm5, [INP + 1*16] \
619 __asm paddsw xmm1, xmm3 \
620 __asm movdqa xmm7, [INP + 6*16] \
621 __asm paddsw xmm0, xmm3 \
622 __asm movdqa xmm3, XMMWORD PTR tg_2_16 \
623 __asm psubsw xmm2, xmm1 \
624 __asm pmulhw xmm7, xmm3 \
625 __asm movdqa xmm1, xmm0 \
626 __asm pmulhw xmm3, [INP + 2*16] \
627 __asm psubsw xmm5, xmm6 \
628 __asm paddsw xmm4, [INP + 1*16] \
629 __asm paddsw xmm0, xmm4 \
630 __asm psubsw xmm4, xmm1 \
631 __asm pshufhw xmm0, xmm0, 00011011b \
632 __asm paddsw xmm7, [INP + 2*16] \
633 __asm movdqa xmm6, xmm5 \
634 __asm psubsw xmm3, [INP + 6*16] \
635 __asm psubsw xmm5, xmm2 \
636 __asm paddsw xmm6, xmm2 \
637 __asm movdqa [OUTP + 7*16], xmm0 \
638 __asm movdqa xmm1, xmm4 \
639 __asm movdqa xmm2, XMMWORD PTR cos_4_16 \
640 __asm paddsw xmm4, xmm5 \
641 __asm movdqa xmm0, XMMWORD PTR cos_4_16 \
642 __asm pmulhw xmm2, xmm4 \
643 __asm pshufhw xmm6, xmm6, 00011011b \
644 __asm movdqa [OUTP + 3*16], xmm6 \
645 __asm psubsw xmm1, xmm5 \
646 __asm movdqa xmm6, [INP + 0*16] \
647 __asm pmulhw xmm0, xmm1 \
648 __asm movdqa xmm5, [INP + 4*16] \
649 __asm paddsw xmm4, xmm2 \
650 __asm paddsw xmm5, xmm6 \
651 __asm psubsw xmm6, [INP + 4*16] \
652 __asm paddsw xmm0, xmm1 \
653 __asm pshufhw xmm4, xmm4, 00011011b \
654 __asm movdqa xmm2, xmm5 \
655 __asm paddsw xmm5, xmm7 \
656 __asm movdqa xmm1, xmm6 \
657 __asm psubsw xmm2, xmm7 \
658 __asm movdqa xmm7, [OUTP + 7*16] \
659 __asm paddsw xmm6, xmm3 \
660 __asm pshufhw xmm5, xmm5, 00011011b \
661 __asm paddsw xmm7, xmm5 \
662 __asm psubsw xmm1, xmm3 \
663 __asm pshufhw xmm6, xmm6, 00011011b \
664 __asm movdqa xmm3, xmm6 \
665 __asm paddsw xmm6, xmm4 \
666 __asm pshufhw xmm2, xmm2, 00011011b \
667 __asm psraw xmm7, SHIFT_INV_COL \
668 __asm movdqa [OUTP + 0*16], xmm7 \
669 __asm movdqa xmm7, xmm1 \
670 __asm paddsw xmm1, xmm0 \
671 __asm psraw xmm6, SHIFT_INV_COL \
672 __asm movdqa [OUTP + 1*16], xmm6 \
673 __asm pshufhw xmm1, xmm1, 00011011b \
674 __asm movdqa xmm6, [OUTP + 3*16] \
675 __asm psubsw xmm7, xmm0 \
676 __asm psraw xmm1, SHIFT_INV_COL \
677 __asm movdqa [OUTP + 2*16], xmm1 \
678 __asm psubsw xmm5, [OUTP + 7*16] \
679 __asm paddsw xmm6, xmm2 \
680 __asm psubsw xmm2, [OUTP + 3*16] \
681 __asm psubsw xmm3, xmm4 \
682 __asm psraw xmm7, SHIFT_INV_COL \
683 __asm pshufhw xmm7, xmm7, 00011011b \
684 __asm movdqa [OUTP + 5*16], xmm7 \
685 __asm psraw xmm5, SHIFT_INV_COL \
686 __asm movdqa [OUTP + 7*16], xmm5 \
687 __asm psraw xmm6, SHIFT_INV_COL \
688 __asm movdqa [OUTP + 3*16], xmm6 \
689 __asm psraw xmm2, SHIFT_INV_COL \
690 __asm movdqa [OUTP + 4*16], xmm2 \
691 __asm psraw xmm3, SHIFT_INV_COL \
692 __asm movdqa [OUTP + 6*16], xmm3 \
693 }
694
695 /*
696 *
697 * Name: dct_8x8_inv_16s
698 * Purpose: Inverse Discrete Cosine Transform 8x8 with
699 * 2D buffer of short int data
700 * Context:
701 * void dct_8x8_inv_16s ( short *src, short *dst )
702 * Parameters:
703 * src - Pointer to the source buffer
704 * dst - Pointer to the destination buffer
705 *
706 */
707
708 GLOBAL(void)
709 dct_8x8_inv_16s ( short *src, short *dst ) {
710
711 __asm {
712
713 mov ecx, src
714 mov edx, dst
715
716 movdqa xmm0, [ecx+0*16]
717 movdqa xmm4, [ecx+4*16]
718 DCT_8_INV_ROW_2R(tab_i_04, round_i_0, round_i_4)
719 movdqa [edx+0*16], xmm1
720 movdqa [edx+4*16], xmm5
721
722 movdqa xmm0, [ecx+1*16]
723 movdqa xmm4, [ecx+7*16]
724 DCT_8_INV_ROW_2R(tab_i_17, round_i_1, round_i_7)
725 movdqa [edx+1*16], xmm1
726 movdqa [edx+7*16], xmm5
727
728 movdqa xmm0, [ecx+3*16]
729 movdqa xmm4, [ecx+5*16]
730 DCT_8_INV_ROW_2R(tab_i_35, round_i_3, round_i_5);
731 movdqa [edx+3*16], xmm1
732 movdqa [edx+5*16], xmm5
733
734 movdqa xmm0, [ecx+2*16]
735 movdqa xmm4, [ecx+6*16]
736 DCT_8_INV_ROW_2R(tab_i_26, round_i_2, round_i_6);
737 movdqa [edx+2*16], xmm1
738 movdqa [edx+6*16], xmm5
739
740 DCT_8_INV_COL_8R(edx+0, edx+0);
741 }
742 }
743
744
745 /*
746 * Name:
747 * ownpj_QuantInv_8x8_16s
748 *
749 * Purpose:
750 * Dequantize 8x8 block of DCT coefficients
751 *
752 * Context:
753 * void ownpj_QuantInv_8x8_16s
754 * Ipp16s* pSrc,
755 * Ipp16s* pDst,
756 * const Ipp16u* pQTbl)*
757 *
758 */
759
760 GLOBAL(void)
761 ownpj_QuantInv_8x8_16s(short * pSrc, short * pDst, const unsigned short * pQTbl)
762 {
763 __asm {
764
765 push ebx
766 push ecx
767 push edx
768 push esi
769 push edi
770
771 mov esi, pSrc
772 mov edi, pDst
773 mov edx, pQTbl
774 mov ecx, 4
775 mov ebx, 32
776
777 again:
778
779 movq mm0, QWORD PTR [esi+0]
780 movq mm1, QWORD PTR [esi+8]
781 movq mm2, QWORD PTR [esi+16]
782 movq mm3, QWORD PTR [esi+24]
783
784 prefetcht0 [esi+ebx] ; fetch next cache line
785
786 pmullw mm0, QWORD PTR [edx+0]
787 pmullw mm1, QWORD PTR [edx+8]
788 pmullw mm2, QWORD PTR [edx+16]
789 pmullw mm3, QWORD PTR [edx+24]
790
791 movq QWORD PTR [edi+0], mm0
792 movq QWORD PTR [edi+8], mm1
793 movq QWORD PTR [edi+16], mm2
794 movq QWORD PTR [edi+24], mm3
795
796 add esi, ebx
797 add edi, ebx
798 add edx, ebx
799 dec ecx
800 jnz again
801
802 emms
803
804 pop edi
805 pop esi
806 pop edx
807 pop ecx
808 pop ebx
809 }
810 }
811
812
813 /*
814 * Name:
815 * ownpj_Add128_8x8_16s8u
816 *
817 * Purpose:
818 * signed to unsigned conversion (level shift)
819 * for 8x8 block of DCT coefficients
820 *
821 * Context:
822 * void ownpj_Add128_8x8_16s8u
823 * const Ipp16s* pSrc,
824 * Ipp8u* pDst,
825 * int DstStep);
826 *
827 */
828
829 __declspec(align(16)) long const_128[]= {0x00800080, 0x00800080, 0x00800080, 0x00800080};
830
831 GLOBAL(void)
832 ownpj_Add128_8x8_16s8u(const short * pSrc, unsigned char * pDst, int DstStep)
833 {
834 __asm {
835 push eax
836 push ebx
837 push ecx
838 push edx
839 push esi
840 push edi
841
842 mov esi, pSrc
843 mov edi, pDst
844 mov edx, DstStep
845 mov ecx, 2
846 mov ebx, edx
847 mov eax, edx
848 sal ebx, 1
849 add eax, ebx
850 movdqa xmm7, XMMWORD PTR const_128
851
852 again:
853
854 movdqa xmm0, XMMWORD PTR [esi+0] ; line 0
855 movdqa xmm1, XMMWORD PTR [esi+16] ; line 1
856 movdqa xmm2, XMMWORD PTR [esi+32] ; line 2
857 movdqa xmm3, XMMWORD PTR [esi+48] ; line 3
858
859 paddw xmm0, xmm7
860 paddw xmm1, xmm7
861 paddw xmm2, xmm7
862 paddw xmm3, xmm7
863
864 packuswb xmm0, xmm1
865 packuswb xmm2, xmm3
866
867 movq QWORD PTR [edi], xmm0 ;0*DstStep
868 movq QWORD PTR [edi+ebx], xmm2 ;2*DstStep
869
870 psrldq xmm0, 8
871 psrldq xmm2, 8
872
873 movq QWORD PTR [edi+edx], xmm0 ;1*DstStep
874 movq QWORD PTR [edi+eax], xmm2 ;3*DstStep
875
876 add edi, ebx
877 add esi, 64
878 add edi, ebx
879 dec ecx
880 jnz again
881
882 pop edi
883 pop esi
884 pop edx
885 pop ecx
886 pop ebx
887 pop eax
888 }
889 }
890
891
892 /*
893 * Name:
894 * ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R
895 *
896 * Purpose:
897 * Inverse DCT transform, de-quantization and level shift
898 *
899 * Parameters:
900 * pSrc - pointer to source
901 * pDst - pointer to output array
902 * DstStep - line offset for output data
903 * pEncoderQuantTable - pointer to Quantization table
904 *
905 */
906
907 GLOBAL(void)
908 ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R(
909 short * pSrc,
910 unsigned char * pDst,
911 int DstStep,
912 const unsigned short * pQuantInvTable)
913 {
914
915 __declspec(align(16)) Ipp8u buf[DCTSIZE2*sizeof(Ipp16s)];
916 Ipp16s * workbuf = (Ipp16s *)buf;
917
918 ownpj_QuantInv_8x8_16s(pSrc,workbuf,pQuantInvTable);
919 dct_8x8_inv_16s(workbuf,workbuf);
920 ownpj_Add128_8x8_16s8u(workbuf,pDst,DstStep);
921
922 }
923
924 GLOBAL(void)
925 jpeg_idct_islow_sse2 (
926 j_decompress_ptr cinfo,
927 jpeg_component_info * compptr,
928 JCOEFPTR coef_block,
929 JSAMPARRAY output_buf,
930 JDIMENSION output_col)
931 {
932 int ctr;
933 JCOEFPTR inptr;
934 Ipp16u* quantptr;
935 Ipp8u* wsptr;
936 __declspec(align(16)) Ipp8u workspace[DCTSIZE2];
937 JSAMPROW outptr;
938
939 inptr = coef_block;
940 quantptr = (Ipp16u*)compptr->dct_table;
941 wsptr = workspace;
942
943 ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R(inptr, workspace, 8, quantptr);
944
945 for(ctr = 0; ctr < DCTSIZE; ctr++)
946 {
947 outptr = output_buf[ctr] + output_col;
948
949 outptr[0] = wsptr[0];
950 outptr[1] = wsptr[1];
951 outptr[2] = wsptr[2];
952 outptr[3] = wsptr[3];
953 outptr[4] = wsptr[4];
954 outptr[5] = wsptr[5];
955 outptr[6] = wsptr[6];
956 outptr[7] = wsptr[7];
957
958 wsptr += DCTSIZE;
959 }
960 }
961 #endif /* HAVE_SSE2_INTEL_MNEMONICS */
962
963 #endif /* DCT_ISLOW_SUPPORTED */