[LIBJPEG] Update to version 9c. CORE-14291
[reactos.git] / dll / 3rdparty / libjpeg / jidctint.c
1 /*
2 * jidctint.c
3 *
4 * Copyright (C) 1991-1998, Thomas G. Lane.
5 * Modification developed 2002-2016 by Guido Vollbeding.
6 * This file is part of the Independent JPEG Group's software.
7 * For conditions of distribution and use, see the accompanying README file.
8 *
9 * This file contains a slow-but-accurate integer implementation of the
10 * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine
11 * must also perform dequantization of the input coefficients.
12 *
13 * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
14 * on each row (or vice versa, but it's more convenient to emit a row at
15 * a time). Direct algorithms are also available, but they are much more
16 * complex and seem not to be any faster when reduced to code.
17 *
18 * This implementation is based on an algorithm described in
19 * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
20 * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
21 * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
22 * The primary algorithm described there uses 11 multiplies and 29 adds.
23 * We use their alternate method with 12 multiplies and 32 adds.
24 * The advantage of this method is that no data path contains more than one
25 * multiplication; this allows a very simple and accurate implementation in
26 * scaled fixed-point arithmetic, with a minimal number of shifts.
27 *
28 * We also provide IDCT routines with various output sample block sizes for
29 * direct resolution reduction or enlargement and for direct resolving the
30 * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
31 * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 input DCT block.
32 *
33 * For N<8 we simply take the corresponding low-frequency coefficients of
34 * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
35 * to yield the downscaled outputs.
36 * This can be seen as direct low-pass downsampling from the DCT domain
37 * point of view rather than the usual spatial domain point of view,
38 * yielding significant computational savings and results at least
39 * as good as common bilinear (averaging) spatial downsampling.
40 *
41 * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
42 * lower frequencies and higher frequencies assumed to be zero.
43 * It turns out that the computational effort is similar to the 8x8 IDCT
44 * regarding the output size.
45 * Furthermore, the scaling and descaling is the same for all IDCT sizes.
46 *
47 * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
48 * since there would be too many additional constants to pre-calculate.
49 */
50
51 #define JPEG_INTERNALS
52 #include "jinclude.h"
53 #include "jpeglib.h"
54 #include "jdct.h" /* Private declarations for DCT subsystem */
55
56 #ifdef DCT_ISLOW_SUPPORTED
57
58
59 /*
60 * This module is specialized to the case DCTSIZE = 8.
61 */
62
63 #if DCTSIZE != 8
64 Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
65 #endif
66
67
68 /*
69 * The poop on this scaling stuff is as follows:
70 *
71 * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
72 * larger than the true IDCT outputs. The final outputs are therefore
73 * a factor of N larger than desired; since N=8 this can be cured by
74 * a simple right shift at the end of the algorithm. The advantage of
75 * this arrangement is that we save two multiplications per 1-D IDCT,
76 * because the y0 and y4 inputs need not be divided by sqrt(N).
77 *
78 * We have to do addition and subtraction of the integer inputs, which
79 * is no problem, and multiplication by fractional constants, which is
80 * a problem to do in integer arithmetic. We multiply all the constants
81 * by CONST_SCALE and convert them to integer constants (thus retaining
82 * CONST_BITS bits of precision in the constants). After doing a
83 * multiplication we have to divide the product by CONST_SCALE, with proper
84 * rounding, to produce the correct output. This division can be done
85 * cheaply as a right shift of CONST_BITS bits. We postpone shifting
86 * as long as possible so that partial sums can be added together with
87 * full fractional precision.
88 *
89 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
90 * they are represented to better-than-integral precision. These outputs
91 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
92 * with the recommended scaling. (To scale up 12-bit sample data further, an
93 * intermediate INT32 array would be needed.)
94 *
95 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
96 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
97 * shows that the values given below are the most effective.
98 */
99
100 #if BITS_IN_JSAMPLE == 8
101 #define CONST_BITS 13
102 #define PASS1_BITS 2
103 #else
104 #define CONST_BITS 13
105 #define PASS1_BITS 1 /* lose a little precision to avoid overflow */
106 #endif
107
108 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
109 * causing a lot of useless floating-point operations at run time.
110 * To get around this we use the following pre-calculated constants.
111 * If you change CONST_BITS you may want to add appropriate values.
112 * (With a reasonable C compiler, you can just rely on the FIX() macro...)
113 */
114
115 #if CONST_BITS == 13
116 #define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */
117 #define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */
118 #define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */
119 #define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */
120 #define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */
121 #define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */
122 #define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */
123 #define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */
124 #define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */
125 #define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */
126 #define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */
127 #define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */
128 #else
129 #define FIX_0_298631336 FIX(0.298631336)
130 #define FIX_0_390180644 FIX(0.390180644)
131 #define FIX_0_541196100 FIX(0.541196100)
132 #define FIX_0_765366865 FIX(0.765366865)
133 #define FIX_0_899976223 FIX(0.899976223)
134 #define FIX_1_175875602 FIX(1.175875602)
135 #define FIX_1_501321110 FIX(1.501321110)
136 #define FIX_1_847759065 FIX(1.847759065)
137 #define FIX_1_961570560 FIX(1.961570560)
138 #define FIX_2_053119869 FIX(2.053119869)
139 #define FIX_2_562915447 FIX(2.562915447)
140 #define FIX_3_072711026 FIX(3.072711026)
141 #endif
142
143
144 /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
145 * For 8-bit samples with the recommended scaling, all the variable
146 * and constant values involved are no more than 16 bits wide, so a
147 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
148 * For 12-bit samples, a full 32-bit multiplication will be needed.
149 */
150
151 #if BITS_IN_JSAMPLE == 8
152 #define MULTIPLY(var,const) MULTIPLY16C16(var,const)
153 #else
154 #define MULTIPLY(var,const) ((var) * (const))
155 #endif
156
157
158 /* Dequantize a coefficient by multiplying it by the multiplier-table
159 * entry; produce an int result. In this module, both inputs and result
160 * are 16 bits or less, so either int or short multiply will work.
161 */
162
163 #define DEQUANTIZE(coef,quantval) (((ISLOW_MULT_TYPE) (coef)) * (quantval))
164
165
166 /*
167 * Perform dequantization and inverse DCT on one block of coefficients.
168 *
169 * Optimized algorithm with 12 multiplications in the 1-D kernel.
170 * cK represents sqrt(2) * cos(K*pi/16).
171 */
172
173 GLOBAL(void)
174 jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
175 JCOEFPTR coef_block,
176 JSAMPARRAY output_buf, JDIMENSION output_col)
177 {
178 INT32 tmp0, tmp1, tmp2, tmp3;
179 INT32 tmp10, tmp11, tmp12, tmp13;
180 INT32 z1, z2, z3;
181 JCOEFPTR inptr;
182 ISLOW_MULT_TYPE * quantptr;
183 int * wsptr;
184 JSAMPROW outptr;
185 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
186 int ctr;
187 int workspace[DCTSIZE2]; /* buffers data between passes */
188 SHIFT_TEMPS
189
190 /* Pass 1: process columns from input, store into work array.
191 * Note results are scaled up by sqrt(8) compared to a true IDCT;
192 * furthermore, we scale the results by 2**PASS1_BITS.
193 */
194
195 inptr = coef_block;
196 quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
197 wsptr = workspace;
198 for (ctr = DCTSIZE; ctr > 0; ctr--) {
199 /* Due to quantization, we will usually find that many of the input
200 * coefficients are zero, especially the AC terms. We can exploit this
201 * by short-circuiting the IDCT calculation for any column in which all
202 * the AC terms are zero. In that case each output is equal to the
203 * DC coefficient (with scale factor as needed).
204 * With typical images and quantization tables, half or more of the
205 * column DCT calculations can be simplified this way.
206 */
207
208 if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
209 inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
210 inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
211 inptr[DCTSIZE*7] == 0) {
212 /* AC terms all zero */
213 int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
214
215 wsptr[DCTSIZE*0] = dcval;
216 wsptr[DCTSIZE*1] = dcval;
217 wsptr[DCTSIZE*2] = dcval;
218 wsptr[DCTSIZE*3] = dcval;
219 wsptr[DCTSIZE*4] = dcval;
220 wsptr[DCTSIZE*5] = dcval;
221 wsptr[DCTSIZE*6] = dcval;
222 wsptr[DCTSIZE*7] = dcval;
223
224 inptr++; /* advance pointers to next column */
225 quantptr++;
226 wsptr++;
227 continue;
228 }
229
230 /* Even part: reverse the even part of the forward DCT.
231 * The rotator is c(-6).
232 */
233
234 z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
235 z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
236 z2 <<= CONST_BITS;
237 z3 <<= CONST_BITS;
238 /* Add fudge factor here for final descale. */
239 z2 += ONE << (CONST_BITS-PASS1_BITS-1);
240
241 tmp0 = z2 + z3;
242 tmp1 = z2 - z3;
243
244 z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
245 z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
246
247 z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */
248 tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
249 tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
250
251 tmp10 = tmp0 + tmp2;
252 tmp13 = tmp0 - tmp2;
253 tmp11 = tmp1 + tmp3;
254 tmp12 = tmp1 - tmp3;
255
256 /* Odd part per figure 8; the matrix is unitary and hence its
257 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
258 */
259
260 tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
261 tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
262 tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
263 tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
264
265 z2 = tmp0 + tmp2;
266 z3 = tmp1 + tmp3;
267
268 z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* c3 */
269 z2 = MULTIPLY(z2, - FIX_1_961570560); /* -c3-c5 */
270 z3 = MULTIPLY(z3, - FIX_0_390180644); /* -c3+c5 */
271 z2 += z1;
272 z3 += z1;
273
274 z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
275 tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* -c1+c3+c5-c7 */
276 tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* c1+c3-c5-c7 */
277 tmp0 += z1 + z2;
278 tmp3 += z1 + z3;
279
280 z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
281 tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* c1+c3-c5+c7 */
282 tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* c1+c3+c5-c7 */
283 tmp1 += z1 + z3;
284 tmp2 += z1 + z2;
285
286 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
287
288 wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
289 wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
290 wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
291 wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
292 wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
293 wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
294 wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
295 wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
296
297 inptr++; /* advance pointers to next column */
298 quantptr++;
299 wsptr++;
300 }
301
302 /* Pass 2: process rows from work array, store into output array.
303 * Note that we must descale the results by a factor of 8 == 2**3,
304 * and also undo the PASS1_BITS scaling.
305 */
306
307 wsptr = workspace;
308 for (ctr = 0; ctr < DCTSIZE; ctr++) {
309 outptr = output_buf[ctr] + output_col;
310
311 /* Add range center and fudge factor for final descale and range-limit. */
312 z2 = (INT32) wsptr[0] +
313 ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
314 (ONE << (PASS1_BITS+2)));
315
316 /* Rows of zeroes can be exploited in the same way as we did with columns.
317 * However, the column calculation has created many nonzero AC terms, so
318 * the simplification applies less often (typically 5% to 10% of the time).
319 * On machines with very fast multiplication, it's possible that the
320 * test takes more time than it's worth. In that case this section
321 * may be commented out.
322 */
323
324 #ifndef NO_ZERO_ROW_TEST
325 if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
326 wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
327 /* AC terms all zero */
328 JSAMPLE dcval = range_limit[(int) RIGHT_SHIFT(z2, PASS1_BITS+3)
329 & RANGE_MASK];
330
331 outptr[0] = dcval;
332 outptr[1] = dcval;
333 outptr[2] = dcval;
334 outptr[3] = dcval;
335 outptr[4] = dcval;
336 outptr[5] = dcval;
337 outptr[6] = dcval;
338 outptr[7] = dcval;
339
340 wsptr += DCTSIZE; /* advance pointer to next row */
341 continue;
342 }
343 #endif
344
345 /* Even part: reverse the even part of the forward DCT.
346 * The rotator is c(-6).
347 */
348
349 z3 = (INT32) wsptr[4];
350
351 tmp0 = (z2 + z3) << CONST_BITS;
352 tmp1 = (z2 - z3) << CONST_BITS;
353
354 z2 = (INT32) wsptr[2];
355 z3 = (INT32) wsptr[6];
356
357 z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */
358 tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
359 tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
360
361 tmp10 = tmp0 + tmp2;
362 tmp13 = tmp0 - tmp2;
363 tmp11 = tmp1 + tmp3;
364 tmp12 = tmp1 - tmp3;
365
366 /* Odd part per figure 8; the matrix is unitary and hence its
367 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
368 */
369
370 tmp0 = (INT32) wsptr[7];
371 tmp1 = (INT32) wsptr[5];
372 tmp2 = (INT32) wsptr[3];
373 tmp3 = (INT32) wsptr[1];
374
375 z2 = tmp0 + tmp2;
376 z3 = tmp1 + tmp3;
377
378 z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* c3 */
379 z2 = MULTIPLY(z2, - FIX_1_961570560); /* -c3-c5 */
380 z3 = MULTIPLY(z3, - FIX_0_390180644); /* -c3+c5 */
381 z2 += z1;
382 z3 += z1;
383
384 z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
385 tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* -c1+c3+c5-c7 */
386 tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* c1+c3-c5-c7 */
387 tmp0 += z1 + z2;
388 tmp3 += z1 + z3;
389
390 z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
391 tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* c1+c3-c5+c7 */
392 tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* c1+c3+c5-c7 */
393 tmp1 += z1 + z3;
394 tmp2 += z1 + z2;
395
396 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
397
398 outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
399 CONST_BITS+PASS1_BITS+3)
400 & RANGE_MASK];
401 outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
402 CONST_BITS+PASS1_BITS+3)
403 & RANGE_MASK];
404 outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
405 CONST_BITS+PASS1_BITS+3)
406 & RANGE_MASK];
407 outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
408 CONST_BITS+PASS1_BITS+3)
409 & RANGE_MASK];
410 outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
411 CONST_BITS+PASS1_BITS+3)
412 & RANGE_MASK];
413 outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
414 CONST_BITS+PASS1_BITS+3)
415 & RANGE_MASK];
416 outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
417 CONST_BITS+PASS1_BITS+3)
418 & RANGE_MASK];
419 outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
420 CONST_BITS+PASS1_BITS+3)
421 & RANGE_MASK];
422
423 wsptr += DCTSIZE; /* advance pointer to next row */
424 }
425 }
426
427 #ifdef IDCT_SCALING_SUPPORTED
428
429
430 /*
431 * Perform dequantization and inverse DCT on one block of coefficients,
432 * producing a reduced-size 7x7 output block.
433 *
434 * Optimized algorithm with 12 multiplications in the 1-D kernel.
435 * cK represents sqrt(2) * cos(K*pi/14).
436 */
437
438 GLOBAL(void)
439 jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
440 JCOEFPTR coef_block,
441 JSAMPARRAY output_buf, JDIMENSION output_col)
442 {
443 INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
444 INT32 z1, z2, z3;
445 JCOEFPTR inptr;
446 ISLOW_MULT_TYPE * quantptr;
447 int * wsptr;
448 JSAMPROW outptr;
449 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
450 int ctr;
451 int workspace[7*7]; /* buffers data between passes */
452 SHIFT_TEMPS
453
454 /* Pass 1: process columns from input, store into work array. */
455
456 inptr = coef_block;
457 quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
458 wsptr = workspace;
459 for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
460 /* Even part */
461
462 tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
463 tmp13 <<= CONST_BITS;
464 /* Add fudge factor here for final descale. */
465 tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
466
467 z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
468 z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
469 z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
470
471 tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734)); /* c4 */
472 tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123)); /* c6 */
473 tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
474 tmp0 = z1 + z3;
475 z2 -= tmp0;
476 tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
477 tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536)); /* c2-c4-c6 */
478 tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249)); /* c2+c4+c6 */
479 tmp13 += MULTIPLY(z2, FIX(1.414213562)); /* c0 */
480
481 /* Odd part */
482
483 z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
484 z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
485 z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
486
487 tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347)); /* (c3+c1-c5)/2 */
488 tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339)); /* (c3+c5-c1)/2 */
489 tmp0 = tmp1 - tmp2;
490 tmp1 += tmp2;
491 tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276)); /* -c1 */
492 tmp1 += tmp2;
493 z2 = MULTIPLY(z1 + z3, FIX(0.613604268)); /* c5 */
494 tmp0 += z2;
495 tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693)); /* c3+c1-c5 */
496
497 /* Final output stage */
498
499 wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
500 wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
501 wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
502 wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
503 wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
504 wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
505 wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
506 }
507
508 /* Pass 2: process 7 rows from work array, store into output array. */
509
510 wsptr = workspace;
511 for (ctr = 0; ctr < 7; ctr++) {
512 outptr = output_buf[ctr] + output_col;
513
514 /* Even part */
515
516 /* Add range center and fudge factor for final descale and range-limit. */
517 tmp13 = (INT32) wsptr[0] +
518 ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
519 (ONE << (PASS1_BITS+2)));
520 tmp13 <<= CONST_BITS;
521
522 z1 = (INT32) wsptr[2];
523 z2 = (INT32) wsptr[4];
524 z3 = (INT32) wsptr[6];
525
526 tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734)); /* c4 */
527 tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123)); /* c6 */
528 tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
529 tmp0 = z1 + z3;
530 z2 -= tmp0;
531 tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
532 tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536)); /* c2-c4-c6 */
533 tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249)); /* c2+c4+c6 */
534 tmp13 += MULTIPLY(z2, FIX(1.414213562)); /* c0 */
535
536 /* Odd part */
537
538 z1 = (INT32) wsptr[1];
539 z2 = (INT32) wsptr[3];
540 z3 = (INT32) wsptr[5];
541
542 tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347)); /* (c3+c1-c5)/2 */
543 tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339)); /* (c3+c5-c1)/2 */
544 tmp0 = tmp1 - tmp2;
545 tmp1 += tmp2;
546 tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276)); /* -c1 */
547 tmp1 += tmp2;
548 z2 = MULTIPLY(z1 + z3, FIX(0.613604268)); /* c5 */
549 tmp0 += z2;
550 tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693)); /* c3+c1-c5 */
551
552 /* Final output stage */
553
554 outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
555 CONST_BITS+PASS1_BITS+3)
556 & RANGE_MASK];
557 outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
558 CONST_BITS+PASS1_BITS+3)
559 & RANGE_MASK];
560 outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
561 CONST_BITS+PASS1_BITS+3)
562 & RANGE_MASK];
563 outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
564 CONST_BITS+PASS1_BITS+3)
565 & RANGE_MASK];
566 outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
567 CONST_BITS+PASS1_BITS+3)
568 & RANGE_MASK];
569 outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
570 CONST_BITS+PASS1_BITS+3)
571 & RANGE_MASK];
572 outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
573 CONST_BITS+PASS1_BITS+3)
574 & RANGE_MASK];
575
576 wsptr += 7; /* advance pointer to next row */
577 }
578 }
579
580
581 /*
582 * Perform dequantization and inverse DCT on one block of coefficients,
583 * producing a reduced-size 6x6 output block.
584 *
585 * Optimized algorithm with 3 multiplications in the 1-D kernel.
586 * cK represents sqrt(2) * cos(K*pi/12).
587 */
588
589 GLOBAL(void)
590 jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
591 JCOEFPTR coef_block,
592 JSAMPARRAY output_buf, JDIMENSION output_col)
593 {
594 INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
595 INT32 z1, z2, z3;
596 JCOEFPTR inptr;
597 ISLOW_MULT_TYPE * quantptr;
598 int * wsptr;
599 JSAMPROW outptr;
600 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
601 int ctr;
602 int workspace[6*6]; /* buffers data between passes */
603 SHIFT_TEMPS
604
605 /* Pass 1: process columns from input, store into work array. */
606
607 inptr = coef_block;
608 quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
609 wsptr = workspace;
610 for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
611 /* Even part */
612
613 tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
614 tmp0 <<= CONST_BITS;
615 /* Add fudge factor here for final descale. */
616 tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
617 tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
618 tmp10 = MULTIPLY(tmp2, FIX(0.707106781)); /* c4 */
619 tmp1 = tmp0 + tmp10;
620 tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
621 tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
622 tmp0 = MULTIPLY(tmp10, FIX(1.224744871)); /* c2 */
623 tmp10 = tmp1 + tmp0;
624 tmp12 = tmp1 - tmp0;
625
626 /* Odd part */
627
628 z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
629 z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
630 z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
631 tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
632 tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
633 tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
634 tmp1 = (z1 - z2 - z3) << PASS1_BITS;
635
636 /* Final output stage */
637
638 wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
639 wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
640 wsptr[6*1] = (int) (tmp11 + tmp1);
641 wsptr[6*4] = (int) (tmp11 - tmp1);
642 wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
643 wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
644 }
645
646 /* Pass 2: process 6 rows from work array, store into output array. */
647
648 wsptr = workspace;
649 for (ctr = 0; ctr < 6; ctr++) {
650 outptr = output_buf[ctr] + output_col;
651
652 /* Even part */
653
654 /* Add range center and fudge factor for final descale and range-limit. */
655 tmp0 = (INT32) wsptr[0] +
656 ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
657 (ONE << (PASS1_BITS+2)));
658 tmp0 <<= CONST_BITS;
659 tmp2 = (INT32) wsptr[4];
660 tmp10 = MULTIPLY(tmp2, FIX(0.707106781)); /* c4 */
661 tmp1 = tmp0 + tmp10;
662 tmp11 = tmp0 - tmp10 - tmp10;
663 tmp10 = (INT32) wsptr[2];
664 tmp0 = MULTIPLY(tmp10, FIX(1.224744871)); /* c2 */
665 tmp10 = tmp1 + tmp0;
666 tmp12 = tmp1 - tmp0;
667
668 /* Odd part */
669
670 z1 = (INT32) wsptr[1];
671 z2 = (INT32) wsptr[3];
672 z3 = (INT32) wsptr[5];
673 tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
674 tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
675 tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
676 tmp1 = (z1 - z2 - z3) << CONST_BITS;
677
678 /* Final output stage */
679
680 outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
681 CONST_BITS+PASS1_BITS+3)
682 & RANGE_MASK];
683 outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
684 CONST_BITS+PASS1_BITS+3)
685 & RANGE_MASK];
686 outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
687 CONST_BITS+PASS1_BITS+3)
688 & RANGE_MASK];
689 outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
690 CONST_BITS+PASS1_BITS+3)
691 & RANGE_MASK];
692 outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
693 CONST_BITS+PASS1_BITS+3)
694 & RANGE_MASK];
695 outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
696 CONST_BITS+PASS1_BITS+3)
697 & RANGE_MASK];
698
699 wsptr += 6; /* advance pointer to next row */
700 }
701 }
702
703
704 /*
705 * Perform dequantization and inverse DCT on one block of coefficients,
706 * producing a reduced-size 5x5 output block.
707 *
708 * Optimized algorithm with 5 multiplications in the 1-D kernel.
709 * cK represents sqrt(2) * cos(K*pi/10).
710 */
711
712 GLOBAL(void)
713 jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
714 JCOEFPTR coef_block,
715 JSAMPARRAY output_buf, JDIMENSION output_col)
716 {
717 INT32 tmp0, tmp1, tmp10, tmp11, tmp12;
718 INT32 z1, z2, z3;
719 JCOEFPTR inptr;
720 ISLOW_MULT_TYPE * quantptr;
721 int * wsptr;
722 JSAMPROW outptr;
723 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
724 int ctr;
725 int workspace[5*5]; /* buffers data between passes */
726 SHIFT_TEMPS
727
728 /* Pass 1: process columns from input, store into work array. */
729
730 inptr = coef_block;
731 quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
732 wsptr = workspace;
733 for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
734 /* Even part */
735
736 tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
737 tmp12 <<= CONST_BITS;
738 /* Add fudge factor here for final descale. */
739 tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
740 tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
741 tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
742 z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
743 z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
744 z3 = tmp12 + z2;
745 tmp10 = z3 + z1;
746 tmp11 = z3 - z1;
747 tmp12 -= z2 << 2;
748
749 /* Odd part */
750
751 z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
752 z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
753
754 z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c3 */
755 tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c1-c3 */
756 tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c1+c3 */
757
758 /* Final output stage */
759
760 wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
761 wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
762 wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
763 wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
764 wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
765 }
766
767 /* Pass 2: process 5 rows from work array, store into output array. */
768
769 wsptr = workspace;
770 for (ctr = 0; ctr < 5; ctr++) {
771 outptr = output_buf[ctr] + output_col;
772
773 /* Even part */
774
775 /* Add range center and fudge factor for final descale and range-limit. */
776 tmp12 = (INT32) wsptr[0] +
777 ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
778 (ONE << (PASS1_BITS+2)));
779 tmp12 <<= CONST_BITS;
780 tmp0 = (INT32) wsptr[2];
781 tmp1 = (INT32) wsptr[4];
782 z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
783 z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
784 z3 = tmp12 + z2;
785 tmp10 = z3 + z1;
786 tmp11 = z3 - z1;
787 tmp12 -= z2 << 2;
788
789 /* Odd part */
790
791 z2 = (INT32) wsptr[1];
792 z3 = (INT32) wsptr[3];
793
794 z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c3 */
795 tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c1-c3 */
796 tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c1+c3 */
797
798 /* Final output stage */
799
800 outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
801 CONST_BITS+PASS1_BITS+3)
802 & RANGE_MASK];
803 outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
804 CONST_BITS+PASS1_BITS+3)
805 & RANGE_MASK];
806 outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
807 CONST_BITS+PASS1_BITS+3)
808 & RANGE_MASK];
809 outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
810 CONST_BITS+PASS1_BITS+3)
811 & RANGE_MASK];
812 outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
813 CONST_BITS+PASS1_BITS+3)
814 & RANGE_MASK];
815
816 wsptr += 5; /* advance pointer to next row */
817 }
818 }
819
820
821 /*
822 * Perform dequantization and inverse DCT on one block of coefficients,
823 * producing a reduced-size 4x4 output block.
824 *
825 * Optimized algorithm with 3 multiplications in the 1-D kernel.
826 * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
827 */
828
829 GLOBAL(void)
830 jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
831 JCOEFPTR coef_block,
832 JSAMPARRAY output_buf, JDIMENSION output_col)
833 {
834 INT32 tmp0, tmp2, tmp10, tmp12;
835 INT32 z1, z2, z3;
836 JCOEFPTR inptr;
837 ISLOW_MULT_TYPE * quantptr;
838 int * wsptr;
839 JSAMPROW outptr;
840 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
841 int ctr;
842 int workspace[4*4]; /* buffers data between passes */
843 SHIFT_TEMPS
844
845 /* Pass 1: process columns from input, store into work array. */
846
847 inptr = coef_block;
848 quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
849 wsptr = workspace;
850 for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
851 /* Even part */
852
853 tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
854 tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
855
856 tmp10 = (tmp0 + tmp2) << PASS1_BITS;
857 tmp12 = (tmp0 - tmp2) << PASS1_BITS;
858
859 /* Odd part */
860 /* Same rotation as in the even part of the 8x8 LL&M IDCT */
861
862 z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
863 z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
864
865 z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */
866 /* Add fudge factor here for final descale. */
867 z1 += ONE << (CONST_BITS-PASS1_BITS-1);
868 tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
869 CONST_BITS-PASS1_BITS);
870 tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
871 CONST_BITS-PASS1_BITS);
872
873 /* Final output stage */
874
875 wsptr[4*0] = (int) (tmp10 + tmp0);
876 wsptr[4*3] = (int) (tmp10 - tmp0);
877 wsptr[4*1] = (int) (tmp12 + tmp2);
878 wsptr[4*2] = (int) (tmp12 - tmp2);
879 }
880
881 /* Pass 2: process 4 rows from work array, store into output array. */
882
883 wsptr = workspace;
884 for (ctr = 0; ctr < 4; ctr++) {
885 outptr = output_buf[ctr] + output_col;
886
887 /* Even part */
888
889 /* Add range center and fudge factor for final descale and range-limit. */
890 tmp0 = (INT32) wsptr[0] +
891 ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
892 (ONE << (PASS1_BITS+2)));
893 tmp2 = (INT32) wsptr[2];
894
895 tmp10 = (tmp0 + tmp2) << CONST_BITS;
896 tmp12 = (tmp0 - tmp2) << CONST_BITS;
897
898 /* Odd part */
899 /* Same rotation as in the even part of the 8x8 LL&M IDCT */
900
901 z2 = (INT32) wsptr[1];
902 z3 = (INT32) wsptr[3];
903
904 z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */
905 tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
906 tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
907
908 /* Final output stage */
909
910 outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
911 CONST_BITS+PASS1_BITS+3)
912 & RANGE_MASK];
913 outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
914 CONST_BITS+PASS1_BITS+3)
915 & RANGE_MASK];
916 outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
917 CONST_BITS+PASS1_BITS+3)
918 & RANGE_MASK];
919 outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
920 CONST_BITS+PASS1_BITS+3)
921 & RANGE_MASK];
922
923 wsptr += 4; /* advance pointer to next row */
924 }
925 }
926
927
928 /*
929 * Perform dequantization and inverse DCT on one block of coefficients,
930 * producing a reduced-size 3x3 output block.
931 *
932 * Optimized algorithm with 2 multiplications in the 1-D kernel.
933 * cK represents sqrt(2) * cos(K*pi/6).
934 */
935
936 GLOBAL(void)
937 jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
938 JCOEFPTR coef_block,
939 JSAMPARRAY output_buf, JDIMENSION output_col)
940 {
941 INT32 tmp0, tmp2, tmp10, tmp12;
942 JCOEFPTR inptr;
943 ISLOW_MULT_TYPE * quantptr;
944 int * wsptr;
945 JSAMPROW outptr;
946 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
947 int ctr;
948 int workspace[3*3]; /* buffers data between passes */
949 SHIFT_TEMPS
950
951 /* Pass 1: process columns from input, store into work array. */
952
953 inptr = coef_block;
954 quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
955 wsptr = workspace;
956 for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
957 /* Even part */
958
959 tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
960 tmp0 <<= CONST_BITS;
961 /* Add fudge factor here for final descale. */
962 tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
963 tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
964 tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
965 tmp10 = tmp0 + tmp12;
966 tmp2 = tmp0 - tmp12 - tmp12;
967
968 /* Odd part */
969
970 tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
971 tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
972
973 /* Final output stage */
974
975 wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
976 wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
977 wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
978 }
979
980 /* Pass 2: process 3 rows from work array, store into output array. */
981
982 wsptr = workspace;
983 for (ctr = 0; ctr < 3; ctr++) {
984 outptr = output_buf[ctr] + output_col;
985
986 /* Even part */
987
988 /* Add range center and fudge factor for final descale and range-limit. */
989 tmp0 = (INT32) wsptr[0] +
990 ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
991 (ONE << (PASS1_BITS+2)));
992 tmp0 <<= CONST_BITS;
993 tmp2 = (INT32) wsptr[2];
994 tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
995 tmp10 = tmp0 + tmp12;
996 tmp2 = tmp0 - tmp12 - tmp12;
997
998 /* Odd part */
999
1000 tmp12 = (INT32) wsptr[1];
1001 tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
1002
1003 /* Final output stage */
1004
1005 outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
1006 CONST_BITS+PASS1_BITS+3)
1007 & RANGE_MASK];
1008 outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
1009 CONST_BITS+PASS1_BITS+3)
1010 & RANGE_MASK];
1011 outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
1012 CONST_BITS+PASS1_BITS+3)
1013 & RANGE_MASK];
1014
1015 wsptr += 3; /* advance pointer to next row */
1016 }
1017 }
1018
1019
1020 /*
1021 * Perform dequantization and inverse DCT on one block of coefficients,
1022 * producing a reduced-size 2x2 output block.
1023 *
1024 * Multiplication-less algorithm.
1025 */
1026
1027 GLOBAL(void)
1028 jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1029 JCOEFPTR coef_block,
1030 JSAMPARRAY output_buf, JDIMENSION output_col)
1031 {
1032 DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1033 ISLOW_MULT_TYPE * quantptr;
1034 JSAMPROW outptr;
1035 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1036 ISHIFT_TEMPS
1037
1038 /* Pass 1: process columns from input. */
1039
1040 quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1041
1042 /* Column 0 */
1043 tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
1044 tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
1045 /* Add range center and fudge factor for final descale and range-limit. */
1046 tmp4 += (((DCTELEM) RANGE_CENTER) << 3) + (1 << 2);
1047
1048 tmp0 = tmp4 + tmp5;
1049 tmp2 = tmp4 - tmp5;
1050
1051 /* Column 1 */
1052 tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0+1], quantptr[DCTSIZE*0+1]);
1053 tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1+1], quantptr[DCTSIZE*1+1]);
1054
1055 tmp1 = tmp4 + tmp5;
1056 tmp3 = tmp4 - tmp5;
1057
1058 /* Pass 2: process 2 rows, store into output array. */
1059
1060 /* Row 0 */
1061 outptr = output_buf[0] + output_col;
1062
1063 outptr[0] = range_limit[(int) IRIGHT_SHIFT(tmp0 + tmp1, 3) & RANGE_MASK];
1064 outptr[1] = range_limit[(int) IRIGHT_SHIFT(tmp0 - tmp1, 3) & RANGE_MASK];
1065
1066 /* Row 1 */
1067 outptr = output_buf[1] + output_col;
1068
1069 outptr[0] = range_limit[(int) IRIGHT_SHIFT(tmp2 + tmp3, 3) & RANGE_MASK];
1070 outptr[1] = range_limit[(int) IRIGHT_SHIFT(tmp2 - tmp3, 3) & RANGE_MASK];
1071 }
1072
1073
1074 /*
1075 * Perform dequantization and inverse DCT on one block of coefficients,
1076 * producing a reduced-size 1x1 output block.
1077 *
1078 * We hardly need an inverse DCT routine for this: just take the
1079 * average pixel value, which is one-eighth of the DC coefficient.
1080 */
1081
1082 GLOBAL(void)
1083 jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1084 JCOEFPTR coef_block,
1085 JSAMPARRAY output_buf, JDIMENSION output_col)
1086 {
1087 DCTELEM dcval;
1088 ISLOW_MULT_TYPE * quantptr;
1089 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1090 ISHIFT_TEMPS
1091
1092 /* 1x1 is trivial: just take the DC coefficient divided by 8. */
1093
1094 quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1095
1096 dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
1097 /* Add range center and fudge factor for descale and range-limit. */
1098 dcval += (((DCTELEM) RANGE_CENTER) << 3) + (1 << 2);
1099
1100 output_buf[0][output_col] =
1101 range_limit[(int) IRIGHT_SHIFT(dcval, 3) & RANGE_MASK];
1102 }
1103
1104
1105 /*
1106 * Perform dequantization and inverse DCT on one block of coefficients,
1107 * producing a 9x9 output block.
1108 *
1109 * Optimized algorithm with 10 multiplications in the 1-D kernel.
1110 * cK represents sqrt(2) * cos(K*pi/18).
1111 */
1112
1113 GLOBAL(void)
1114 jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1115 JCOEFPTR coef_block,
1116 JSAMPARRAY output_buf, JDIMENSION output_col)
1117 {
1118 INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
1119 INT32 z1, z2, z3, z4;
1120 JCOEFPTR inptr;
1121 ISLOW_MULT_TYPE * quantptr;
1122 int * wsptr;
1123 JSAMPROW outptr;
1124 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1125 int ctr;
1126 int workspace[8*9]; /* buffers data between passes */
1127 SHIFT_TEMPS
1128
1129 /* Pass 1: process columns from input, store into work array. */
1130
1131 inptr = coef_block;
1132 quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1133 wsptr = workspace;
1134 for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1135 /* Even part */
1136
1137 tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1138 tmp0 <<= CONST_BITS;
1139 /* Add fudge factor here for final descale. */
1140 tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
1141
1142 z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1143 z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1144 z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1145
1146 tmp3 = MULTIPLY(z3, FIX(0.707106781)); /* c6 */
1147 tmp1 = tmp0 + tmp3;
1148 tmp2 = tmp0 - tmp3 - tmp3;
1149
1150 tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
1151 tmp11 = tmp2 + tmp0;
1152 tmp14 = tmp2 - tmp0 - tmp0;
1153
1154 tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
1155 tmp2 = MULTIPLY(z1, FIX(1.083350441)); /* c4 */
1156 tmp3 = MULTIPLY(z2, FIX(0.245575608)); /* c8 */
1157
1158 tmp10 = tmp1 + tmp0 - tmp3;
1159 tmp12 = tmp1 - tmp0 + tmp2;
1160 tmp13 = tmp1 - tmp2 + tmp3;
1161
1162 /* Odd part */
1163
1164 z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1165 z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1166 z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1167 z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1168
1169 z2 = MULTIPLY(z2, - FIX(1.224744871)); /* -c3 */
1170
1171 tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955)); /* c5 */
1172 tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525)); /* c7 */
1173 tmp0 = tmp2 + tmp3 - z2;
1174 tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481)); /* c1 */
1175 tmp2 += z2 - tmp1;
1176 tmp3 += z2 + tmp1;
1177 tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
1178
1179 /* Final output stage */
1180
1181 wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
1182 wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
1183 wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
1184 wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
1185 wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
1186 wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
1187 wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
1188 wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
1189 wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
1190 }
1191
1192 /* Pass 2: process 9 rows from work array, store into output array. */
1193
1194 wsptr = workspace;
1195 for (ctr = 0; ctr < 9; ctr++) {
1196 outptr = output_buf[ctr] + output_col;
1197
1198 /* Even part */
1199
1200 /* Add range center and fudge factor for final descale and range-limit. */
1201 tmp0 = (INT32) wsptr[0] +
1202 ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
1203 (ONE << (PASS1_BITS+2)));
1204 tmp0 <<= CONST_BITS;
1205
1206 z1 = (INT32) wsptr[2];
1207 z2 = (INT32) wsptr[4];
1208 z3 = (INT32) wsptr[6];
1209
1210 tmp3 = MULTIPLY(z3, FIX(0.707106781)); /* c6 */
1211 tmp1 = tmp0 + tmp3;
1212 tmp2 = tmp0 - tmp3 - tmp3;
1213
1214 tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
1215 tmp11 = tmp2 + tmp0;
1216 tmp14 = tmp2 - tmp0 - tmp0;
1217
1218 tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
1219 tmp2 = MULTIPLY(z1, FIX(1.083350441)); /* c4 */
1220 tmp3 = MULTIPLY(z2, FIX(0.245575608)); /* c8 */
1221
1222 tmp10 = tmp1 + tmp0 - tmp3;
1223 tmp12 = tmp1 - tmp0 + tmp2;
1224 tmp13 = tmp1 - tmp2 + tmp3;
1225
1226 /* Odd part */
1227
1228 z1 = (INT32) wsptr[1];
1229 z2 = (INT32) wsptr[3];
1230 z3 = (INT32) wsptr[5];
1231 z4 = (INT32) wsptr[7];
1232
1233 z2 = MULTIPLY(z2, - FIX(1.224744871)); /* -c3 */
1234
1235 tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955)); /* c5 */
1236 tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525)); /* c7 */
1237 tmp0 = tmp2 + tmp3 - z2;
1238 tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481)); /* c1 */
1239 tmp2 += z2 - tmp1;
1240 tmp3 += z2 + tmp1;
1241 tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
1242
1243 /* Final output stage */
1244
1245 outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
1246 CONST_BITS+PASS1_BITS+3)
1247 & RANGE_MASK];
1248 outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
1249 CONST_BITS+PASS1_BITS+3)
1250 & RANGE_MASK];
1251 outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
1252 CONST_BITS+PASS1_BITS+3)
1253 & RANGE_MASK];
1254 outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
1255 CONST_BITS+PASS1_BITS+3)
1256 & RANGE_MASK];
1257 outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
1258 CONST_BITS+PASS1_BITS+3)
1259 & RANGE_MASK];
1260 outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
1261 CONST_BITS+PASS1_BITS+3)
1262 & RANGE_MASK];
1263 outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
1264 CONST_BITS+PASS1_BITS+3)
1265 & RANGE_MASK];
1266 outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
1267 CONST_BITS+PASS1_BITS+3)
1268 & RANGE_MASK];
1269 outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
1270 CONST_BITS+PASS1_BITS+3)
1271 & RANGE_MASK];
1272
1273 wsptr += 8; /* advance pointer to next row */
1274 }
1275 }
1276
1277
1278 /*
1279 * Perform dequantization and inverse DCT on one block of coefficients,
1280 * producing a 10x10 output block.
1281 *
1282 * Optimized algorithm with 12 multiplications in the 1-D kernel.
1283 * cK represents sqrt(2) * cos(K*pi/20).
1284 */
1285
1286 GLOBAL(void)
1287 jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1288 JCOEFPTR coef_block,
1289 JSAMPARRAY output_buf, JDIMENSION output_col)
1290 {
1291 INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1292 INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
1293 INT32 z1, z2, z3, z4, z5;
1294 JCOEFPTR inptr;
1295 ISLOW_MULT_TYPE * quantptr;
1296 int * wsptr;
1297 JSAMPROW outptr;
1298 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1299 int ctr;
1300 int workspace[8*10]; /* buffers data between passes */
1301 SHIFT_TEMPS
1302
1303 /* Pass 1: process columns from input, store into work array. */
1304
1305 inptr = coef_block;
1306 quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1307 wsptr = workspace;
1308 for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1309 /* Even part */
1310
1311 z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1312 z3 <<= CONST_BITS;
1313 /* Add fudge factor here for final descale. */
1314 z3 += ONE << (CONST_BITS-PASS1_BITS-1);
1315 z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1316 z1 = MULTIPLY(z4, FIX(1.144122806)); /* c4 */
1317 z2 = MULTIPLY(z4, FIX(0.437016024)); /* c8 */
1318 tmp10 = z3 + z1;
1319 tmp11 = z3 - z2;
1320
1321 tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1), /* c0 = (c4-c8)*2 */
1322 CONST_BITS-PASS1_BITS);
1323
1324 z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1325 z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1326
1327 z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c6 */
1328 tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
1329 tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
1330
1331 tmp20 = tmp10 + tmp12;
1332 tmp24 = tmp10 - tmp12;
1333 tmp21 = tmp11 + tmp13;
1334 tmp23 = tmp11 - tmp13;
1335
1336 /* Odd part */
1337
1338 z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1339 z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1340 z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1341 z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1342
1343 tmp11 = z2 + z4;
1344 tmp13 = z2 - z4;
1345
1346 tmp12 = MULTIPLY(tmp13, FIX(0.309016994)); /* (c3-c7)/2 */
1347 z5 = z3 << CONST_BITS;
1348
1349 z2 = MULTIPLY(tmp11, FIX(0.951056516)); /* (c3+c7)/2 */
1350 z4 = z5 + tmp12;
1351
1352 tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
1353 tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
1354
1355 z2 = MULTIPLY(tmp11, FIX(0.587785252)); /* (c1-c9)/2 */
1356 z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
1357
1358 tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
1359
1360 tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
1361 tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
1362
1363 /* Final output stage */
1364
1365 wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1366 wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1367 wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1368 wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1369 wsptr[8*2] = (int) (tmp22 + tmp12);
1370 wsptr[8*7] = (int) (tmp22 - tmp12);
1371 wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1372 wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1373 wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1374 wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1375 }
1376
1377 /* Pass 2: process 10 rows from work array, store into output array. */
1378
1379 wsptr = workspace;
1380 for (ctr = 0; ctr < 10; ctr++) {
1381 outptr = output_buf[ctr] + output_col;
1382
1383 /* Even part */
1384
1385 /* Add range center and fudge factor for final descale and range-limit. */
1386 z3 = (INT32) wsptr[0] +
1387 ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
1388 (ONE << (PASS1_BITS+2)));
1389 z3 <<= CONST_BITS;
1390 z4 = (INT32) wsptr[4];
1391 z1 = MULTIPLY(z4, FIX(1.144122806)); /* c4 */
1392 z2 = MULTIPLY(z4, FIX(0.437016024)); /* c8 */
1393 tmp10 = z3 + z1;
1394 tmp11 = z3 - z2;
1395
1396 tmp22 = z3 - ((z1 - z2) << 1); /* c0 = (c4-c8)*2 */
1397
1398 z2 = (INT32) wsptr[2];
1399 z3 = (INT32) wsptr[6];
1400
1401 z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c6 */
1402 tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
1403 tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
1404
1405 tmp20 = tmp10 + tmp12;
1406 tmp24 = tmp10 - tmp12;
1407 tmp21 = tmp11 + tmp13;
1408 tmp23 = tmp11 - tmp13;
1409
1410 /* Odd part */
1411
1412 z1 = (INT32) wsptr[1];
1413 z2 = (INT32) wsptr[3];
1414 z3 = (INT32) wsptr[5];
1415 z3 <<= CONST_BITS;
1416 z4 = (INT32) wsptr[7];
1417
1418 tmp11 = z2 + z4;
1419 tmp13 = z2 - z4;
1420
1421 tmp12 = MULTIPLY(tmp13, FIX(0.309016994)); /* (c3-c7)/2 */
1422
1423 z2 = MULTIPLY(tmp11, FIX(0.951056516)); /* (c3+c7)/2 */
1424 z4 = z3 + tmp12;
1425
1426 tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
1427 tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
1428
1429 z2 = MULTIPLY(tmp11, FIX(0.587785252)); /* (c1-c9)/2 */
1430 z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
1431
1432 tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
1433
1434 tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
1435 tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
1436
1437 /* Final output stage */
1438
1439 outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1440 CONST_BITS+PASS1_BITS+3)
1441 & RANGE_MASK];
1442 outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1443 CONST_BITS+PASS1_BITS+3)
1444 & RANGE_MASK];
1445 outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1446 CONST_BITS+PASS1_BITS+3)
1447 & RANGE_MASK];
1448 outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1449 CONST_BITS+PASS1_BITS+3)
1450 & RANGE_MASK];
1451 outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1452 CONST_BITS+PASS1_BITS+3)
1453 & RANGE_MASK];
1454 outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1455 CONST_BITS+PASS1_BITS+3)
1456 & RANGE_MASK];
1457 outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1458 CONST_BITS+PASS1_BITS+3)
1459 & RANGE_MASK];
1460 outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1461 CONST_BITS+PASS1_BITS+3)
1462 & RANGE_MASK];
1463 outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1464 CONST_BITS+PASS1_BITS+3)
1465 & RANGE_MASK];
1466 outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1467 CONST_BITS+PASS1_BITS+3)
1468 & RANGE_MASK];
1469
1470 wsptr += 8; /* advance pointer to next row */
1471 }
1472 }
1473
1474
1475 /*
1476 * Perform dequantization and inverse DCT on one block of coefficients,
1477 * producing a 11x11 output block.
1478 *
1479 * Optimized algorithm with 24 multiplications in the 1-D kernel.
1480 * cK represents sqrt(2) * cos(K*pi/22).
1481 */
1482
1483 GLOBAL(void)
1484 jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1485 JCOEFPTR coef_block,
1486 JSAMPARRAY output_buf, JDIMENSION output_col)
1487 {
1488 INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1489 INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
1490 INT32 z1, z2, z3, z4;
1491 JCOEFPTR inptr;
1492 ISLOW_MULT_TYPE * quantptr;
1493 int * wsptr;
1494 JSAMPROW outptr;
1495 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1496 int ctr;
1497 int workspace[8*11]; /* buffers data between passes */
1498 SHIFT_TEMPS
1499
1500 /* Pass 1: process columns from input, store into work array. */
1501
1502 inptr = coef_block;
1503 quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1504 wsptr = workspace;
1505 for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1506 /* Even part */
1507
1508 tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1509 tmp10 <<= CONST_BITS;
1510 /* Add fudge factor here for final descale. */
1511 tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
1512
1513 z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1514 z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1515 z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1516
1517 tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132)); /* c2+c4 */
1518 tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045)); /* c2-c6 */
1519 z4 = z1 + z3;
1520 tmp24 = MULTIPLY(z4, - FIX(1.155664402)); /* -(c2-c10) */
1521 z4 -= z2;
1522 tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976)); /* c2 */
1523 tmp21 = tmp20 + tmp23 + tmp25 -
1524 MULTIPLY(z2, FIX(1.821790775)); /* c2+c4+c10-c6 */
1525 tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
1526 tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
1527 tmp24 += tmp25;
1528 tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120)); /* c8+c10 */
1529 tmp24 += MULTIPLY(z2, FIX(1.944413522)) - /* c2+c8 */
1530 MULTIPLY(z1, FIX(1.390975730)); /* c4+c10 */
1531 tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562)); /* c0 */
1532
1533 /* Odd part */
1534
1535 z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1536 z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1537 z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1538 z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1539
1540 tmp11 = z1 + z2;
1541 tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
1542 tmp11 = MULTIPLY(tmp11, FIX(0.887983902)); /* c3-c9 */
1543 tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295)); /* c5-c9 */
1544 tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
1545 tmp10 = tmp11 + tmp12 + tmp13 -
1546 MULTIPLY(z1, FIX(0.923107866)); /* c7+c5+c3-c1-2*c9 */
1547 z1 = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
1548 tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588)); /* c1+c7+3*c9-c3 */
1549 tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623)); /* c3+c5-c7-c9 */
1550 z1 = MULTIPLY(z2 + z4, - FIX(1.798248910)); /* -(c1+c9) */
1551 tmp11 += z1;
1552 tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632)); /* c1+c5+c9-c7 */
1553 tmp14 += MULTIPLY(z2, - FIX(1.467221301)) + /* -(c5+c9) */
1554 MULTIPLY(z3, FIX(1.001388905)) - /* c1-c9 */
1555 MULTIPLY(z4, FIX(1.684843907)); /* c3+c9 */
1556
1557 /* Final output stage */
1558
1559 wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1560 wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1561 wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1562 wsptr[8*9] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1563 wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
1564 wsptr[8*8] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
1565 wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1566 wsptr[8*7] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1567 wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1568 wsptr[8*6] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1569 wsptr[8*5] = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
1570 }
1571
1572 /* Pass 2: process 11 rows from work array, store into output array. */
1573
1574 wsptr = workspace;
1575 for (ctr = 0; ctr < 11; ctr++) {
1576 outptr = output_buf[ctr] + output_col;
1577
1578 /* Even part */
1579
1580 /* Add range center and fudge factor for final descale and range-limit. */
1581 tmp10 = (INT32) wsptr[0] +
1582 ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
1583 (ONE << (PASS1_BITS+2)));
1584 tmp10 <<= CONST_BITS;
1585
1586 z1 = (INT32) wsptr[2];
1587 z2 = (INT32) wsptr[4];
1588 z3 = (INT32) wsptr[6];
1589
1590 tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132)); /* c2+c4 */
1591 tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045)); /* c2-c6 */
1592 z4 = z1 + z3;
1593 tmp24 = MULTIPLY(z4, - FIX(1.155664402)); /* -(c2-c10) */
1594 z4 -= z2;
1595 tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976)); /* c2 */
1596 tmp21 = tmp20 + tmp23 + tmp25 -
1597 MULTIPLY(z2, FIX(1.821790775)); /* c2+c4+c10-c6 */
1598 tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
1599 tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
1600 tmp24 += tmp25;
1601 tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120)); /* c8+c10 */
1602 tmp24 += MULTIPLY(z2, FIX(1.944413522)) - /* c2+c8 */
1603 MULTIPLY(z1, FIX(1.390975730)); /* c4+c10 */
1604 tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562)); /* c0 */
1605
1606 /* Odd part */
1607
1608 z1 = (INT32) wsptr[1];
1609 z2 = (INT32) wsptr[3];
1610 z3 = (INT32) wsptr[5];
1611 z4 = (INT32) wsptr[7];
1612
1613 tmp11 = z1 + z2;
1614 tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
1615 tmp11 = MULTIPLY(tmp11, FIX(0.887983902)); /* c3-c9 */
1616 tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295)); /* c5-c9 */
1617 tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
1618 tmp10 = tmp11 + tmp12 + tmp13 -
1619 MULTIPLY(z1, FIX(0.923107866)); /* c7+c5+c3-c1-2*c9 */
1620 z1 = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
1621 tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588)); /* c1+c7+3*c9-c3 */
1622 tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623)); /* c3+c5-c7-c9 */
1623 z1 = MULTIPLY(z2 + z4, - FIX(1.798248910)); /* -(c1+c9) */
1624 tmp11 += z1;
1625 tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632)); /* c1+c5+c9-c7 */
1626 tmp14 += MULTIPLY(z2, - FIX(1.467221301)) + /* -(c5+c9) */
1627 MULTIPLY(z3, FIX(1.001388905)) - /* c1-c9 */
1628 MULTIPLY(z4, FIX(1.684843907)); /* c3+c9 */
1629
1630 /* Final output stage */
1631
1632 outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1633 CONST_BITS+PASS1_BITS+3)
1634 & RANGE_MASK];
1635 outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1636 CONST_BITS+PASS1_BITS+3)
1637 & RANGE_MASK];
1638 outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1639 CONST_BITS+PASS1_BITS+3)
1640 & RANGE_MASK];
1641 outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1642 CONST_BITS+PASS1_BITS+3)
1643 & RANGE_MASK];
1644 outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1645 CONST_BITS+PASS1_BITS+3)
1646 & RANGE_MASK];
1647 outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1648 CONST_BITS+PASS1_BITS+3)
1649 & RANGE_MASK];
1650 outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1651 CONST_BITS+PASS1_BITS+3)
1652 & RANGE_MASK];
1653 outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1654 CONST_BITS+PASS1_BITS+3)
1655 & RANGE_MASK];
1656 outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1657 CONST_BITS+PASS1_BITS+3)
1658 & RANGE_MASK];
1659 outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1660 CONST_BITS+PASS1_BITS+3)
1661 & RANGE_MASK];
1662 outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25,
1663 CONST_BITS+PASS1_BITS+3)
1664 & RANGE_MASK];
1665
1666 wsptr += 8; /* advance pointer to next row */
1667 }
1668 }
1669
1670
1671 /*
1672 * Perform dequantization and inverse DCT on one block of coefficients,
1673 * producing a 12x12 output block.
1674 *
1675 * Optimized algorithm with 15 multiplications in the 1-D kernel.
1676 * cK represents sqrt(2) * cos(K*pi/24).
1677 */
1678
1679 GLOBAL(void)
1680 jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1681 JCOEFPTR coef_block,
1682 JSAMPARRAY output_buf, JDIMENSION output_col)
1683 {
1684 INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1685 INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
1686 INT32 z1, z2, z3, z4;
1687 JCOEFPTR inptr;
1688 ISLOW_MULT_TYPE * quantptr;
1689 int * wsptr;
1690 JSAMPROW outptr;
1691 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1692 int ctr;
1693 int workspace[8*12]; /* buffers data between passes */
1694 SHIFT_TEMPS
1695
1696 /* Pass 1: process columns from input, store into work array. */
1697
1698 inptr = coef_block;
1699 quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1700 wsptr = workspace;
1701 for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1702 /* Even part */
1703
1704 z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1705 z3 <<= CONST_BITS;
1706 /* Add fudge factor here for final descale. */
1707 z3 += ONE << (CONST_BITS-PASS1_BITS-1);
1708
1709 z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1710 z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
1711
1712 tmp10 = z3 + z4;
1713 tmp11 = z3 - z4;
1714
1715 z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1716 z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
1717 z1 <<= CONST_BITS;
1718 z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1719 z2 <<= CONST_BITS;
1720
1721 tmp12 = z1 - z2;
1722
1723 tmp21 = z3 + tmp12;
1724 tmp24 = z3 - tmp12;
1725
1726 tmp12 = z4 + z2;
1727
1728 tmp20 = tmp10 + tmp12;
1729 tmp25 = tmp10 - tmp12;
1730
1731 tmp12 = z4 - z1 - z2;
1732
1733 tmp22 = tmp11 + tmp12;
1734 tmp23 = tmp11 - tmp12;
1735
1736 /* Odd part */
1737
1738 z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1739 z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1740 z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1741 z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1742
1743 tmp11 = MULTIPLY(z2, FIX(1.306562965)); /* c3 */
1744 tmp14 = MULTIPLY(z2, - FIX_0_541196100); /* -c9 */
1745
1746 tmp10 = z1 + z3;
1747 tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669)); /* c7 */
1748 tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384)); /* c5-c7 */
1749 tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716)); /* c1-c5 */
1750 tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580)); /* -(c7+c11) */
1751 tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
1752 tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
1753 tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */
1754 MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */
1755
1756 z1 -= z4;
1757 z2 -= z3;
1758 z3 = MULTIPLY(z1 + z2, FIX_0_541196100); /* c9 */
1759 tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865); /* c3-c9 */
1760 tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065); /* c3+c9 */
1761
1762 /* Final output stage */
1763
1764 wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1765 wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1766 wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1767 wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1768 wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
1769 wsptr[8*9] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
1770 wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1771 wsptr[8*8] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1772 wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1773 wsptr[8*7] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1774 wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
1775 wsptr[8*6] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
1776 }
1777
1778 /* Pass 2: process 12 rows from work array, store into output array. */
1779
1780 wsptr = workspace;
1781 for (ctr = 0; ctr < 12; ctr++) {
1782 outptr = output_buf[ctr] + output_col;
1783
1784 /* Even part */
1785
1786 /* Add range center and fudge factor for final descale and range-limit. */
1787 z3 = (INT32) wsptr[0] +
1788 ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
1789 (ONE << (PASS1_BITS+2)));
1790 z3 <<= CONST_BITS;
1791
1792 z4 = (INT32) wsptr[4];
1793 z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
1794
1795 tmp10 = z3 + z4;
1796 tmp11 = z3 - z4;
1797
1798 z1 = (INT32) wsptr[2];
1799 z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
1800 z1 <<= CONST_BITS;
1801 z2 = (INT32) wsptr[6];
1802 z2 <<= CONST_BITS;
1803
1804 tmp12 = z1 - z2;
1805
1806 tmp21 = z3 + tmp12;
1807 tmp24 = z3 - tmp12;
1808
1809 tmp12 = z4 + z2;
1810
1811 tmp20 = tmp10 + tmp12;
1812 tmp25 = tmp10 - tmp12;
1813
1814 tmp12 = z4 - z1 - z2;
1815
1816 tmp22 = tmp11 + tmp12;
1817 tmp23 = tmp11 - tmp12;
1818
1819 /* Odd part */
1820
1821 z1 = (INT32) wsptr[1];
1822 z2 = (INT32) wsptr[3];
1823 z3 = (INT32) wsptr[5];
1824 z4 = (INT32) wsptr[7];
1825
1826 tmp11 = MULTIPLY(z2, FIX(1.306562965)); /* c3 */
1827 tmp14 = MULTIPLY(z2, - FIX_0_541196100); /* -c9 */
1828
1829 tmp10 = z1 + z3;
1830 tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669)); /* c7 */
1831 tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384)); /* c5-c7 */
1832 tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716)); /* c1-c5 */
1833 tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580)); /* -(c7+c11) */
1834 tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
1835 tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
1836 tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */
1837 MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */
1838
1839 z1 -= z4;
1840 z2 -= z3;
1841 z3 = MULTIPLY(z1 + z2, FIX_0_541196100); /* c9 */
1842 tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865); /* c3-c9 */
1843 tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065); /* c3+c9 */
1844
1845 /* Final output stage */
1846
1847 outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1848 CONST_BITS+PASS1_BITS+3)
1849 & RANGE_MASK];
1850 outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1851 CONST_BITS+PASS1_BITS+3)
1852 & RANGE_MASK];
1853 outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1854 CONST_BITS+PASS1_BITS+3)
1855 & RANGE_MASK];
1856 outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1857 CONST_BITS+PASS1_BITS+3)
1858 & RANGE_MASK];
1859 outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1860 CONST_BITS+PASS1_BITS+3)
1861 & RANGE_MASK];
1862 outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1863 CONST_BITS+PASS1_BITS+3)
1864 & RANGE_MASK];
1865 outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1866 CONST_BITS+PASS1_BITS+3)
1867 & RANGE_MASK];
1868 outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1869 CONST_BITS+PASS1_BITS+3)
1870 & RANGE_MASK];
1871 outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1872 CONST_BITS+PASS1_BITS+3)
1873 & RANGE_MASK];
1874 outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1875 CONST_BITS+PASS1_BITS+3)
1876 & RANGE_MASK];
1877 outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
1878 CONST_BITS+PASS1_BITS+3)
1879 & RANGE_MASK];
1880 outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
1881 CONST_BITS+PASS1_BITS+3)
1882 & RANGE_MASK];
1883
1884 wsptr += 8; /* advance pointer to next row */
1885 }
1886 }
1887
1888
1889 /*
1890 * Perform dequantization and inverse DCT on one block of coefficients,
1891 * producing a 13x13 output block.
1892 *
1893 * Optimized algorithm with 29 multiplications in the 1-D kernel.
1894 * cK represents sqrt(2) * cos(K*pi/26).
1895 */
1896
1897 GLOBAL(void)
1898 jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1899 JCOEFPTR coef_block,
1900 JSAMPARRAY output_buf, JDIMENSION output_col)
1901 {
1902 INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1903 INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
1904 INT32 z1, z2, z3, z4;
1905 JCOEFPTR inptr;
1906 ISLOW_MULT_TYPE * quantptr;
1907 int * wsptr;
1908 JSAMPROW outptr;
1909 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1910 int ctr;
1911 int workspace[8*13]; /* buffers data between passes */
1912 SHIFT_TEMPS
1913
1914 /* Pass 1: process columns from input, store into work array. */
1915
1916 inptr = coef_block;
1917 quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1918 wsptr = workspace;
1919 for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1920 /* Even part */
1921
1922 z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1923 z1 <<= CONST_BITS;
1924 /* Add fudge factor here for final descale. */
1925 z1 += ONE << (CONST_BITS-PASS1_BITS-1);
1926
1927 z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1928 z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1929 z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1930
1931 tmp10 = z3 + z4;
1932 tmp11 = z3 - z4;
1933
1934 tmp12 = MULTIPLY(tmp10, FIX(1.155388986)); /* (c4+c6)/2 */
1935 tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1; /* (c4-c6)/2 */
1936
1937 tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13; /* c2 */
1938 tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13; /* c10 */
1939
1940 tmp12 = MULTIPLY(tmp10, FIX(0.316450131)); /* (c8-c12)/2 */
1941 tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1; /* (c8+c12)/2 */
1942
1943 tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13; /* c6 */
1944 tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
1945
1946 tmp12 = MULTIPLY(tmp10, FIX(0.435816023)); /* (c2-c10)/2 */
1947 tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1; /* (c2+c10)/2 */
1948
1949 tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
1950 tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
1951
1952 tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1; /* c0 */
1953
1954 /* Odd part */
1955
1956 z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1957 z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1958 z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1959 z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1960
1961 tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651)); /* c3 */
1962 tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945)); /* c5 */
1963 tmp15 = z1 + z4;
1964 tmp13 = MULTIPLY(tmp15, FIX(0.937797057)); /* c7 */
1965 tmp10 = tmp11 + tmp12 + tmp13 -
1966 MULTIPLY(z1, FIX(2.020082300)); /* c7+c5+c3-c1 */
1967 tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458)); /* -c11 */
1968 tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
1969 tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
1970 tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945)); /* -c5 */
1971 tmp11 += tmp14;
1972 tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
1973 tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813)); /* -c9 */
1974 tmp12 += tmp14;
1975 tmp13 += tmp14;
1976 tmp15 = MULTIPLY(tmp15, FIX(0.338443458)); /* c11 */
1977 tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
1978 MULTIPLY(z2, FIX(0.466105296)); /* c1-c7 */
1979 z1 = MULTIPLY(z3 - z2, FIX(0.937797057)); /* c7 */
1980 tmp14 += z1;
1981 tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) - /* c3-c7 */
1982 MULTIPLY(z4, FIX(1.742345811)); /* c1+c11 */
1983
1984 /* Final output stage */
1985
1986 wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1987 wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1988 wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1989 wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1990 wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
1991 wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
1992 wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1993 wsptr[8*9] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1994 wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1995 wsptr[8*8] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1996 wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
1997 wsptr[8*7] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
1998 wsptr[8*6] = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
1999 }
2000
2001 /* Pass 2: process 13 rows from work array, store into output array. */
2002
2003 wsptr = workspace;
2004 for (ctr = 0; ctr < 13; ctr++) {
2005 outptr = output_buf[ctr] + output_col;
2006
2007 /* Even part */
2008
2009 /* Add range center and fudge factor for final descale and range-limit. */
2010 z1 = (INT32) wsptr[0] +
2011 ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
2012 (ONE << (PASS1_BITS+2)));
2013 z1 <<= CONST_BITS;
2014
2015 z2 = (INT32) wsptr[2];
2016 z3 = (INT32) wsptr[4];
2017 z4 = (INT32) wsptr[6];
2018
2019 tmp10 = z3 + z4;
2020 tmp11 = z3 - z4;
2021
2022 tmp12 = MULTIPLY(tmp10, FIX(1.155388986)); /* (c4+c6)/2 */
2023 tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1; /* (c4-c6)/2 */
2024
2025 tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13; /* c2 */
2026 tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13; /* c10 */
2027
2028 tmp12 = MULTIPLY(tmp10, FIX(0.316450131)); /* (c8-c12)/2 */
2029 tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1; /* (c8+c12)/2 */
2030
2031 tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13; /* c6 */
2032 tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
2033
2034 tmp12 = MULTIPLY(tmp10, FIX(0.435816023)); /* (c2-c10)/2 */
2035 tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1; /* (c2+c10)/2 */
2036
2037 tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
2038 tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
2039
2040 tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1; /* c0 */
2041
2042 /* Odd part */
2043
2044 z1 = (INT32) wsptr[1];
2045 z2 = (INT32) wsptr[3];
2046 z3 = (INT32) wsptr[5];
2047 z4 = (INT32) wsptr[7];
2048
2049 tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651)); /* c3 */
2050 tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945)); /* c5 */
2051 tmp15 = z1 + z4;
2052 tmp13 = MULTIPLY(tmp15, FIX(0.937797057)); /* c7 */
2053 tmp10 = tmp11 + tmp12 + tmp13 -
2054 MULTIPLY(z1, FIX(2.020082300)); /* c7+c5+c3-c1 */
2055 tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458)); /* -c11 */
2056 tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
2057 tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
2058 tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945)); /* -c5 */
2059 tmp11 += tmp14;
2060 tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
2061 tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813)); /* -c9 */
2062 tmp12 += tmp14;
2063 tmp13 += tmp14;
2064 tmp15 = MULTIPLY(tmp15, FIX(0.338443458)); /* c11 */
2065 tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
2066 MULTIPLY(z2, FIX(0.466105296)); /* c1-c7 */
2067 z1 = MULTIPLY(z3 - z2, FIX(0.937797057)); /* c7 */
2068 tmp14 += z1;
2069 tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) - /* c3-c7 */
2070 MULTIPLY(z4, FIX(1.742345811)); /* c1+c11 */
2071
2072 /* Final output stage */
2073
2074 outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2075 CONST_BITS+PASS1_BITS+3)
2076 & RANGE_MASK];
2077 outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2078 CONST_BITS+PASS1_BITS+3)
2079 & RANGE_MASK];
2080 outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2081 CONST_BITS+PASS1_BITS+3)
2082 & RANGE_MASK];
2083 outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2084 CONST_BITS+PASS1_BITS+3)
2085 & RANGE_MASK];
2086 outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2087 CONST_BITS+PASS1_BITS+3)
2088 & RANGE_MASK];
2089 outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2090 CONST_BITS+PASS1_BITS+3)
2091 & RANGE_MASK];
2092 outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2093 CONST_BITS+PASS1_BITS+3)
2094 & RANGE_MASK];
2095 outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2096 CONST_BITS+PASS1_BITS+3)
2097 & RANGE_MASK];
2098 outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2099 CONST_BITS+PASS1_BITS+3)
2100 & RANGE_MASK];
2101 outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2102 CONST_BITS+PASS1_BITS+3)
2103 & RANGE_MASK];
2104 outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2105 CONST_BITS+PASS1_BITS+3)
2106 & RANGE_MASK];
2107 outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2108 CONST_BITS+PASS1_BITS+3)
2109 & RANGE_MASK];
2110 outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26,
2111 CONST_BITS+PASS1_BITS+3)
2112 & RANGE_MASK];
2113
2114 wsptr += 8; /* advance pointer to next row */
2115 }
2116 }
2117
2118
2119 /*
2120 * Perform dequantization and inverse DCT on one block of coefficients,
2121 * producing a 14x14 output block.
2122 *
2123 * Optimized algorithm with 20 multiplications in the 1-D kernel.
2124 * cK represents sqrt(2) * cos(K*pi/28).
2125 */
2126
2127 GLOBAL(void)
2128 jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2129 JCOEFPTR coef_block,
2130 JSAMPARRAY output_buf, JDIMENSION output_col)
2131 {
2132 INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2133 INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
2134 INT32 z1, z2, z3, z4;
2135 JCOEFPTR inptr;
2136 ISLOW_MULT_TYPE * quantptr;
2137 int * wsptr;
2138 JSAMPROW outptr;
2139 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2140 int ctr;
2141 int workspace[8*14]; /* buffers data between passes */
2142 SHIFT_TEMPS
2143
2144 /* Pass 1: process columns from input, store into work array. */
2145
2146 inptr = coef_block;
2147 quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2148 wsptr = workspace;
2149 for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2150 /* Even part */
2151
2152 z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2153 z1 <<= CONST_BITS;
2154 /* Add fudge factor here for final descale. */
2155 z1 += ONE << (CONST_BITS-PASS1_BITS-1);
2156 z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2157 z2 = MULTIPLY(z4, FIX(1.274162392)); /* c4 */
2158 z3 = MULTIPLY(z4, FIX(0.314692123)); /* c12 */
2159 z4 = MULTIPLY(z4, FIX(0.881747734)); /* c8 */
2160
2161 tmp10 = z1 + z2;
2162 tmp11 = z1 + z3;
2163 tmp12 = z1 - z4;
2164
2165 tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
2166 CONST_BITS-PASS1_BITS);
2167
2168 z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2169 z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2170
2171 z3 = MULTIPLY(z1 + z2, FIX(1.105676686)); /* c6 */
2172
2173 tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
2174 tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
2175 tmp15 = MULTIPLY(z1, FIX(0.613604268)) - /* c10 */
2176 MULTIPLY(z2, FIX(1.378756276)); /* c2 */
2177
2178 tmp20 = tmp10 + tmp13;
2179 tmp26 = tmp10 - tmp13;
2180 tmp21 = tmp11 + tmp14;
2181 tmp25 = tmp11 - tmp14;
2182 tmp22 = tmp12 + tmp15;
2183 tmp24 = tmp12 - tmp15;
2184
2185 /* Odd part */
2186
2187 z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2188 z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2189 z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2190 z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2191 tmp13 = z4 << CONST_BITS;
2192
2193 tmp14 = z1 + z3;
2194 tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607)); /* c3 */
2195 tmp12 = MULTIPLY(tmp14, FIX(1.197448846)); /* c5 */
2196 tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
2197 tmp14 = MULTIPLY(tmp14, FIX(0.752406978)); /* c9 */
2198 tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426)); /* c9+c11-c13 */
2199 z1 -= z2;
2200 tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13; /* c11 */
2201 tmp16 += tmp15;
2202 z1 += z4;
2203 z4 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
2204 tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948)); /* c3-c9-c13 */
2205 tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773)); /* c3+c5-c13 */
2206 z4 = MULTIPLY(z3 - z2, FIX(1.405321284)); /* c1 */
2207 tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
2208 tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567)); /* c1+c11-c5 */
2209
2210 tmp13 = (z1 - z3) << PASS1_BITS;
2211
2212 /* Final output stage */
2213
2214 wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
2215 wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
2216 wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
2217 wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
2218 wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
2219 wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
2220 wsptr[8*3] = (int) (tmp23 + tmp13);
2221 wsptr[8*10] = (int) (tmp23 - tmp13);
2222 wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
2223 wsptr[8*9] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
2224 wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
2225 wsptr[8*8] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
2226 wsptr[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
2227 wsptr[8*7] = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
2228 }
2229
2230 /* Pass 2: process 14 rows from work array, store into output array. */
2231
2232 wsptr = workspace;
2233 for (ctr = 0; ctr < 14; ctr++) {
2234 outptr = output_buf[ctr] + output_col;
2235
2236 /* Even part */
2237
2238 /* Add range center and fudge factor for final descale and range-limit. */
2239 z1 = (INT32) wsptr[0] +
2240 ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
2241 (ONE << (PASS1_BITS+2)));
2242 z1 <<= CONST_BITS;
2243 z4 = (INT32) wsptr[4];
2244 z2 = MULTIPLY(z4, FIX(1.274162392)); /* c4 */
2245 z3 = MULTIPLY(z4, FIX(0.314692123)); /* c12 */
2246 z4 = MULTIPLY(z4, FIX(0.881747734)); /* c8 */
2247
2248 tmp10 = z1 + z2;
2249 tmp11 = z1 + z3;
2250 tmp12 = z1 - z4;
2251
2252 tmp23 = z1 - ((z2 + z3 - z4) << 1); /* c0 = (c4+c12-c8)*2 */
2253
2254 z1 = (INT32) wsptr[2];
2255 z2 = (INT32) wsptr[6];
2256
2257 z3 = MULTIPLY(z1 + z2, FIX(1.105676686)); /* c6 */
2258
2259 tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
2260 tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
2261 tmp15 = MULTIPLY(z1, FIX(0.613604268)) - /* c10 */
2262 MULTIPLY(z2, FIX(1.378756276)); /* c2 */
2263
2264 tmp20 = tmp10 + tmp13;
2265 tmp26 = tmp10 - tmp13;
2266 tmp21 = tmp11 + tmp14;
2267 tmp25 = tmp11 - tmp14;
2268 tmp22 = tmp12 + tmp15;
2269 tmp24 = tmp12 - tmp15;
2270
2271 /* Odd part */
2272
2273 z1 = (INT32) wsptr[1];
2274 z2 = (INT32) wsptr[3];
2275 z3 = (INT32) wsptr[5];
2276 z4 = (INT32) wsptr[7];
2277 z4 <<= CONST_BITS;
2278
2279 tmp14 = z1 + z3;
2280 tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607)); /* c3 */
2281 tmp12 = MULTIPLY(tmp14, FIX(1.197448846)); /* c5 */
2282 tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
2283 tmp14 = MULTIPLY(tmp14, FIX(0.752406978)); /* c9 */
2284 tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426)); /* c9+c11-c13 */
2285 z1 -= z2;
2286 tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4; /* c11 */
2287 tmp16 += tmp15;
2288 tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4; /* -c13 */
2289 tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948)); /* c3-c9-c13 */
2290 tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773)); /* c3+c5-c13 */
2291 tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284)); /* c1 */
2292 tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
2293 tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567)); /* c1+c11-c5 */
2294
2295 tmp13 = ((z1 - z3) << CONST_BITS) + z4;
2296
2297 /* Final output stage */
2298
2299 outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2300 CONST_BITS+PASS1_BITS+3)
2301 & RANGE_MASK];
2302 outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2303 CONST_BITS+PASS1_BITS+3)
2304 & RANGE_MASK];
2305 outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2306 CONST_BITS+PASS1_BITS+3)
2307 & RANGE_MASK];
2308 outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2309 CONST_BITS+PASS1_BITS+3)
2310 & RANGE_MASK];
2311 outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2312 CONST_BITS+PASS1_BITS+3)
2313 & RANGE_MASK];
2314 outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2315 CONST_BITS+PASS1_BITS+3)
2316 & RANGE_MASK];
2317 outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2318 CONST_BITS+PASS1_BITS+3)
2319 & RANGE_MASK];
2320 outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2321 CONST_BITS+PASS1_BITS+3)
2322 & RANGE_MASK];
2323 outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2324 CONST_BITS+PASS1_BITS+3)
2325 & RANGE_MASK];
2326 outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2327 CONST_BITS+PASS1_BITS+3)
2328 & RANGE_MASK];
2329 outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2330 CONST_BITS+PASS1_BITS+3)
2331 & RANGE_MASK];
2332 outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2333 CONST_BITS+PASS1_BITS+3)
2334 & RANGE_MASK];
2335 outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
2336 CONST_BITS+PASS1_BITS+3)
2337 & RANGE_MASK];
2338 outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
2339 CONST_BITS+PASS1_BITS+3)
2340 & RANGE_MASK];
2341
2342 wsptr += 8; /* advance pointer to next row */
2343 }
2344 }
2345
2346
2347 /*
2348 * Perform dequantization and inverse DCT on one block of coefficients,
2349 * producing a 15x15 output block.
2350 *
2351 * Optimized algorithm with 22 multiplications in the 1-D kernel.
2352 * cK represents sqrt(2) * cos(K*pi/30).
2353 */
2354
2355 GLOBAL(void)
2356 jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2357 JCOEFPTR coef_block,
2358 JSAMPARRAY output_buf, JDIMENSION output_col)
2359 {
2360 INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2361 INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2362 INT32 z1, z2, z3, z4;
2363 JCOEFPTR inptr;
2364 ISLOW_MULT_TYPE * quantptr;
2365 int * wsptr;
2366 JSAMPROW outptr;
2367 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2368 int ctr;
2369 int workspace[8*15]; /* buffers data between passes */
2370 SHIFT_TEMPS
2371
2372 /* Pass 1: process columns from input, store into work array. */
2373
2374 inptr = coef_block;
2375 quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2376 wsptr = workspace;
2377 for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2378 /* Even part */
2379
2380 z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2381 z1 <<= CONST_BITS;
2382 /* Add fudge factor here for final descale. */
2383 z1 += ONE << (CONST_BITS-PASS1_BITS-1);
2384
2385 z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2386 z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2387 z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2388
2389 tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
2390 tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
2391
2392 tmp12 = z1 - tmp10;
2393 tmp13 = z1 + tmp11;
2394 z1 -= (tmp11 - tmp10) << 1; /* c0 = (c6-c12)*2 */
2395
2396 z4 = z2 - z3;
2397 z3 += z2;
2398 tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
2399 tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
2400 z2 = MULTIPLY(z2, FIX(1.439773946)); /* c4+c14 */
2401
2402 tmp20 = tmp13 + tmp10 + tmp11;
2403 tmp23 = tmp12 - tmp10 + tmp11 + z2;
2404
2405 tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
2406 tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
2407
2408 tmp25 = tmp13 - tmp10 - tmp11;
2409 tmp26 = tmp12 + tmp10 - tmp11 - z2;
2410
2411 tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
2412 tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
2413
2414 tmp21 = tmp12 + tmp10 + tmp11;
2415 tmp24 = tmp13 - tmp10 + tmp11;
2416 tmp11 += tmp11;
2417 tmp22 = z1 + tmp11; /* c10 = c6-c12 */
2418 tmp27 = z1 - tmp11 - tmp11; /* c0 = (c6-c12)*2 */
2419
2420 /* Odd part */
2421
2422 z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2423 z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2424 z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2425 z3 = MULTIPLY(z4, FIX(1.224744871)); /* c5 */
2426 z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2427
2428 tmp13 = z2 - z4;
2429 tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876)); /* c9 */
2430 tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148)); /* c3-c9 */
2431 tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899)); /* c3+c9 */
2432
2433 tmp13 = MULTIPLY(z2, - FIX(0.831253876)); /* -c9 */
2434 tmp15 = MULTIPLY(z2, - FIX(1.344997024)); /* -c3 */
2435 z2 = z1 - z4;
2436 tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353)); /* c1 */
2437
2438 tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
2439 tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
2440 tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3; /* c5 */
2441 z2 = MULTIPLY(z1 + z4, FIX(0.575212477)); /* c11 */
2442 tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3; /* c7-c11 */
2443 tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3; /* c11+c13 */
2444
2445 /* Final output stage */
2446
2447 wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
2448 wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
2449 wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
2450 wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
2451 wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
2452 wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
2453 wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
2454 wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
2455 wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
2456 wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
2457 wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
2458 wsptr[8*9] = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
2459 wsptr[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
2460 wsptr[8*8] = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
2461 wsptr[8*7] = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
2462 }
2463
2464 /* Pass 2: process 15 rows from work array, store into output array. */
2465
2466 wsptr = workspace;
2467 for (ctr = 0; ctr < 15; ctr++) {
2468 outptr = output_buf[ctr] + output_col;
2469
2470 /* Even part */
2471
2472 /* Add range center and fudge factor for final descale and range-limit. */
2473 z1 = (INT32) wsptr[0] +
2474 ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
2475 (ONE << (PASS1_BITS+2)));
2476 z1 <<= CONST_BITS;
2477
2478 z2 = (INT32) wsptr[2];
2479 z3 = (INT32) wsptr[4];
2480 z4 = (INT32) wsptr[6];
2481
2482 tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
2483 tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
2484
2485 tmp12 = z1 - tmp10;
2486 tmp13 = z1 + tmp11;
2487 z1 -= (tmp11 - tmp10) << 1; /* c0 = (c6-c12)*2 */
2488
2489 z4 = z2 - z3;
2490 z3 += z2;
2491 tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
2492 tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
2493 z2 = MULTIPLY(z2, FIX(1.439773946)); /* c4+c14 */
2494
2495 tmp20 = tmp13 + tmp10 + tmp11;
2496 tmp23 = tmp12 - tmp10 + tmp11 + z2;
2497
2498 tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
2499 tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
2500
2501 tmp25 = tmp13 - tmp10 - tmp11;
2502 tmp26 = tmp12 + tmp10 - tmp11 - z2;
2503
2504 tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
2505 tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
2506
2507 tmp21 = tmp12 + tmp10 + tmp11;
2508 tmp24 = tmp13 - tmp10 + tmp11;
2509 tmp11 += tmp11;
2510 tmp22 = z1 + tmp11; /* c10 = c6-c12 */
2511 tmp27 = z1 - tmp11 - tmp11; /* c0 = (c6-c12)*2 */
2512
2513 /* Odd part */
2514
2515 z1 = (INT32) wsptr[1];
2516 z2 = (INT32) wsptr[3];
2517 z4 = (INT32) wsptr[5];
2518 z3 = MULTIPLY(z4, FIX(1.224744871)); /* c5 */
2519 z4 = (INT32) wsptr[7];
2520
2521 tmp13 = z2 - z4;
2522 tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876)); /* c9 */
2523 tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148)); /* c3-c9 */
2524 tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899)); /* c3+c9 */
2525
2526 tmp13 = MULTIPLY(z2, - FIX(0.831253876)); /* -c9 */
2527 tmp15 = MULTIPLY(z2, - FIX(1.344997024)); /* -c3 */
2528 z2 = z1 - z4;
2529 tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353)); /* c1 */
2530
2531 tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
2532 tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
2533 tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3; /* c5 */
2534 z2 = MULTIPLY(z1 + z4, FIX(0.575212477)); /* c11 */
2535 tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3; /* c7-c11 */
2536 tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3; /* c11+c13 */
2537
2538 /* Final output stage */
2539
2540 outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2541 CONST_BITS+PASS1_BITS+3)
2542 & RANGE_MASK];
2543 outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2544 CONST_BITS+PASS1_BITS+3)
2545 & RANGE_MASK];
2546 outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2547 CONST_BITS+PASS1_BITS+3)
2548 & RANGE_MASK];
2549 outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2550 CONST_BITS+PASS1_BITS+3)
2551 & RANGE_MASK];
2552 outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2553 CONST_BITS+PASS1_BITS+3)
2554 & RANGE_MASK];
2555 outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2556 CONST_BITS+PASS1_BITS+3)
2557 & RANGE_MASK];
2558 outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2559 CONST_BITS+PASS1_BITS+3)
2560 & RANGE_MASK];
2561 outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2562 CONST_BITS+PASS1_BITS+3)
2563 & RANGE_MASK];
2564 outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2565 CONST_BITS+PASS1_BITS+3)
2566 & RANGE_MASK];
2567 outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2568 CONST_BITS+PASS1_BITS+3)
2569 & RANGE_MASK];
2570 outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2571 CONST_BITS+PASS1_BITS+3)
2572 & RANGE_MASK];
2573 outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2574 CONST_BITS+PASS1_BITS+3)
2575 & RANGE_MASK];
2576 outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
2577 CONST_BITS+PASS1_BITS+3)
2578 & RANGE_MASK];
2579 outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
2580 CONST_BITS+PASS1_BITS+3)
2581 & RANGE_MASK];
2582 outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp27,
2583 CONST_BITS+PASS1_BITS+3)
2584 & RANGE_MASK];
2585
2586 wsptr += 8; /* advance pointer to next row */
2587 }
2588 }
2589
2590
2591 /*
2592 * Perform dequantization and inverse DCT on one block of coefficients,
2593 * producing a 16x16 output block.
2594 *
2595 * Optimized algorithm with 28 multiplications in the 1-D kernel.
2596 * cK represents sqrt(2) * cos(K*pi/32).
2597 */
2598
2599 GLOBAL(void)
2600 jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2601 JCOEFPTR coef_block,
2602 JSAMPARRAY output_buf, JDIMENSION output_col)
2603 {
2604 INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
2605 INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2606 INT32 z1, z2, z3, z4;
2607 JCOEFPTR inptr;
2608 ISLOW_MULT_TYPE * quantptr;
2609 int * wsptr;
2610 JSAMPROW outptr;
2611 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2612 int ctr;
2613 int workspace[8*16]; /* buffers data between passes */
2614 SHIFT_TEMPS
2615
2616 /* Pass 1: process columns from input, store into work array. */
2617
2618 inptr = coef_block;
2619 quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2620 wsptr = workspace;
2621 for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2622 /* Even part */
2623
2624 tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2625 tmp0 <<= CONST_BITS;
2626 /* Add fudge factor here for final descale. */
2627 tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
2628
2629 z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2630 tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */
2631 tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */
2632
2633 tmp10 = tmp0 + tmp1;
2634 tmp11 = tmp0 - tmp1;
2635 tmp12 = tmp0 + tmp2;
2636 tmp13 = tmp0 - tmp2;
2637
2638 z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2639 z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2640 z3 = z1 - z2;
2641 z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */
2642 z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */
2643
2644 tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447); /* (c6+c2)[16] = (c3+c1)[8] */
2645 tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223); /* (c6-c14)[16] = (c3-c7)[8] */
2646 tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
2647 tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
2648
2649 tmp20 = tmp10 + tmp0;
2650 tmp27 = tmp10 - tmp0;
2651 tmp21 = tmp12 + tmp1;
2652 tmp26 = tmp12 - tmp1;
2653 tmp22 = tmp13 + tmp2;
2654 tmp25 = tmp13 - tmp2;
2655 tmp23 = tmp11 + tmp3;
2656 tmp24 = tmp11 - tmp3;
2657
2658 /* Odd part */
2659
2660 z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2661 z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2662 z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2663 z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2664
2665 tmp11 = z1 + z3;
2666
2667 tmp1 = MULTIPLY(z1 + z2, FIX(1.353318001)); /* c3 */
2668 tmp2 = MULTIPLY(tmp11, FIX(1.247225013)); /* c5 */
2669 tmp3 = MULTIPLY(z1 + z4, FIX(1.093201867)); /* c7 */
2670 tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586)); /* c9 */
2671 tmp11 = MULTIPLY(tmp11, FIX(0.666655658)); /* c11 */
2672 tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528)); /* c13 */
2673 tmp0 = tmp1 + tmp2 + tmp3 -
2674 MULTIPLY(z1, FIX(2.286341144)); /* c7+c5+c3-c1 */
2675 tmp13 = tmp10 + tmp11 + tmp12 -
2676 MULTIPLY(z1, FIX(1.835730603)); /* c9+c11+c13-c15 */
2677 z1 = MULTIPLY(z2 + z3, FIX(0.138617169)); /* c15 */
2678 tmp1 += z1 + MULTIPLY(z2, FIX(0.071888074)); /* c9+c11-c3-c15 */
2679 tmp2 += z1 - MULTIPLY(z3, FIX(1.125726048)); /* c5+c7+c15-c3 */
2680 z1 = MULTIPLY(z3 - z2, FIX(1.407403738)); /* c1 */
2681 tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282)); /* c1+c11-c9-c13 */
2682 tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411)); /* c1+c5+c13-c7 */
2683 z2 += z4;
2684 z1 = MULTIPLY(z2, - FIX(0.666655658)); /* -c11 */
2685 tmp1 += z1;
2686 tmp3 += z1 + MULTIPLY(z4, FIX(1.065388962)); /* c3+c11+c15-c7 */
2687 z2 = MULTIPLY(z2, - FIX(1.247225013)); /* -c5 */
2688 tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809)); /* c1+c5+c9-c13 */
2689 tmp12 += z2;
2690 z2 = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
2691 tmp2 += z2;
2692 tmp3 += z2;
2693 z2 = MULTIPLY(z4 - z3, FIX(0.410524528)); /* c13 */
2694 tmp10 += z2;
2695 tmp11 += z2;
2696
2697 /* Final output stage */
2698
2699 wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp0, CONST_BITS-PASS1_BITS);
2700 wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0, CONST_BITS-PASS1_BITS);
2701 wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp1, CONST_BITS-PASS1_BITS);
2702 wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1, CONST_BITS-PASS1_BITS);
2703 wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp2, CONST_BITS-PASS1_BITS);
2704 wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2, CONST_BITS-PASS1_BITS);
2705 wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp3, CONST_BITS-PASS1_BITS);
2706 wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3, CONST_BITS-PASS1_BITS);
2707 wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
2708 wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
2709 wsptr[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
2710 wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
2711 wsptr[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
2712 wsptr[8*9] = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
2713 wsptr[8*7] = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
2714 wsptr[8*8] = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
2715 }
2716
2717 /* Pass 2: process 16 rows from work array, store into output array. */
2718
2719 wsptr = workspace;
2720 for (ctr = 0; ctr < 16; ctr++) {
2721 outptr = output_buf[ctr] + output_col;
2722
2723 /* Even part */
2724
2725 /* Add range center and fudge factor for final descale and range-limit. */
2726 tmp0 = (INT32) wsptr[0] +
2727 ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
2728 (ONE << (PASS1_BITS+2)));
2729 tmp0 <<= CONST_BITS;
2730
2731 z1 = (INT32) wsptr[4];
2732 tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */
2733 tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */
2734
2735 tmp10 = tmp0 + tmp1;
2736 tmp11 = tmp0 - tmp1;
2737 tmp12 = tmp0 + tmp2;
2738 tmp13 = tmp0 - tmp2;
2739
2740 z1 = (INT32) wsptr[2];
2741 z2 = (INT32) wsptr[6];
2742 z3 = z1 - z2;
2743 z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */
2744 z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */
2745
2746 tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447); /* (c6+c2)[16] = (c3+c1)[8] */
2747 tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223); /* (c6-c14)[16] = (c3-c7)[8] */
2748 tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
2749 tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
2750
2751 tmp20 = tmp10 + tmp0;
2752 tmp27 = tmp10 - tmp0;
2753 tmp21 = tmp12 + tmp1;
2754 tmp26 = tmp12 - tmp1;
2755 tmp22 = tmp13 + tmp2;
2756 tmp25 = tmp13 - tmp2;
2757 tmp23 = tmp11 + tmp3;
2758 tmp24 = tmp11 - tmp3;
2759
2760 /* Odd part */
2761
2762 z1 = (INT32) wsptr[1];
2763 z2 = (INT32) wsptr[3];
2764 z3 = (INT32) wsptr[5];
2765 z4 = (INT32) wsptr[7];
2766
2767 tmp11 = z1