2 * Mesa 3-D graphics library
5 * Copyright (C) 1999-2005 Brian Paul All Rights Reserved.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 * \file t_arb_program.c
27 * Compile vertex programs to an intermediate representation.
28 * Execute vertex programs over a buffer of vertices.
29 * \author Keith Whitwell, Brian Paul
37 #include "arbprogparse.h"
40 #include "math/m_matrix.h"
41 #include "math/m_translate.h"
42 #include "t_context.h"
43 #include "t_pipeline.h"
44 #include "t_vb_arbprogram.h"
48 /*--------------------------------------------------------------------------- */
53 void (*print
)( union instruction
, const struct opcode_info
* );
58 union instruction
*csr
;
62 #define ARB_VP_MACHINE(stage) ((struct arb_vp_machine *)(stage->privatePtr))
64 #define PUFF(x) ((x)[1] = (x)[2] = (x)[3] = (x)[0])
68 /* Lower precision functions for the EXP, LOG and LIT opcodes. The
69 * LOG2() implementation is probably not accurate enough, and the
70 * attempted optimization for Exp2 is definitely not accurate
71 * enough - it discards all of t's fractional bits!
73 static GLfloat
RoughApproxLog2(GLfloat t
)
78 static GLfloat
RoughApproxExp2(GLfloat t
)
83 fi
.i
= (fi
.i
<< 23) + 0x3f800000;
86 return (GLfloat
) _mesa_pow(2.0, t
);
90 static GLfloat
RoughApproxPower(GLfloat x
, GLfloat y
)
92 return RoughApproxExp2(y
* RoughApproxLog2(x
));
96 /* Higher precision functions for the EX2, LG2 and POW opcodes:
98 static GLfloat
ApproxLog2(GLfloat t
)
100 return (GLfloat
) (log(t
) * 1.442695F
);
103 static GLfloat
ApproxExp2(GLfloat t
)
105 return (GLfloat
) _mesa_pow(2.0, t
);
108 static GLfloat
ApproxPower(GLfloat x
, GLfloat y
)
110 return (GLfloat
) _mesa_pow(x
, y
);
113 static GLfloat
rough_approx_log2_0_1(GLfloat x
)
122 * Perform a reduced swizzle:
124 static void do_RSW( struct arb_vp_machine
*m
, union instruction op
)
126 GLfloat
*result
= m
->File
[0][op
.rsw
.dst
];
127 const GLfloat
*arg0
= m
->File
[op
.rsw
.file0
][op
.rsw
.idx0
];
128 GLuint swz
= op
.rsw
.swz
;
129 GLuint neg
= op
.rsw
.neg
;
131 result
[0] = arg0
[GET_RSW(swz
, 0)];
132 result
[1] = arg0
[GET_RSW(swz
, 1)];
133 result
[2] = arg0
[GET_RSW(swz
, 2)];
134 result
[3] = arg0
[GET_RSW(swz
, 3)];
137 if (neg
& 0x1) result
[0] = -result
[0];
138 if (neg
& 0x2) result
[1] = -result
[1];
139 if (neg
& 0x4) result
[2] = -result
[2];
140 if (neg
& 0x8) result
[3] = -result
[3];
144 /* Used to implement write masking. To make things easier for the sse
145 * generator I've gone back to a 1 argument version of this function
146 * (dst.msk = arg), rather than the semantically cleaner (dst = SEL
149 * That means this is the only instruction which doesn't write a full
150 * 4 dwords out. This would make such a program harder to analyse,
151 * but it looks like analysis is going to take place on a higher level
154 static void do_MSK( struct arb_vp_machine
*m
, union instruction op
)
156 GLfloat
*dst
= m
->File
[0][op
.msk
.dst
];
157 const GLfloat
*arg
= m
->File
[op
.msk
.file
][op
.msk
.idx
];
159 if (op
.msk
.mask
& 0x1) dst
[0] = arg
[0];
160 if (op
.msk
.mask
& 0x2) dst
[1] = arg
[1];
161 if (op
.msk
.mask
& 0x4) dst
[2] = arg
[2];
162 if (op
.msk
.mask
& 0x8) dst
[3] = arg
[3];
166 static void do_PRT( struct arb_vp_machine
*m
, union instruction op
)
168 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
170 _mesa_printf("%d: %f %f %f %f\n", m
->vtx_nr
,
171 arg0
[0], arg0
[1], arg0
[2], arg0
[3]);
176 * The traditional ALU and texturing instructions. All operate on
177 * internal registers and ignore write masks and swizzling issues.
180 static void do_ABS( struct arb_vp_machine
*m
, union instruction op
)
182 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
183 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
185 result
[0] = (arg0
[0] < 0.0) ? -arg0
[0] : arg0
[0];
186 result
[1] = (arg0
[1] < 0.0) ? -arg0
[1] : arg0
[1];
187 result
[2] = (arg0
[2] < 0.0) ? -arg0
[2] : arg0
[2];
188 result
[3] = (arg0
[3] < 0.0) ? -arg0
[3] : arg0
[3];
191 static void do_ADD( struct arb_vp_machine
*m
, union instruction op
)
193 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
194 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
195 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
197 result
[0] = arg0
[0] + arg1
[0];
198 result
[1] = arg0
[1] + arg1
[1];
199 result
[2] = arg0
[2] + arg1
[2];
200 result
[3] = arg0
[3] + arg1
[3];
204 static void do_DP3( struct arb_vp_machine
*m
, union instruction op
)
206 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
207 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
208 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
210 result
[0] = (arg0
[0] * arg1
[0] +
219 static void do_DP4( struct arb_vp_machine
*m
, union instruction op
)
221 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
222 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
223 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
225 result
[0] = (arg0
[0] * arg1
[0] +
233 static void do_DPH( struct arb_vp_machine
*m
, union instruction op
)
235 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
236 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
237 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
239 result
[0] = (arg0
[0] * arg1
[0] +
247 static void do_DST( struct arb_vp_machine
*m
, union instruction op
)
249 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
250 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
251 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
254 result
[1] = arg0
[1] * arg1
[1];
260 /* Intended to be high precision:
262 static void do_EX2( struct arb_vp_machine
*m
, union instruction op
)
264 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
265 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
267 result
[0] = (GLfloat
)ApproxExp2(arg0
[0]);
272 /* Allowed to be lower precision:
274 static void do_EXP( struct arb_vp_machine
*m
, union instruction op
)
276 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
277 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
278 GLfloat tmp
= arg0
[0];
279 GLfloat flr_tmp
= FLOORF(tmp
);
280 GLfloat frac_tmp
= tmp
- flr_tmp
;
282 result
[0] = LDEXPF(1.0, (int)flr_tmp
);
283 result
[1] = frac_tmp
;
284 result
[2] = LDEXPF(rough_approx_log2_0_1(frac_tmp
), (int)flr_tmp
);
288 static void do_FLR( struct arb_vp_machine
*m
, union instruction op
)
290 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
291 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
293 result
[0] = FLOORF(arg0
[0]);
294 result
[1] = FLOORF(arg0
[1]);
295 result
[2] = FLOORF(arg0
[2]);
296 result
[3] = FLOORF(arg0
[3]);
299 static void do_FRC( struct arb_vp_machine
*m
, union instruction op
)
301 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
302 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
304 result
[0] = arg0
[0] - FLOORF(arg0
[0]);
305 result
[1] = arg0
[1] - FLOORF(arg0
[1]);
306 result
[2] = arg0
[2] - FLOORF(arg0
[2]);
307 result
[3] = arg0
[3] - FLOORF(arg0
[3]);
310 /* High precision log base 2:
312 static void do_LG2( struct arb_vp_machine
*m
, union instruction op
)
314 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
315 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
317 result
[0] = ApproxLog2(arg0
[0]);
323 static void do_LIT( struct arb_vp_machine
*m
, union instruction op
)
325 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
326 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
338 tmp
[2] = RoughApproxPower(arg0
[1], arg0
[3]);
342 COPY_4V(result
, tmp
);
346 /* Intended to allow a lower precision than required for LG2 above.
348 static void do_LOG( struct arb_vp_machine
*m
, union instruction op
)
350 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
351 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
352 GLfloat tmp
= FABSF(arg0
[0]);
354 GLfloat mantissa
= FREXPF(tmp
, &exponent
);
356 result
[0] = (GLfloat
) (exponent
- 1);
357 result
[1] = 2.0 * mantissa
; /* map [.5, 1) -> [1, 2) */
358 result
[2] = exponent
+ LOG2(mantissa
);
362 static void do_MAX( struct arb_vp_machine
*m
, union instruction op
)
364 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
365 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
366 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
368 result
[0] = (arg0
[0] > arg1
[0]) ? arg0
[0] : arg1
[0];
369 result
[1] = (arg0
[1] > arg1
[1]) ? arg0
[1] : arg1
[1];
370 result
[2] = (arg0
[2] > arg1
[2]) ? arg0
[2] : arg1
[2];
371 result
[3] = (arg0
[3] > arg1
[3]) ? arg0
[3] : arg1
[3];
375 static void do_MIN( struct arb_vp_machine
*m
, union instruction op
)
377 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
378 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
379 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
381 result
[0] = (arg0
[0] < arg1
[0]) ? arg0
[0] : arg1
[0];
382 result
[1] = (arg0
[1] < arg1
[1]) ? arg0
[1] : arg1
[1];
383 result
[2] = (arg0
[2] < arg1
[2]) ? arg0
[2] : arg1
[2];
384 result
[3] = (arg0
[3] < arg1
[3]) ? arg0
[3] : arg1
[3];
387 static void do_MOV( struct arb_vp_machine
*m
, union instruction op
)
389 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
390 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
398 static void do_MUL( struct arb_vp_machine
*m
, union instruction op
)
400 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
401 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
402 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
404 result
[0] = arg0
[0] * arg1
[0];
405 result
[1] = arg0
[1] * arg1
[1];
406 result
[2] = arg0
[2] * arg1
[2];
407 result
[3] = arg0
[3] * arg1
[3];
411 /* Intended to be "high" precision
413 static void do_POW( struct arb_vp_machine
*m
, union instruction op
)
415 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
416 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
417 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
419 result
[0] = (GLfloat
)ApproxPower(arg0
[0], arg1
[0]);
423 static void do_REL( struct arb_vp_machine
*m
, union instruction op
)
425 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
426 GLuint idx
= (op
.alu
.idx0
+ (GLint
)m
->File
[0][REG_ADDR
][0]) & (MAX_NV_VERTEX_PROGRAM_PARAMS
-1);
427 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][idx
];
435 static void do_RCP( struct arb_vp_machine
*m
, union instruction op
)
437 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
438 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
440 result
[0] = 1.0F
/ arg0
[0];
444 static void do_RSQ( struct arb_vp_machine
*m
, union instruction op
)
446 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
447 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
449 result
[0] = INV_SQRTF(FABSF(arg0
[0]));
454 static void do_SGE( struct arb_vp_machine
*m
, union instruction op
)
456 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
457 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
458 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
460 result
[0] = (arg0
[0] >= arg1
[0]) ? 1.0F
: 0.0F
;
461 result
[1] = (arg0
[1] >= arg1
[1]) ? 1.0F
: 0.0F
;
462 result
[2] = (arg0
[2] >= arg1
[2]) ? 1.0F
: 0.0F
;
463 result
[3] = (arg0
[3] >= arg1
[3]) ? 1.0F
: 0.0F
;
467 static void do_SLT( struct arb_vp_machine
*m
, union instruction op
)
469 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
470 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
471 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
473 result
[0] = (arg0
[0] < arg1
[0]) ? 1.0F
: 0.0F
;
474 result
[1] = (arg0
[1] < arg1
[1]) ? 1.0F
: 0.0F
;
475 result
[2] = (arg0
[2] < arg1
[2]) ? 1.0F
: 0.0F
;
476 result
[3] = (arg0
[3] < arg1
[3]) ? 1.0F
: 0.0F
;
479 static void do_SUB( struct arb_vp_machine
*m
, union instruction op
)
481 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
482 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
483 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
485 result
[0] = arg0
[0] - arg1
[0];
486 result
[1] = arg0
[1] - arg1
[1];
487 result
[2] = arg0
[2] - arg1
[2];
488 result
[3] = arg0
[3] - arg1
[3];
492 static void do_XPD( struct arb_vp_machine
*m
, union instruction op
)
494 GLfloat
*result
= m
->File
[0][op
.alu
.dst
];
495 const GLfloat
*arg0
= m
->File
[op
.alu
.file0
][op
.alu
.idx0
];
496 const GLfloat
*arg1
= m
->File
[op
.alu
.file1
][op
.alu
.idx1
];
498 result
[0] = arg0
[1] * arg1
[2] - arg0
[2] * arg1
[1];
499 result
[1] = arg0
[2] * arg1
[0] - arg0
[0] * arg1
[2];
500 result
[2] = arg0
[0] * arg1
[1] - arg0
[1] * arg1
[0];
503 static void do_NOP( struct arb_vp_machine
*m
, union instruction op
)
507 /* Some useful debugging functions:
509 static void print_mask( GLuint mask
)
512 if (mask
&0x1) _mesa_printf("x");
513 if (mask
&0x2) _mesa_printf("y");
514 if (mask
&0x4) _mesa_printf("z");
515 if (mask
&0x8) _mesa_printf("w");
518 static void print_reg( GLuint file
, GLuint reg
)
520 static const char *reg_file
[] = {
530 else if (reg
>= REG_ARG0
&& reg
<= REG_ARG1
)
531 _mesa_printf("ARG%d", reg
- REG_ARG0
);
532 else if (reg
>= REG_TMP0
&& reg
<= REG_TMP11
)
533 _mesa_printf("TMP%d", reg
- REG_TMP0
);
534 else if (reg
>= REG_IN0
&& reg
<= REG_IN31
)
535 _mesa_printf("IN%d", reg
- REG_IN0
);
536 else if (reg
>= REG_OUT0
&& reg
<= REG_OUT14
)
537 _mesa_printf("OUT%d", reg
- REG_OUT0
);
538 else if (reg
== REG_ADDR
)
539 _mesa_printf("ADDR");
540 else if (reg
== REG_ID
)
543 _mesa_printf("REG%d", reg
);
546 _mesa_printf("%s:%d", reg_file
[file
], reg
);
550 static void print_RSW( union instruction op
, const struct opcode_info
*info
)
552 GLuint swz
= op
.rsw
.swz
;
553 GLuint neg
= op
.rsw
.neg
;
556 _mesa_printf("%s ", info
->string
);
557 print_reg(0, op
.rsw
.dst
);
559 print_reg(op
.rsw
.file0
, op
.rsw
.idx0
);
561 for (i
= 0; i
< 4; i
++, swz
>>= 2) {
562 const char *cswz
= "xyzw";
565 _mesa_printf("%c", cswz
[swz
&0x3]);
571 static void print_ALU( union instruction op
, const struct opcode_info
*info
)
573 _mesa_printf("%s ", info
->string
);
574 print_reg(0, op
.alu
.dst
);
576 print_reg(op
.alu
.file0
, op
.alu
.idx0
);
577 if (info
->nr_args
> 1) {
579 print_reg(op
.alu
.file1
, op
.alu
.idx1
);
584 static void print_MSK( union instruction op
, const struct opcode_info
*info
)
586 _mesa_printf("%s ", info
->string
);
587 print_reg(0, op
.msk
.dst
);
588 print_mask(op
.msk
.mask
);
590 print_reg(op
.msk
.file
, op
.msk
.idx
);
595 static void print_NOP( union instruction op
, const struct opcode_info
*info
)
603 static const struct opcode_info opcode_info
[] =
605 { 1, "ABS", print_ALU
},
606 { 2, "ADD", print_ALU
},
607 { 1, "ARL", print_NOP
},
608 { 2, "DP3", print_ALU
},
609 { 2, "DP4", print_ALU
},
610 { 2, "DPH", print_ALU
},
611 { 2, "DST", print_ALU
},
612 { 0, "END", print_NOP
},
613 { 1, "EX2", print_ALU
},
614 { 1, "EXP", print_ALU
},
615 { 1, "FLR", print_ALU
},
616 { 1, "FRC", print_ALU
},
617 { 1, "LG2", print_ALU
},
618 { 1, "LIT", print_ALU
},
619 { 1, "LOG", print_ALU
},
620 { 3, "MAD", print_NOP
},
621 { 2, "MAX", print_ALU
},
622 { 2, "MIN", print_ALU
},
623 { 1, "MOV", print_ALU
},
624 { 2, "MUL", print_ALU
},
625 { 2, "POW", print_ALU
},
626 { 1, "PRT", print_ALU
}, /* PRINT */
627 { 1, "RCC", print_NOP
},
628 { 1, "RCP", print_ALU
},
629 { 1, "RSQ", print_ALU
},
630 { 2, "SGE", print_ALU
},
631 { 2, "SLT", print_ALU
},
632 { 2, "SUB", print_ALU
},
633 { 1, "SWZ", print_NOP
},
634 { 2, "XPD", print_ALU
},
635 { 1, "RSW", print_RSW
},
636 { 2, "MSK", print_MSK
},
637 { 1, "REL", print_ALU
},
640 void _tnl_disassem_vba_insn( union instruction op
)
642 const struct opcode_info
*info
= &opcode_info
[op
.alu
.opcode
];
643 info
->print( op
, info
);
647 static void (* const opcode_func
[])(struct arb_vp_machine
*, union instruction
) =
684 static union instruction
*cvp_next_instruction( struct compilation
*cp
)
686 union instruction
*op
= cp
->csr
++;
691 static struct reg
cvp_make_reg( GLuint file
, GLuint idx
)
699 static struct reg
cvp_emit_rel( struct compilation
*cp
,
703 union instruction
*op
= cvp_next_instruction(cp
);
704 op
->alu
.opcode
= REL
;
705 op
->alu
.file0
= reg
.file
;
706 op
->alu
.idx0
= reg
.idx
;
707 op
->alu
.dst
= tmpreg
.idx
;
712 static struct reg
cvp_load_reg( struct compilation
*cp
,
718 struct reg tmpreg
= cvp_make_reg(FILE_REG
, tmpidx
);
722 case PROGRAM_TEMPORARY
:
723 return cvp_make_reg(FILE_REG
, REG_TMP0
+ index
);
726 return cvp_make_reg(FILE_REG
, REG_IN0
+ index
);
729 return cvp_make_reg(FILE_REG
, REG_OUT0
+ index
);
731 /* These two aren't populated by the parser?
733 case PROGRAM_LOCAL_PARAM
:
734 reg
= cvp_make_reg(FILE_LOCAL_PARAM
, index
);
736 return cvp_emit_rel(cp
, reg
, tmpreg
);
740 case PROGRAM_ENV_PARAM
:
741 reg
= cvp_make_reg(FILE_ENV_PARAM
, index
);
743 return cvp_emit_rel(cp
, reg
, tmpreg
);
747 case PROGRAM_STATE_VAR
:
748 reg
= cvp_make_reg(FILE_STATE_PARAM
, index
);
750 return cvp_emit_rel(cp
, reg
, tmpreg
);
756 case PROGRAM_WRITE_ONLY
:
757 case PROGRAM_ADDRESS
:
760 return tmpreg
; /* can't happen */
764 static struct reg
cvp_emit_arg( struct compilation
*cp
,
765 const struct vp_src_register
*src
,
768 struct reg reg
= cvp_load_reg( cp
, src
->File
, src
->Index
, src
->RelAddr
, arg
);
769 union instruction rsw
, noop
;
771 /* Emit any necessary swizzling.
774 rsw
.rsw
.neg
= src
->Negate
? WRITEMASK_XYZW
: 0;
775 rsw
.rsw
.swz
= ((GET_SWZ(src
->Swizzle
, 0) << 0) |
776 (GET_SWZ(src
->Swizzle
, 1) << 2) |
777 (GET_SWZ(src
->Swizzle
, 2) << 4) |
778 (GET_SWZ(src
->Swizzle
, 3) << 6));
782 noop
.rsw
.swz
= RSW_NOOP
;
784 if (rsw
.dword
!= noop
.dword
) {
785 union instruction
*op
= cvp_next_instruction(cp
);
786 struct reg rsw_reg
= cvp_make_reg(FILE_REG
, REG_ARG0
+ arg
);
787 op
->dword
= rsw
.dword
;
788 op
->rsw
.opcode
= RSW
;
789 op
->rsw
.file0
= reg
.file
;
790 op
->rsw
.idx0
= reg
.idx
;
791 op
->rsw
.dst
= rsw_reg
.idx
;
798 static GLuint
cvp_choose_result( struct compilation
*cp
,
799 const struct vp_dst_register
*dst
,
800 union instruction
*fixup
)
802 GLuint mask
= dst
->WriteMask
;
806 case PROGRAM_TEMPORARY
:
807 idx
= REG_TMP0
+ dst
->Index
;
810 idx
= REG_OUT0
+ dst
->Index
;
814 return REG_RES
; /* can't happen */
817 /* Optimization: When writing (with a writemask) to an undefined
818 * value for the first time, the writemask may be ignored.
820 if (mask
!= WRITEMASK_XYZW
&& (cp
->reg_active
& (1 << idx
))) {
821 fixup
->msk
.opcode
= MSK
;
822 fixup
->msk
.dst
= idx
;
823 fixup
->msk
.file
= FILE_REG
;
824 fixup
->msk
.idx
= REG_RES
;
825 fixup
->msk
.mask
= mask
;
826 cp
->reg_active
|= 1 << idx
;
831 cp
->reg_active
|= 1 << idx
;
836 static struct reg
cvp_emit_rsw( struct compilation
*cp
,
845 if (swz
!= RSW_NOOP
|| neg
!= 0) {
846 union instruction
*op
= cvp_next_instruction(cp
);
847 op
->rsw
.opcode
= RSW
;
849 op
->rsw
.file0
= src
.file
;
850 op
->rsw
.idx0
= src
.idx
;
854 retval
.file
= FILE_REG
;
859 /* Oops. Degenerate case:
861 union instruction
*op
= cvp_next_instruction(cp
);
862 op
->alu
.opcode
= VP_OPCODE_MOV
;
864 op
->alu
.file0
= src
.file
;
865 op
->alu
.idx0
= src
.idx
;
867 retval
.file
= FILE_REG
;
877 static void cvp_emit_inst( struct compilation
*cp
,
878 const struct vp_instruction
*inst
)
880 const struct opcode_info
*info
= &opcode_info
[inst
->Opcode
];
881 union instruction
*op
;
882 union instruction fixup
;
886 assert(sizeof(*op
) == sizeof(GLuint
));
888 /* Need to handle SWZ, ARL specially.
890 switch (inst
->Opcode
) {
891 /* Split into mul and add:
894 result
= cvp_choose_result( cp
, &inst
->DstReg
, &fixup
);
895 for (i
= 0; i
< 3; i
++)
896 reg
[i
] = cvp_emit_arg( cp
, &inst
->SrcReg
[i
], REG_ARG0
+i
);
898 op
= cvp_next_instruction(cp
);
899 op
->alu
.opcode
= VP_OPCODE_MUL
;
900 op
->alu
.file0
= reg
[0].file
;
901 op
->alu
.idx0
= reg
[0].idx
;
902 op
->alu
.file1
= reg
[1].file
;
903 op
->alu
.idx1
= reg
[1].idx
;
904 op
->alu
.dst
= REG_ARG0
;
906 op
= cvp_next_instruction(cp
);
907 op
->alu
.opcode
= VP_OPCODE_ADD
;
908 op
->alu
.file0
= FILE_REG
;
909 op
->alu
.idx0
= REG_ARG0
;
910 op
->alu
.file1
= reg
[2].file
;
911 op
->alu
.idx1
= reg
[2].idx
;
912 op
->alu
.dst
= result
;
914 if (result
== REG_RES
) {
915 op
= cvp_next_instruction(cp
);
916 op
->dword
= fixup
.dword
;
921 reg
[0] = cvp_emit_arg( cp
, &inst
->SrcReg
[0], REG_ARG0
);
923 op
= cvp_next_instruction(cp
);
924 op
->alu
.opcode
= VP_OPCODE_FLR
;
925 op
->alu
.dst
= REG_ADDR
;
926 op
->alu
.file0
= reg
[0].file
;
927 op
->alu
.idx0
= reg
[0].idx
;
930 case VP_OPCODE_SWZ
: {
931 GLuint swz0
= 0, swz1
= 0;
932 GLuint neg0
= 0, neg1
= 0;
935 /* Translate 3-bit-per-element swizzle into two 2-bit swizzles,
936 * one from the source register the other from a constant
939 for (i
= 0; i
< 4; i
++) {
940 GLuint swzelt
= GET_SWZ(inst
->SrcReg
[0].Swizzle
, i
);
941 if (swzelt
>= SWIZZLE_ZERO
) {
942 neg0
|= inst
->SrcReg
[0].Negate
& (1<<i
);
943 if (swzelt
== SWIZZLE_ONE
)
944 swz0
|= SWIZZLE_W
<< (i
*2);
945 else if (i
< SWIZZLE_W
)
950 neg1
|= inst
->SrcReg
[0].Negate
& (1<<i
);
951 swz1
|= swzelt
<< (i
*2);
955 result
= cvp_choose_result( cp
, &inst
->DstReg
, &fixup
);
956 reg
[0].file
= FILE_REG
;
958 reg
[1] = cvp_emit_arg( cp
, &inst
->SrcReg
[0], REG_ARG0
);
960 if (mask
== WRITEMASK_XYZW
) {
961 cvp_emit_rsw(cp
, result
, reg
[0], neg0
, swz0
, GL_TRUE
);
964 else if (mask
== 0) {
965 cvp_emit_rsw(cp
, result
, reg
[1], neg1
, swz1
, GL_TRUE
);
968 cvp_emit_rsw(cp
, result
, reg
[0], neg0
, swz0
, GL_TRUE
);
969 reg
[1] = cvp_emit_rsw(cp
, REG_ARG0
, reg
[1], neg1
, swz1
, GL_FALSE
);
971 op
= cvp_next_instruction(cp
);
972 op
->msk
.opcode
= MSK
;
973 op
->msk
.dst
= result
;
974 op
->msk
.file
= reg
[1].file
;
975 op
->msk
.idx
= reg
[1].idx
;
979 if (result
== REG_RES
) {
980 op
= cvp_next_instruction(cp
);
981 op
->dword
= fixup
.dword
;
990 result
= cvp_choose_result( cp
, &inst
->DstReg
, &fixup
);
991 for (i
= 0; i
< info
->nr_args
; i
++)
992 reg
[i
] = cvp_emit_arg( cp
, &inst
->SrcReg
[i
], REG_ARG0
+ i
);
994 op
= cvp_next_instruction(cp
);
995 op
->alu
.opcode
= inst
->Opcode
;
996 op
->alu
.file0
= reg
[0].file
;
997 op
->alu
.idx0
= reg
[0].idx
;
998 op
->alu
.file1
= reg
[1].file
;
999 op
->alu
.idx1
= reg
[1].idx
;
1000 op
->alu
.dst
= result
;
1002 if (result
== REG_RES
) {
1003 op
= cvp_next_instruction(cp
);
1004 op
->dword
= fixup
.dword
;
1010 static void free_tnl_data( struct vertex_program
*program
)
1012 struct tnl_compiled_program
*p
= program
->TnlData
;
1013 if (p
->compiled_func
)
1014 _mesa_free((void *)p
->compiled_func
);
1016 program
->TnlData
= NULL
;
1019 static void compile_vertex_program( struct vertex_program
*program
,
1020 GLboolean try_codegen
)
1022 struct compilation cp
;
1023 struct tnl_compiled_program
*p
= CALLOC_STRUCT(tnl_compiled_program
);
1026 if (program
->TnlData
)
1027 free_tnl_data( program
);
1029 program
->TnlData
= p
;
1031 /* Initialize cp. Note that ctx and VB aren't used in compilation
1032 * so we don't have to worry about statechanges:
1034 memset(&cp
, 0, sizeof(cp
));
1035 cp
.csr
= p
->instructions
;
1037 /* Compile instructions:
1039 for (i
= 0; i
< program
->Base
.NumInstructions
; i
++) {
1040 cvp_emit_inst(&cp
, &program
->Instructions
[i
]);
1045 p
->nr_instructions
= cp
.csr
- p
->instructions
;
1047 /* Print/disassemble:
1050 for (i
= 0; i
< p
->nr_instructions
; i
++) {
1051 _tnl_disassem_vba_insn(p
->instructions
[i
]);
1053 _mesa_printf("\n\n");
1058 _tnl_sse_codegen_vertex_program(p
);
1066 /* ----------------------------------------------------------------------
1069 static void userclip( GLcontext
*ctx
,
1072 GLubyte
*clipormask
,
1073 GLubyte
*clipandmask
)
1077 for (p
= 0; p
< ctx
->Const
.MaxClipPlanes
; p
++) {
1078 if (ctx
->Transform
.ClipPlanesEnabled
& (1 << p
)) {
1080 const GLfloat a
= ctx
->Transform
._ClipUserPlane
[p
][0];
1081 const GLfloat b
= ctx
->Transform
._ClipUserPlane
[p
][1];
1082 const GLfloat c
= ctx
->Transform
._ClipUserPlane
[p
][2];
1083 const GLfloat d
= ctx
->Transform
._ClipUserPlane
[p
][3];
1084 GLfloat
*coord
= (GLfloat
*)clip
->data
;
1085 GLuint stride
= clip
->stride
;
1086 GLuint count
= clip
->count
;
1088 for (nr
= 0, i
= 0 ; i
< count
; i
++) {
1089 GLfloat dp
= (coord
[0] * a
+
1096 clipmask
[i
] |= CLIP_USER_BIT
;
1099 STRIDE_F(coord
, stride
);
1103 *clipormask
|= CLIP_USER_BIT
;
1105 *clipandmask
|= CLIP_USER_BIT
;
1114 static GLboolean
do_ndc_cliptest( struct arb_vp_machine
*m
)
1116 GLcontext
*ctx
= m
->ctx
;
1117 TNLcontext
*tnl
= TNL_CONTEXT(ctx
);
1118 struct vertex_buffer
*VB
= m
->VB
;
1120 /* Cliptest and perspective divide. Clip functions must clear
1124 m
->andmask
= CLIP_ALL_BITS
;
1126 if (tnl
->NeedNdcCoords
) {
1128 _mesa_clip_tab
[VB
->ClipPtr
->size
]( VB
->ClipPtr
,
1136 _mesa_clip_np_tab
[VB
->ClipPtr
->size
]( VB
->ClipPtr
,
1144 /* All vertices are outside the frustum */
1148 /* Test userclip planes. This contributes to VB->ClipMask.
1150 if (ctx
->Transform
.ClipPlanesEnabled
&& !ctx
->VertexProgram
._Enabled
) {
1162 VB
->ClipAndMask
= m
->andmask
;
1163 VB
->ClipOrMask
= m
->ormask
;
1164 VB
->ClipMask
= m
->clipmask
;
1170 static INLINE
void call_func( struct tnl_compiled_program
*p
,
1171 struct arb_vp_machine
*m
)
1173 p
->compiled_func(m
);
1177 * Execute the given vertex program.
1179 * TODO: Integrate the t_vertex.c code here, to build machine vertices
1180 * directly at this point.
1182 * TODO: Eliminate the VB struct entirely and just use
1183 * struct arb_vertex_machine.
1186 run_arb_vertex_program(GLcontext
*ctx
, struct tnl_pipeline_stage
*stage
)
1188 struct vertex_program
*program
= (ctx
->VertexProgram
._Enabled
?
1189 ctx
->VertexProgram
.Current
:
1191 struct vertex_buffer
*VB
= &TNL_CONTEXT(ctx
)->vb
;
1192 struct arb_vp_machine
*m
= ARB_VP_MACHINE(stage
);
1193 struct tnl_compiled_program
*p
;
1194 GLuint i
, j
, outputs
;
1196 if (!program
|| program
->IsNVProgram
)
1199 if (program
->Parameters
) {
1200 _mesa_load_state_parameters(ctx
, program
->Parameters
);
1203 p
= (struct tnl_compiled_program
*)program
->TnlData
;
1207 m
->nr_inputs
= m
->nr_outputs
= 0;
1209 for (i
= 0; i
< _TNL_ATTRIB_MAX
; i
++) {
1210 if (program
->InputsRead
& (1<<i
)) {
1211 GLuint j
= m
->nr_inputs
++;
1212 m
->input
[j
].idx
= i
;
1213 m
->input
[j
].data
= (GLfloat
*)m
->VB
->AttribPtr
[i
]->data
;
1214 m
->input
[j
].stride
= m
->VB
->AttribPtr
[i
]->stride
;
1215 m
->input
[j
].size
= m
->VB
->AttribPtr
[i
]->size
;
1216 ASSIGN_4V(m
->File
[0][REG_IN0
+ i
], 0, 0, 0, 1);
1220 for (i
= 0; i
< 15; i
++) {
1221 if (program
->OutputsWritten
& (1<<i
)) {
1222 GLuint j
= m
->nr_outputs
++;
1223 m
->output
[j
].idx
= i
;
1224 m
->output
[j
].data
= (GLfloat
*)m
->attribs
[i
].data
;
1229 /* Run the actual program:
1231 for (m
->vtx_nr
= 0; m
->vtx_nr
< VB
->Count
; m
->vtx_nr
++) {
1232 for (j
= 0; j
< m
->nr_inputs
; j
++) {
1233 GLuint idx
= REG_IN0
+ m
->input
[j
].idx
;
1234 switch (m
->input
[j
].size
) {
1235 case 4: m
->File
[0][idx
][3] = m
->input
[j
].data
[3];
1236 case 3: m
->File
[0][idx
][2] = m
->input
[j
].data
[2];
1237 case 2: m
->File
[0][idx
][1] = m
->input
[j
].data
[1];
1238 case 1: m
->File
[0][idx
][0] = m
->input
[j
].data
[0];
1241 STRIDE_F(m
->input
[j
].data
, m
->input
[j
].stride
);
1244 if (p
->compiled_func
) {
1248 for (j
= 0; j
< p
->nr_instructions
; j
++) {
1249 union instruction inst
= p
->instructions
[j
];
1250 opcode_func
[inst
.alu
.opcode
]( m
, inst
);
1254 for (j
= 0; j
< m
->nr_outputs
; j
++) {
1255 GLuint idx
= REG_OUT0
+ m
->output
[j
].idx
;
1256 m
->output
[j
].data
[0] = m
->File
[0][idx
][0];
1257 m
->output
[j
].data
[1] = m
->File
[0][idx
][1];
1258 m
->output
[j
].data
[2] = m
->File
[0][idx
][2];
1259 m
->output
[j
].data
[3] = m
->File
[0][idx
][3];
1260 m
->output
[j
].data
+= 4;
1264 /* Setup the VB pointers so that the next pipeline stages get
1265 * their data from the right place (the program output arrays).
1267 * TODO: 1) Have tnl use these RESULT values for outputs rather
1268 * than trying to shoe-horn inputs and outputs into one set of
1271 * TODO: 2) Integrate t_vertex.c so that we just go straight ahead
1272 * and build machine vertices here.
1274 VB
->ClipPtr
= &m
->attribs
[VERT_RESULT_HPOS
];
1275 VB
->ClipPtr
->count
= VB
->Count
;
1277 outputs
= program
->OutputsWritten
;
1279 if (outputs
& (1<<VERT_RESULT_COL0
)) {
1280 VB
->ColorPtr
[0] = &m
->attribs
[VERT_RESULT_COL0
];
1281 VB
->AttribPtr
[VERT_ATTRIB_COLOR0
] = VB
->ColorPtr
[0];
1284 if (outputs
& (1<<VERT_RESULT_BFC0
)) {
1285 VB
->ColorPtr
[1] = &m
->attribs
[VERT_RESULT_BFC0
];
1288 if (outputs
& (1<<VERT_RESULT_COL1
)) {
1289 VB
->SecondaryColorPtr
[0] = &m
->attribs
[VERT_RESULT_COL1
];
1290 VB
->AttribPtr
[VERT_ATTRIB_COLOR1
] = VB
->SecondaryColorPtr
[0];
1293 if (outputs
& (1<<VERT_RESULT_BFC1
)) {
1294 VB
->SecondaryColorPtr
[1] = &m
->attribs
[VERT_RESULT_BFC1
];
1297 if (outputs
& (1<<VERT_RESULT_FOGC
)) {
1298 VB
->FogCoordPtr
= &m
->attribs
[VERT_RESULT_FOGC
];
1299 VB
->AttribPtr
[VERT_ATTRIB_FOG
] = VB
->FogCoordPtr
;
1302 if (outputs
& (1<<VERT_RESULT_PSIZ
)) {
1303 VB
->PointSizePtr
= &m
->attribs
[VERT_RESULT_PSIZ
];
1304 VB
->AttribPtr
[_TNL_ATTRIB_POINTSIZE
] = &m
->attribs
[VERT_RESULT_PSIZ
];
1307 for (i
= 0; i
< ctx
->Const
.MaxTextureUnits
; i
++) {
1308 if (outputs
& (1<<(VERT_RESULT_TEX0
+i
))) {
1309 VB
->TexCoordPtr
[i
] = &m
->attribs
[VERT_RESULT_TEX0
+ i
];
1310 VB
->AttribPtr
[VERT_ATTRIB_TEX0
+i
] = VB
->TexCoordPtr
[i
];
1315 for (i
= 0; i
< VB
->Count
; i
++) {
1316 printf("Out %d: %f %f %f %f %f %f %f %f\n", i
,
1317 VEC_ELT(VB
->ClipPtr
, GLfloat
, i
)[0],
1318 VEC_ELT(VB
->ClipPtr
, GLfloat
, i
)[1],
1319 VEC_ELT(VB
->ClipPtr
, GLfloat
, i
)[2],
1320 VEC_ELT(VB
->ClipPtr
, GLfloat
, i
)[3],
1321 VEC_ELT(VB
->TexCoordPtr
[0], GLfloat
, i
)[0],
1322 VEC_ELT(VB
->TexCoordPtr
[0], GLfloat
, i
)[1],
1323 VEC_ELT(VB
->TexCoordPtr
[0], GLfloat
, i
)[2],
1324 VEC_ELT(VB
->TexCoordPtr
[0], GLfloat
, i
)[3]);
1328 /* Perform NDC and cliptest operations:
1330 return do_ndc_cliptest(m
);
1335 validate_vertex_program( GLcontext
*ctx
, struct tnl_pipeline_stage
*stage
)
1337 struct arb_vp_machine
*m
= ARB_VP_MACHINE(stage
);
1338 struct vertex_program
*program
=
1339 (ctx
->VertexProgram
._Enabled
? ctx
->VertexProgram
.Current
: 0);
1341 if (!program
&& ctx
->_MaintainTnlProgram
) {
1342 program
= ctx
->_TnlProgram
;
1346 if (!program
->TnlData
)
1347 compile_vertex_program( program
, m
->try_codegen
);
1349 /* Grab the state GL state and put into registers:
1351 m
->File
[FILE_LOCAL_PARAM
] = program
->Base
.LocalParams
;
1352 m
->File
[FILE_ENV_PARAM
] = ctx
->VertexProgram
.Parameters
;
1353 /* GL_NV_vertex_programs can't reference GL state */
1354 if (program
->Parameters
)
1355 m
->File
[FILE_STATE_PARAM
] = program
->Parameters
->ParameterValues
;
1357 m
->File
[FILE_STATE_PARAM
] = NULL
;
1368 * Called the first time stage->run is called. In effect, don't
1369 * allocate data until the first time the stage is run.
1371 static GLboolean
init_vertex_program( GLcontext
*ctx
,
1372 struct tnl_pipeline_stage
*stage
)
1374 TNLcontext
*tnl
= TNL_CONTEXT(ctx
);
1375 struct vertex_buffer
*VB
= &(tnl
->vb
);
1376 struct arb_vp_machine
*m
;
1377 const GLuint size
= VB
->Size
;
1380 stage
->privatePtr
= _mesa_malloc(sizeof(*m
));
1381 m
= ARB_VP_MACHINE(stage
);
1385 /* arb_vertex_machine struct should subsume the VB:
1390 m
->File
[0] = ALIGN_MALLOC(REG_MAX
* sizeof(GLfloat
) * 4, 16);
1392 /* Initialize regs where necessary:
1394 ASSIGN_4V(m
->File
[0][REG_ID
], 0, 0, 0, 1);
1395 ASSIGN_4V(m
->File
[0][REG_ONES
], 1, 1, 1, 1);
1396 ASSIGN_4V(m
->File
[0][REG_SWZ
], -1, 1, 0, 0);
1397 ASSIGN_4V(m
->File
[0][REG_NEG
], -1, -1, -1, -1);
1398 ASSIGN_4V(m
->File
[0][REG_LIT
], 1, 0, 0, 1);
1399 ASSIGN_4V(m
->File
[0][REG_LIT2
], 1, .5, .2, 1); /* debug value */
1401 if (_mesa_getenv("MESA_EXPERIMENTAL"))
1404 /* Allocate arrays of vertex output values */
1405 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
1406 _mesa_vector4f_alloc( &m
->attribs
[i
], 0, size
, 32 );
1407 m
->attribs
[i
].size
= 4;
1410 /* a few other misc allocations */
1411 _mesa_vector4f_alloc( &m
->ndcCoords
, 0, size
, 32 );
1412 m
->clipmask
= (GLubyte
*) ALIGN_MALLOC(sizeof(GLubyte
)*size
, 32 );
1414 if (ctx
->_MaintainTnlProgram
)
1415 _mesa_allow_light_in_model( ctx
, GL_FALSE
);
1417 m
->fpucntl_rnd_neg
= RND_NEG_FPU
; /* const value */
1418 m
->fpucntl_restore
= RESTORE_FPU
; /* const value */
1427 * Destructor for this pipeline stage.
1429 static void dtr( struct tnl_pipeline_stage
*stage
)
1431 struct arb_vp_machine
*m
= ARB_VP_MACHINE(stage
);
1436 /* free the vertex program result arrays */
1437 for (i
= 0; i
< VERT_RESULT_MAX
; i
++)
1438 _mesa_vector4f_free( &m
->attribs
[i
] );
1440 /* free misc arrays */
1441 _mesa_vector4f_free( &m
->ndcCoords
);
1442 ALIGN_FREE( m
->clipmask
);
1443 ALIGN_FREE( m
->File
[0] );
1446 stage
->privatePtr
= NULL
;
1451 * Public description of this pipeline stage.
1453 const struct tnl_pipeline_stage _tnl_arb_vertex_program_stage
=
1456 NULL
, /* private_data */
1457 init_vertex_program
, /* create */
1459 validate_vertex_program
, /* validate */
1460 run_arb_vertex_program
/* run */