dll/opengl/mesa/asm-386.S

   1 /* $Id: asm-386.S,v 1.8 1997/12/17 00:50:51 brianp Exp $ */
   2
   3 /*
   4  * asm-386.S - special (hopefully faster) transformation functions for x86
   5  *
   6  * by Josh Vanderhoof
   7  *
   8  * This file is in the public domain.
   9  */
  10
  11 /*
  12  * $Log: asm-386.S,v $
  13  * Revision 1.8  1997/12/17 00:50:51  brianp
  14  * applied Josh's patch to fix texture coordinate transformation bugs
  15  *
  16  * Revision 1.7  1997/12/17 00:27:11  brianp
  17  * applied Josh's patch to fix bfris
  18  *
  19  * Revision 1.6  1997/12/01 01:02:41  brianp
  20  * added FreeBSD patches (Daniel J. O'Connor)
  21  *
  22  * Revision 1.5  1997/11/19 23:52:17  brianp
  23  * added missing "cld" instruction in asm_transform_points4_identity()
  24  *
  25  * Revision 1.4  1997/11/11 02:22:41  brianp
  26  * small change per Josh to ensure U/V pairing
  27  *
  28  * Revision 1.3  1997/11/07 03:37:24  brianp
  29  * added missing line from Stephane Rehel
  30  *
  31  * Revision 1.2  1997/11/07 03:30:37  brianp
  32  * added Josh's 11-5-97 patches
  33  *
  34  * Revision 1.1  1997/10/30 06:00:33  brianp
  35  * Initial revision
  36  */
  37
  38 #include <asm.inc>
  39
  40 #define S(x)    dword ptr [esi + 4*x]
  41 #define D(x)    dword ptr [edi + 4*x]
  42 #define M(x, y) dword ptr [edx + 16*x + 4*y]
  43
  44 .code
  45
  46 /*
  47  * void asm_transform_points3_general( GLuint n, GLfloat d[][4],
  48  *                                     GLfloat m[16], GLfloat s[][4] );
  49  */
  50 PUBLIC _asm_transform_points3_general
  51 _asm_transform_points3_general:
  52 .align 4
  53         push esi
  54         push edi
  55
  56         mov ecx, [esp + 12]         /* ecx = n */
  57         mov edi, [esp + 16]     /* edi = d */
  58         mov edx, [esp + 20]     /* edx = m */
  59         mov esi, [esp + 24]     /* esi = s */
  60
  61         test ecx, ecx
  62         jz _asm_transform_points3_general_end
  63
  64 .align 4
  65 _asm_transform_points3_general_loop:
  66         fld S(0)
  67         fmul M(0, 0)
  68         fld S(0)
  69         fmul M(0, 1)
  70         fld S(0)
  71         fmul M(0, 2)
  72         fld S(0)
  73         fmul M(0, 3)
  74
  75         fld S(1)
  76         fmul M(1, 0)
  77         fld S(1)
  78         fmul M(1, 1)
  79         fld S(1)
  80         fmul M(1, 2)
  81         fld S(1)
  82         fmul M(1, 3)
  83
  84         /*
  85          * The FPU stack should now look like this:
  86          *
  87          * st(7) = S(0) * M(0, 0)
  88          * st(6) = S(0) * M(0, 1)
  89          * st(5) = S(0) * M(0, 2)
  90          * st(4) = S(0) * M(0, 3)
  91          * st(3) = S(1) * M(1, 0)
  92          * st(2) = S(1) * M(1, 1)
  93          * st(1) = S(1) * M(1, 2)
  94          * st(0) = S(1) * M(1, 3)
  95          */
  96
  97         fxch st(3)              /* 3 1 2 0 4 5 6 7 */
  98         faddp st(7), st         /* 1 2 0 4 5 6 7 */
  99         fxch st(1)              /* 2 1 0 4 5 6 7 */
 100         faddp st(5), st         /* 1 0 4 5 6 7 */
 101         faddp st(3), st         /* 0 4 5 6 7 */
 102         faddp st(1), st         /* 4 5 6 7 */
 103
 104         /*
 105          * st(3) = S(0) * M(0, 0) + S(1) * M(1, 0)
 106          * st(2) = S(0) * M(0, 1) + S(1) * M(1, 1)
 107          * st(1) = S(0) * M(0, 2) + S(1) * M(1, 2)
 108          * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3)
 109          */
 110
 111         fld S(2)
 112         fmul M(2, 0)
 113         fld S(2)
 114         fmul M(2, 1)
 115         fld S(2)
 116         fmul M(2, 2)
 117         fld S(2)
 118         fmul M(2, 3)
 119
 120         /*
 121          * st(7) = S(0) * M(0, 0) + S(1) * M(1, 0)
 122          * st(6) = S(0) * M(0, 1) + S(1) * M(1, 1)
 123          * st(5) = S(0) * M(0, 2) + S(1) * M(1, 2)
 124          * st(4) = S(0) * M(0, 3) + S(1) * M(1, 3)
 125          * st(3) = S(2) * M(2, 0)
 126          * st(2) = S(2) * M(2, 1)
 127          * st(1) = S(2) * M(2, 2)
 128          * st(0) = S(2) * M(2, 3)
 129          */
 130
 131         fxch st(3)          /* 3 1 2 0 4 5 6 7 */
 132         faddp st(7), st         /* 1 2 0 4 5 6 7 */
 133         fxch st(1)          /* 2 1 0 4 5 6 7 */
 134         faddp st(5), st     /* 1 0 4 5 6 7 */
 135         faddp st(3), st     /* 0 4 5 6 7 */
 136         faddp st(1), st     /* 4 5 6 7 */
 137
 138         /*
 139          * st(3) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
 140          * st(2) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
 141          * st(1) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
 142          * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3)
 143          */
 144
 145         fxch st(3)      /* 3 1 2 0 */
 146         fadd M(3, 0)
 147         fxch st(2)      /* 2 1 3 0 */
 148         fadd M(3, 1)
 149         fxch st(1)      /* 1 2 3 0 */
 150         fadd M(3, 2)
 151         fxch st(3)      /* 0 2 3 1 */
 152         fadd M(3, 3)
 153
 154         /*
 155          * st(3) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) + M(3, 2)
 156          * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) + M(3, 0)
 157          * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) + M(3, 1)
 158          * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3) + M(3, 3)
 159          */
 160
 161         fxch st(3)      /* 3 1 2 0 */
 162         fstp D(2)       /* 1 2 0 */
 163         fxch st(1)      /* 2 1 0 */
 164         fstp D(0)       /* 1 0 */
 165         lea esi, S(4)
 166         fstp D(1)       /* 0 */
 167         dec ecx
 168         fstp D(3)       /* */
 169
 170         lea edi, D(4)
 171
 172         jnz _asm_transform_points3_general_loop
 173
 174 _asm_transform_points3_general_end:
 175         pop edi
 176         pop esi
 177         ret
 178
 179
 180 /*
 181  * void asm_transform_points3_identity( GLuint n, GLfloat d[][4],
 182  *                                      GLfloat s[][4] );
 183  */
 184 PUBLIC _asm_transform_points3_identity
 185 _asm_transform_points3_identity:
 186 .align 4
 187         push esi
 188         push edi
 189         mov ecx, [esp + 12]     /* ecx = n */
 190         mov edi, [esp + 16]     /* edi = d */
 191         mov esi, [esp + 20]     /* esi = s */
 192         push ebx
 193         push ebp
 194
 195         test ecx, ecx
 196         jz _asm_transform_points3_identity_end
 197
 198         mov ebp, HEX(3f800000)
 199
 200 .align 4
 201 _asm_transform_points3_identity_loop:
 202         mov eax, S(0)
 203         mov edx, S(1)
 204         mov ebx, S(2)
 205         lea esi, S(4)
 206         mov D(0), eax
 207         mov D(1), edx
 208         mov D(2), ebx
 209         mov D(3), ebp
 210         dec ecx
 211         lea edi, D(4)
 212         jnz _asm_transform_points3_identity_loop
 213
 214 _asm_transform_points3_identity_end:
 215         pop ebp
 216         pop ebx
 217         pop edi
 218         pop esi
 219         ret
 220
 221
 222 /*
 223  * void asm_transform_points3_2d( GLuint n, GLfloat d[][4], GLfloat m[16],
 224  *                                GLfloat s[][4] );
 225  */
 226 PUBLIC _asm_transform_points3_2d
 227 _asm_transform_points3_2d:
 228 .align 4
 229         push esi
 230         push edi
 231         mov ecx, [esp + 12]     /* ecx = n */
 232         mov edi, [esp + 16]     /* edi = d */
 233         mov edx, [esp + 20]     /* edx = m */
 234         mov esi, [esp + 24]     /* esi = s */
 235         push ebp
 236
 237         mov ebp, HEX(3f800000)
 238
 239         test cl, DEC(1)
 240         jz _asm_transform_points3_2d_step
 241
 242         dec ecx
 243
 244         fld S(0)
 245         fmul M(0, 0)
 246         fld S(0)
 247         fmul M(0, 1)
 248         fld S(1)
 249         fmul M(1, 0)
 250         fld S(1)
 251         fmul M(1, 1)
 252
 253         /*
 254          * st(3) = S(0) * M(0, 0)
 255          * st(2) = S(0) * M(0, 1)
 256          * st(1) = S(1) * M(1, 0)
 257          * st(0) = S(1) * M(1, 1)
 258          */
 259
 260         fxch st(1)      /* 1 0 2 3 */
 261         fadd M(3, 0)
 262         fxch st(1)      /* 0 1 2 3 */
 263         fadd M(3, 1)
 264         fxch st(1)      /* 1 0 2 3 */
 265         faddp st(3), st         /* 0 2 3 */
 266         faddp st(1), st         /* 2 3 */
 267         fstp D(1)       /* 3 */
 268         fstp D(0)       /* */
 269         mov eax, S(2)
 270         lea esi, S(4)
 271         mov D(3), ebp
 272         mov D(2), eax
 273         lea edi, D(4)
 274
 275 _asm_transform_points3_2d_step:
 276         test ecx, ecx
 277         jz _asm_transform_points3_2d_end
 278
 279 .align 4
 280 _asm_transform_points3_2d_loop:
 281         fld S(0)
 282         fmul M(0, 0)
 283         fld S(0)
 284         fmul M(0, 1)
 285         fld S(4)
 286         fmul M(0, 0)
 287         fld S(4)
 288         fmul M(0, 1)
 289         fld S(1)
 290         fmul M(1, 0)
 291         fld S(1)
 292         fmul M(1, 1)
 293         fld S(5)
 294         fmul M(1, 0)
 295         fld S(5)
 296         fmul M(1, 1)
 297
 298         /*
 299          * st(7) = S(0) * M(0, 0)
 300          * st(6) = S(0) * M(0, 1)
 301          * st(5) = S(4) * M(0, 0)
 302          * st(4) = S(4) * M(0, 1)
 303          * st(3) = S(1) * M(1, 0)
 304          * st(2) = S(1) * M(1, 1)
 305          * st(1) = S(5) * M(1, 0)
 306          * st(0) = S(5) * M(1, 1)
 307          */
 308
 309         fxch st(7)      /* 7 1 2 3 4 5 6 0 */
 310         fadd M(3, 0)
 311         fxch st(6)      /* 6 1 2 3 4 5 7 0 */
 312         fadd M(3, 1)
 313         fxch st(5)      /* 5 1 2 3 4 6 7 0 */
 314         fadd M(3, 0)
 315         fxch st(4)      /* 4 1 2 3 5 6 7 0 */
 316         fadd M(3, 1)
 317
 318         mov eax, S(2)
 319         mov D(3), ebp
 320         mov D(2), eax
 321         mov eax, S(6)
 322         mov D(7), ebp
 323         mov D(6), eax
 324         lea esi, S(8)
 325         sub ecx, DEC(2)
 326
 327         /*
 328          * st(7) = S(5) * M(1, 1)
 329          * st(6) = S(0) * M(0, 0) + M(3, 0)
 330          * st(5) = S(0) * M(0, 1) + M(3, 1)
 331          * st(4) = S(4) * M(0, 0) + M(3, 0)
 332          * st(3) = S(1) * M(1, 0)
 333          * st(2) = S(1) * M(1, 1)
 334          * st(1) = S(5) * M(1, 0)
 335          * st(0) = S(4) * M(0, 1) + M(3, 1)
 336          */
 337
 338         faddp st(7), st         /* 1 2 3 4 5 6 7 */
 339         faddp st(3), st         /* 2 3 4 5 6 7 */
 340         faddp st(3), st         /* 3 4 5 6 7 */
 341         faddp st(3), st         /* 4 5 6 7 */
 342         fxch st(3)      /* 7 5 6 4 */
 343         fstp D(5)       /* 5 6 4 */
 344         fstp D(1)       /* 6 4 */
 345         fstp D(0)       /* 4 */
 346         fstp D(4)       /* */
 347
 348         lea edi, D(8)
 349         jnz _asm_transform_points3_2d_loop
 350
 351 _asm_transform_points3_2d_end:
 352         pop ebp
 353         pop edi
 354         pop esi
 355         ret
 356
 357
 358 /*
 359  * void asm_transform_points3_2d_no_rot( GLuint n, GLfloat d[][4],
 360  *                                       GLfloat m[16], GLfloat s[][4] );
 361  *
 362  */
 363 PUBLIC _asm_transform_points3_2d_no_rot
 364 _asm_transform_points3_2d_no_rot:
 365 .align 4
 366         push esi
 367         push edi
 368         mov ecx, [esp + 12]     /* ecx = n */
 369         mov edi, [esp + 16]     /* edi = d */
 370         mov edx, [esp + 20]     /* edx = m */
 371         mov esi, [esp + 24]     /* esi = s */
 372         push ebp
 373
 374         test ecx, ecx
 375         jz _asm_transform_points3_2d_no_rot_end
 376
 377         mov ebp, HEX(3f800000)
 378
 379 .align 4
 380 _asm_transform_points3_2d_no_rot_loop:
 381         fld S(0)
 382         fmul M(0, 0)
 383         fld S(1)
 384         fmul M(1, 1)
 385         fxch st(1)
 386         fadd M(3, 0)
 387         fxch st(1)
 388         fadd M(3, 1)
 389         fxch st(1)
 390         fstp D(0)
 391         fstp D(1)
 392
 393         mov eax, S(2)   /* cycle 1: U pipe */
 394         mov D(3), ebp   /*          V pipe */
 395         mov D(2), eax   /* cycle 2: U pipe */
 396
 397         dec ecx
 398         lea esi, S(4)
 399         lea edi, D(4)
 400         jnz _asm_transform_points3_2d_no_rot_loop
 401
 402 _asm_transform_points3_2d_no_rot_end:
 403         pop ebp
 404         pop edi
 405         pop esi
 406         ret
 407
 408
 409
 410 /*
 411  * void asm_transform_points3_3d( GLuint n, GLfloat d[][4], GLfloat m[16],
 412  *                                GLfloat s[][4] );
 413  */
 414 PUBLIC _asm_transform_points3_3d
 415 _asm_transform_points3_3d:
 416 .align 4
 417         push esi
 418         push edi
 419         mov ecx, [esp + 12]     /* ecx = n */
 420         mov edi, [esp + 16]     /* edi = d */
 421         mov edx, [esp + 20]     /* edx = m */
 422         mov esi, [esp + 24]     /* esi = s */
 423
 424         test ecx, ecx
 425         jz _asm_transform_points3_3d_end
 426
 427         mov eax, HEX(3f800000)
 428
 429 .align 4
 430 _asm_transform_points3_3d_loop:
 431         fld S(0)
 432         fmul M(0, 0)
 433         fld S(0)
 434         fmul M(0, 1)
 435         fld S(0)
 436         fmul M(0, 2)
 437
 438         fld S(1)
 439         fmul M(1, 0)
 440         fld S(1)
 441         fmul M(1, 1)
 442         fld S(1)
 443         fmul M(1, 2)
 444
 445         /*
 446          * st(5) = S(0) * M(0, 0)
 447          * st(4) = S(0) * M(0, 1)
 448          * st(3) = S(0) * M(0, 2)
 449          * st(2) = S(1) * M(1, 0)
 450          * st(1) = S(1) * M(1, 1)
 451          * st(0) = S(1) * M(1, 2)
 452          */
 453
 454         fxch st(2)              /* 2 1 0 3 4 5 */
 455         faddp st(5), st /* 1 0 3 4 5 */
 456         faddp st(3), st /* 0 3 4 5 */
 457         faddp st(1), st /* 3 4 5 */
 458
 459         /*
 460          * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0)
 461          * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1)
 462          * st(0) = S(0) * M(0, 2) + S(1) * M(1, 2)
 463          */
 464
 465         fld S(2)
 466         fmul M(2, 0)
 467         fld S(2)
 468         fmul M(2, 1)
 469         fld S(2)
 470         fmul M(2, 2)
 471
 472         /*
 473          * st(5) = S(0) * M(0, 0) + S(1) * M(1, 0)
 474          * st(4) = S(0) * M(0, 1) + S(1) * M(1, 1)
 475          * st(3) = S(0) * M(0, 2) + S(1) * M(1, 2)
 476          * st(2) = S(2) * M(2, 0)
 477          * st(1) = S(2) * M(2, 1)
 478          * st(0) = S(2) * M(2, 2)
 479          */
 480
 481         fxch st(2)              /* 2 1 0 3 4 5 */
 482         faddp st(5), st /* 1 0 3 4 5 */
 483         faddp st(3), st /* 0 3 4 5 */
 484         faddp st(1), st /* 3 4 5 */
 485
 486         /*
 487          * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
 488          * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
 489          * st(0) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
 490          */
 491
 492         fxch st(2)      /* 2 1 0 */
 493         fadd M(3, 0)
 494         fxch st(1)      /* 1 2 0 */
 495         fadd M(3, 1)
 496         fxch st(2)      /* 0 2 1 */
 497         fadd M(3, 2)
 498
 499         fxch st(1)      /* 2 0 1 */
 500         fstp D(0)       /* 0 1 */
 501         fstp D(2)       /* 1 */
 502         fstp D(1)       /* */
 503         mov D(3), eax
 504
 505         lea esi, S(4)
 506         dec ecx
 507
 508         lea edi, D(4)
 509
 510         jnz _asm_transform_points3_3d_loop
 511
 512 _asm_transform_points3_3d_end:
 513         pop edi
 514         pop esi
 515         ret
 516
 517
 518
 519 /*
 520  * void asm_transform_points4_general( GLuint n, GLfloat d[][4],
 521  *                                     GLfloat m[16], GLfloat s[][4] );
 522  */
 523 PUBLIC _asm_transform_points4_general
 524 _asm_transform_points4_general:
 525 .align 4
 526         push esi
 527         push edi
 528         mov ecx, [esp + 12]     /* ecx = n */
 529         mov edi, [esp + 16]     /* edi = d */
 530         mov edx, [esp + 20]     /* edx = m */
 531         mov esi, [esp + 24]     /* esi = s */
 532
 533         test ecx, ecx
 534         jz _asm_transform_points4_general_end
 535
 536 .align 4
 537 _asm_transform_points4_general_loop:
 538         fld S(0)
 539         fmul M(0, 0)
 540         fld S(0)
 541         fmul M(0, 1)
 542         fld S(0)
 543         fmul M(0, 2)
 544         fld S(0)
 545         fmul M(0, 3)
 546
 547         fld S(1)
 548         fmul M(1, 0)
 549         fld S(1)
 550         fmul M(1, 1)
 551         fld S(1)
 552         fmul M(1, 2)
 553         fld S(1)
 554         fmul M(1, 3)
 555
 556         /*
 557          * st(7) = S(0) * M(0, 0)
 558          * st(6) = S(0) * M(0, 1)
 559          * st(5) = S(0) * M(0, 2)
 560          * st(4) = S(0) * M(0, 3)
 561          * st(3) = S(1) * M(1, 0)
 562          * st(2) = S(1) * M(1, 1)
 563          * st(1) = S(1) * M(1, 2)
 564          * st(0) = S(1) * M(1, 3)
 565          */
 566
 567         fxch st(3)              /* 3 1 2 0 4 5 6 7 */
 568         faddp st(7), st /* 1 2 0 4 5 6 7 */
 569         fxch st(1)              /* 2 1 0 4 5 6 7 */
 570         faddp st(5), st /* 1 0 4 5 6 7 */
 571         faddp st(3), st /* 0 4 5 6 7 */
 572         faddp st(1), st /* 4 5 6 7 */
 573
 574         /*
 575          * st(3) = S(0) * M(0, 0) + S(1) * M(1, 0)
 576          * st(2) = S(0) * M(0, 1) + S(1) * M(1, 1)
 577          * st(1) = S(0) * M(0, 2) + S(1) * M(1, 2)
 578          * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3)
 579          */
 580
 581         fld S(2)
 582         fmul M(2, 0)
 583         fld S(2)
 584         fmul M(2, 1)
 585         fld S(2)
 586         fmul M(2, 2)
 587         fld S(2)
 588         fmul M(2, 3)
 589
 590         /*
 591          * st(7) = S(0) * M(0, 0) + S(1) * M(1, 0)
 592          * st(6) = S(0) * M(0, 1) + S(1) * M(1, 1)
 593          * st(5) = S(0) * M(0, 2) + S(1) * M(1, 2)
 594          * st(4) = S(0) * M(0, 3) + S(1) * M(1, 3)
 595          * st(3) = S(2) * M(2, 0)
 596          * st(2) = S(2) * M(2, 1)
 597          * st(1) = S(2) * M(2, 2)
 598          * st(0) = S(2) * M(2, 3)
 599          */
 600
 601         fxch st(3)              /* 3 1 2 0 4 5 6 7 */
 602         faddp st(7), st /* 1 2 0 4 5 6 7 */
 603         fxch st(1)              /* 2 1 0 4 5 6 7 */
 604         faddp st(5), st /* 1 0 4 5 6 7 */
 605         faddp st(3), st /* 0 4 5 6 7 */
 606         faddp st(1), st /* 4 5 6 7 */
 607
 608         /*
 609          * st(3) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
 610          * st(2) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
 611          * st(1) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
 612          * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3)
 613          */
 614
 615         fld S(3)
 616         fmul M(3, 0)
 617         fld S(3)
 618         fmul M(3, 1)
 619         fld S(3)
 620         fmul M(3, 2)
 621         fld S(3)
 622         fmul M(3, 3)
 623
 624         /*
 625          * st(7) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
 626          * st(6) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
 627          * st(5) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
 628          * st(4) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3)
 629          * st(3) = S(3) * M(3, 0)
 630          * st(2) = S(3) * M(3, 1)
 631          * st(1) = S(3) * M(3, 2)
 632          * st(0) = S(3) * M(3, 3)
 633          */
 634
 635         fxch st(3)              /* 3 1 2 0 4 5 6 7 */
 636         faddp st(7), st /* 1 2 0 4 5 6 7 */
 637         fxch st(1)              /* 2 1 0 4 5 6 7 */
 638         faddp st(5), st /* 1 0 4 5 6 7 */
 639         faddp st(3), st /* 0 4 5 6 7 */
 640
 641         lea esi, S(4)
 642         dec ecx
 643
 644         faddp st(1), st         /* 4 5 6 7 */
 645
 646         /*
 647          * st(3) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) + S(3) * M(3, 0)
 648          * st(2) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) + S(3) * M(3, 1)
 649          * st(1) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) + S(3) * M(3, 2)
 650          * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3) + S(3) * M(3, 3)
 651          */
 652
 653         fxch st(3)      /* 3 1 2 0 */
 654         fstp D(0)       /* 1 2 0 */
 655         fxch st(1)      /* 2 1 0 */
 656         fstp D(1)       /* 1 0 */
 657         fstp D(2)       /* 0 */
 658         fstp D(3)       /* */
 659
 660         lea edi, D(4)
 661
 662         jnz _asm_transform_points4_general_loop
 663
 664 _asm_transform_points4_general_end:
 665         pop edi
 666         pop esi
 667         ret
 668
 669
 670
 671 /*
 672  * void asm_transform_points4_identity( GLuint n, GLfloat d[][4],
 673  *                                      GLfloat s[][4] );
 674  */
 675 PUBLIC _asm_transform_points4_identity
 676 _asm_transform_points4_identity:
 677 .align 4
 678         push esi
 679         push edi
 680         mov ecx, [esp + 12]     /* ecx = n */
 681         mov edi, [esp + 16]     /* edi = d */
 682         mov esi, [esp + 20]     /* esi = s */
 683
 684         lea ecx,  [ecx * 4]
 685
 686         cld
 687         rep movsd
 688
 689         pop edi
 690         pop esi
 691         ret
 692
 693
 694
 695 /*
 696  * void asm_transform_points4_2d( GLuint n, GLfloat d[][4], GLfloat m[16],
 697  *                                GLfloat s[][4] );
 698  */
 699 PUBLIC _asm_transform_points4_2d
 700 _asm_transform_points4_2d:
 701 .align 4
 702         push esi
 703         push edi
 704         mov ecx, [esp + 12]     /* ecx = n */
 705         mov edi, [esp + 16]     /* edi = d */
 706         mov edx, [esp + 20]     /* edx = m */
 707         mov esi, [esp + 24]     /* esi = s */
 708
 709         test ecx, ecx
 710         jz _asm_transform_points4_2d_end
 711
 712         push ebx
 713
 714 .align 4
 715 _asm_transform_points4_2d_loop:
 716         fld S(0)
 717         fmul M(0, 0)
 718         fld S(0)
 719         fmul M(0, 1)
 720         fld S(1)
 721         fmul M(1, 0)
 722         fld S(1)
 723         fmul M(1, 1)
 724         fld S(3)
 725         fmul M(3, 0)
 726         fld S(3)
 727         fmul M(3, 1)
 728
 729         /*
 730          * st(5) = S(0) * M(0, 0)
 731          * st(4) = S(0) * M(0, 1)
 732          * st(3) = S(1) * M(1, 0)
 733          * st(2) = S(1) * M(1, 1)
 734          * st(1) = S(3) * M(3, 0)
 735          * st(0) = S(3) * M(3, 1)
 736          */
 737
 738         mov eax, S(2)
 739         mov ebx, S(3)
 740         lea esi, S(4)
 741         dec ecx
 742         mov D(2), eax
 743         mov D(3), ebx
 744         faddp st(4), st
 745         faddp st(4), st
 746         faddp st(2), st
 747         faddp st(2), st
 748         fstp D(1)
 749         fstp D(0)
 750         lea edi, D(4)
 751         jnz _asm_transform_points4_2d_loop
 752
 753         pop ebx
 754
 755 _asm_transform_points4_2d_end:
 756         pop edi
 757         pop esi
 758         ret
 759
 760
 761
 762 /*
 763  * void asm_transform_points4_2d_no_rot( GLuint n, GLfloat d[][4],
 764  *                                       GLfloat m[16], GLfloat s[][4] );
 765  */
 766 PUBLIC _asm_transform_points4_2d_no_rot
 767 _asm_transform_points4_2d_no_rot:
 768 .align 4
 769         push esi
 770         push edi
 771         mov ecx, [esp + 12]     /* ecx = n */
 772         mov edi, [esp + 16]     /* edi = d */
 773         mov edx, [esp + 20]     /* edx = m */
 774         mov esi, [esp + 24]     /* esi = s */
 775
 776         test ecx, ecx
 777         jz _asm_transform_points4_2d_no_rot_end
 778         push ebx
 779
 780 .align 4
 781 _asm_transform_points4_2d_no_rot_loop:
 782         fld S(0)
 783         fmul M(0, 0)
 784         fld S(1)
 785         fmul M(1, 1)
 786         fld S(3)
 787         fmul M(3, 0)
 788         fld S(3)
 789         fmul M(3, 1)
 790         mov eax, S(2)
 791         mov ebx, S(3)
 792         lea esi, S(4)
 793         dec ecx
 794         mov D(2), eax
 795         mov D(3), ebx
 796         faddp st(2), st
 797         faddp st(2), st
 798         fstp D(1)
 799         fstp D(0)
 800         lea edi, D(4)
 801         jnz _asm_transform_points4_2d_no_rot_loop
 802
 803         pop ebx
 804
 805 _asm_transform_points4_2d_no_rot_end:
 806         pop edi
 807         pop esi
 808         ret
 809
 810
 811
 812 /*
 813  * void asm_transform_points4_3d( GLuint n, GLfloat d[][4], GLfloat m[16],
 814  *                                GLfloat s[][4] );
 815  */
 816 PUBLIC _asm_transform_points4_3d
 817 _asm_transform_points4_3d:
 818 .align 4
 819         push esi
 820         push edi
 821         mov ecx, [esp + 12]     /* ecx = n */
 822         mov edi, [esp + 16]     /* edi = d */
 823         mov edx, [esp + 20]     /* edx = m */
 824         mov esi, [esp + 24]     /* esi = s */
 825
 826         test ecx, ecx
 827         jz _asm_transform_points4_3d_end
 828
 829 .align 4
 830 _asm_transform_points4_3d_loop:
 831         fld S(3)
 832
 833         fld S(0)
 834         fmul M(0, 0)
 835         fld S(0)
 836         fmul M(0, 1)
 837         fld S(0)
 838         fmul M(0, 2)
 839
 840         fld S(1)
 841         fmul M(1, 0)
 842         fld S(1)
 843         fmul M(1, 1)
 844         fld S(1)
 845         fmul M(1, 2)
 846
 847         /*
 848          * st(5) = S(0) * M(0, 0)
 849          * st(4) = S(0) * M(0, 1)
 850          * st(3) = S(0) * M(0, 2)
 851          * st(2) = S(1) * M(1, 0)
 852          * st(1) = S(1) * M(1, 1)
 853          * st(0) = S(1) * M(1, 2)
 854          */
 855
 856         fxch st(2)              /* 2 1 0 3 4 5 */
 857         faddp st(5), st         /* 1 0 3 4 5 */
 858         faddp st(3), st         /* 0 3 4 5 */
 859         faddp st(1), st         /* 3 4 5 */
 860
 861         /*
 862          * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0)
 863          * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1)
 864          * st(0) = S(0) * M(0, 2) + S(1) * M(1, 2)
 865          */
 866
 867         fld S(2)
 868         fmul M(2, 0)
 869         fld S(2)
 870         fmul M(2, 1)
 871         fld S(2)
 872         fmul M(2, 2)
 873
 874         /*
 875          * st(5) = S(0) * M(0, 0) + S(1) * M(1, 0)
 876          * st(4) = S(0) * M(0, 1) + S(1) * M(1, 1)
 877          * st(3) = S(0) * M(0, 2) + S(1) * M(1, 2)
 878          * st(2) = S(2) * M(2, 0)
 879          * st(1) = S(2) * M(2, 1)
 880          * st(0) = S(2) * M(2, 2)
 881          */
 882
 883         fxch st(2)              /* 2 1 0 3 4 5 */
 884         faddp st(5), st         /* 1 0 3 4 5 */
 885         faddp st(3), st         /* 0 3 4 5 */
 886         faddp st(1), st         /* 3 4 5 */
 887
 888         /*
 889          * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
 890          * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
 891          * st(0) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
 892          */
 893
 894         fld S(3)
 895         fmul M(3, 0)
 896         fld S(3)
 897         fmul M(3, 1)
 898         fld S(3)
 899         fmul M(3, 2)
 900
 901         /*
 902          * st(5) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
 903          * st(4) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
 904          * st(3) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
 905          * st(2) = S(3) * M(3, 0)
 906          * st(1) = S(3) * M(3, 1)
 907          * st(0) = S(3) * M(3, 2)
 908          */
 909
 910         fxch st(2)      /* 2 1 0 3 4 5 */
 911         faddp st(5), st /* 1 0 3 4 5 */
 912         faddp st(3), st /* 0 3 4 5 */
 913
 914         lea esi, S(4)
 915         dec ecx
 916
 917         faddp st(1), st         /* 3 4 5 */
 918
 919         /*
 920          * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) + S(3) * M(3, 0)
 921          * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) + S(3) * M(3, 1)
 922          * st(0) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) + S(3) * M(3, 2)
 923          */
 924
 925         fxch st(2)      /* 2 1 0 */
 926         fstp D(0)       /* 1 0 */
 927         fstp D(1)       /* 0 */
 928         fstp D(2)       /* */
 929         fstp D(3)
 930
 931         lea edi, D(4)
 932
 933         jnz _asm_transform_points4_3d_loop
 934
 935 _asm_transform_points4_3d_end:
 936         pop edi
 937         pop esi
 938         ret
 939
 940 /*
 941  * void asm_transform_points4_ortho( GLuint n, GLfloat d[][4],
 942  *                                   GLfloat m[16], GLfloat s[][4] );
 943  */
 944 PUBLIC _asm_transform_points4_ortho
 945 _asm_transform_points4_ortho:
 946 .align 4
 947         push esi
 948         push edi
 949         mov ecx, [esp + 12]     /* ecx = n */
 950         mov edi, [esp + 16]     /* edi = d */
 951         mov edx, [esp + 20]     /* edx = m */
 952         mov esi, [esp + 24]     /* esi = s */
 953
 954         test ecx, ecx
 955         jz _asm_transform_points4_ortho_end
 956
 957 .align 4
 958 _asm_transform_points4_ortho_loop:
 959         fld S(0)
 960         fmul M(0, 0)
 961         fld S(1)
 962         fmul M(1, 1)
 963         fld S(2)
 964         fmul M(2, 2)
 965
 966         fld S(3)
 967         fmul M(3, 0)
 968         fld S(3)
 969         fmul M(3, 1)
 970         fld S(3)
 971         fmul M(3, 2)
 972
 973         mov eax, S(3)
 974         lea esi, S(4)
 975         dec ecx
 976         mov D(3), eax
 977
 978         faddp st(3), st
 979         faddp st(3), st
 980         faddp st(3), st
 981
 982         fstp D(2)
 983         fstp D(1)
 984         fstp D(0)
 985
 986         lea edi, D(4)
 987         jnz _asm_transform_points4_ortho_loop
 988
 989 _asm_transform_points4_ortho_end:
 990         pop edi
 991         pop esi
 992         ret
 993
 994 /*
 995  * void asm_transform_points4_perspective( GLuint n, GLfloat d[][4],
 996  *                                         GLfloat m[16], GLfloat s[][4] );
 997  */
 998 PUBLIC _asm_transform_points4_perspective
 999 _asm_transform_points4_perspective:
1000 .align 4
1001         push esi
1002         push edi
1003         mov ecx, [esp + 12]     /* ecx = n */
1004         mov edi, [esp + 16]     /* edi = d */
1005         mov edx, [esp + 20]     /* edx = m */
1006         mov esi, [esp + 24]     /* esi = s */
1007
1008         test ecx, ecx
1009         jz _asm_transform_points4_perspective_end
1010
1011 .align 4
1012 _asm_transform_points4_perspective_loop:
1013         fld S(0)
1014         fmul M(0, 0)
1015         fld S(1)
1016         fmul M(1, 1)
1017         fld S(2)
1018         fmul M(2, 2)
1019
1020         fld S(2)
1021         fmul M(2, 0)
1022         fld S(2)
1023         fmul M(2, 1)
1024         fld S(3)
1025         fmul M(3, 2)
1026
1027         mov eax, S(2)
1028         lea esi, S(4)
1029         xor eax, HEX(80000000)
1030         dec ecx
1031
1032         faddp st(3), st
1033         faddp st(3), st
1034         faddp st(3), st
1035
1036         fstp D(2)
1037         fstp D(1)
1038         fstp D(0)
1039
1040         mov D(3), eax
1041         lea edi, D(4)
1042         jnz _asm_transform_points4_perspective_loop
1043
1044 _asm_transform_points4_perspective_end:
1045         pop edi
1046         pop esi
1047         ret
1048
1049
1050
1051 /*
1052  * Table for clip test.
1053  *
1054  *      bit6 = S(3) < 0
1055  *      bit5 = S(2) < 0
1056  *      bit4 = abs(S(2)) > abs(S(3))
1057  *      bit3 = S(1) < 0
1058  *      bit2 = abs(S(1)) > abs(S(3))
1059  *      bit1 = S(0) < 0
1060  *      bit0 = abs(S(0)) > abs(S(3))
1061  */
1062
1063 /* Vertex buffer clipping flags (from vb.h) */
1064 #if 0
1065
1066 #define CLIP_RIGHT_BIT   0x01
1067 #define CLIP_LEFT_BIT    0x02
1068 #define CLIP_TOP_BIT     0x04
1069 #define CLIP_BOTTOM_BIT  0x08
1070 #define CLIP_NEAR_BIT    0x10
1071 #define CLIP_FAR_BIT     0x20
1072 #define CLIP_USER_BIT    0x40
1073 #define CLIP_ALL_BITS    0x3f
1074
1075 #define MAGN_X(i)       (~(((i) & 1) - 1))
1076 #define SIGN_X(i)       (~((((i) >> 1) & 1) - 1))
1077 #define MAGN_Y(i)       (~((((i) >> 2) & 1) - 1))
1078 #define SIGN_Y(i)       (~((((i) >> 3) & 1) - 1))
1079 #define MAGN_Z(i)       (~((((i) >> 4) & 1) - 1))
1080 #define SIGN_Z(i)       (~((((i) >> 5) & 1) - 1))
1081 #define SIGN_W(i)       (~((((i) >> 6) & 1) - 1))
1082
1083 #define CLIP_VALUE(i)                                           \
1084          (CLIP_RIGHT_BIT                                        \
1085           & ((~SIGN_X(i) & SIGN_W(i))                           \
1086              | (~SIGN_X(i) & ~SIGN_W(i) & MAGN_X(i))            \
1087              | (SIGN_X(i) & SIGN_W(i) & ~MAGN_X(i))))           \
1088          | (CLIP_LEFT_BIT                                       \
1089             & ((SIGN_X(i) & SIGN_W(i))                          \
1090                | (~SIGN_X(i) & SIGN_W(i) & ~MAGN_X(i))          \
1091                | (SIGN_X(i) & ~SIGN_W(i) & MAGN_X(i))))         \
1092          | (CLIP_TOP_BIT                                        \
1093             & ((~SIGN_Y(i) & SIGN_W(i))                         \
1094                | (~SIGN_Y(i) & ~SIGN_W(i) & MAGN_Y(i))          \
1095                | (SIGN_Y(i) & SIGN_W(i) & ~MAGN_Y(i))))         \
1096          | (CLIP_BOTTOM_BIT                                     \
1097             & ((SIGN_Y(i) & SIGN_W(i))                          \
1098                | (~SIGN_Y(i) & SIGN_W(i) & ~MAGN_Y(i))          \
1099                | (SIGN_Y(i) & ~SIGN_W(i) & MAGN_Y(i))))         \
1100          | (CLIP_FAR_BIT                                        \
1101             & ((~SIGN_Z(i) & SIGN_W(i))                         \
1102                | (~SIGN_Z(i) & ~SIGN_W(i) & MAGN_Z(i))          \
1103                | (SIGN_Z(i) & SIGN_W(i) & ~MAGN_Z(i))))         \
1104          | (CLIP_NEAR_BIT                                       \
1105             & ((SIGN_Z(i) & SIGN_W(i))                          \
1106                | (~SIGN_Z(i) & SIGN_W(i) & ~MAGN_Z(i))          \
1107                | (SIGN_Z(i) & ~SIGN_W(i) & MAGN_Z(i))))
1108
1109 #define CLIP_VALUE8(i) \
1110         CLIP_VALUE(i + 0), CLIP_VALUE(i + 1), CLIP_VALUE(i + 2), CLIP_VALUE(i + 3), \
1111         CLIP_VALUE(i + 4), CLIP_VALUE(i + 5), CLIP_VALUE(i + 6), CLIP_VALUE(i + 7)
1112
1113 .rodata
1114
1115 clip_table:
1116         .byte CLIP_VALUE8(0x00)
1117         .byte CLIP_VALUE8(0x08)
1118         .byte CLIP_VALUE8(0x10)
1119         .byte CLIP_VALUE8(0x18)
1120         .byte CLIP_VALUE8(0x20)
1121         .byte CLIP_VALUE8(0x28)
1122         .byte CLIP_VALUE8(0x30)
1123         .byte CLIP_VALUE8(0x38)
1124         .byte CLIP_VALUE8(0x40)
1125         .byte CLIP_VALUE8(0x48)
1126         .byte CLIP_VALUE8(0x50)
1127         .byte CLIP_VALUE8(0x58)
1128         .byte CLIP_VALUE8(0x60)
1129         .byte CLIP_VALUE8(0x68)
1130         .byte CLIP_VALUE8(0x70)
1131         .byte CLIP_VALUE8(0x78)
1132 #else
1133
1134 .const
1135 ASSUME NOTHING
1136
1137 clip_table:
1138         .byte HEX(0), HEX(1), HEX(0), HEX(2), HEX(4), HEX(5), HEX(4), HEX(6)
1139         .byte HEX(0), HEX(1), HEX(0), HEX(2), HEX(8), HEX(9), HEX(8), HEX(a)
1140         .byte HEX(20), HEX(21), HEX(20), HEX(22), HEX(24), HEX(25), HEX(24), HEX(26)
1141         .byte HEX(20), HEX(21), HEX(20), HEX(22), HEX(28), HEX(29), HEX(28), HEX(2a)
1142         .byte HEX(0), HEX(1), HEX(0), HEX(2), HEX(4), HEX(5), HEX(4), HEX(6)
1143         .byte HEX(0), HEX(1), HEX(0), HEX(2), HEX(8), HEX(9), HEX(8), HEX(a)
1144         .byte HEX(10), HEX(11), HEX(10), HEX(12), HEX(14), HEX(15), HEX(14), HEX(16)
1145         .byte HEX(10), HEX(11), HEX(10), HEX(12), HEX(18), HEX(19), HEX(18), HEX(1a)
1146         .byte HEX(3f), HEX(3d), HEX(3f), HEX(3e), HEX(37), HEX(35), HEX(37), HEX(36)
1147         .byte HEX(3f), HEX(3d), HEX(3f), HEX(3e), HEX(3b), HEX(39), HEX(3b), HEX(3a)
1148         .byte HEX(2f), HEX(2d), HEX(2f), HEX(2e), HEX(27), HEX(25), HEX(27), HEX(26)
1149         .byte HEX(2f), HEX(2d), HEX(2f), HEX(2e), HEX(2b), HEX(29), HEX(2b), HEX(2a)
1150         .byte HEX(3f), HEX(3d), HEX(3f), HEX(3e), HEX(37), HEX(35), HEX(37), HEX(36)
1151         .byte HEX(3f), HEX(3d), HEX(3f), HEX(3e), HEX(3b), HEX(39), HEX(3b), HEX(3a)
1152         .byte HEX(1f), HEX(1d), HEX(1f), HEX(1e), HEX(17), HEX(15), HEX(17), HEX(16)
1153         .byte HEX(1f), HEX(1d), HEX(1f), HEX(1e), HEX(1b), HEX(19), HEX(1b), HEX(1a)
1154
1155 #endif
1156
1157 .code
1158
1159 /*
1160  * cliptest -
1161  *
1162  * inputs:
1163  *      ecx = # points
1164  *      esi = points
1165  *      edi = clipmask[]
1166  *
1167  * inputs/outputs:
1168  *      al = ormask
1169  *      ah = andmask
1170  */
1171
1172 cliptest:
1173         test ecx, ecx
1174         jz cliptest_end
1175
1176         push ebp
1177         push ebx
1178
1179 .align 4
1180 cliptest_loop:
1181         mov ebp, S(3)
1182         mov ebx, S(2)
1183
1184         xor edx, edx
1185         add ebp, ebp    /* %ebp = abs(S(3))*2 ; carry = sign of S(3) */
1186
1187         adc edx, edx
1188         add ebx, ebx    /* %ebx = abs(S(2))*2 ; carry = sign of S(2) */
1189
1190         adc edx, edx
1191         cmp ebp, ebx    /* carry = abs(S(2))*2 > abs(S(3))*2 */
1192
1193         adc edx, edx
1194         mov ebx, S(1)
1195
1196         add ebx, ebx    /* %ebx = abs(S(1))*2 ; carry = sign of S(1) */
1197
1198         adc edx, edx
1199         cmp ebp, ebx    /* carry = abs(S(1))*2 > abs(S(3))*2 */
1200
1201         adc edx, edx
1202         mov ebx, S(0)
1203
1204         add ebx, ebx    /* %ebx = abs(S(0))*2 ; carry = sign of S(0) */
1205
1206         adc edx, edx
1207         cmp ebp, ebx    /* carry = abs(S(0))*2 > abs(S(3))*2 */
1208
1209         adc edx, edx
1210
1211         lea esi, S(4)
1212
1213         mov bl, byte ptr [edi]
1214         mov dl, byte ptr [clip_table + edx]
1215
1216         or bl, dl
1217         or al, dl
1218
1219         and ah, dl
1220         mov [edi], bl
1221
1222         inc edi
1223         dec ecx
1224
1225         jnz cliptest_loop
1226
1227         pop ebx
1228         pop ebp
1229 cliptest_end:
1230         ret
1231
1232 /*
1233  * void asm_project_and_cliptest_general( GLuint n, GLfloat d[][4], GLfloat m[16],
1234  *                                        GLfloat s[][4], GLubyte clipmask[],
1235  *                                        GLubyte *ormask, GLubyte *andmask );
1236  */
1237 PUBLIC _asm_project_and_cliptest_general
1238 _asm_project_and_cliptest_general:
1239 .align 4
1240         push esi
1241         push edi
1242         mov ecx, [esp + 12]     /* ecx = n */
1243         mov edi, [esp + 16]     /* edi = d */
1244         mov edx, [esp + 20]     /* edx = m */
1245         mov esi, [esp + 24]     /* esi = s */
1246
1247         push esi
1248         push edx
1249         push edi
1250         push ecx
1251         call _asm_transform_points4_general
1252         add esp, DEC(16)
1253
1254         mov edi, [esp + 32]     /* ormask */
1255         mov esi, [esp + 36]     /* andmask */
1256         mov al, [edi]
1257         mov ah, [esi]
1258
1259         mov ecx, [esp + 12]     /* ecx = n */
1260         mov edi, [esp + 28]     /* edi = clipmask */
1261         mov esi, [esp + 16]     /* esi = d */
1262
1263         call cliptest
1264
1265         mov edi, [esp + 32]     /* ormask */
1266         mov esi, [esp + 36]     /* andmask */
1267         mov [edi], al
1268         mov [esi], ah
1269
1270         pop edi
1271         pop esi
1272         ret
1273
1274
1275 /*
1276  * void asm_project_and_cliptest_identity( GLuint n, GLfloat d[][4],
1277  *                                         GLfloat s[][4], GLubyte clipmask[],
1278  *                                         GLubyte *ormask, GLubyte *andmask );
1279  */
1280 PUBLIC _asm_project_and_cliptest_identity
1281 _asm_project_and_cliptest_identity:
1282 .align 4
1283         push esi
1284         push edi
1285         mov ecx, [esp + 12]     /* ecx = n */
1286         mov edi, [esp + 16]     /* edi = d */
1287         mov esi, [esp + 20]     /* esi = s */
1288
1289         push esi
1290         push edi
1291         push ecx
1292
1293         call _asm_transform_points4_identity
1294
1295         add esp, DEC(12)
1296
1297         mov edi, [esp + 28]     /* ormask */
1298         mov esi, [esp + 32]     /* andmask */
1299         mov al, [edi]
1300         mov ah, [esi]
1301
1302         mov ecx, [esp + 12]     /* ecx = n */
1303         mov edi, [esp + 24]     /* edi = clipmask */
1304         mov esi, [esp + 16]     /* esi = d */
1305
1306         call cliptest
1307
1308         mov edi, [esp + 28]     /* ormask */
1309         mov esi, [esp + 32]     /* andmask */
1310         mov [edi], al
1311         mov [esi], ah
1312
1313         pop edi
1314         pop esi
1315         ret
1316
1317 /*
1318  * void asm_project_and_cliptest_ortho( GLuint n, GLfloat d[][4], GLfloat m[16],
1319  *                                      GLfloat s[][4], GLubyte clipmask[],
1320  *                                      GLubyte *ormask, GLubyte *andmask );
1321  */
1322 PUBLIC _asm_project_and_cliptest_ortho
1323 _asm_project_and_cliptest_ortho:
1324 .align 4
1325         push esi
1326         push edi
1327         mov ecx, [esp + 12]     /* ecx = n */
1328         mov edi, [esp + 16]     /* edi = d */
1329         mov edx, [esp + 20]     /* edx = m */
1330         mov esi, [esp + 24]     /* esi = s */
1331
1332         push esi
1333         push edx
1334         push edi
1335         push ecx
1336
1337         call _asm_transform_points4_ortho
1338
1339         add esp, DEC(16)
1340
1341         mov edi, [esp + 32]     /* ormask */
1342         mov esi, [esp + 36]     /* andmask */
1343         mov al, [edi]
1344         mov ah, [esi]
1345
1346         mov ecx, [esp + 12]     /* ecx = n */
1347         mov edi, [esp + 28]     /* edi = clipmask */
1348         mov esi, [esp + 16]     /* esi = d */
1349
1350         call cliptest
1351
1352         mov edi, [esp + 32]     /* ormask */
1353         mov esi, [esp + 36]     /* andmask */
1354         mov [edi], al
1355         mov [esi], ah
1356
1357         pop edi
1358         pop esi
1359         ret
1360
1361 /*
1362  * void asm_project_and_cliptest_perspective( GLuint n, GLfloat d[][4], GLfloat m[16],
1363  *                                            GLfloat s[][4], GLubyte clipmask[],
1364  *                                            GLubyte *ormask, GLubyte *andmask );
1365  */
1366 PUBLIC _asm_project_and_cliptest_perspective
1367 _asm_project_and_cliptest_perspective:
1368 .align 4
1369         push esi
1370         push edi
1371         mov ecx, [esp + 12]     /* ecx = n */
1372         mov edi, [esp + 16]     /* edi = d */
1373         mov edx, [esp + 20]     /* edx = m */
1374         mov esi, [esp + 24]     /* esi = s */
1375
1376         push esi
1377         push edx
1378         push edi
1379         push ecx
1380
1381         call _asm_transform_points4_perspective
1382
1383         add esp, DEC(16)
1384
1385         mov edi, [esp + 32]     /* ormask */
1386         mov esi, [esp + 36]     /* andmask */
1387         mov al, [edi]
1388         mov ah, [esi]
1389
1390         mov ecx, [esp + 12]     /* ecx = n */
1391         mov edi, [esp + 28]     /* edi = clipmask */
1392         mov esi, [esp + 16]     /* esi = d */
1393
1394         call cliptest
1395
1396         mov edi, [esp + 32]     /* ormask */
1397         mov esi, [esp + 36]             /* andmask */
1398         mov byte ptr [edi], al
1399         mov byte ptr [esi], ah
1400
1401         pop edi
1402         pop esi
1403         ret
1404
1405
1406 /*
1407  * unsigned int inverse_nofp( float f );
1408  *
1409  * Calculate the inverse of a float without using the FPU.
1410  * This function returns a float in eax, so it's return
1411  * type should be 'int' when called from C (and converted
1412  * to float with pointer/union abuse).
1413  */
1414 .align 4
1415 inverse_nofp:
1416
1417         /* get mantissa in eax */
1418         mov ecx, [esp + 4]
1419         and ecx, HEX(7fffff)
1420
1421         /* set implicit integer */
1422         or ecx, HEX(800000)
1423
1424         /* div 0x10000:0x00000000 by mantissa */
1425         xor eax, eax
1426         mov edx, HEX(10000)
1427
1428         div ecx
1429
1430         /* round result */
1431         shr eax, DEC(1)
1432         adc eax, DEC(0)
1433
1434         /* get exponent in ecx */
1435         mov ecx, HEX(7f800000)
1436         mov edx, [esp + 4]
1437         and ecx, edx
1438
1439         /* negate exponent and decrement it */
1440         mov edx, HEX(7E800000)
1441         sub edx, ecx
1442
1443         /* if bit 24 is set, shift and adjust exponent */
1444         test eax, HEX(1000000)
1445         jz inverse_nofp_combine
1446
1447         shr eax, HEX(1)
1448         add edx, HEX(800000)
1449
1450         /* combine mantissa and exponent, then set sign */
1451 inverse_nofp_combine:
1452         and eax, HEX(7fffff)
1453         mov ecx, [esp + 4]
1454         or eax, edx
1455         and ecx, HEX(80000000)
1456         or eax, ecx
1457
1458         ret
1459
1460
1461 /*
1462  * void gl_xform_normals_3fv( GLuint n, GLfloat d[][4], GLfloat m[16],
1463  *                             GLfloat s[][4], GLboolean normalize );
1464  */
1465 PUBLIC _gl_xform_normals_3fv
1466 _gl_xform_normals_3fv:
1467 .align 4
1468         push esi
1469         push edi
1470         mov ecx, [esp + 12]     /* ecx = n */
1471         mov edi, [esp + 16]     /* edi = d */
1472         mov edx, [esp + 20]     /* edx = m */
1473         mov esi, [esp + 24]     /* esi = s */
1474
1475         test ecx, ecx
1476         jz _gl_xform_normals_3fv_end
1477
1478 .align 4
1479 _gl_xform_normals_3fv_loop:
1480         fld S(0)
1481         fmul M(0, 0)
1482         fld S(0)
1483         fmul M(1, 0)
1484         fld S(0)
1485         fmul M(2, 0)
1486
1487         fld S(1)
1488         fmul M(0, 1)
1489         fld S(1)
1490         fmul M(1, 1)
1491         fld S(1)
1492         fmul M(2, 1)
1493
1494         /*
1495          * st(5) = S(0) * M(0, 0)
1496          * st(4) = S(0) * M(1, 0)
1497          * st(3) = S(0) * M(2, 0)
1498          * st(2) = S(1) * M(0, 1)
1499          * st(1) = S(1) * M(1, 1)
1500          * st(0) = S(1) * M(2, 1)
1501          */
1502
1503         fxch st(2)                      /* 2 1 0 3 4 5 */
1504         faddp st(5), st         /* 1 0 3 4 5 */
1505         faddp st(3), st         /* 0 3 4 5 */
1506         faddp st(1), st         /* 3 4 5 */
1507
1508         /*
1509          * st(2) = S(0) * M(0, 0) + S(1) * M(0, 1)
1510          * st(1) = S(0) * M(1, 0) + S(1) * M(1, 1)
1511          * st(0) = S(0) * M(2, 0) + S(1) * M(2, 1)
1512          */
1513
1514         fld S(2)
1515         fmul M(0, 2)
1516         fld S(2)
1517         fmul M(1, 2)
1518         fld S(2)
1519         fmul M(2, 2)
1520
1521         /*
1522          * st(5) = S(0) * M(0, 0) + S(1) * M(0, 1)
1523          * st(4) = S(0) * M(1, 0) + S(1) * M(1, 1)
1524          * st(3) = S(0) * M(2, 0) + S(1) * M(2, 1)
1525          * st(2) = S(2) * M(0, 2)
1526          * st(1) = S(2) * M(1, 2)
1527          * st(0) = S(2) * M(2, 2)
1528          */
1529
1530         fxch st(2)                      /* 2 1 0 3 4 5 */
1531         faddp st(5), st         /* 1 0 3 4 5 */
1532         faddp st(3), st         /* 0 3 4 5 */
1533         faddp st(1), st         /* 3 4 5 */
1534
1535         /*
1536          * st(2) = S(0) * M(0, 0) + S(1) * M(0, 1) + S(2) * M(0, 2)
1537          * st(1) = S(0) * M(1, 0) + S(1) * M(1, 1) + S(2) * M(1, 2)
1538          * st(0) = S(0) * M(2, 0) + S(1) * M(2, 1) + S(2) * M(2, 2)
1539          */
1540
1541         fxch st(2)      /* 2 1 0 */
1542         fstp D(0)       /* 1 0 */
1543         fstp D(1)       /* 0 */
1544         fstp D(2)       /* */
1545
1546         lea esi, S(3)
1547
1548         dec ecx
1549         lea edi, D(3)
1550
1551         jnz _gl_xform_normals_3fv_loop
1552
1553         /*
1554          * Skip normalize if it isn't needed
1555          */
1556         cmp dword ptr [esp + 28], DEC(0)
1557         jz _gl_xform_normals_3fv_end
1558
1559         /* Normalize required */
1560
1561         mov esi, [esp + 12]             /* esi = n */
1562         mov edi, [esp + 16]             /* edi = d */
1563
1564         sub esp, DEC(4) /* temp var for 1.0 / len */
1565
1566         /*
1567          * (%esp) = length of first normal
1568          */
1569         fld D(0)
1570         fmul D(0)
1571         fld D(1)
1572         fmul D(1)
1573         fld D(2)
1574         fmul D(2)
1575         fxch st(2)
1576         faddp st(1), st
1577         faddp st(1), st
1578         fsqrt
1579         fstp dword ptr [esp]
1580
1581         jmp _gl_xform_normals_3fv_loop2_end
1582
1583 .align 4
1584 _gl_xform_normals_3fv_loop2:
1585         /* %st(0) = length of next normal */
1586         fld D(3)
1587         fmul D(3)
1588         fld D(4)
1589         fmul D(4)
1590         fld D(5)
1591         fmul D(5)
1592         fxch st(2)
1593         faddp st(1), st
1594         faddp st(1), st
1595         fsqrt
1596
1597         /*
1598          * inverse the length of the current normal, which is
1599          * already at (%esp).  This should overlap the prev
1600          * fsqrt nicely.
1601          */
1602         call inverse_nofp
1603         mov [esp], eax
1604
1605         /* multiply normal by 1/len */
1606         fld D(0)
1607         fmul dword ptr [esp]
1608         fld D(1)
1609         fmul dword ptr [esp]
1610         fld D(2)
1611         fmul dword ptr [esp]
1612         fxch st(3)
1613         fstp dword ptr [esp]    /* store length of next normal */
1614         fstp D(1)
1615         fstp D(0)
1616         fstp D(2)
1617         lea edi, D(3)
1618
1619 _gl_xform_normals_3fv_loop2_end:
1620         dec esi
1621         jnz _gl_xform_normals_3fv_loop2
1622
1623         /* finish up the last normal */
1624         call inverse_nofp
1625         mov [esp], eax
1626         fld D(0)
1627         fmul dword ptr [esp]
1628         fld D(1)
1629         fmul dword ptr [esp]
1630         fld D(2)
1631         fmul dword ptr [esp]
1632         fxch st(2)
1633         fstp D(0)
1634         fstp D(1)
1635         fstp D(2)
1636
1637         add esp, DEC(4)
1638
1639 _gl_xform_normals_3fv_end:
1640         pop edi
1641         pop esi
1642         ret
1643
1644 END