Fixed typo
[reactos.git] / dll / opengl / mesa / x86-64 / xform4.S
1 /*
2 * Mesa 3-D graphics library
3 * Version: 7.1
4 *
5 * Copyright (C) 1999-2007 Brian Paul All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #ifdef USE_X86_64_ASM
26
27 #include "matypes.h"
28
29 .text
30
31 .align 16
32 .globl _mesa_x86_64_cpuid
33 .hidden _mesa_x86_64_cpuid
34 _mesa_x86_64_cpuid:
35 pushq %rbx
36 movl (%rdi), %eax
37 movl 8(%rdi), %ecx
38
39 cpuid
40
41 movl %ebx, 4(%rdi)
42 movl %eax, (%rdi)
43 movl %ecx, 8(%rdi)
44 movl %edx, 12(%rdi)
45 popq %rbx
46 ret
47
48 .align 16
49 .globl _mesa_x86_64_transform_points4_general
50 .hidden _mesa_x86_64_transform_points4_general
51 _mesa_x86_64_transform_points4_general:
52 /*
53 * rdi = dest
54 * rsi = matrix
55 * rdx = source
56 */
57 movl V4F_COUNT(%rdx), %ecx /* count */
58 movzbl V4F_STRIDE(%rdx), %eax /* stride */
59
60 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
61 movl $4, V4F_SIZE(%rdi) /* set dest size */
62 .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */
63 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
64
65 testl %ecx, %ecx /* verify non-zero count */
66 prefetchnta 64(%rsi)
67 jz p4_general_done
68
69 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
70 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
71
72 prefetch 16(%rdx)
73
74 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
75 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
76 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
77 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
78 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
79
80 p4_general_loop:
81
82 movups (%rdx), %xmm8 /* ox | oy | oz | ow */
83 prefetchw 16(%rdi)
84
85 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
86 addq %rax, %rdx
87 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
88 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
89 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
90 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
91 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
92 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
93 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
94 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
95 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
96 prefetch 16(%rdx)
97 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
98
99 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
100 addq $16, %rdi
101
102 decl %ecx
103 jnz p4_general_loop
104
105 p4_general_done:
106 .byte 0xf3
107 ret
108
109 .section .rodata
110
111 .align 16
112 p4_constants:
113 .byte 0xff, 0xff, 0xff, 0xff
114 .byte 0xff, 0xff, 0xff, 0xff
115 .byte 0xff, 0xff, 0xff, 0xff
116 .byte 0x00, 0x00, 0x00, 0x00
117
118 .byte 0x00, 0x00, 0x00, 0x00
119 .byte 0x00, 0x00, 0x00, 0x00
120 .byte 0x00, 0x00, 0x00, 0x00
121 .float 1.0
122
123 .text
124 .align 16
125 .globl _mesa_x86_64_transform_points4_3d
126 .hidden _mesa_x86_64_transform_points4_3d
127 /*
128 * this is slower than _mesa_x86_64_transform_points4_general
129 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
130 */
131 _mesa_x86_64_transform_points4_3d:
132
133 leaq p4_constants(%rip), %rax
134
135 prefetchnta 64(%rsi)
136
137 movaps (%rax), %xmm9
138 movaps 16(%rax), %xmm10
139
140 movl V4F_COUNT(%rdx), %ecx /* count */
141 movzbl V4F_STRIDE(%rdx), %eax /* stride */
142
143 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
144 movl $4, V4F_SIZE(%rdi) /* set dest size */
145 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
146
147 testl %ecx, %ecx /* verify non-zero count */
148 jz p4_3d_done
149
150 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
151 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
152
153 prefetch 16(%rdx)
154
155 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
156 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
157 andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */
158 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
159 andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */
160 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
161 andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */
162 andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */
163 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
164 orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */
165
166 p4_3d_loop:
167
168 movups (%rdx), %xmm8 /* ox | oy | oz | ow */
169 prefetchw 16(%rdi)
170
171 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
172 addq %rax, %rdx
173 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
174 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
175 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
176 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
177 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
178 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
179 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
180 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
181 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
182 prefetch 16(%rdx)
183 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
184
185 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
186 addq $16, %rdi
187
188 dec %ecx
189 jnz p4_3d_loop
190
191 p4_3d_done:
192 .byte 0xf3
193 ret
194
195
196 .align 16
197 .globl _mesa_x86_64_transform_points4_identity
198 .hidden _mesa_x86_64_transform_points4_identity
199 _mesa_x86_64_transform_points4_identity:
200
201 movl V4F_COUNT(%rdx), %ecx /* count */
202 movzbl V4F_STRIDE(%rdx), %eax /* stride */
203
204 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
205 movl $4, V4F_SIZE(%rdi) /* set dest size */
206 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
207
208 test %ecx, %ecx
209 jz p4_identity_done
210
211 movq V4F_START(%rdx), %rsi /* ptr to first src vertex */
212 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
213 prefetch 64(%rsi)
214 prefetchw 64(%rdi)
215
216 add %ecx, %ecx
217
218 rep movsq
219
220 p4_identity_done:
221 .byte 0xf3
222 ret
223
224
225 .align 16
226 .globl _mesa_3dnow_transform_points4_3d_no_rot
227 .hidden _mesa_3dnow_transform_points4_3d_no_rot
228 _mesa_3dnow_transform_points4_3d_no_rot:
229
230 movl V4F_COUNT(%rdx), %ecx /* count */
231 movzbl V4F_STRIDE(%rdx), %eax /* stride */
232
233 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
234 movl $4, V4F_SIZE(%rdi) /* set dest size */
235 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
236 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
237
238 test %ecx, %ecx
239 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
240 jz p4_3d_no_rot_done
241
242 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
243 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
244
245 prefetch (%rdx)
246
247 movd (%rsi), %mm0 /* | m00 */
248 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
249 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
250
251 movd 40(%rsi), %mm2 /* | m22 */
252 movq 48(%rsi), %mm1 /* m31 | m30 */
253
254 punpckldq 56(%rsi), %mm2 /* m11 | m00 */
255
256 p4_3d_no_rot_loop:
257
258 prefetchw 32(%rdi)
259
260 movq (%rdx), %mm4 /* x1 | x0 */
261 movq 8(%rdx), %mm5 /* x3 | x2 */
262 movd 12(%rdx), %mm7 /* | x3 */
263
264 movq %mm5, %mm6 /* x3 | x2 */
265 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
266
267 punpckhdq %mm6, %mm6 /* x3 | x3 */
268 pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */
269
270 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
271 pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */
272
273 pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
274
275 addq %rax, %rdx
276 movq %mm4, (%rdi) /* write r0, r1 */
277 movq %mm5, 8(%rdi) /* write r2, r3 */
278
279 addq $16, %rdi
280
281 decl %ecx
282 prefetch 32(%rdx)
283 jnz p4_3d_no_rot_loop
284
285 p4_3d_no_rot_done:
286 femms
287 ret
288
289
290 .align 16
291 .globl _mesa_3dnow_transform_points4_perspective
292 .hidden _mesa_3dnow_transform_points4_perspective
293 _mesa_3dnow_transform_points4_perspective:
294
295 movl V4F_COUNT(%rdx), %ecx /* count */
296 movzbl V4F_STRIDE(%rdx), %eax /* stride */
297
298 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
299 movl $4, V4F_SIZE(%rdi) /* set dest size */
300 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
301
302 test %ecx, %ecx
303 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
304 jz p4_perspective_done
305
306 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
307 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
308
309 movd (%rsi), %mm0 /* | m00 */
310 pxor %mm7, %mm7 /* 0 | 0 */
311 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
312
313 movq 32(%rsi), %mm2 /* m21 | m20 */
314 prefetch (%rdx)
315
316 movd 40(%rsi), %mm1 /* | m22 */
317
318 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
319 punpckldq 56(%rsi), %mm1 /* m32 | m22 */
320
321
322 p4_perspective_loop:
323
324 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
325
326 movq (%rdx), %mm4 /* x1 | x0 */
327 movq 8(%rdx), %mm5 /* x3 | x2 */
328 movd 8(%rdx), %mm3 /* | x2 */
329
330 movq %mm5, %mm6 /* x3 | x2 */
331 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
332
333 punpckldq %mm5, %mm5 /* x2 | x2 */
334
335 pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */
336 pfsubr %mm7, %mm3 /* | -x2 */
337
338 pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */
339 pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */
340
341 pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */
342
343 movq %mm5, (%rdi) /* write r0, r1 */
344 addq %rax, %rdx
345 movq %mm6, 8(%rdi) /* write r2, r3 */
346
347 addq $16, %rdi
348
349 decl %ecx
350 prefetch 32(%rdx) /* hopefully stride is zero */
351 jnz p4_perspective_loop
352
353 p4_perspective_done:
354 femms
355 ret
356
357 .align 16
358 .globl _mesa_3dnow_transform_points4_2d_no_rot
359 .hidden _mesa_3dnow_transform_points4_2d_no_rot
360 _mesa_3dnow_transform_points4_2d_no_rot:
361
362 movl V4F_COUNT(%rdx), %ecx /* count */
363 movzbl V4F_STRIDE(%rdx), %eax /* stride */
364
365 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
366 movl $4, V4F_SIZE(%rdi) /* set dest size */
367 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
368
369 test %ecx, %ecx
370 .byte 0x90 /* manual align += 1 */
371 jz p4_2d_no_rot_done
372
373 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
374 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
375
376 movd (%rsi), %mm0 /* | m00 */
377 prefetch (%rdx)
378 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
379
380 movq 48(%rsi), %mm1 /* m31 | m30 */
381
382 p4_2d_no_rot_loop:
383
384 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
385
386 movq (%rdx), %mm4 /* x1 | x0 */
387 movq 8(%rdx), %mm5 /* x3 | x2 */
388
389 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
390 movq %mm5, %mm6 /* x3 | x2 */
391
392 punpckhdq %mm6, %mm6 /* x3 | x3 */
393
394 addq %rax, %rdx
395 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
396
397 prefetch 32(%rdx) /* hopefully stride is zero */
398 pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
399
400 movq %mm6, (%rdi) /* write r0, r1 */
401 movq %mm5, 8(%rdi) /* write r2, r3 */
402
403 addq $16, %rdi
404
405 decl %ecx
406 jnz p4_2d_no_rot_loop
407
408 p4_2d_no_rot_done:
409 femms
410 ret
411
412
413 .align 16
414 .globl _mesa_3dnow_transform_points4_2d
415 .hidden _mesa_3dnow_transform_points4_2d
416 _mesa_3dnow_transform_points4_2d:
417
418 movl V4F_COUNT(%rdx), %ecx /* count */
419 movzbl V4F_STRIDE(%rdx), %eax /* stride */
420
421 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
422 movl $4, V4F_SIZE(%rdi) /* set dest size */
423 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
424 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
425
426 test %ecx, %ecx
427 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
428 jz p4_2d_done
429
430 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
431 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
432
433 movd (%rsi), %mm0 /* | m00 */
434 movd 4(%rsi), %mm1 /* | m01 */
435
436 prefetch (%rdx)
437
438 punpckldq 16(%rsi), %mm0 /* m10 | m00 */
439 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
440 punpckldq 20(%rsi), %mm1 /* m11 | m01 */
441
442 movq 48(%rsi), %mm2 /* m31 | m30 */
443
444 p4_2d_loop:
445
446 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
447
448 movq (%rdx), %mm3 /* x1 | x0 */
449 movq 8(%rdx), %mm5 /* x3 | x2 */
450
451 movq %mm3, %mm4 /* x1 | x0 */
452 movq %mm5, %mm6 /* x3 | x2 */
453
454 pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */
455 punpckhdq %mm6, %mm6 /* x3 | x3 */
456
457 pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */
458
459 addq %rax, %rdx
460 pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */
461
462 pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */
463 prefetch 32(%rdx) /* hopefully stride is zero */
464
465 pfadd %mm6, %mm3 /* r1 | r0 */
466
467 movq %mm3, (%rdi) /* write r0, r1 */
468 movq %mm5, 8(%rdi) /* write r2, r3 */
469
470 addq $16, %rdi
471
472 decl %ecx
473 jnz p4_2d_loop
474
475 p4_2d_done:
476 femms
477 ret
478
479 #endif
480
481 #if defined (__ELF__) && defined (__linux__)
482 .section .note.GNU-stack,"",%progbits
483 #endif