00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00032 #include "util/u_memory.h"
00033 #include "util/u_math.h"
00034 #include "pipe/p_shader_tokens.h"
00035 #include "pipe/p_debug.h"
00036 #include "tgsi/tgsi_parse.h"
00037 #include "tgsi/tgsi_util.h"
00038 #include "tgsi/tgsi_exec.h"
00039 #include "tgsi/tgsi_dump.h"
00040
00041 #include "draw_vs.h"
00042 #include "draw_vs_aos.h"
00043
00044 #include "rtasm/rtasm_x86sse.h"
00045
00046 #ifdef PIPE_ARCH_X86
00047 #define DISASSEM 0
00048 #define FAST_MATH 1
00049
00050 static const char *files[] =
00051 {
00052 "NULL",
00053 "CONST",
00054 "IN",
00055 "OUT",
00056 "TEMP",
00057 "SAMP",
00058 "ADDR",
00059 "IMM",
00060 "INTERNAL",
00061 };
00062
00063 static INLINE boolean eq( struct x86_reg a,
00064 struct x86_reg b )
00065 {
00066 return (a.file == b.file &&
00067 a.idx == b.idx &&
00068 a.mod == b.mod &&
00069 a.disp == b.disp);
00070 }
00071
00072 struct x86_reg aos_get_x86( struct aos_compilation *cp,
00073 unsigned which_reg,
00074 unsigned value )
00075 {
00076 struct x86_reg reg;
00077
00078 if (which_reg == 0)
00079 reg = cp->temp_EBP;
00080 else
00081 reg = cp->tmp_EAX;
00082
00083 if (cp->x86_reg[which_reg] != value) {
00084 unsigned offset;
00085
00086 switch (value) {
00087 case X86_IMMEDIATES:
00088 assert(which_reg == 0);
00089 offset = Offset(struct aos_machine, immediates);
00090 break;
00091 case X86_CONSTANTS:
00092 assert(which_reg == 1);
00093 offset = Offset(struct aos_machine, constants);
00094 break;
00095 case X86_BUFFERS:
00096 assert(which_reg == 0);
00097 offset = Offset(struct aos_machine, buffer);
00098 break;
00099 default:
00100 assert(0);
00101 offset = 0;
00102 }
00103
00104
00105 x86_mov(cp->func, reg,
00106 x86_make_disp(cp->machine_EDX, offset));
00107
00108 cp->x86_reg[which_reg] = value;
00109 }
00110
00111 return reg;
00112 }
00113
00114
00115 static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
00116 unsigned file,
00117 unsigned idx )
00118 {
00119 struct x86_reg ptr = cp->machine_EDX;
00120
00121 switch (file) {
00122 case TGSI_FILE_INPUT:
00123 assert(idx < MAX_INPUTS);
00124 return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));
00125
00126 case TGSI_FILE_OUTPUT:
00127 return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));
00128
00129 case TGSI_FILE_TEMPORARY:
00130 assert(idx < MAX_TEMPS);
00131 return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));
00132
00133 case AOS_FILE_INTERNAL:
00134 assert(idx < MAX_INTERNALS);
00135 return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));
00136
00137 case TGSI_FILE_IMMEDIATE:
00138 assert(idx < MAX_IMMEDIATES);
00139 return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float));
00140
00141 case TGSI_FILE_CONSTANT:
00142 assert(idx < MAX_CONSTANTS);
00143 return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float));
00144
00145 default:
00146 ERROR(cp, "unknown reg file");
00147 return x86_make_reg(0,0);
00148 }
00149 }
00150
00151
00152
00153 #define X87_CW_EXCEPTION_INV_OP (1<<0)
00154 #define X87_CW_EXCEPTION_DENORM_OP (1<<1)
00155 #define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2)
00156 #define X87_CW_EXCEPTION_OVERFLOW (1<<3)
00157 #define X87_CW_EXCEPTION_UNDERFLOW (1<<4)
00158 #define X87_CW_EXCEPTION_PRECISION (1<<5)
00159 #define X87_CW_PRECISION_SINGLE (0<<8)
00160 #define X87_CW_PRECISION_RESERVED (1<<8)
00161 #define X87_CW_PRECISION_DOUBLE (2<<8)
00162 #define X87_CW_PRECISION_DOUBLE_EXT (3<<8)
00163 #define X87_CW_PRECISION_MASK (3<<8)
00164 #define X87_CW_ROUND_NEAREST (0<<10)
00165 #define X87_CW_ROUND_DOWN (1<<10)
00166 #define X87_CW_ROUND_UP (2<<10)
00167 #define X87_CW_ROUND_ZERO (3<<10)
00168 #define X87_CW_ROUND_MASK (3<<10)
00169 #define X87_CW_INFINITY (1<<12)
00170
00171
00172
00173
00174 static void spill( struct aos_compilation *cp, unsigned idx )
00175 {
00176 if (!cp->xmm[idx].dirty ||
00177 (cp->xmm[idx].file != TGSI_FILE_INPUT &&
00178 cp->xmm[idx].file != TGSI_FILE_OUTPUT &&
00179 cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) {
00180 ERROR(cp, "invalid spill");
00181 return;
00182 }
00183 else {
00184 struct x86_reg oldval = get_reg_ptr(cp,
00185 cp->xmm[idx].file,
00186 cp->xmm[idx].idx);
00187
00188 if (0) debug_printf("\nspill %s[%d]",
00189 files[cp->xmm[idx].file],
00190 cp->xmm[idx].idx);
00191
00192 assert(cp->xmm[idx].dirty);
00193 sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx));
00194 cp->xmm[idx].dirty = 0;
00195 }
00196 }
00197
00198
00199 void aos_spill_all( struct aos_compilation *cp )
00200 {
00201 unsigned i;
00202
00203 for (i = 0; i < 8; i++) {
00204 if (cp->xmm[i].dirty)
00205 spill(cp, i);
00206 aos_release_xmm_reg(cp, i);
00207 }
00208 }
00209
00210
00211 static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
00212 struct x86_reg reg )
00213 {
00214 if (reg.file != file_XMM ||
00215 cp->xmm[reg.idx].file != TGSI_FILE_NULL)
00216 {
00217 struct x86_reg tmp = aos_get_xmm_reg(cp);
00218 sse_movaps(cp->func, tmp, reg);
00219 reg = tmp;
00220 }
00221
00222 cp->xmm[reg.idx].last_used = cp->insn_counter;
00223 return reg;
00224 }
00225
00226 static struct x86_reg get_xmm( struct aos_compilation *cp,
00227 struct x86_reg reg )
00228 {
00229 if (reg.file != file_XMM)
00230 {
00231 struct x86_reg tmp = aos_get_xmm_reg(cp);
00232 sse_movaps(cp->func, tmp, reg);
00233 reg = tmp;
00234 }
00235
00236 cp->xmm[reg.idx].last_used = cp->insn_counter;
00237 return reg;
00238 }
00239
00240
00241
00242
00243
00244 struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp )
00245 {
00246 unsigned i;
00247 unsigned oldest = 0;
00248 boolean found = FALSE;
00249
00250 for (i = 0; i < 8; i++)
00251 if (cp->xmm[i].last_used != cp->insn_counter &&
00252 cp->xmm[i].file == TGSI_FILE_NULL) {
00253 oldest = i;
00254 found = TRUE;
00255 }
00256
00257 if (!found) {
00258 for (i = 0; i < 8; i++)
00259 if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
00260 oldest = i;
00261 }
00262
00263
00264
00265 if (cp->xmm[oldest].dirty)
00266 spill(cp, oldest);
00267
00268 assert(cp->xmm[oldest].last_used != cp->insn_counter);
00269
00270 cp->xmm[oldest].file = TGSI_FILE_NULL;
00271 cp->xmm[oldest].idx = 0;
00272 cp->xmm[oldest].dirty = 0;
00273 cp->xmm[oldest].last_used = cp->insn_counter;
00274 return x86_make_reg(file_XMM, oldest);
00275 }
00276
00277 void aos_release_xmm_reg( struct aos_compilation *cp,
00278 unsigned idx )
00279 {
00280 cp->xmm[idx].file = TGSI_FILE_NULL;
00281 cp->xmm[idx].idx = 0;
00282 cp->xmm[idx].dirty = 0;
00283 cp->xmm[idx].last_used = 0;
00284 }
00285
00286
00287 static void aos_soft_release_xmm( struct aos_compilation *cp,
00288 struct x86_reg reg )
00289 {
00290 if (reg.file == file_XMM) {
00291 assert(cp->xmm[reg.idx].last_used == cp->insn_counter);
00292 cp->xmm[reg.idx].last_used = cp->insn_counter - 1;
00293 }
00294 }
00295
00296
00297
00298
00299
00300 void aos_adopt_xmm_reg( struct aos_compilation *cp,
00301 struct x86_reg reg,
00302 unsigned file,
00303 unsigned idx,
00304 unsigned dirty )
00305 {
00306 unsigned i;
00307
00308 if (reg.file != file_XMM) {
00309 assert(0);
00310 return;
00311 }
00312
00313
00314
00315
00316
00317 for (i = 0; i < 8; i++) {
00318 if (cp->xmm[i].file == file &&
00319 cp->xmm[i].idx == idx)
00320 {
00321
00322
00323
00324 dirty |= cp->xmm[i].dirty;
00325 aos_release_xmm_reg(cp, i);
00326 }
00327 }
00328
00329 cp->xmm[reg.idx].file = file;
00330 cp->xmm[reg.idx].idx = idx;
00331 cp->xmm[reg.idx].dirty = dirty;
00332 cp->xmm[reg.idx].last_used = cp->insn_counter;
00333 }
00334
00335
00336
00337
00338 static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp,
00339 unsigned file,
00340 unsigned idx )
00341 {
00342 unsigned i;
00343
00344
00345
00346 for (i = 0; i < 8; i++) {
00347 if (cp->xmm[i].file == file &&
00348 cp->xmm[i].idx == idx &&
00349 cp->xmm[i].dirty) {
00350 spill(cp, i);
00351 }
00352 }
00353
00354 return get_reg_ptr( cp, file, idx );
00355 }
00356
00357
00358
00359
00360
00361 static struct x86_reg get_dst_ptr( struct aos_compilation *cp,
00362 const struct tgsi_full_dst_register *dst )
00363 {
00364 unsigned file = dst->DstRegister.File;
00365 unsigned idx = dst->DstRegister.Index;
00366 unsigned i;
00367
00368
00369
00370
00371
00372 for (i = 0; i < 8; i++) {
00373 if (cp->xmm[i].file == file &&
00374 cp->xmm[i].idx == idx)
00375 {
00376 if (cp->xmm[i].dirty)
00377 spill(cp, i);
00378
00379 aos_release_xmm_reg(cp, i);
00380 }
00381 }
00382
00383 return get_reg_ptr( cp, file, idx );
00384 }
00385
00386
00387
00388
00389
00390
00391
00392
00393 struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,
00394 unsigned file,
00395 unsigned idx )
00396 {
00397 unsigned i;
00398
00399 for (i = 0; i < 8; i++) {
00400 if (cp->xmm[i].file == file &&
00401 cp->xmm[i].idx == idx)
00402 {
00403 cp->xmm[i].last_used = cp->insn_counter;
00404 return x86_make_reg(file_XMM, i);
00405 }
00406 }
00407
00408
00409
00410
00411 return get_reg_ptr( cp, file, idx );
00412 }
00413
00414
00415
00416 static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp,
00417 unsigned file,
00418 unsigned idx )
00419 {
00420 struct x86_reg reg = get_xmm( cp,
00421 aos_get_shader_reg( cp, file, idx ) );
00422
00423 aos_adopt_xmm_reg( cp,
00424 reg,
00425 file,
00426 idx,
00427 FALSE );
00428
00429 return reg;
00430 }
00431
00432
00433
00434 struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
00435 unsigned imm )
00436 {
00437 return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm );
00438 }
00439
00440
00441 struct x86_reg aos_get_internal( struct aos_compilation *cp,
00442 unsigned imm )
00443 {
00444 return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm );
00445 }
00446
00447
00448
00449
00450
00451
00452
00453 static void emit_pshufd( struct aos_compilation *cp,
00454 struct x86_reg dst,
00455 struct x86_reg arg0,
00456 ubyte shuf )
00457 {
00458 if (cp->have_sse2) {
00459 sse2_pshufd(cp->func, dst, arg0, shuf);
00460 }
00461 else {
00462 if (!eq(dst, arg0))
00463 sse_movaps(cp->func, dst, arg0);
00464
00465 sse_shufps(cp->func, dst, dst, shuf);
00466 }
00467 }
00468
00469
00470
00471
00472
00473
00474
00475 static boolean mask_write( struct aos_compilation *cp,
00476 struct x86_reg dst,
00477 struct x86_reg result,
00478 unsigned mask )
00479 {
00480 struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
00481 struct x86_reg tmp = aos_get_xmm_reg(cp);
00482
00483 emit_pshufd(cp, tmp, imm_swz,
00484 SHUF((mask & 1) ? 2 : 3,
00485 (mask & 2) ? 2 : 3,
00486 (mask & 4) ? 2 : 3,
00487 (mask & 8) ? 2 : 3));
00488
00489 sse_andps(cp->func, dst, tmp);
00490 sse_andnps(cp->func, tmp, result);
00491 sse_orps(cp->func, dst, tmp);
00492
00493 aos_release_xmm_reg(cp, tmp.idx);
00494 return TRUE;
00495 }
00496
00497
00498
00499
00500
00501
00502 static boolean emit_shuf_copy2( struct aos_compilation *cp,
00503 struct x86_reg dst,
00504 struct x86_reg arg0,
00505 struct x86_reg arg1,
00506 ubyte shuf )
00507 {
00508 struct x86_reg tmp = aos_get_xmm_reg(cp);
00509
00510 emit_pshufd(cp, dst, arg1, shuf);
00511 emit_pshufd(cp, tmp, arg0, shuf);
00512 sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W));
00513 emit_pshufd(cp, dst, dst, shuf);
00514
00515 aos_release_xmm_reg(cp, tmp.idx);
00516 return TRUE;
00517 }
00518
00519
00520
00521 #define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
00522
00523
00524
00525
00526
00527
00528 static struct x86_reg fetch_src( struct aos_compilation *cp,
00529 const struct tgsi_full_src_register *src )
00530 {
00531 struct x86_reg arg0 = aos_get_shader_reg(cp,
00532 src->SrcRegister.File,
00533 src->SrcRegister.Index);
00534 unsigned i;
00535 ubyte swz = 0;
00536 unsigned negs = 0;
00537 unsigned abs = 0;
00538
00539 for (i = 0; i < 4; i++) {
00540 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, i );
00541 unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );
00542
00543 switch (swizzle) {
00544 case TGSI_EXTSWIZZLE_ZERO:
00545 case TGSI_EXTSWIZZLE_ONE:
00546 ERROR(cp, "not supporting full swizzles yet in tgsi_aos_sse2");
00547 break;
00548
00549 default:
00550 swz |= (swizzle & 0x3) << (i * 2);
00551 break;
00552 }
00553
00554 switch (neg) {
00555 case TGSI_UTIL_SIGN_TOGGLE:
00556 negs |= (1<<i);
00557 break;
00558
00559 case TGSI_UTIL_SIGN_KEEP:
00560 break;
00561
00562 case TGSI_UTIL_SIGN_CLEAR:
00563 abs |= (1<<i);
00564 break;
00565
00566 default:
00567 ERROR(cp, "unsupported sign-mode");
00568 break;
00569 }
00570 }
00571
00572 if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) {
00573 struct x86_reg dst = aos_get_xmm_reg(cp);
00574
00575 if (swz != SSE_SWIZZLE_NOOP)
00576 emit_pshufd(cp, dst, arg0, swz);
00577 else
00578 sse_movaps(cp->func, dst, arg0);
00579
00580 if (negs && negs != 0xf) {
00581 struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
00582 struct x86_reg tmp = aos_get_xmm_reg(cp);
00583
00584
00585
00586
00587
00588 emit_pshufd(cp, tmp, imm_swz,
00589 SHUF((negs & 1) ? 1 : 0,
00590 (negs & 2) ? 1 : 0,
00591 (negs & 4) ? 1 : 0,
00592 (negs & 8) ? 1 : 0));
00593 sse_mulps(cp->func, dst, tmp);
00594
00595 aos_release_xmm_reg(cp, tmp.idx);
00596 aos_soft_release_xmm(cp, imm_swz);
00597 }
00598 else if (negs) {
00599 struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS);
00600 sse_mulps(cp->func, dst, imm_negs);
00601 aos_soft_release_xmm(cp, imm_negs);
00602 }
00603
00604
00605 if (abs && abs != 0xf) {
00606 ERROR(cp, "unsupported partial abs");
00607 }
00608 else if (abs) {
00609 struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
00610 struct x86_reg tmp = aos_get_xmm_reg(cp);
00611
00612 sse_movaps(cp->func, tmp, dst);
00613 sse_mulps(cp->func, tmp, neg);
00614 sse_maxps(cp->func, dst, tmp);
00615
00616 aos_release_xmm_reg(cp, tmp.idx);
00617 aos_soft_release_xmm(cp, neg);
00618 }
00619
00620 aos_soft_release_xmm(cp, arg0);
00621 return dst;
00622 }
00623
00624 return arg0;
00625 }
00626
00627 static void x87_fld_src( struct aos_compilation *cp,
00628 const struct tgsi_full_src_register *src,
00629 unsigned channel )
00630 {
00631 struct x86_reg arg0 = aos_get_shader_reg_ptr(cp,
00632 src->SrcRegister.File,
00633 src->SrcRegister.Index);
00634
00635 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, channel );
00636 unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );
00637
00638 switch (swizzle) {
00639 case TGSI_EXTSWIZZLE_ZERO:
00640 x87_fldz( cp->func );
00641 break;
00642
00643 case TGSI_EXTSWIZZLE_ONE:
00644 x87_fld1( cp->func );
00645 break;
00646
00647 default:
00648 x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
00649 break;
00650 }
00651
00652
00653 switch (neg) {
00654 case TGSI_UTIL_SIGN_TOGGLE:
00655
00656
00657 x87_fchs( cp->func );
00658 break;
00659
00660 case TGSI_UTIL_SIGN_KEEP:
00661 break;
00662
00663 case TGSI_UTIL_SIGN_CLEAR:
00664 x87_fabs( cp->func );
00665 break;
00666
00667 case TGSI_UTIL_SIGN_SET:
00668 x87_fabs( cp->func );
00669 x87_fchs( cp->func );
00670 break;
00671
00672 default:
00673 ERROR(cp, "unsupported sign-mode");
00674 break;
00675 }
00676 }
00677
00678
00679
00680
00681
00682
00683
00684
00685
00686
00687
00688 static void store_dest( struct aos_compilation *cp,
00689 const struct tgsi_full_dst_register *reg,
00690 struct x86_reg result )
00691 {
00692 struct x86_reg dst;
00693
00694 switch (reg->DstRegister.WriteMask) {
00695 case 0:
00696 return;
00697
00698 case TGSI_WRITEMASK_XYZW:
00699 aos_adopt_xmm_reg(cp,
00700 get_xmm_writable(cp, result),
00701 reg->DstRegister.File,
00702 reg->DstRegister.Index,
00703 TRUE);
00704 return;
00705 default:
00706 break;
00707 }
00708
00709 dst = aos_get_shader_reg_xmm(cp,
00710 reg->DstRegister.File,
00711 reg->DstRegister.Index);
00712
00713 switch (reg->DstRegister.WriteMask) {
00714 case TGSI_WRITEMASK_X:
00715 sse_movss(cp->func, dst, get_xmm(cp, result));
00716 break;
00717
00718 case TGSI_WRITEMASK_ZW:
00719 sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W));
00720 break;
00721
00722 case TGSI_WRITEMASK_XY:
00723 result = get_xmm_writable(cp, result);
00724 sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W));
00725 dst = result;
00726 break;
00727
00728 case TGSI_WRITEMASK_YZW:
00729 result = get_xmm_writable(cp, result);
00730 sse_movss(cp->func, result, dst);
00731 dst = result;
00732 break;
00733
00734 default:
00735 mask_write(cp, dst, result, reg->DstRegister.WriteMask);
00736 break;
00737 }
00738
00739 aos_adopt_xmm_reg(cp,
00740 dst,
00741 reg->DstRegister.File,
00742 reg->DstRegister.Index,
00743 TRUE);
00744
00745 }
00746
00747 static void inject_scalar( struct aos_compilation *cp,
00748 struct x86_reg dst,
00749 struct x86_reg result,
00750 ubyte swizzle )
00751 {
00752 sse_shufps(cp->func, dst, dst, swizzle);
00753 sse_movss(cp->func, dst, result);
00754 sse_shufps(cp->func, dst, dst, swizzle);
00755 }
00756
00757
00758 static void store_scalar_dest( struct aos_compilation *cp,
00759 const struct tgsi_full_dst_register *reg,
00760 struct x86_reg result )
00761 {
00762 unsigned writemask = reg->DstRegister.WriteMask;
00763 struct x86_reg dst;
00764
00765 if (writemask != TGSI_WRITEMASK_X &&
00766 writemask != TGSI_WRITEMASK_Y &&
00767 writemask != TGSI_WRITEMASK_Z &&
00768 writemask != TGSI_WRITEMASK_W &&
00769 writemask != 0)
00770 {
00771 result = get_xmm_writable(cp, result);
00772 sse_shufps(cp->func, result, result, SHUF(X,X,X,X));
00773 store_dest(cp, reg, result);
00774 return;
00775 }
00776
00777 result = get_xmm(cp, result);
00778 dst = aos_get_shader_reg_xmm(cp,
00779 reg->DstRegister.File,
00780 reg->DstRegister.Index);
00781
00782
00783
00784 switch (reg->DstRegister.WriteMask) {
00785 case TGSI_WRITEMASK_X:
00786 sse_movss(cp->func, dst, result);
00787 break;
00788
00789 case TGSI_WRITEMASK_Y:
00790 inject_scalar(cp, dst, result, SHUF(Y, X, Z, W));
00791 break;
00792
00793 case TGSI_WRITEMASK_Z:
00794 inject_scalar(cp, dst, result, SHUF(Z, Y, X, W));
00795 break;
00796
00797 case TGSI_WRITEMASK_W:
00798 inject_scalar(cp, dst, result, SHUF(W, Y, Z, X));
00799 break;
00800
00801 default:
00802 break;
00803 }
00804
00805 aos_adopt_xmm_reg(cp,
00806 dst,
00807 reg->DstRegister.File,
00808 reg->DstRegister.Index,
00809 TRUE);
00810 }
00811
00812
00813
00814 static void x87_fst_or_nop( struct x86_function *func,
00815 unsigned writemask,
00816 unsigned channel,
00817 struct x86_reg ptr )
00818 {
00819 assert(ptr.file == file_REG32);
00820 if (writemask & (1<<channel))
00821 x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) );
00822 }
00823
00824 static void x87_fstp_or_pop( struct x86_function *func,
00825 unsigned writemask,
00826 unsigned channel,
00827 struct x86_reg ptr )
00828 {
00829 assert(ptr.file == file_REG32);
00830 if (writemask & (1<<channel))
00831 x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) );
00832 else
00833 x87_fstp( func, x86_make_reg( file_x87, 0 ));
00834 }
00835
00836
00837
00838
00839
00840 static void x87_fstp_dest4( struct aos_compilation *cp,
00841 const struct tgsi_full_dst_register *dst )
00842 {
00843 struct x86_reg ptr = get_dst_ptr(cp, dst);
00844 unsigned writemask = dst->DstRegister.WriteMask;
00845
00846 x87_fst_or_nop(cp->func, writemask, 0, ptr);
00847 x87_fst_or_nop(cp->func, writemask, 1, ptr);
00848 x87_fst_or_nop(cp->func, writemask, 2, ptr);
00849 x87_fstp_or_pop(cp->func, writemask, 3, ptr);
00850 }
00851
00852
00853
00854 static void save_fpu_state( struct aos_compilation *cp )
00855 {
00856 x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX,
00857 Offset(struct aos_machine, fpu_restore)));
00858 }
00859
00860 static void restore_fpu_state( struct aos_compilation *cp )
00861 {
00862 x87_fnclex(cp->func);
00863 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
00864 Offset(struct aos_machine, fpu_restore)));
00865 }
00866
00867 static void set_fpu_round_neg_inf( struct aos_compilation *cp )
00868 {
00869 if (cp->fpucntl != FPU_RND_NEG) {
00870 cp->fpucntl = FPU_RND_NEG;
00871 x87_fnclex(cp->func);
00872 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
00873 Offset(struct aos_machine, fpu_rnd_neg_inf)));
00874 }
00875 }
00876
00877 static void set_fpu_round_nearest( struct aos_compilation *cp )
00878 {
00879 if (cp->fpucntl != FPU_RND_NEAREST) {
00880 cp->fpucntl = FPU_RND_NEAREST;
00881 x87_fnclex(cp->func);
00882 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
00883 Offset(struct aos_machine, fpu_rnd_nearest)));
00884 }
00885 }
00886
00887
00888 static void x87_emit_ex2( struct aos_compilation *cp )
00889 {
00890 struct x86_reg st0 = x86_make_reg(file_x87, 0);
00891 struct x86_reg st1 = x86_make_reg(file_x87, 1);
00892 int stack = cp->func->x87_stack;
00893
00894
00895
00896 x87_fld(cp->func, st0);
00897 x87_fprndint( cp->func );
00898 x87_fsubr(cp->func, st1, st0);
00899 x87_fxch(cp->func, st1);
00900 x87_f2xm1(cp->func);
00901 x87_fld1(cp->func);
00902 x87_faddp(cp->func, st1);
00903 x87_fscale(cp->func);
00904
00905 x87_fstp(cp->func, st1);
00906
00907 assert( stack == cp->func->x87_stack);
00908
00909 }
00910
00911 static void PIPE_CDECL print_reg( const char *msg,
00912 const float *reg )
00913 {
00914 debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]);
00915 }
00916
00917 static void emit_print( struct aos_compilation *cp,
00918 const char *message,
00919 unsigned file,
00920 unsigned idx )
00921 {
00922 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
00923 struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx );
00924 unsigned i;
00925
00926
00927
00928
00929 assert(cp->func->x87_stack == 0);
00930
00931
00932
00933
00934
00935 for (i = 0; i < 8; i++) {
00936 if (cp->xmm[i].dirty)
00937 spill(cp, i);
00938
00939 aos_release_xmm_reg(cp, i);
00940 }
00941
00942
00943
00944 x86_cdecl_caller_push_regs( cp->func );
00945
00946
00947
00948
00949 x86_lea( cp->func, ecx, arg );
00950 x86_push( cp->func, ecx );
00951 x86_push_imm32( cp->func, (int)message );
00952
00953
00954
00955
00956 x86_mov_reg_imm( cp->func, ecx, (int)print_reg );
00957 x86_call( cp->func, ecx );
00958 x86_pop( cp->func, ecx );
00959 x86_pop( cp->func, ecx );
00960
00961
00962
00963 x86_cdecl_caller_pop_regs( cp->func );
00964
00965
00966
00967 }
00968
00974 static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
00975 {
00976 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
00977 struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
00978 struct x86_reg tmp = aos_get_xmm_reg(cp);
00979
00980 sse_movaps(cp->func, tmp, arg0);
00981 sse_mulps(cp->func, tmp, neg);
00982 sse_maxps(cp->func, tmp, arg0);
00983
00984 store_dest(cp, &op->FullDstRegisters[0], tmp);
00985 return TRUE;
00986 }
00987
00988 static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
00989 {
00990 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
00991 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
00992 struct x86_reg dst = get_xmm_writable(cp, arg0);
00993
00994 sse_addps(cp->func, dst, arg1);
00995
00996 store_dest(cp, &op->FullDstRegisters[0], dst);
00997 return TRUE;
00998 }
00999
01000 static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01001 {
01002 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
01003 x87_fcos(cp->func);
01004 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
01005 return TRUE;
01006 }
01007
01008
01009
01010
01011 static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01012 {
01013 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01014 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01015 struct x86_reg tmp = aos_get_xmm_reg(cp);
01016 struct x86_reg dst = get_xmm_writable(cp, arg0);
01017
01018 sse_mulps(cp->func, dst, arg1);
01019
01020
01021 sse_movhlps(cp->func, tmp, dst);
01022 sse_addss(cp->func, dst, tmp);
01023 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
01024 sse_addss(cp->func, dst, tmp);
01025
01026 aos_release_xmm_reg(cp, tmp.idx);
01027 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
01028 return TRUE;
01029 }
01030
01031 static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01032 {
01033 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01034 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01035 struct x86_reg tmp = aos_get_xmm_reg(cp);
01036 struct x86_reg dst = get_xmm_writable(cp, arg0);
01037
01038 sse_mulps(cp->func, dst, arg1);
01039
01040
01041
01042 sse_movhlps(cp->func, tmp, dst);
01043 sse_addps(cp->func, dst, tmp);
01044 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
01045 sse_addss(cp->func, dst, tmp);
01046
01047 aos_release_xmm_reg(cp, tmp.idx);
01048 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
01049 return TRUE;
01050 }
01051
01052 static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01053 {
01054 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01055 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01056 struct x86_reg tmp = aos_get_xmm_reg(cp);
01057 struct x86_reg dst = get_xmm_writable(cp, arg0);
01058
01059 sse_mulps(cp->func, dst, arg1);
01060
01061
01062
01063 sse_movhlps(cp->func, tmp, dst);
01064 sse_addss(cp->func, dst, tmp);
01065 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
01066 sse_addss(cp->func, dst, tmp);
01067 emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
01068 sse_addss(cp->func, dst, tmp);
01069
01070 aos_release_xmm_reg(cp, tmp.idx);
01071 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
01072 return TRUE;
01073 }
01074
01075 static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01076 {
01077 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01078 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01079 struct x86_reg dst = aos_get_xmm_reg(cp);
01080 struct x86_reg tmp = aos_get_xmm_reg(cp);
01081 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
01082
01083
01084
01085
01086
01087
01088 emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
01089 emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
01090 sse_mulps(cp->func, dst, tmp);
01091
01092 aos_release_xmm_reg(cp, tmp.idx);
01093 store_dest(cp, &op->FullDstRegisters[0], dst);
01094 return TRUE;
01095 }
01096
01097 static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01098 {
01099 x87_fld1(cp->func);
01100 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
01101 x87_fyl2x(cp->func);
01102 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
01103 return TRUE;
01104 }
01105
01106
01107 static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01108 {
01109 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
01110 x87_emit_ex2(cp);
01111 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
01112 return TRUE;
01113 }
01114
01115
01116 static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01117 {
01118 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
01119 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
01120 int i;
01121
01122 set_fpu_round_neg_inf( cp );
01123
01124
01125
01126 for (i = 3; i >= 0; i--) {
01127 if (writemask & (1<<i)) {
01128 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
01129 }
01130 }
01131
01132 for (i = 0; i < 4; i++) {
01133 if (writemask & (1<<i)) {
01134 x87_fprndint( cp->func );
01135 x87_fstp(cp->func, x86_make_disp(dst, i*4));
01136 }
01137 }
01138
01139 return TRUE;
01140 }
01141
01142
01143 static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01144 {
01145 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
01146 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
01147 int i;
01148
01149 set_fpu_round_nearest( cp );
01150
01151
01152
01153 for (i = 3; i >= 0; i--) {
01154 if (writemask & (1<<i)) {
01155 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
01156 }
01157 }
01158
01159 for (i = 0; i < 4; i++) {
01160 if (writemask & (1<<i)) {
01161 x87_fprndint( cp->func );
01162 x87_fstp(cp->func, x86_make_disp(dst, i*4));
01163 }
01164 }
01165
01166 return TRUE;
01167 }
01168
01169
01170 static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01171 {
01172 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
01173 struct x86_reg st0 = x86_make_reg(file_x87, 0);
01174 struct x86_reg st1 = x86_make_reg(file_x87, 1);
01175 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
01176 int i;
01177
01178 set_fpu_round_neg_inf( cp );
01179
01180
01181
01182
01183 for (i = 3; i >= 0; i--) {
01184 if (writemask & (1<<i)) {
01185 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
01186 }
01187 }
01188
01189 for (i = 0; i < 4; i++) {
01190 if (writemask & (1<<i)) {
01191 x87_fld(cp->func, st0);
01192 x87_fprndint( cp->func );
01193 x87_fsubp(cp->func, st1);
01194 x87_fstp(cp->func, x86_make_disp(dst, i*4));
01195 }
01196 }
01197
01198 return TRUE;
01199 }
01200
01201
01202
01203
01204
01205
01206 static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01207 {
01208 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
01209 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
01210 unsigned lit_count = cp->lit_count++;
01211 struct x86_reg result, arg0;
01212 unsigned i;
01213
01214 #if 1
01215
01216
01217
01218 for (i = 0; i < 8; i++) {
01219 if (cp->xmm[i].dirty)
01220 spill(cp, i);
01221 aos_release_xmm_reg(cp, i);
01222 }
01223 #endif
01224
01225 if (writemask != TGSI_WRITEMASK_XYZW)
01226 result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0]));
01227 else
01228 result = get_dst_ptr(cp, &op->FullDstRegisters[0]);
01229
01230
01231 arg0 = fetch_src( cp, &op->FullSrcRegisters[0] );
01232 if (arg0.file == file_XMM) {
01233 struct x86_reg tmp = x86_make_disp(cp->machine_EDX,
01234 Offset(struct aos_machine, tmp[1]));
01235 sse_movaps( cp->func, tmp, arg0 );
01236 arg0 = tmp;
01237 }
01238
01239
01240
01241
01242
01243 x86_cdecl_caller_push_regs( cp->func );
01244
01245
01246
01247 x86_push_imm32( cp->func, lit_count );
01248
01249 x86_lea( cp->func, ecx, arg0 );
01250 x86_push( cp->func, ecx );
01251
01252 x86_lea( cp->func, ecx, result );
01253 x86_push( cp->func, ecx );
01254
01255 x86_push( cp->func, cp->machine_EDX );
01256
01257 if (lit_count < MAX_LIT_INFO) {
01258 x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX,
01259 Offset(struct aos_machine, lit_info) +
01260 lit_count * sizeof(struct lit_info) +
01261 Offset(struct lit_info, func)));
01262 }
01263 else {
01264 x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit );
01265 }
01266
01267 x86_call( cp->func, ecx );
01268
01269 x86_pop( cp->func, ecx );
01270 x86_pop( cp->func, ecx );
01271 x86_pop( cp->func, ecx );
01272 x86_pop( cp->func, ecx );
01273
01274 x86_cdecl_caller_pop_regs( cp->func );
01275
01276 if (writemask != TGSI_WRITEMASK_XYZW) {
01277 store_dest( cp,
01278 &op->FullDstRegisters[0],
01279 get_xmm_writable( cp, result ) );
01280 }
01281
01282 return TRUE;
01283 }
01284
01285 #if 0
01286 static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01287 {
01288 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
01289 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
01290
01291 if (writemask & TGSI_WRITEMASK_YZ) {
01292 struct x86_reg st1 = x86_make_reg(file_x87, 1);
01293 struct x86_reg st2 = x86_make_reg(file_x87, 2);
01294
01295
01296
01297 x87_fldz(cp->func);
01298 #if 1
01299 x87_fld1(cp->func);
01300 #else
01301
01302
01303 x87_fldz(cp->func);
01304 #endif
01305 x87_fld_src(cp, &op->FullSrcRegisters[0], 1);
01306 x87_fcomi(cp->func, st2);
01307 x87_fcmovb(cp->func, st1);
01308 x87_fstp(cp->func, st1);
01309 x87_fstp(cp->func, st1);
01310
01311 x87_fld_src(cp, &op->FullSrcRegisters[0], 3);
01312 x87_fxch(cp->func, st1);
01313
01314
01315
01316
01317 x87_fyl2x(cp->func);
01318 x87_emit_ex2( cp );
01319
01320
01321
01322
01323 x87_fldz(cp->func);
01324 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
01325 x87_fcomi(cp->func, st1);
01326 x87_fcmovb(cp->func, st1);
01327
01328 x87_fst_or_nop(cp->func, writemask, 1, dst);
01329
01330 x87_fcomi(cp->func, st1);
01331 x87_fcmovnbe(cp->func, st2);
01332
01333 x87_fstp_or_pop(cp->func, writemask, 2, dst);
01334 x87_fpop(cp->func);
01335 x87_fpop(cp->func);
01336 }
01337
01338 if (writemask & TGSI_WRITEMASK_XW) {
01339 x87_fld1(cp->func);
01340 x87_fst_or_nop(cp->func, writemask, 0, dst);
01341 x87_fstp_or_pop(cp->func, writemask, 3, dst);
01342 }
01343
01344 return TRUE;
01345 }
01346 #endif
01347
01348
01349
01350 static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01351 {
01352 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01353 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01354 struct x86_reg dst = get_xmm_writable(cp, arg0);
01355
01356 sse_maxps(cp->func, dst, arg1);
01357
01358 store_dest(cp, &op->FullDstRegisters[0], dst);
01359 return TRUE;
01360 }
01361
01362
01363 static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01364 {
01365 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01366 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01367 struct x86_reg dst = get_xmm_writable(cp, arg0);
01368
01369 sse_minps(cp->func, dst, arg1);
01370
01371 store_dest(cp, &op->FullDstRegisters[0], dst);
01372 return TRUE;
01373 }
01374
01375 static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01376 {
01377 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01378 struct x86_reg dst = get_xmm_writable(cp, arg0);
01379
01380
01381
01382 store_dest(cp, &op->FullDstRegisters[0], dst);
01383 return TRUE;
01384 }
01385
01386 static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01387 {
01388 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01389 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01390 struct x86_reg dst = get_xmm_writable(cp, arg0);
01391
01392 sse_mulps(cp->func, dst, arg1);
01393
01394 store_dest(cp, &op->FullDstRegisters[0], dst);
01395 return TRUE;
01396 }
01397
01398
01399 static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01400 {
01401 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01402 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01403 struct x86_reg arg2 = fetch_src(cp, &op->FullSrcRegisters[2]);
01404
01405
01406
01407
01408 arg0 = get_xmm_writable(cp, arg0);
01409
01410 sse_mulps(cp->func, arg0, arg1);
01411 sse_addps(cp->func, arg0, arg2);
01412 store_dest(cp, &op->FullDstRegisters[0], arg0);
01413 return TRUE;
01414 }
01415
01416
01417
01418
01419
01420
01421 static float PIPE_CDECL _powerf( float x, float y )
01422 {
01423 #if FAST_MATH
01424 return util_fast_pow(x, y);
01425 #else
01426 return powf( x, y );
01427 #endif
01428 }
01429
01430 #if FAST_MATH
01431 static float PIPE_CDECL _exp2(float x)
01432 {
01433 return util_fast_exp2(x);
01434 }
01435 #endif
01436
01437
01438
01439
01440
01441 static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01442 {
01443 #if 0
01444 x87_fld_src(cp, &op->FullSrcRegisters[1], 0);
01445 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
01446 x87_fyl2x(cp->func);
01447
01448 x87_emit_ex2( cp );
01449
01450 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
01451 #else
01452 uint i;
01453
01454
01455
01456
01457 for (i = 0; i < 8; i++) {
01458 if (cp->xmm[i].dirty)
01459 spill(cp, i);
01460 aos_release_xmm_reg(cp, i);
01461 }
01462
01463
01464
01465 x86_cdecl_caller_push_regs( cp->func );
01466
01467 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) );
01468
01469 x87_fld_src( cp, &op->FullSrcRegisters[1], 0 );
01470 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) );
01471 x87_fld_src( cp, &op->FullSrcRegisters[0], 0 );
01472 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
01473
01474
01475 x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf );
01476 x86_call( cp->func, cp->tmp_EAX );
01477
01478 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) );
01479
01480 x86_cdecl_caller_pop_regs( cp->func );
01481
01482
01483
01484 cp->func->x87_stack++;
01485
01486 x87_fstp_dest4( cp, &op->FullDstRegisters[0] );
01487 #endif
01488 return TRUE;
01489 }
01490
01491
01492 #if FAST_MATH
01493 static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01494 {
01495 uint i;
01496
01497
01498
01499
01500 for (i = 0; i < 8; i++) {
01501 if (cp->xmm[i].dirty)
01502 spill(cp, i);
01503 aos_release_xmm_reg(cp, i);
01504 }
01505
01506
01507
01508 x86_cdecl_caller_push_regs( cp->func );
01509
01510 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) );
01511
01512 x87_fld_src( cp, &op->FullSrcRegisters[0], 0 );
01513 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
01514
01515
01516 x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 );
01517 x86_call( cp->func, cp->tmp_EAX );
01518
01519 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) );
01520
01521 x86_cdecl_caller_pop_regs( cp->func );
01522
01523
01524
01525 cp->func->x87_stack++;
01526
01527 x87_fstp_dest4( cp, &op->FullDstRegisters[0] );
01528
01529 return TRUE;
01530 }
01531 #endif
01532
01533
01534 static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01535 {
01536 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01537 struct x86_reg dst = aos_get_xmm_reg(cp);
01538
01539 if (cp->have_sse2) {
01540 sse2_rcpss(cp->func, dst, arg0);
01541
01542
01543 }
01544 else {
01545 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
01546 sse_movss(cp->func, dst, ones);
01547 sse_divss(cp->func, dst, arg0);
01548 }
01549
01550 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
01551 return TRUE;
01552 }
01553
01554
01555
01556
01557
01558
01559
01560
01561
01562
01563
01564
01565
01566
01567 static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01568 {
01569
01570 if (0) {
01571 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01572 struct x86_reg r = aos_get_xmm_reg(cp);
01573 sse_rsqrtss(cp->func, r, arg0);
01574 store_scalar_dest(cp, &op->FullDstRegisters[0], r);
01575 return TRUE;
01576 }
01577 else {
01578 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01579 struct x86_reg r = aos_get_xmm_reg(cp);
01580
01581 struct x86_reg neg_half = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ );
01582 struct x86_reg one_point_five = x86_make_disp( neg_half, 4 );
01583 struct x86_reg src = get_xmm_writable( cp, arg0 );
01584
01585 sse_rsqrtss( cp->func, r, src );
01586 sse_mulss( cp->func, src, neg_half );
01587 sse_mulss( cp->func, src, r );
01588 sse_mulss( cp->func, src, r );
01589 sse_addss( cp->func, src, one_point_five );
01590 sse_mulss( cp->func, r, src );
01591
01592 store_scalar_dest(cp, &op->FullDstRegisters[0], r);
01593 return TRUE;
01594 }
01595 }
01596
01597
01598 static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01599 {
01600 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01601 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01602 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
01603 struct x86_reg dst = get_xmm_writable(cp, arg0);
01604
01605 sse_cmpps(cp->func, dst, arg1, cc_NotLessThan);
01606 sse_andps(cp->func, dst, ones);
01607
01608 store_dest(cp, &op->FullDstRegisters[0], dst);
01609 return TRUE;
01610 }
01611
01612 static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01613 {
01614 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
01615 x87_fsin(cp->func);
01616 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
01617 return TRUE;
01618 }
01619
01620
01621
01622 static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01623 {
01624 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01625 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01626 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
01627 struct x86_reg dst = get_xmm_writable(cp, arg0);
01628
01629 sse_cmpps(cp->func, dst, arg1, cc_LessThan);
01630 sse_andps(cp->func, dst, ones);
01631
01632 store_dest(cp, &op->FullDstRegisters[0], dst);
01633 return TRUE;
01634 }
01635
01636 static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01637 {
01638 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01639 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01640 struct x86_reg dst = get_xmm_writable(cp, arg0);
01641
01642 sse_subps(cp->func, dst, arg1);
01643
01644 store_dest(cp, &op->FullDstRegisters[0], dst);
01645 return TRUE;
01646 }
01647
01648 static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01649 {
01650 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01651 struct x86_reg tmp0 = aos_get_xmm_reg(cp);
01652
01653 sse2_cvttps2dq(cp->func, tmp0, arg0);
01654 sse2_cvtdq2ps(cp->func, tmp0, tmp0);
01655
01656 store_dest(cp, &op->FullDstRegisters[0], tmp0);
01657 return TRUE;
01658 }
01659
01660 static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01661 {
01662 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01663 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01664 struct x86_reg tmp0 = aos_get_xmm_reg(cp);
01665 struct x86_reg tmp1 = aos_get_xmm_reg(cp);
01666
01667 emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
01668 sse_mulps(cp->func, tmp1, arg0);
01669 emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W));
01670 sse_mulps(cp->func, tmp0, arg1);
01671 sse_subps(cp->func, tmp1, tmp0);
01672 sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W));
01673
01674
01675
01676
01677
01678
01679
01680 aos_release_xmm_reg(cp, tmp0.idx);
01681 store_dest(cp, &op->FullDstRegisters[0], tmp1);
01682 return TRUE;
01683 }
01684
01685
01686
01687 static boolean
01688 emit_instruction( struct aos_compilation *cp,
01689 struct tgsi_full_instruction *inst )
01690 {
01691 x87_assert_stack_empty(cp->func);
01692
01693 switch( inst->Instruction.Opcode ) {
01694 case TGSI_OPCODE_MOV:
01695 return emit_MOV( cp, inst );
01696
01697 case TGSI_OPCODE_LIT:
01698 return emit_LIT(cp, inst);
01699
01700 case TGSI_OPCODE_RCP:
01701 return emit_RCP(cp, inst);
01702
01703 case TGSI_OPCODE_RSQ:
01704 return emit_RSQ(cp, inst);
01705
01706 case TGSI_OPCODE_EXP:
01707
01708 return FALSE;
01709
01710 case TGSI_OPCODE_LOG:
01711
01712 return FALSE;
01713
01714 case TGSI_OPCODE_MUL:
01715 return emit_MUL(cp, inst);
01716
01717 case TGSI_OPCODE_ADD:
01718 return emit_ADD(cp, inst);
01719
01720 case TGSI_OPCODE_DP3:
01721 return emit_DP3(cp, inst);
01722
01723 case TGSI_OPCODE_DP4:
01724 return emit_DP4(cp, inst);
01725
01726 case TGSI_OPCODE_DST:
01727 return emit_DST(cp, inst);
01728
01729 case TGSI_OPCODE_MIN:
01730 return emit_MIN(cp, inst);
01731
01732 case TGSI_OPCODE_MAX:
01733 return emit_MAX(cp, inst);
01734
01735 case TGSI_OPCODE_SLT:
01736 return emit_SLT(cp, inst);
01737
01738 case TGSI_OPCODE_SGE:
01739 return emit_SGE(cp, inst);
01740
01741 case TGSI_OPCODE_MAD:
01742 return emit_MAD(cp, inst);
01743
01744 case TGSI_OPCODE_SUB:
01745 return emit_SUB(cp, inst);
01746
01747 case TGSI_OPCODE_LERP:
01748
01749 return FALSE;
01750
01751 case TGSI_OPCODE_FRAC:
01752 return emit_FRC(cp, inst);
01753
01754 case TGSI_OPCODE_CLAMP:
01755
01756 return FALSE;
01757
01758 case TGSI_OPCODE_FLOOR:
01759 return emit_FLR(cp, inst);
01760
01761 case TGSI_OPCODE_ROUND:
01762 return emit_RND(cp, inst);
01763
01764 case TGSI_OPCODE_EXPBASE2:
01765 #if FAST_MATH
01766 return emit_EXPBASE2(cp, inst);
01767 #elif 0
01768
01769
01770
01771 return emit_EX2(cp, inst);
01772 #else
01773 return FALSE;
01774 #endif
01775
01776 case TGSI_OPCODE_LOGBASE2:
01777 return emit_LG2(cp, inst);
01778
01779 case TGSI_OPCODE_POWER:
01780 return emit_POW(cp, inst);
01781
01782 case TGSI_OPCODE_CROSSPRODUCT:
01783 return emit_XPD(cp, inst);
01784
01785 case TGSI_OPCODE_ABS:
01786 return emit_ABS(cp, inst);
01787
01788 case TGSI_OPCODE_DPH:
01789 return emit_DPH(cp, inst);
01790
01791 case TGSI_OPCODE_COS:
01792 return emit_COS(cp, inst);
01793
01794 case TGSI_OPCODE_SIN:
01795 return emit_SIN(cp, inst);
01796
01797 case TGSI_OPCODE_TRUNC:
01798 return emit_TRUNC(cp, inst);
01799
01800 case TGSI_OPCODE_END:
01801 return TRUE;
01802
01803 default:
01804 return FALSE;
01805 }
01806 }
01807
01808
01809 static boolean emit_viewport( struct aos_compilation *cp )
01810 {
01811 struct x86_reg pos = aos_get_shader_reg_xmm(cp,
01812 TGSI_FILE_OUTPUT,
01813 cp->vaos->draw->vs.position_output );
01814
01815 struct x86_reg scale = x86_make_disp(cp->machine_EDX,
01816 Offset(struct aos_machine, scale));
01817
01818 struct x86_reg translate = x86_make_disp(cp->machine_EDX,
01819 Offset(struct aos_machine, translate));
01820
01821 sse_mulps(cp->func, pos, scale);
01822 sse_addps(cp->func, pos, translate);
01823
01824 aos_adopt_xmm_reg( cp,
01825 pos,
01826 TGSI_FILE_OUTPUT,
01827 cp->vaos->draw->vs.position_output,
01828 TRUE );
01829 return TRUE;
01830 }
01831
01832
01833
01834
01835
01836
01837 static boolean emit_rhw_viewport( struct aos_compilation *cp )
01838 {
01839 struct x86_reg tmp = aos_get_xmm_reg(cp);
01840 struct x86_reg pos = aos_get_shader_reg_xmm(cp,
01841 TGSI_FILE_OUTPUT,
01842 cp->vaos->draw->vs.position_output);
01843
01844 struct x86_reg scale = x86_make_disp(cp->machine_EDX,
01845 Offset(struct aos_machine, scale));
01846
01847 struct x86_reg translate = x86_make_disp(cp->machine_EDX,
01848 Offset(struct aos_machine, translate));
01849
01850
01851
01852 emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W));
01853 sse2_rcpss(cp->func, tmp, tmp);
01854 sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X));
01855
01856 sse_mulps(cp->func, pos, scale);
01857 sse_mulps(cp->func, pos, tmp);
01858 sse_addps(cp->func, pos, translate);
01859
01860
01861
01862 mask_write(cp, pos, tmp, TGSI_WRITEMASK_W);
01863
01864 aos_adopt_xmm_reg( cp,
01865 pos,
01866 TGSI_FILE_OUTPUT,
01867 cp->vaos->draw->vs.position_output,
01868 TRUE );
01869 return TRUE;
01870 }
01871
01872
01873 #if 0
01874 static boolean note_immediate( struct aos_compilation *cp,
01875 struct tgsi_full_immediate *imm )
01876 {
01877 unsigned pos = cp->num_immediates++;
01878 unsigned j;
01879
01880 for (j = 0; j < imm->Immediate.Size; j++) {
01881 cp->vaos->machine->immediate[pos][j] = imm->u.ImmediateFloat32[j].Float;
01882 }
01883
01884 return TRUE;
01885 }
01886 #endif
01887
01888
01889
01890
01891 static void find_last_write_outputs( struct aos_compilation *cp )
01892 {
01893 struct tgsi_parse_context parse;
01894 unsigned this_instruction = 0;
01895 unsigned i;
01896
01897 tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens );
01898
01899 while (!tgsi_parse_end_of_tokens( &parse )) {
01900
01901 tgsi_parse_token( &parse );
01902
01903 if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION)
01904 continue;
01905
01906 for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) {
01907 if (parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.File ==
01908 TGSI_FILE_OUTPUT)
01909 {
01910 unsigned idx = parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.Index;
01911 cp->output_last_write[idx] = this_instruction;
01912 }
01913 }
01914
01915 this_instruction++;
01916 }
01917
01918 tgsi_parse_free( &parse );
01919 }
01920
01921
01922 #define ARG_MACHINE 1
01923 #define ARG_START_ELTS 2
01924 #define ARG_COUNT 3
01925 #define ARG_OUTBUF 4
01926
01927
01928 static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
01929 boolean linear )
01930 {
01931 struct tgsi_parse_context parse;
01932 struct aos_compilation cp;
01933 unsigned fixup, label;
01934
01935 util_init_math();
01936
01937 tgsi_parse_init( &parse, varient->base.vs->state.tokens );
01938
01939 memset(&cp, 0, sizeof(cp));
01940
01941 cp.insn_counter = 1;
01942 cp.vaos = varient;
01943 cp.have_sse2 = 1;
01944 cp.func = &varient->func[ linear ? 0 : 1 ];
01945
01946 cp.tmp_EAX = x86_make_reg(file_REG32, reg_AX);
01947 cp.idx_EBX = x86_make_reg(file_REG32, reg_BX);
01948 cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
01949 cp.machine_EDX = x86_make_reg(file_REG32, reg_DX);
01950 cp.count_ESI = x86_make_reg(file_REG32, reg_SI);
01951 cp.temp_EBP = x86_make_reg(file_REG32, reg_BP);
01952 cp.stack_ESP = x86_make_reg( file_REG32, reg_SP );
01953
01954 x86_init_func(cp.func);
01955
01956 find_last_write_outputs(&cp);
01957
01958 x86_push(cp.func, cp.idx_EBX);
01959 x86_push(cp.func, cp.count_ESI);
01960 x86_push(cp.func, cp.temp_EBP);
01961
01962
01963
01964
01965 x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE));
01966 x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS));
01967 x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT));
01968 x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF));
01969
01970
01971
01972
01973 x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX);
01974 x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX);
01975 fixup = x86_jcc_forward(cp.func, cc_E);
01976
01977
01978 save_fpu_state( &cp );
01979 set_fpu_round_nearest( &cp );
01980
01981 aos_init_inputs( &cp, linear );
01982
01983 cp.x86_reg[0] = 0;
01984 cp.x86_reg[1] = 0;
01985
01986
01987
01988 label = x86_get_label(cp.func);
01989 {
01990
01991
01992 if (!aos_fetch_inputs( &cp, linear ))
01993 goto fail;
01994
01995
01996
01997 while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error )
01998 {
01999 tgsi_parse_token( &parse );
02000
02001 switch (parse.FullToken.Token.Type) {
02002 case TGSI_TOKEN_TYPE_IMMEDIATE:
02003 #if 0
02004 if (!note_immediate( &cp, &parse.FullToken.FullImmediate ))
02005 goto fail;
02006 #endif
02007 break;
02008
02009 case TGSI_TOKEN_TYPE_INSTRUCTION:
02010 if (DISASSEM)
02011 tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter );
02012
02013 if (!emit_instruction( &cp, &parse.FullToken.FullInstruction ))
02014 goto fail;
02015 break;
02016 }
02017
02018 x87_assert_stack_empty(cp.func);
02019 cp.insn_counter++;
02020
02021 if (DISASSEM)
02022 debug_printf("\n");
02023 }
02024
02025
02026 {
02027 unsigned i;
02028 for (i = 0; i < 8; i++) {
02029 if (cp.xmm[i].file != TGSI_FILE_OUTPUT) {
02030 cp.xmm[i].file = TGSI_FILE_NULL;
02031 cp.xmm[i].dirty = 0;
02032 }
02033 }
02034 }
02035
02036 if (cp.error)
02037 goto fail;
02038
02039 if (cp.vaos->base.key.clip) {
02040
02041
02042
02043 emit_rhw_viewport(&cp);
02044 }
02045 else if (cp.vaos->base.key.viewport) {
02046 emit_viewport(&cp);
02047 }
02048
02049
02050
02051
02052 if (!aos_emit_outputs( &cp ))
02053 goto fail;
02054
02055
02056
02057
02058 x86_lea(cp.func,
02059 cp.outbuf_ECX,
02060 x86_make_disp(cp.outbuf_ECX,
02061 cp.vaos->base.key.output_stride));
02062
02063
02064
02065 aos_incr_inputs( &cp, linear );
02066 }
02067
02068
02069 x86_dec(cp.func, cp.count_ESI);
02070 x86_jcc(cp.func, cc_NZ, label);
02071
02072 restore_fpu_state(&cp);
02073
02074
02075
02076 x86_fixup_fwd_jump(cp.func, fixup);
02077
02078
02079
02080 if (cp.func->need_emms)
02081 mmx_emms(cp.func);
02082
02083 x86_pop(cp.func, cp.temp_EBP);
02084 x86_pop(cp.func, cp.count_ESI);
02085 x86_pop(cp.func, cp.idx_EBX);
02086
02087 x87_assert_stack_empty(cp.func);
02088 x86_ret(cp.func);
02089
02090 tgsi_parse_free( &parse );
02091 return !cp.error;
02092
02093 fail:
02094 tgsi_parse_free( &parse );
02095 return FALSE;
02096 }
02097
02098
02099
02100 static void vaos_set_buffer( struct draw_vs_varient *varient,
02101 unsigned buf,
02102 const void *ptr,
02103 unsigned stride )
02104 {
02105 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
02106
02107 if (buf < vaos->nr_vb) {
02108 vaos->buffer[buf].base_ptr = (char *)ptr;
02109 vaos->buffer[buf].stride = stride;
02110 }
02111
02112 if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
02113 }
02114
02115
02116
02117 static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
02118 const unsigned *elts,
02119 unsigned count,
02120 void *output_buffer )
02121 {
02122 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
02123 struct aos_machine *machine = vaos->draw->vs.aos_machine;
02124
02125 if (0) debug_printf("%s %d\n", __FUNCTION__, count);
02126
02127 machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
02128 machine->constants = vaos->draw->vs.aligned_constants;
02129 machine->immediates = vaos->base.vs->immediates;
02130 machine->buffer = vaos->buffer;
02131
02132 vaos->gen_run_elts( machine,
02133 elts,
02134 count,
02135 output_buffer );
02136 }
02137
02138 static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
02139 unsigned start,
02140 unsigned count,
02141 void *output_buffer )
02142 {
02143 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
02144 struct aos_machine *machine = vaos->draw->vs.aos_machine;
02145
02146 if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count,
02147 vaos->base.key.const_vbuffers);
02148
02149 machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
02150 machine->constants = vaos->draw->vs.aligned_constants;
02151 machine->immediates = vaos->base.vs->immediates;
02152 machine->buffer = vaos->buffer;
02153
02154 vaos->gen_run_linear( machine,
02155 start,
02156 count,
02157 output_buffer );
02158
02159
02160 assert(machine->internal[IMM_ONES][0] == 1.0f);
02161 assert(machine->internal[IMM_IDENTITY][0] == 0.0f);
02162 assert(machine->internal[IMM_NEGS][0] == -1.0f);
02163 }
02164
02165
02166
02167 static void vaos_destroy( struct draw_vs_varient *varient )
02168 {
02169 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
02170
02171 FREE( vaos->buffer );
02172
02173 x86_release_func( &vaos->func[0] );
02174 x86_release_func( &vaos->func[1] );
02175
02176 FREE(vaos);
02177 }
02178
02179
02180
02181 static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
02182 const struct draw_vs_varient_key *key )
02183 {
02184 unsigned i;
02185 struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse);
02186
02187 if (!vaos)
02188 goto fail;
02189
02190 vaos->base.key = *key;
02191 vaos->base.vs = vs;
02192 vaos->base.set_buffer = vaos_set_buffer;
02193 vaos->base.destroy = vaos_destroy;
02194 vaos->base.run_linear = vaos_run_linear;
02195 vaos->base.run_elts = vaos_run_elts;
02196
02197 vaos->draw = vs->draw;
02198
02199 for (i = 0; i < key->nr_inputs; i++)
02200 vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );
02201
02202 vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
02203 if (!vaos->buffer)
02204 goto fail;
02205
02206 if (0)
02207 debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
02208
02209 #if 0
02210 tgsi_dump(vs->state.tokens, 0);
02211 #endif
02212
02213 if (!build_vertex_program( vaos, TRUE ))
02214 goto fail;
02215
02216 if (!build_vertex_program( vaos, FALSE ))
02217 goto fail;
02218
02219 vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]);
02220 if (!vaos->gen_run_linear)
02221 goto fail;
02222
02223 vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]);
02224 if (!vaos->gen_run_elts)
02225 goto fail;
02226
02227 return &vaos->base;
02228
02229 fail:
02230 if (vaos && vaos->buffer)
02231 FREE(vaos->buffer);
02232
02233 if (vaos)
02234 x86_release_func( &vaos->func[0] );
02235
02236 if (vaos)
02237 x86_release_func( &vaos->func[1] );
02238
02239 FREE(vaos);
02240
02241 return NULL;
02242 }
02243
02244
02245 struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs,
02246 const struct draw_vs_varient_key *key )
02247 {
02248 struct draw_vs_varient *varient = varient_aos_sse( vs, key );
02249
02250 if (varient == NULL) {
02251 varient = draw_vs_varient_generic( vs, key );
02252 }
02253
02254 return varient;
02255 }
02256
02257
02258
02259 #endif