draw_vs_aos.c

Go to the documentation of this file.
00001 /*
00002  * Mesa 3-D graphics library
00003  * Version:  6.3
00004  *
00005  * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
00006  *
00007  * Permission is hereby granted, free of charge, to any person obtaining a
00008  * copy of this software and associated documentation files (the "Software"),
00009  * to deal in the Software without restriction, including without limitation
00010  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
00011  * and/or sell copies of the Software, and to permit persons to whom the
00012  * Software is furnished to do so, subject to the following conditions:
00013  *
00014  * The above copyright notice and this permission notice shall be included
00015  * in all copies or substantial portions of the Software.
00016  *
00017  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
00018  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00019  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
00020  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
00021  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
00022  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
00023  */
00024 
00032 #include "util/u_memory.h"
00033 #include "util/u_math.h"
00034 #include "pipe/p_shader_tokens.h"
00035 #include "pipe/p_debug.h"
00036 #include "tgsi/tgsi_parse.h"
00037 #include "tgsi/tgsi_util.h"
00038 #include "tgsi/tgsi_exec.h"
00039 #include "tgsi/tgsi_dump.h"
00040 
00041 #include "draw_vs.h"
00042 #include "draw_vs_aos.h"
00043 
00044 #include "rtasm/rtasm_x86sse.h"
00045 
00046 #ifdef PIPE_ARCH_X86
00047 #define DISASSEM 0
00048 #define FAST_MATH 1
00049 
00050 static const char *files[] =
00051 {
00052    "NULL",
00053    "CONST",
00054    "IN",
00055    "OUT",
00056    "TEMP",
00057    "SAMP",
00058    "ADDR",
00059    "IMM",
00060    "INTERNAL",
00061 };
00062 
00063 static INLINE boolean eq( struct x86_reg a,
00064                             struct x86_reg b )
00065 {
00066    return (a.file == b.file &&
00067            a.idx == b.idx &&
00068            a.mod == b.mod &&
00069            a.disp == b.disp);
00070 }
00071       
00072 struct x86_reg aos_get_x86( struct aos_compilation *cp,
00073                             unsigned which_reg, /* quick hack */
00074                             unsigned value )
00075 {
00076    struct x86_reg reg;
00077 
00078    if (which_reg == 0)
00079       reg = cp->temp_EBP;
00080    else
00081       reg = cp->tmp_EAX;
00082 
00083    if (cp->x86_reg[which_reg] != value) {
00084       unsigned offset;
00085 
00086       switch (value) {
00087       case X86_IMMEDIATES:
00088          assert(which_reg == 0);
00089          offset = Offset(struct aos_machine, immediates);
00090          break;
00091       case X86_CONSTANTS:
00092          assert(which_reg == 1);
00093          offset = Offset(struct aos_machine, constants);
00094          break;
00095       case X86_BUFFERS:
00096          assert(which_reg == 0);
00097          offset = Offset(struct aos_machine, buffer);
00098          break;
00099       default:
00100          assert(0);
00101          offset = 0;
00102       }
00103 
00104 
00105       x86_mov(cp->func, reg, 
00106               x86_make_disp(cp->machine_EDX, offset));
00107 
00108       cp->x86_reg[which_reg] = value;
00109    }
00110 
00111    return reg;
00112 }
00113 
00114 
00115 static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
00116                                   unsigned file,
00117                                   unsigned idx )
00118 {
00119    struct x86_reg ptr = cp->machine_EDX;
00120 
00121    switch (file) {
00122    case TGSI_FILE_INPUT:
00123       assert(idx < MAX_INPUTS);
00124       return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));
00125 
00126    case TGSI_FILE_OUTPUT:
00127       return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));
00128 
00129    case TGSI_FILE_TEMPORARY:
00130       assert(idx < MAX_TEMPS);
00131       return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));
00132 
00133    case AOS_FILE_INTERNAL:
00134       assert(idx < MAX_INTERNALS);
00135       return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));
00136 
00137    case TGSI_FILE_IMMEDIATE: 
00138       assert(idx < MAX_IMMEDIATES);  /* just a sanity check */
00139       return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float));
00140 
00141    case TGSI_FILE_CONSTANT: 
00142       assert(idx < MAX_CONSTANTS);  /* just a sanity check */
00143       return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float));
00144 
00145    default:
00146       ERROR(cp, "unknown reg file");
00147       return x86_make_reg(0,0);
00148    }
00149 }
00150                 
00151 
00152 
00153 #define X87_CW_EXCEPTION_INV_OP       (1<<0)
00154 #define X87_CW_EXCEPTION_DENORM_OP    (1<<1)
00155 #define X87_CW_EXCEPTION_ZERO_DIVIDE  (1<<2)
00156 #define X87_CW_EXCEPTION_OVERFLOW     (1<<3)
00157 #define X87_CW_EXCEPTION_UNDERFLOW    (1<<4)
00158 #define X87_CW_EXCEPTION_PRECISION    (1<<5)
00159 #define X87_CW_PRECISION_SINGLE       (0<<8)
00160 #define X87_CW_PRECISION_RESERVED     (1<<8)
00161 #define X87_CW_PRECISION_DOUBLE       (2<<8)
00162 #define X87_CW_PRECISION_DOUBLE_EXT   (3<<8)
00163 #define X87_CW_PRECISION_MASK         (3<<8)
00164 #define X87_CW_ROUND_NEAREST          (0<<10)
00165 #define X87_CW_ROUND_DOWN             (1<<10)
00166 #define X87_CW_ROUND_UP               (2<<10)
00167 #define X87_CW_ROUND_ZERO             (3<<10)
00168 #define X87_CW_ROUND_MASK             (3<<10)
00169 #define X87_CW_INFINITY               (1<<12)
00170 
00171 
00172 
00173 
00174 static void spill( struct aos_compilation *cp, unsigned idx )
00175 {
00176    if (!cp->xmm[idx].dirty ||
00177        (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */
00178         cp->xmm[idx].file != TGSI_FILE_OUTPUT &&
00179         cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) {
00180       ERROR(cp, "invalid spill");
00181       return;
00182    }
00183    else {
00184       struct x86_reg oldval = get_reg_ptr(cp,
00185                                           cp->xmm[idx].file,
00186                                           cp->xmm[idx].idx);
00187      
00188       if (0) debug_printf("\nspill %s[%d]", 
00189                           files[cp->xmm[idx].file],
00190                           cp->xmm[idx].idx);
00191  
00192       assert(cp->xmm[idx].dirty);
00193       sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx));
00194       cp->xmm[idx].dirty = 0;
00195    }
00196 }
00197 
00198 
00199 void aos_spill_all( struct aos_compilation *cp )
00200 {
00201    unsigned i;
00202 
00203    for (i = 0; i < 8; i++) {
00204       if (cp->xmm[i].dirty) 
00205          spill(cp, i);
00206       aos_release_xmm_reg(cp, i);
00207    }
00208 }
00209 
00210 
00211 static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
00212                                         struct x86_reg reg )
00213 {
00214    if (reg.file != file_XMM ||
00215        cp->xmm[reg.idx].file != TGSI_FILE_NULL)
00216    {
00217       struct x86_reg tmp = aos_get_xmm_reg(cp);
00218       sse_movaps(cp->func, tmp, reg);
00219       reg = tmp;
00220    }
00221 
00222    cp->xmm[reg.idx].last_used = cp->insn_counter;
00223    return reg;
00224 }
00225 
00226 static struct x86_reg get_xmm( struct aos_compilation *cp,
00227                                struct x86_reg reg )
00228 {
00229    if (reg.file != file_XMM) 
00230    {
00231       struct x86_reg tmp = aos_get_xmm_reg(cp);
00232       sse_movaps(cp->func, tmp, reg);
00233       reg = tmp;
00234    }
00235 
00236    cp->xmm[reg.idx].last_used = cp->insn_counter;
00237    return reg;
00238 }
00239 
00240 
00241 /* Allocate an empty xmm register, either as a temporary or later to
00242  * "adopt" as a shader reg.
00243  */
00244 struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp )
00245 {
00246    unsigned i;
00247    unsigned oldest = 0;
00248    boolean found = FALSE;
00249 
00250    for (i = 0; i < 8; i++) 
00251       if (cp->xmm[i].last_used != cp->insn_counter &&
00252           cp->xmm[i].file == TGSI_FILE_NULL) {
00253          oldest = i;
00254          found = TRUE;
00255       }
00256 
00257    if (!found) {
00258       for (i = 0; i < 8; i++) 
00259          if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
00260             oldest = i;
00261    }
00262 
00263    /* Need to write out the old value?
00264     */
00265    if (cp->xmm[oldest].dirty) 
00266       spill(cp, oldest);
00267 
00268    assert(cp->xmm[oldest].last_used != cp->insn_counter);
00269 
00270    cp->xmm[oldest].file = TGSI_FILE_NULL;
00271    cp->xmm[oldest].idx = 0;
00272    cp->xmm[oldest].dirty = 0;
00273    cp->xmm[oldest].last_used = cp->insn_counter;
00274    return x86_make_reg(file_XMM, oldest);
00275 }
00276 
00277 void aos_release_xmm_reg( struct aos_compilation *cp,
00278                           unsigned idx )
00279 {
00280    cp->xmm[idx].file = TGSI_FILE_NULL;
00281    cp->xmm[idx].idx = 0;
00282    cp->xmm[idx].dirty = 0;
00283    cp->xmm[idx].last_used = 0;
00284 }
00285 
00286 
00287 static void aos_soft_release_xmm( struct aos_compilation *cp,
00288                                   struct x86_reg reg )
00289 {
00290    if (reg.file == file_XMM) {
00291       assert(cp->xmm[reg.idx].last_used == cp->insn_counter);
00292       cp->xmm[reg.idx].last_used = cp->insn_counter - 1;
00293    }
00294 }
00295 
00296 
00297      
00298 /* Mark an xmm reg as holding the current copy of a shader reg.
00299  */
00300 void aos_adopt_xmm_reg( struct aos_compilation *cp,
00301                         struct x86_reg reg,
00302                         unsigned file,
00303                         unsigned idx,
00304                         unsigned dirty )
00305 {
00306    unsigned i;
00307 
00308    if (reg.file != file_XMM) {
00309       assert(0);
00310       return;
00311    }
00312 
00313 
00314    /* If any xmm reg thinks it holds this shader reg, break the
00315     * illusion.
00316     */
00317    for (i = 0; i < 8; i++) {
00318       if (cp->xmm[i].file == file && 
00319           cp->xmm[i].idx == idx) 
00320       {
00321          /* If an xmm reg is already holding this shader reg, take into account its
00322           * dirty flag...
00323           */
00324          dirty |= cp->xmm[i].dirty;
00325          aos_release_xmm_reg(cp, i);
00326       }
00327    }
00328 
00329    cp->xmm[reg.idx].file = file;
00330    cp->xmm[reg.idx].idx = idx;
00331    cp->xmm[reg.idx].dirty = dirty;
00332    cp->xmm[reg.idx].last_used = cp->insn_counter;
00333 }
00334 
00335 
00336 /* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
00337  */
00338 static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp, 
00339                                               unsigned file,
00340                                               unsigned idx )
00341 {
00342    unsigned i;
00343 
00344    /* Ensure the in-memory copy of this reg is up-to-date
00345     */
00346    for (i = 0; i < 8; i++) {
00347       if (cp->xmm[i].file == file && 
00348           cp->xmm[i].idx == idx &&
00349           cp->xmm[i].dirty) {
00350          spill(cp, i);
00351       }
00352    }
00353 
00354    return get_reg_ptr( cp, file, idx );
00355 }
00356 
00357 
00358 /* As above, but return a pointer.  Note - this pointer may alias
00359  * those returned by get_arg_ptr().
00360  */
00361 static struct x86_reg get_dst_ptr( struct aos_compilation *cp, 
00362                                    const struct tgsi_full_dst_register *dst )
00363 {
00364    unsigned file = dst->DstRegister.File;
00365    unsigned idx = dst->DstRegister.Index;
00366    unsigned i;
00367    
00368 
00369    /* Ensure in-memory copy of this reg is up-to-date and invalidate
00370     * any xmm copies.
00371     */
00372    for (i = 0; i < 8; i++) {
00373       if (cp->xmm[i].file == file &&
00374           cp->xmm[i].idx == idx)
00375       {
00376          if (cp->xmm[i].dirty) 
00377             spill(cp, i);
00378          
00379          aos_release_xmm_reg(cp, i);
00380       }
00381    }
00382 
00383    return get_reg_ptr( cp, file, idx );
00384 }
00385 
00386 
00387 
00388 
00389 
00390 /* Return an XMM reg if the argument is resident, otherwise return a
00391  * base+offset pointer to the saved value.
00392  */
00393 struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, 
00394                                    unsigned file,
00395                                    unsigned idx )
00396 {
00397    unsigned i;
00398 
00399    for (i = 0; i < 8; i++) {
00400       if (cp->xmm[i].file == file &&
00401           cp->xmm[i].idx  == idx) 
00402       {
00403          cp->xmm[i].last_used = cp->insn_counter;
00404          return x86_make_reg(file_XMM, i);
00405       }
00406    }
00407 
00408    /* If not found in the XMM register file, return an indirect
00409     * reference to the in-memory copy:
00410     */
00411    return get_reg_ptr( cp, file, idx );
00412 }
00413 
00414 
00415 
00416 static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp, 
00417                                               unsigned file,
00418                                               unsigned idx )
00419 {
00420    struct x86_reg reg = get_xmm( cp,
00421                                  aos_get_shader_reg( cp, file, idx ) );
00422 
00423    aos_adopt_xmm_reg( cp,
00424                       reg,
00425                       file,
00426                       idx,
00427                       FALSE );
00428    
00429    return reg;
00430 }
00431 
00432 
00433 
00434 struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
00435                                      unsigned imm )
00436 {
00437    return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm );
00438 }
00439 
00440 
00441 struct x86_reg aos_get_internal( struct aos_compilation *cp,
00442                                  unsigned imm )
00443 {
00444    return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm );
00445 }
00446 
00447 
00448 
00449 
00450 
00451 /* Emulate pshufd insn in regular SSE, if necessary:
00452  */
00453 static void emit_pshufd( struct aos_compilation *cp,
00454                          struct x86_reg dst,
00455                          struct x86_reg arg0,
00456                          ubyte shuf )
00457 {
00458    if (cp->have_sse2) {
00459       sse2_pshufd(cp->func, dst, arg0, shuf);
00460    }
00461    else {
00462       if (!eq(dst, arg0)) 
00463          sse_movaps(cp->func, dst, arg0);
00464 
00465       sse_shufps(cp->func, dst, dst, shuf);
00466    }
00467 }
00468 
00469 /* load masks (pack into negs??)
00470  * pshufd - shuffle according to writemask
00471  * and - result, mask
00472  * nand - dest, mask
00473  * or - dest, result
00474  */
00475 static boolean mask_write( struct aos_compilation *cp,
00476                            struct x86_reg dst,
00477                            struct x86_reg result,
00478                            unsigned mask )
00479 {
00480    struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
00481    struct x86_reg tmp = aos_get_xmm_reg(cp);
00482    
00483    emit_pshufd(cp, tmp, imm_swz, 
00484                SHUF((mask & 1) ? 2 : 3,
00485                     (mask & 2) ? 2 : 3,
00486                     (mask & 4) ? 2 : 3,
00487                     (mask & 8) ? 2 : 3));
00488 
00489    sse_andps(cp->func, dst, tmp);
00490    sse_andnps(cp->func, tmp, result);
00491    sse_orps(cp->func, dst, tmp);
00492 
00493    aos_release_xmm_reg(cp, tmp.idx);
00494    return TRUE;
00495 }
00496 
00497 
00498 
00499 
00500 /* Helper for writemask:
00501  */
00502 static boolean emit_shuf_copy2( struct aos_compilation *cp,
00503                                   struct x86_reg dst,
00504                                   struct x86_reg arg0,
00505                                   struct x86_reg arg1,
00506                                   ubyte shuf )
00507 {
00508    struct x86_reg tmp = aos_get_xmm_reg(cp);
00509 
00510    emit_pshufd(cp, dst, arg1, shuf);
00511    emit_pshufd(cp, tmp, arg0, shuf);
00512    sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W));
00513    emit_pshufd(cp, dst, dst, shuf);
00514 
00515    aos_release_xmm_reg(cp, tmp.idx);
00516    return TRUE;
00517 }
00518 
00519 
00520 
00521 #define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
00522 
00523 
00524 /* Locate a source register and perform any required (simple) swizzle.  
00525  * 
00526  * Just fail on complex swizzles at this point.
00527  */
00528 static struct x86_reg fetch_src( struct aos_compilation *cp, 
00529                                  const struct tgsi_full_src_register *src ) 
00530 {
00531    struct x86_reg arg0 = aos_get_shader_reg(cp, 
00532                                             src->SrcRegister.File, 
00533                                             src->SrcRegister.Index);
00534    unsigned i;
00535    ubyte swz = 0;
00536    unsigned negs = 0;
00537    unsigned abs = 0;
00538 
00539    for (i = 0; i < 4; i++) {
00540       unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, i );
00541       unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );
00542 
00543       switch (swizzle) {
00544       case TGSI_EXTSWIZZLE_ZERO:
00545       case TGSI_EXTSWIZZLE_ONE:
00546          ERROR(cp, "not supporting full swizzles yet in tgsi_aos_sse2");
00547          break;
00548 
00549       default:
00550          swz |= (swizzle & 0x3) << (i * 2);
00551          break;
00552       }
00553 
00554       switch (neg) {
00555       case TGSI_UTIL_SIGN_TOGGLE:
00556          negs |= (1<<i);
00557          break;
00558          
00559       case TGSI_UTIL_SIGN_KEEP:
00560          break;
00561 
00562       case TGSI_UTIL_SIGN_CLEAR:
00563          abs |= (1<<i);
00564          break;
00565 
00566       default:
00567          ERROR(cp, "unsupported sign-mode");
00568          break;
00569       }
00570    }
00571 
00572    if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) {
00573       struct x86_reg dst = aos_get_xmm_reg(cp);
00574 
00575       if (swz != SSE_SWIZZLE_NOOP)
00576          emit_pshufd(cp, dst, arg0, swz);
00577       else
00578          sse_movaps(cp->func, dst, arg0);
00579 
00580       if (negs && negs != 0xf) {
00581          struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
00582          struct x86_reg tmp = aos_get_xmm_reg(cp);
00583 
00584          /* Load 1,-1,0,0
00585           * Use neg as arg to pshufd
00586           * Multiply
00587           */
00588          emit_pshufd(cp, tmp, imm_swz, 
00589                      SHUF((negs & 1) ? 1 : 0,
00590                           (negs & 2) ? 1 : 0,
00591                           (negs & 4) ? 1 : 0,
00592                           (negs & 8) ? 1 : 0));
00593          sse_mulps(cp->func, dst, tmp);
00594 
00595          aos_release_xmm_reg(cp, tmp.idx);
00596          aos_soft_release_xmm(cp, imm_swz);
00597       }
00598       else if (negs) {
00599          struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS);
00600          sse_mulps(cp->func, dst, imm_negs);
00601          aos_soft_release_xmm(cp, imm_negs);
00602       }
00603 
00604 
00605       if (abs && abs != 0xf) {
00606          ERROR(cp, "unsupported partial abs");
00607       }
00608       else if (abs) {
00609          struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
00610          struct x86_reg tmp = aos_get_xmm_reg(cp);
00611 
00612          sse_movaps(cp->func, tmp, dst);
00613          sse_mulps(cp->func, tmp, neg);
00614          sse_maxps(cp->func, dst, tmp);
00615 
00616          aos_release_xmm_reg(cp, tmp.idx);
00617          aos_soft_release_xmm(cp, neg);
00618       }
00619 
00620       aos_soft_release_xmm(cp, arg0);
00621       return dst;
00622    }
00623       
00624    return arg0;
00625 }
00626 
00627 static void x87_fld_src( struct aos_compilation *cp, 
00628                          const struct tgsi_full_src_register *src,
00629                          unsigned channel ) 
00630 {
00631    struct x86_reg arg0 = aos_get_shader_reg_ptr(cp, 
00632                                                 src->SrcRegister.File, 
00633                                                 src->SrcRegister.Index);
00634 
00635    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, channel );
00636    unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );
00637 
00638    switch (swizzle) {
00639    case TGSI_EXTSWIZZLE_ZERO:
00640       x87_fldz( cp->func );
00641       break;
00642 
00643    case TGSI_EXTSWIZZLE_ONE:
00644       x87_fld1( cp->func );
00645       break;
00646 
00647    default:
00648       x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
00649       break;
00650    }
00651    
00652 
00653    switch (neg) {
00654    case TGSI_UTIL_SIGN_TOGGLE:
00655       /* Flip the sign:
00656        */
00657       x87_fchs( cp->func );
00658       break;
00659          
00660    case TGSI_UTIL_SIGN_KEEP:
00661       break;
00662 
00663    case TGSI_UTIL_SIGN_CLEAR:
00664       x87_fabs( cp->func );
00665       break;
00666 
00667    case TGSI_UTIL_SIGN_SET:
00668       x87_fabs( cp->func );
00669       x87_fchs( cp->func );
00670       break;
00671 
00672    default:
00673       ERROR(cp, "unsupported sign-mode");
00674       break;
00675    }
00676 }
00677 
00678 
00679 
00680 
00681 
00682 
00683 /* Used to implement write masking.  This and most of the other instructions
00684  * here would be easier to implement if there had been a translation
00685  * to a 2 argument format (dst/arg0, arg1) at the shader level before
00686  * attempting to translate to x86/sse code.
00687  */
00688 static void store_dest( struct aos_compilation *cp, 
00689                         const struct tgsi_full_dst_register *reg,
00690                         struct x86_reg result )
00691 {
00692    struct x86_reg dst;
00693 
00694    switch (reg->DstRegister.WriteMask) {
00695    case 0:
00696       return;
00697    
00698    case TGSI_WRITEMASK_XYZW:
00699       aos_adopt_xmm_reg(cp, 
00700                         get_xmm_writable(cp, result), 
00701                         reg->DstRegister.File,
00702                         reg->DstRegister.Index,
00703                         TRUE);
00704       return;
00705    default: 
00706       break;
00707    }
00708 
00709    dst = aos_get_shader_reg_xmm(cp, 
00710                                 reg->DstRegister.File,
00711                                 reg->DstRegister.Index);
00712 
00713    switch (reg->DstRegister.WriteMask) {
00714    case TGSI_WRITEMASK_X:
00715       sse_movss(cp->func, dst, get_xmm(cp, result));
00716       break;
00717       
00718    case TGSI_WRITEMASK_ZW:
00719       sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W));
00720       break;
00721 
00722    case TGSI_WRITEMASK_XY: 
00723       result = get_xmm_writable(cp, result);
00724       sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W));
00725       dst = result;
00726       break;
00727 
00728    case TGSI_WRITEMASK_YZW: 
00729       result = get_xmm_writable(cp, result);
00730       sse_movss(cp->func, result, dst);
00731       dst = result;
00732       break;
00733 
00734    default:
00735       mask_write(cp, dst, result, reg->DstRegister.WriteMask);
00736       break;
00737    }
00738 
00739    aos_adopt_xmm_reg(cp, 
00740                      dst, 
00741                      reg->DstRegister.File,
00742                      reg->DstRegister.Index,
00743                      TRUE);
00744 
00745 }
00746 
00747 static void inject_scalar( struct aos_compilation *cp,
00748                            struct x86_reg dst,
00749                            struct x86_reg result,
00750                            ubyte swizzle )
00751 {
00752    sse_shufps(cp->func, dst, dst, swizzle);
00753    sse_movss(cp->func, dst, result);
00754    sse_shufps(cp->func, dst, dst, swizzle);
00755 }
00756 
00757 
00758 static void store_scalar_dest( struct aos_compilation *cp, 
00759                                const struct tgsi_full_dst_register *reg,
00760                                struct x86_reg result )
00761 {
00762    unsigned writemask = reg->DstRegister.WriteMask;
00763    struct x86_reg dst;
00764 
00765    if (writemask != TGSI_WRITEMASK_X &&
00766        writemask != TGSI_WRITEMASK_Y &&
00767        writemask != TGSI_WRITEMASK_Z &&
00768        writemask != TGSI_WRITEMASK_W &&
00769        writemask != 0) 
00770    {
00771       result = get_xmm_writable(cp, result); /* already true, right? */
00772       sse_shufps(cp->func, result, result, SHUF(X,X,X,X));
00773       store_dest(cp, reg, result);
00774       return;
00775    }
00776 
00777    result = get_xmm(cp, result);
00778    dst = aos_get_shader_reg_xmm(cp, 
00779                                 reg->DstRegister.File,
00780                                 reg->DstRegister.Index);
00781 
00782 
00783 
00784    switch (reg->DstRegister.WriteMask) {
00785    case TGSI_WRITEMASK_X:
00786       sse_movss(cp->func, dst, result);
00787       break;
00788 
00789    case TGSI_WRITEMASK_Y:
00790       inject_scalar(cp, dst, result, SHUF(Y, X, Z, W));
00791       break;
00792 
00793    case TGSI_WRITEMASK_Z:
00794       inject_scalar(cp, dst, result, SHUF(Z, Y, X, W));
00795       break;
00796 
00797    case TGSI_WRITEMASK_W:
00798       inject_scalar(cp, dst, result, SHUF(W, Y, Z, X));
00799       break;
00800 
00801    default:
00802       break;
00803    }
00804 
00805    aos_adopt_xmm_reg(cp, 
00806                      dst, 
00807                      reg->DstRegister.File,
00808                      reg->DstRegister.Index,
00809                      TRUE);
00810 }
00811    
00812 
00813 
00814 static void x87_fst_or_nop( struct x86_function *func,
00815                             unsigned writemask,
00816                             unsigned channel,
00817                             struct x86_reg ptr )
00818 {
00819    assert(ptr.file == file_REG32);
00820    if (writemask & (1<<channel)) 
00821       x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) );
00822 }
00823 
00824 static void x87_fstp_or_pop( struct x86_function *func,
00825                              unsigned writemask,
00826                              unsigned channel,
00827                              struct x86_reg ptr )
00828 {
00829    assert(ptr.file == file_REG32);
00830    if (writemask & (1<<channel)) 
00831       x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) );
00832    else
00833       x87_fstp( func, x86_make_reg( file_x87, 0 ));
00834 }
00835 
00836 
00837 
00838 /* 
00839  */
00840 static void x87_fstp_dest4( struct aos_compilation *cp,
00841                             const struct tgsi_full_dst_register *dst )
00842 {
00843    struct x86_reg ptr = get_dst_ptr(cp, dst); 
00844    unsigned writemask = dst->DstRegister.WriteMask;
00845 
00846    x87_fst_or_nop(cp->func, writemask, 0, ptr);
00847    x87_fst_or_nop(cp->func, writemask, 1, ptr);
00848    x87_fst_or_nop(cp->func, writemask, 2, ptr);
00849    x87_fstp_or_pop(cp->func, writemask, 3, ptr);
00850 }
00851 
00852 /* Save current x87 state and put it into single precision mode.
00853  */
00854 static void save_fpu_state( struct aos_compilation *cp )
00855 {
00856    x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX, 
00857                                        Offset(struct aos_machine, fpu_restore)));
00858 }
00859 
00860 static void restore_fpu_state( struct aos_compilation *cp )
00861 {
00862    x87_fnclex(cp->func);
00863    x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, 
00864                                       Offset(struct aos_machine, fpu_restore)));
00865 }
00866 
00867 static void set_fpu_round_neg_inf( struct aos_compilation *cp )
00868 {
00869    if (cp->fpucntl != FPU_RND_NEG) {
00870       cp->fpucntl = FPU_RND_NEG;
00871       x87_fnclex(cp->func);
00872       x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, 
00873                                          Offset(struct aos_machine, fpu_rnd_neg_inf)));
00874    }
00875 }
00876 
00877 static void set_fpu_round_nearest( struct aos_compilation *cp )
00878 {
00879    if (cp->fpucntl != FPU_RND_NEAREST) {
00880       cp->fpucntl = FPU_RND_NEAREST;
00881       x87_fnclex(cp->func);
00882       x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, 
00883                                          Offset(struct aos_machine, fpu_rnd_nearest)));
00884    }
00885 }
00886 
00887 
00888 static void x87_emit_ex2( struct aos_compilation *cp )
00889 {
00890    struct x86_reg st0 = x86_make_reg(file_x87, 0);
00891    struct x86_reg st1 = x86_make_reg(file_x87, 1);
00892    int stack = cp->func->x87_stack;
00893 
00894 //   set_fpu_round_neg_inf( cp );
00895 
00896    x87_fld(cp->func, st0);      /* a a */
00897    x87_fprndint( cp->func );    /* int(a) a*/
00898    x87_fsubr(cp->func, st1, st0);    /* int(a) frc(a) */
00899    x87_fxch(cp->func, st1);     /* frc(a) int(a) */
00900    x87_f2xm1(cp->func);         /* (2^frc(a))-1 int(a) */
00901    x87_fld1(cp->func);          /* 1 (2^frc(a))-1 int(a) */
00902    x87_faddp(cp->func, st1);    /* 2^frac(a) int(a)  */
00903    x87_fscale(cp->func);        /* (2^frac(a)*2^int(int(a))) int(a) */
00904                                 /* 2^a int(a) */
00905    x87_fstp(cp->func, st1);     /* 2^a */
00906 
00907    assert( stack == cp->func->x87_stack);
00908       
00909 }
00910 
00911 static void PIPE_CDECL print_reg( const char *msg,
00912                                   const float *reg )
00913 {
00914    debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]);
00915 }
00916 
00917 static void emit_print( struct aos_compilation *cp,
00918                         const char *message, /* must point to a static string! */
00919                         unsigned file,
00920                         unsigned idx )
00921 {
00922    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
00923    struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx );
00924    unsigned i;
00925 
00926    /* There shouldn't be anything on the x87 stack.  Can add this
00927     * capacity later if need be.
00928     */
00929    assert(cp->func->x87_stack == 0);
00930 
00931    /* For absolute correctness, need to spill/invalidate all XMM regs
00932     * too.  We're obviously not concerned about performance on this
00933     * debug path, so here goes:
00934     */
00935    for (i = 0; i < 8; i++) {
00936       if (cp->xmm[i].dirty) 
00937          spill(cp, i);
00938 
00939       aos_release_xmm_reg(cp, i);
00940    }
00941 
00942    /* Push caller-save (ie scratch) regs.  
00943     */
00944    x86_cdecl_caller_push_regs( cp->func );
00945 
00946 
00947    /* Push the arguments:
00948     */
00949    x86_lea( cp->func, ecx, arg );
00950    x86_push( cp->func, ecx );
00951    x86_push_imm32( cp->func, (int)message );
00952 
00953    /* Call the helper.  Could call debug_printf directly, but
00954     * print_reg is a nice place to put a breakpoint if need be.
00955     */
00956    x86_mov_reg_imm( cp->func, ecx, (int)print_reg );
00957    x86_call( cp->func, ecx );
00958    x86_pop( cp->func, ecx );
00959    x86_pop( cp->func, ecx );
00960 
00961    /* Pop caller-save regs 
00962     */
00963    x86_cdecl_caller_pop_regs( cp->func );
00964 
00965    /* Done... 
00966     */
00967 }
00968 
00974 static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
00975 {
00976    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
00977    struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
00978    struct x86_reg tmp = aos_get_xmm_reg(cp);
00979 
00980    sse_movaps(cp->func, tmp, arg0);
00981    sse_mulps(cp->func, tmp, neg);
00982    sse_maxps(cp->func, tmp, arg0);
00983    
00984    store_dest(cp, &op->FullDstRegisters[0], tmp);
00985    return TRUE;
00986 }
00987 
00988 static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
00989 {
00990    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
00991    struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
00992    struct x86_reg dst = get_xmm_writable(cp, arg0);
00993 
00994    sse_addps(cp->func, dst, arg1);
00995 
00996    store_dest(cp, &op->FullDstRegisters[0], dst);
00997    return TRUE;
00998 }
00999 
01000 static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
01001 {
01002    x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
01003    x87_fcos(cp->func);
01004    x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
01005    return TRUE;
01006 }
01007 
01008 /* The dotproduct instructions don't really do that well in sse:
01009  * XXX: produces wrong results -- disabled.
01010  */
01011 static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01012 {
01013    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01014    struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01015    struct x86_reg tmp = aos_get_xmm_reg(cp); 
01016    struct x86_reg dst = get_xmm_writable(cp, arg0);
01017 
01018    sse_mulps(cp->func, dst, arg1);
01019    /* Now the hard bit: sum the first 3 values:
01020     */ 
01021    sse_movhlps(cp->func, tmp, dst);
01022    sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
01023    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
01024    sse_addss(cp->func, dst, tmp);
01025    
01026    aos_release_xmm_reg(cp, tmp.idx);
01027    store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
01028    return TRUE;
01029 }
01030 
01031 static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01032 {
01033    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01034    struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01035    struct x86_reg tmp = aos_get_xmm_reg(cp);      
01036    struct x86_reg dst = get_xmm_writable(cp, arg0);
01037 
01038    sse_mulps(cp->func, dst, arg1);
01039    
01040    /* Now the hard bit: sum the values:
01041     */ 
01042    sse_movhlps(cp->func, tmp, dst);
01043    sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
01044    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
01045    sse_addss(cp->func, dst, tmp);
01046 
01047    aos_release_xmm_reg(cp, tmp.idx);
01048    store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
01049    return TRUE;
01050 }
01051 
01052 static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01053 {
01054    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01055    struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01056    struct x86_reg tmp = aos_get_xmm_reg(cp);
01057    struct x86_reg dst = get_xmm_writable(cp, arg0);
01058 
01059    sse_mulps(cp->func, dst, arg1);
01060 
01061    /* Now the hard bit: sum the values (from DP3):
01062     */ 
01063    sse_movhlps(cp->func, tmp, dst);
01064    sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
01065    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
01066    sse_addss(cp->func, dst, tmp);
01067    emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
01068    sse_addss(cp->func, dst, tmp);
01069 
01070    aos_release_xmm_reg(cp, tmp.idx);
01071    store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
01072    return TRUE;
01073 }
01074 
01075 static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01076 {
01077     struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01078     struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01079     struct x86_reg dst = aos_get_xmm_reg(cp);
01080     struct x86_reg tmp = aos_get_xmm_reg(cp);
01081     struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
01082 
01083 /*    dst[0] = 1.0     * 1.0F; */
01084 /*    dst[1] = arg0[1] * arg1[1]; */
01085 /*    dst[2] = arg0[2] * 1.0; */
01086 /*    dst[3] = 1.0     * arg1[3]; */
01087 
01088     emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
01089     emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
01090     sse_mulps(cp->func, dst, tmp);
01091 
01092     aos_release_xmm_reg(cp, tmp.idx);
01093     store_dest(cp, &op->FullDstRegisters[0], dst);
01094     return TRUE;
01095 }
01096 
01097 static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
01098 {
01099    x87_fld1(cp->func);          /* 1 */
01100    x87_fld_src(cp, &op->FullSrcRegisters[0], 0);        /* a0 1 */
01101    x87_fyl2x(cp->func); /* log2(a0) */
01102    x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
01103    return TRUE;
01104 }
01105 
01106 
01107 static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
01108 {
01109    x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
01110    x87_emit_ex2(cp);
01111    x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
01112    return TRUE;
01113 }
01114 
01115 
01116 static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
01117 {
01118    struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); 
01119    unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
01120    int i;
01121 
01122    set_fpu_round_neg_inf( cp );
01123 
01124    /* Load all sources first to avoid aliasing
01125     */
01126    for (i = 3; i >= 0; i--) {
01127       if (writemask & (1<<i)) {
01128          x87_fld_src(cp, &op->FullSrcRegisters[0], i);   
01129       }
01130    }
01131 
01132    for (i = 0; i < 4; i++) {
01133       if (writemask & (1<<i)) {
01134          x87_fprndint( cp->func );   
01135          x87_fstp(cp->func, x86_make_disp(dst, i*4));
01136       }
01137    }
01138 
01139    return TRUE;
01140 }
01141 
01142 
01143 static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
01144 {
01145    struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); 
01146    unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
01147    int i;
01148 
01149    set_fpu_round_nearest( cp );
01150 
01151    /* Load all sources first to avoid aliasing
01152     */
01153    for (i = 3; i >= 0; i--) {
01154       if (writemask & (1<<i)) {
01155          x87_fld_src(cp, &op->FullSrcRegisters[0], i);   
01156       }
01157    }
01158 
01159    for (i = 0; i < 4; i++) {
01160       if (writemask & (1<<i)) {
01161          x87_fprndint( cp->func );   
01162          x87_fstp(cp->func, x86_make_disp(dst, i*4));
01163       }
01164    }
01165 
01166    return TRUE;
01167 }
01168 
01169 
01170 static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
01171 {
01172    struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); 
01173    struct x86_reg st0 = x86_make_reg(file_x87, 0);
01174    struct x86_reg st1 = x86_make_reg(file_x87, 1);
01175    unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
01176    int i;
01177 
01178    set_fpu_round_neg_inf( cp );
01179 
01180    /* suck all the source values onto the stack before writing out any
01181     * dst, which may alias...
01182     */
01183    for (i = 3; i >= 0; i--) {
01184       if (writemask & (1<<i)) {
01185          x87_fld_src(cp, &op->FullSrcRegisters[0], i);   
01186       }
01187    }
01188 
01189    for (i = 0; i < 4; i++) {
01190       if (writemask & (1<<i)) {
01191          x87_fld(cp->func, st0);     /* a a */
01192          x87_fprndint( cp->func );   /* flr(a) a */
01193          x87_fsubp(cp->func, st1);  /* frc(a) */
01194          x87_fstp(cp->func, x86_make_disp(dst, i*4));
01195       }
01196    }
01197 
01198    return TRUE;
01199 }
01200 
01201 
01202 
01203 
01204 
01205 
01206 static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01207 {
01208    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
01209    unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
01210    unsigned lit_count = cp->lit_count++;
01211    struct x86_reg result, arg0;
01212    unsigned i;
01213 
01214 #if 1
01215    /* For absolute correctness, need to spill/invalidate all XMM regs
01216     * too.  
01217     */
01218    for (i = 0; i < 8; i++) {
01219       if (cp->xmm[i].dirty) 
01220          spill(cp, i);
01221       aos_release_xmm_reg(cp, i);
01222    }
01223 #endif
01224 
01225    if (writemask != TGSI_WRITEMASK_XYZW) 
01226       result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0]));
01227    else 
01228       result = get_dst_ptr(cp, &op->FullDstRegisters[0]);    
01229 
01230    
01231    arg0 = fetch_src( cp, &op->FullSrcRegisters[0] );
01232    if (arg0.file == file_XMM) {
01233       struct x86_reg tmp = x86_make_disp(cp->machine_EDX, 
01234                                          Offset(struct aos_machine, tmp[1]));
01235       sse_movaps( cp->func, tmp, arg0 );
01236       arg0 = tmp;
01237    }
01238                   
01239       
01240 
01241    /* Push caller-save (ie scratch) regs.  
01242     */
01243    x86_cdecl_caller_push_regs( cp->func );
01244 
01245    /* Push the arguments:
01246     */
01247    x86_push_imm32( cp->func, lit_count );
01248 
01249    x86_lea( cp->func, ecx, arg0 );
01250    x86_push( cp->func, ecx );
01251 
01252    x86_lea( cp->func, ecx, result );
01253    x86_push( cp->func, ecx );
01254 
01255    x86_push( cp->func, cp->machine_EDX );
01256 
01257    if (lit_count < MAX_LIT_INFO) {
01258       x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX, 
01259                                              Offset(struct aos_machine, lit_info) + 
01260                                              lit_count * sizeof(struct lit_info) + 
01261                                              Offset(struct lit_info, func)));
01262    }
01263    else {
01264       x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit );
01265    }
01266 
01267    x86_call( cp->func, ecx );
01268             
01269    x86_pop( cp->func, ecx );    /* fixme... */
01270    x86_pop( cp->func, ecx );
01271    x86_pop( cp->func, ecx );
01272    x86_pop( cp->func, ecx );
01273 
01274    x86_cdecl_caller_pop_regs( cp->func );
01275 
01276    if (writemask != TGSI_WRITEMASK_XYZW) {
01277       store_dest( cp, 
01278                   &op->FullDstRegisters[0],
01279                   get_xmm_writable( cp, result ) );
01280    }
01281 
01282    return TRUE;
01283 }
01284 
01285 #if 0   
01286 static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01287 {
01288    struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); 
01289    unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
01290 
01291    if (writemask & TGSI_WRITEMASK_YZ) {
01292       struct x86_reg st1 = x86_make_reg(file_x87, 1);
01293       struct x86_reg st2 = x86_make_reg(file_x87, 2);
01294 
01295       /* a1' = a1 <= 0 ? 1 : a1;  
01296        */
01297       x87_fldz(cp->func);                           /* 1 0  */
01298 #if 1
01299       x87_fld1(cp->func);                           /* 1 0  */
01300 #else
01301       /* Correct but slow due to fp exceptions generated in fyl2x - fix me.
01302        */
01303       x87_fldz(cp->func);                           /* 1 0  */
01304 #endif
01305       x87_fld_src(cp, &op->FullSrcRegisters[0], 1); /* a1 1 0  */
01306       x87_fcomi(cp->func, st2);                     /* a1 1 0  */
01307       x87_fcmovb(cp->func, st1);                    /* a1' 1 0  */
01308       x87_fstp(cp->func, st1);                      /* a1' 0  */
01309       x87_fstp(cp->func, st1);                      /* a1'  */
01310 
01311       x87_fld_src(cp, &op->FullSrcRegisters[0], 3); /* a3 a1'  */
01312       x87_fxch(cp->func, st1);                      /* a1' a3  */
01313       
01314 
01315       /* Compute pow(a1, a3)
01316        */
01317       x87_fyl2x(cp->func);      /* a3*log2(a1)      */
01318       x87_emit_ex2( cp );       /* 2^(a3*log2(a1))   */
01319 
01320 
01321       /* a0' = max2(a0, 0):
01322        */
01323       x87_fldz(cp->func);                           /* 0 r2 */
01324       x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 0 r2 */
01325       x87_fcomi(cp->func, st1); 
01326       x87_fcmovb(cp->func, st1);                    /* a0' 0 r2 */
01327 
01328       x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */
01329 
01330       x87_fcomi(cp->func, st1);  /* a0' 0 r2 */
01331       x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */
01332 
01333       x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */
01334       x87_fpop(cp->func);       /* r2 */
01335       x87_fpop(cp->func);
01336    }
01337 
01338    if (writemask & TGSI_WRITEMASK_XW) {
01339       x87_fld1(cp->func);
01340       x87_fst_or_nop(cp->func, writemask, 0, dst);
01341       x87_fstp_or_pop(cp->func, writemask, 3, dst);
01342    }
01343 
01344    return TRUE;
01345 }
01346 #endif
01347 
01348 
01349 
01350 static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01351 {
01352    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01353    struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01354    struct x86_reg dst = get_xmm_writable(cp, arg0);
01355 
01356    sse_maxps(cp->func, dst, arg1);
01357 
01358    store_dest(cp, &op->FullDstRegisters[0], dst);
01359    return TRUE;
01360 }
01361 
01362 
01363 static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01364 {
01365    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01366    struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01367    struct x86_reg dst = get_xmm_writable(cp, arg0);
01368 
01369    sse_minps(cp->func, dst, arg1);
01370 
01371    store_dest(cp, &op->FullDstRegisters[0], dst);
01372    return TRUE;
01373 }
01374 
01375 static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01376 {
01377    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01378    struct x86_reg dst = get_xmm_writable(cp, arg0);
01379 
01380    /* potentially nothing to do */
01381 
01382    store_dest(cp, &op->FullDstRegisters[0], dst);
01383    return TRUE;
01384 }
01385 
01386 static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01387 {
01388    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01389    struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01390    struct x86_reg dst = get_xmm_writable(cp, arg0);
01391 
01392    sse_mulps(cp->func, dst, arg1);
01393 
01394    store_dest(cp, &op->FullDstRegisters[0], dst);
01395    return TRUE;
01396 }
01397 
01398 
01399 static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01400 {
01401    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01402    struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01403    struct x86_reg arg2 = fetch_src(cp, &op->FullSrcRegisters[2]);
01404 
01405    /* If we can't clobber old contents of arg0, get a temporary & copy
01406     * it there, then clobber it...
01407     */
01408    arg0 = get_xmm_writable(cp, arg0);
01409 
01410    sse_mulps(cp->func, arg0, arg1);
01411    sse_addps(cp->func, arg0, arg2);
01412    store_dest(cp, &op->FullDstRegisters[0], arg0);
01413    return TRUE;
01414 }
01415 
01416 
01417 
01418 /* A wrapper for powf().
01419  * Makes sure it is cdecl and operates on floats.
01420  */
01421 static float PIPE_CDECL _powerf( float x, float y )
01422 {
01423 #if FAST_MATH
01424    return util_fast_pow(x, y);
01425 #else
01426    return powf( x, y );
01427 #endif
01428 }
01429 
01430 #if FAST_MATH
01431 static float PIPE_CDECL _exp2(float x)
01432 {
01433    return util_fast_exp2(x);
01434 }
01435 #endif
01436 
01437 
01438 /* Really not sufficient -- need to check for conditions that could
01439  * generate inf/nan values, which will slow things down hugely.
01440  */
01441 static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
01442 {
01443 #if 0
01444    x87_fld_src(cp, &op->FullSrcRegisters[1], 0);  /* a1.x */
01445    x87_fld_src(cp, &op->FullSrcRegisters[0], 0);        /* a0.x a1.x */
01446    x87_fyl2x(cp->func);                                 /* a1*log2(a0) */
01447 
01448    x87_emit_ex2( cp );          /* 2^(a1*log2(a0)) */
01449 
01450    x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
01451 #else
01452    uint i;
01453 
01454    /* For absolute correctness, need to spill/invalidate all XMM regs
01455     * too.  
01456     */
01457    for (i = 0; i < 8; i++) {
01458       if (cp->xmm[i].dirty) 
01459          spill(cp, i);
01460       aos_release_xmm_reg(cp, i);
01461    }
01462 
01463    /* Push caller-save (ie scratch) regs.  
01464     */
01465    x86_cdecl_caller_push_regs( cp->func );
01466 
01467    x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) );
01468 
01469    x87_fld_src( cp, &op->FullSrcRegisters[1], 0 );
01470    x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) );
01471    x87_fld_src( cp, &op->FullSrcRegisters[0], 0 );
01472    x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
01473 
01474    /* tmp_EAX has been pushed & will be restored below */
01475    x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf );
01476    x86_call( cp->func, cp->tmp_EAX );
01477 
01478    x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) );
01479 
01480    x86_cdecl_caller_pop_regs( cp->func );
01481 
01482    /* Note retval on x87 stack:
01483     */
01484    cp->func->x87_stack++;
01485 
01486    x87_fstp_dest4( cp, &op->FullDstRegisters[0] );
01487 #endif
01488    return TRUE;
01489 }
01490 
01491 
01492 #if FAST_MATH
01493 static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
01494 {
01495    uint i;
01496 
01497    /* For absolute correctness, need to spill/invalidate all XMM regs
01498     * too.  
01499     */
01500    for (i = 0; i < 8; i++) {
01501       if (cp->xmm[i].dirty) 
01502          spill(cp, i);
01503       aos_release_xmm_reg(cp, i);
01504    }
01505 
01506    /* Push caller-save (ie scratch) regs.  
01507     */
01508    x86_cdecl_caller_push_regs( cp->func );
01509 
01510    x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) );
01511 
01512    x87_fld_src( cp, &op->FullSrcRegisters[0], 0 );
01513    x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
01514 
01515    /* tmp_EAX has been pushed & will be restored below */
01516    x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 );
01517    x86_call( cp->func, cp->tmp_EAX );
01518 
01519    x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) );
01520 
01521    x86_cdecl_caller_pop_regs( cp->func );
01522 
01523    /* Note retval on x87 stack:
01524     */
01525    cp->func->x87_stack++;
01526 
01527    x87_fstp_dest4( cp, &op->FullDstRegisters[0] );
01528 
01529    return TRUE;
01530 }
01531 #endif
01532 
01533 
01534 static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01535 {
01536    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01537    struct x86_reg dst = aos_get_xmm_reg(cp);
01538 
01539    if (cp->have_sse2) {
01540       sse2_rcpss(cp->func, dst, arg0);
01541       /* extend precision here...
01542        */
01543    }
01544    else {
01545       struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
01546       sse_movss(cp->func, dst, ones);
01547       sse_divss(cp->func, dst, arg0);
01548    }
01549 
01550    store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
01551    return TRUE;
01552 }
01553 
01554 
01555 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
01556  * implementations, it is possible to improve its precision at
01557  * fairly low cost, using a newton/raphson step, as below:
01558  * 
01559  * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
01560  * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
01561  * or:
01562  *   x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
01563  * 
01564  *
01565  * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
01566  */
01567 static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01568 {
01569 
01570    if (0) {
01571       struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01572       struct x86_reg r = aos_get_xmm_reg(cp);
01573       sse_rsqrtss(cp->func, r, arg0);
01574       store_scalar_dest(cp, &op->FullDstRegisters[0], r);
01575       return TRUE;
01576    }
01577    else {
01578       struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01579       struct x86_reg r = aos_get_xmm_reg(cp);
01580 
01581       struct x86_reg neg_half       = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ );
01582       struct x86_reg one_point_five = x86_make_disp( neg_half, 4 );
01583       struct x86_reg src            = get_xmm_writable( cp, arg0 );
01584       
01585       sse_rsqrtss( cp->func, r, src  );             /* rsqrtss(a) */
01586       sse_mulss(   cp->func, src, neg_half  );      /* -.5 * a */
01587       sse_mulss(   cp->func, src,  r );             /* -.5 * a * r */
01588       sse_mulss(   cp->func, src,  r );             /* -.5 * a * r * r */
01589       sse_addss(   cp->func, src, one_point_five ); /* 1.5 - .5 * a * r * r */
01590       sse_mulss(   cp->func, r,  src );             /* r * (1.5 - .5 * a * r * r) */
01591 
01592       store_scalar_dest(cp, &op->FullDstRegisters[0], r);
01593       return TRUE;
01594    }
01595 }
01596 
01597 
01598 static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01599 {
01600    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01601    struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01602    struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
01603    struct x86_reg dst = get_xmm_writable(cp, arg0);
01604 
01605    sse_cmpps(cp->func, dst, arg1, cc_NotLessThan);
01606    sse_andps(cp->func, dst, ones);
01607 
01608    store_dest(cp, &op->FullDstRegisters[0], dst);
01609    return TRUE;
01610 }
01611 
01612 static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
01613 {
01614    x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
01615    x87_fsin(cp->func);
01616    x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
01617    return TRUE;
01618 }
01619 
01620 
01621 
01622 static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01623 {
01624    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01625    struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01626    struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
01627    struct x86_reg dst = get_xmm_writable(cp, arg0);
01628    
01629    sse_cmpps(cp->func, dst, arg1, cc_LessThan);
01630    sse_andps(cp->func, dst, ones);
01631 
01632    store_dest(cp, &op->FullDstRegisters[0], dst);
01633    return TRUE;
01634 }
01635 
01636 static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
01637 {
01638    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01639    struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01640    struct x86_reg dst = get_xmm_writable(cp, arg0);
01641 
01642    sse_subps(cp->func, dst, arg1);
01643 
01644    store_dest(cp, &op->FullDstRegisters[0], dst);
01645    return TRUE;
01646 }
01647 
01648 static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
01649 {
01650    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01651    struct x86_reg tmp0 = aos_get_xmm_reg(cp);
01652 
01653    sse2_cvttps2dq(cp->func, tmp0, arg0);
01654    sse2_cvtdq2ps(cp->func, tmp0, tmp0);
01655 
01656    store_dest(cp, &op->FullDstRegisters[0], tmp0);
01657    return TRUE;
01658 }
01659 
01660 static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
01661 {
01662    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
01663    struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
01664    struct x86_reg tmp0 = aos_get_xmm_reg(cp);
01665    struct x86_reg tmp1 = aos_get_xmm_reg(cp);
01666 
01667    emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
01668    sse_mulps(cp->func, tmp1, arg0);
01669    emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W));
01670    sse_mulps(cp->func, tmp0, arg1);
01671    sse_subps(cp->func, tmp1, tmp0);
01672    sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W));
01673 
01674 /*    dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
01675 /*    dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
01676 /*    dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
01677 /*    dst[3] is undef */
01678 
01679 
01680    aos_release_xmm_reg(cp, tmp0.idx);
01681    store_dest(cp, &op->FullDstRegisters[0], tmp1);
01682    return TRUE;
01683 }
01684 
01685 
01686 
01687 static boolean
01688 emit_instruction( struct aos_compilation *cp,
01689                   struct tgsi_full_instruction *inst )
01690 {
01691    x87_assert_stack_empty(cp->func);
01692 
01693    switch( inst->Instruction.Opcode ) {
01694    case TGSI_OPCODE_MOV:
01695       return emit_MOV( cp, inst );
01696 
01697    case TGSI_OPCODE_LIT:
01698       return emit_LIT(cp, inst);
01699 
01700    case TGSI_OPCODE_RCP:
01701       return emit_RCP(cp, inst);
01702 
01703    case TGSI_OPCODE_RSQ:
01704       return emit_RSQ(cp, inst);
01705 
01706    case TGSI_OPCODE_EXP:
01707       /*return emit_EXP(cp, inst);*/
01708       return FALSE;
01709 
01710    case TGSI_OPCODE_LOG:
01711       /*return emit_LOG(cp, inst);*/
01712       return FALSE;
01713 
01714    case TGSI_OPCODE_MUL:
01715       return emit_MUL(cp, inst);
01716 
01717    case TGSI_OPCODE_ADD:
01718       return emit_ADD(cp, inst);
01719 
01720    case TGSI_OPCODE_DP3:
01721       return emit_DP3(cp, inst);
01722 
01723    case TGSI_OPCODE_DP4:
01724       return emit_DP4(cp, inst);
01725 
01726    case TGSI_OPCODE_DST:
01727       return emit_DST(cp, inst);
01728 
01729    case TGSI_OPCODE_MIN:
01730       return emit_MIN(cp, inst);
01731 
01732    case TGSI_OPCODE_MAX:
01733       return emit_MAX(cp, inst);
01734 
01735    case TGSI_OPCODE_SLT:
01736       return emit_SLT(cp, inst);
01737 
01738    case TGSI_OPCODE_SGE:
01739       return emit_SGE(cp, inst);
01740 
01741    case TGSI_OPCODE_MAD:
01742       return emit_MAD(cp, inst);
01743 
01744    case TGSI_OPCODE_SUB:
01745       return emit_SUB(cp, inst);
01746  
01747    case TGSI_OPCODE_LERP:
01748 //      return emit_LERP(cp, inst);
01749       return FALSE;
01750 
01751    case TGSI_OPCODE_FRAC:
01752       return emit_FRC(cp, inst);
01753 
01754    case TGSI_OPCODE_CLAMP:
01755 //      return emit_CLAMP(cp, inst);
01756       return FALSE;
01757 
01758    case TGSI_OPCODE_FLOOR:
01759       return emit_FLR(cp, inst);
01760 
01761    case TGSI_OPCODE_ROUND:
01762       return emit_RND(cp, inst);
01763 
01764    case TGSI_OPCODE_EXPBASE2:
01765 #if FAST_MATH
01766       return emit_EXPBASE2(cp, inst);
01767 #elif 0
01768       /* this seems to fail for "larger" exponents.
01769        * See glean tvertProg1's EX2 test.
01770        */
01771       return emit_EX2(cp, inst);
01772 #else
01773       return FALSE;
01774 #endif
01775 
01776    case TGSI_OPCODE_LOGBASE2:
01777       return emit_LG2(cp, inst);
01778 
01779    case TGSI_OPCODE_POWER:
01780       return emit_POW(cp, inst);
01781 
01782    case TGSI_OPCODE_CROSSPRODUCT:
01783       return emit_XPD(cp, inst);
01784 
01785    case TGSI_OPCODE_ABS:
01786       return emit_ABS(cp, inst);
01787 
01788    case TGSI_OPCODE_DPH:
01789       return emit_DPH(cp, inst);
01790 
01791    case TGSI_OPCODE_COS:
01792       return emit_COS(cp, inst);
01793 
01794    case TGSI_OPCODE_SIN:
01795       return emit_SIN(cp, inst);
01796 
01797    case TGSI_OPCODE_TRUNC:
01798       return emit_TRUNC(cp, inst);
01799 
01800    case TGSI_OPCODE_END:
01801       return TRUE;
01802 
01803    default:
01804       return FALSE;
01805    }
01806 }
01807 
01808 
01809 static boolean emit_viewport( struct aos_compilation *cp )
01810 {
01811    struct x86_reg pos = aos_get_shader_reg_xmm(cp, 
01812                                                TGSI_FILE_OUTPUT, 
01813                                                cp->vaos->draw->vs.position_output );
01814 
01815    struct x86_reg scale = x86_make_disp(cp->machine_EDX, 
01816                                         Offset(struct aos_machine, scale));
01817 
01818    struct x86_reg translate = x86_make_disp(cp->machine_EDX, 
01819                                         Offset(struct aos_machine, translate));
01820 
01821    sse_mulps(cp->func, pos, scale);
01822    sse_addps(cp->func, pos, translate);
01823 
01824    aos_adopt_xmm_reg( cp,
01825                       pos,
01826                       TGSI_FILE_OUTPUT,
01827                       cp->vaos->draw->vs.position_output,
01828                       TRUE );
01829    return TRUE;
01830 }
01831 
01832 
01833 /* This is useful to be able to see the results on softpipe.  Doesn't
01834  * do proper clipping, just assumes the backend can do it during
01835  * rasterization -- for debug only...
01836  */
01837 static boolean emit_rhw_viewport( struct aos_compilation *cp )
01838 {
01839    struct x86_reg tmp = aos_get_xmm_reg(cp);
01840    struct x86_reg pos = aos_get_shader_reg_xmm(cp, 
01841                                                TGSI_FILE_OUTPUT, 
01842                                                cp->vaos->draw->vs.position_output);
01843 
01844    struct x86_reg scale = x86_make_disp(cp->machine_EDX, 
01845                                         Offset(struct aos_machine, scale));
01846 
01847    struct x86_reg translate = x86_make_disp(cp->machine_EDX, 
01848                                         Offset(struct aos_machine, translate));
01849 
01850 
01851 
01852    emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W));
01853    sse2_rcpss(cp->func, tmp, tmp);
01854    sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X));
01855    
01856    sse_mulps(cp->func, pos, scale);
01857    sse_mulps(cp->func, pos, tmp);
01858    sse_addps(cp->func, pos, translate);
01859 
01860    /* Set pos[3] = w 
01861     */
01862    mask_write(cp, pos, tmp, TGSI_WRITEMASK_W);
01863 
01864    aos_adopt_xmm_reg( cp,
01865                       pos,
01866                       TGSI_FILE_OUTPUT,
01867                       cp->vaos->draw->vs.position_output,
01868                       TRUE );
01869    return TRUE;
01870 }
01871 
01872 
01873 #if 0
01874 static boolean note_immediate( struct aos_compilation *cp,
01875                                struct tgsi_full_immediate *imm )
01876 {
01877    unsigned pos = cp->num_immediates++;
01878    unsigned j;
01879 
01880    for (j = 0; j < imm->Immediate.Size; j++) {
01881       cp->vaos->machine->immediate[pos][j] = imm->u.ImmediateFloat32[j].Float;
01882    }
01883 
01884    return TRUE;
01885 }
01886 #endif
01887 
01888 
01889 
01890 
01891 static void find_last_write_outputs( struct aos_compilation *cp )
01892 {
01893    struct tgsi_parse_context parse;
01894    unsigned this_instruction = 0;
01895    unsigned i;
01896 
01897    tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens );
01898 
01899    while (!tgsi_parse_end_of_tokens( &parse )) {
01900       
01901       tgsi_parse_token( &parse );
01902 
01903       if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION) 
01904          continue;
01905 
01906       for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) {
01907          if (parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.File ==
01908              TGSI_FILE_OUTPUT) 
01909          {
01910             unsigned idx = parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.Index;
01911             cp->output_last_write[idx] = this_instruction;
01912          }
01913       }
01914 
01915       this_instruction++;
01916    }
01917 
01918    tgsi_parse_free( &parse );
01919 }
01920 
01921 
01922 #define ARG_MACHINE    1
01923 #define ARG_START_ELTS 2
01924 #define ARG_COUNT      3
01925 #define ARG_OUTBUF     4
01926 
01927 
01928 static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
01929                                      boolean linear )
01930 { 
01931    struct tgsi_parse_context parse;
01932    struct aos_compilation cp;
01933    unsigned fixup, label;
01934 
01935    util_init_math();
01936 
01937    tgsi_parse_init( &parse, varient->base.vs->state.tokens );
01938 
01939    memset(&cp, 0, sizeof(cp));
01940 
01941    cp.insn_counter = 1;
01942    cp.vaos = varient;
01943    cp.have_sse2 = 1;
01944    cp.func = &varient->func[ linear ? 0 : 1 ];
01945 
01946    cp.tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
01947    cp.idx_EBX      = x86_make_reg(file_REG32, reg_BX);
01948    cp.outbuf_ECX    = x86_make_reg(file_REG32, reg_CX);
01949    cp.machine_EDX   = x86_make_reg(file_REG32, reg_DX);
01950    cp.count_ESI     = x86_make_reg(file_REG32, reg_SI);
01951    cp.temp_EBP     = x86_make_reg(file_REG32, reg_BP);
01952    cp.stack_ESP = x86_make_reg( file_REG32, reg_SP );
01953 
01954    x86_init_func(cp.func);
01955 
01956    find_last_write_outputs(&cp);
01957 
01958    x86_push(cp.func, cp.idx_EBX);
01959    x86_push(cp.func, cp.count_ESI);
01960    x86_push(cp.func, cp.temp_EBP);
01961 
01962 
01963    /* Load arguments into regs:
01964     */
01965    x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE));
01966    x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS));
01967    x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT));
01968    x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF));
01969 
01970 
01971    /* Compare count to zero and possibly bail.
01972     */
01973    x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX);
01974    x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX);
01975    fixup = x86_jcc_forward(cp.func, cc_E);
01976 
01977 
01978    save_fpu_state( &cp );
01979    set_fpu_round_nearest( &cp );
01980 
01981    aos_init_inputs( &cp, linear );
01982 
01983    cp.x86_reg[0] = 0;
01984    cp.x86_reg[1] = 0;
01985    
01986    /* Note address for loop jump 
01987     */
01988    label = x86_get_label(cp.func);
01989    {
01990       /* Fetch inputs...  TODO:  fetch lazily...
01991        */
01992       if (!aos_fetch_inputs( &cp, linear ))
01993          goto fail;
01994 
01995       /* Emit the shader:
01996        */
01997       while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error ) 
01998       {
01999          tgsi_parse_token( &parse );
02000 
02001          switch (parse.FullToken.Token.Type) {
02002          case TGSI_TOKEN_TYPE_IMMEDIATE:
02003 #if 0
02004             if (!note_immediate( &cp, &parse.FullToken.FullImmediate ))
02005                goto fail;
02006 #endif
02007             break;
02008 
02009          case TGSI_TOKEN_TYPE_INSTRUCTION:
02010             if (DISASSEM)
02011                tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter );
02012 
02013             if (!emit_instruction( &cp, &parse.FullToken.FullInstruction ))
02014                goto fail;
02015             break;
02016          }
02017 
02018          x87_assert_stack_empty(cp.func);
02019          cp.insn_counter++;
02020 
02021          if (DISASSEM)
02022             debug_printf("\n");
02023       }
02024 
02025    
02026       {
02027          unsigned i;
02028          for (i = 0; i < 8; i++) {
02029             if (cp.xmm[i].file != TGSI_FILE_OUTPUT) {
02030                cp.xmm[i].file = TGSI_FILE_NULL;
02031                cp.xmm[i].dirty = 0;
02032             }
02033          }
02034       }
02035 
02036       if (cp.error)
02037          goto fail;
02038 
02039       if (cp.vaos->base.key.clip) {
02040          /* not really handling clipping, just do the rhw so we can
02041           * see the results...
02042           */
02043          emit_rhw_viewport(&cp); 
02044       }
02045       else if (cp.vaos->base.key.viewport) {
02046          emit_viewport(&cp);
02047       }
02048 
02049       /* Emit output...  TODO: do this eagerly after the last write to a
02050        * given output.
02051        */
02052       if (!aos_emit_outputs( &cp ))
02053          goto fail;
02054 
02055 
02056       /* Next vertex:
02057        */
02058       x86_lea(cp.func, 
02059               cp.outbuf_ECX, 
02060               x86_make_disp(cp.outbuf_ECX, 
02061                             cp.vaos->base.key.output_stride));
02062 
02063       /* Incr index
02064        */   
02065       aos_incr_inputs( &cp, linear );
02066    }
02067    /* decr count, loop if not zero
02068     */
02069    x86_dec(cp.func, cp.count_ESI);
02070    x86_jcc(cp.func, cc_NZ, label);
02071 
02072    restore_fpu_state(&cp);
02073 
02074    /* Land forward jump here:
02075     */
02076    x86_fixup_fwd_jump(cp.func, fixup);
02077 
02078    /* Exit mmx state?
02079     */
02080    if (cp.func->need_emms)
02081       mmx_emms(cp.func);
02082 
02083    x86_pop(cp.func, cp.temp_EBP);
02084    x86_pop(cp.func, cp.count_ESI);
02085    x86_pop(cp.func, cp.idx_EBX);
02086 
02087    x87_assert_stack_empty(cp.func);
02088    x86_ret(cp.func);
02089 
02090    tgsi_parse_free( &parse );
02091    return !cp.error;
02092 
02093  fail:
02094    tgsi_parse_free( &parse );
02095    return FALSE;
02096 }
02097 
02098 
02099 
02100 static void vaos_set_buffer( struct draw_vs_varient *varient,
02101                              unsigned buf,
02102                              const void *ptr,
02103                              unsigned stride )
02104 {
02105    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
02106 
02107    if (buf < vaos->nr_vb) {
02108       vaos->buffer[buf].base_ptr = (char *)ptr;
02109       vaos->buffer[buf].stride = stride;
02110    }
02111 
02112    if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
02113 }
02114 
02115 
02116 
02117 static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
02118                                       const unsigned *elts,
02119                                       unsigned count,
02120                                       void *output_buffer )
02121 {
02122    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
02123    struct aos_machine *machine = vaos->draw->vs.aos_machine;
02124 
02125    if (0) debug_printf("%s %d\n", __FUNCTION__, count);
02126 
02127    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
02128    machine->constants = vaos->draw->vs.aligned_constants;
02129    machine->immediates = vaos->base.vs->immediates;
02130    machine->buffer = vaos->buffer;
02131 
02132    vaos->gen_run_elts( machine,
02133                        elts,
02134                        count,
02135                        output_buffer );
02136 }
02137 
02138 static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
02139                                         unsigned start,
02140                                         unsigned count,
02141                                         void *output_buffer )
02142 {
02143    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
02144    struct aos_machine *machine = vaos->draw->vs.aos_machine;
02145 
02146    if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count, 
02147                        vaos->base.key.const_vbuffers);
02148 
02149    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
02150    machine->constants = vaos->draw->vs.aligned_constants;
02151    machine->immediates = vaos->base.vs->immediates;
02152    machine->buffer = vaos->buffer;
02153 
02154    vaos->gen_run_linear( machine,
02155                          start,
02156                          count,
02157                          output_buffer );
02158 
02159    /* Sanity spot checks to make sure we didn't trash our constants */
02160    assert(machine->internal[IMM_ONES][0] == 1.0f);
02161    assert(machine->internal[IMM_IDENTITY][0] == 0.0f);
02162    assert(machine->internal[IMM_NEGS][0] == -1.0f);
02163 }
02164 
02165 
02166 
02167 static void vaos_destroy( struct draw_vs_varient *varient )
02168 {
02169    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
02170 
02171    FREE( vaos->buffer );
02172 
02173    x86_release_func( &vaos->func[0] );
02174    x86_release_func( &vaos->func[1] );
02175 
02176    FREE(vaos);
02177 }
02178 
02179 
02180 
02181 static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
02182                                                  const struct draw_vs_varient_key *key )
02183 {
02184    unsigned i;
02185    struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse);
02186 
02187    if (!vaos)
02188       goto fail;
02189    
02190    vaos->base.key = *key;
02191    vaos->base.vs = vs;
02192    vaos->base.set_buffer = vaos_set_buffer;
02193    vaos->base.destroy = vaos_destroy;
02194    vaos->base.run_linear = vaos_run_linear;
02195    vaos->base.run_elts = vaos_run_elts;
02196 
02197    vaos->draw = vs->draw;
02198 
02199    for (i = 0; i < key->nr_inputs; i++) 
02200       vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );
02201 
02202    vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
02203    if (!vaos->buffer)
02204       goto fail;
02205 
02206    if (0)
02207       debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
02208 
02209 #if 0
02210    tgsi_dump(vs->state.tokens, 0);
02211 #endif
02212 
02213    if (!build_vertex_program( vaos, TRUE ))
02214       goto fail;
02215 
02216    if (!build_vertex_program( vaos, FALSE ))
02217       goto fail;
02218 
02219    vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]);
02220    if (!vaos->gen_run_linear)
02221       goto fail;
02222 
02223    vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]);
02224    if (!vaos->gen_run_elts)
02225       goto fail;
02226 
02227    return &vaos->base;
02228 
02229  fail:
02230    if (vaos && vaos->buffer)
02231       FREE(vaos->buffer);
02232 
02233    if (vaos)
02234       x86_release_func( &vaos->func[0] );
02235 
02236    if (vaos)
02237       x86_release_func( &vaos->func[1] );
02238 
02239    FREE(vaos);
02240    
02241    return NULL;
02242 }
02243 
02244 
02245 struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs,
02246                                                  const struct draw_vs_varient_key *key )
02247 {
02248    struct draw_vs_varient *varient = varient_aos_sse( vs, key );
02249 
02250    if (varient == NULL) {
02251       varient = draw_vs_varient_generic( vs, key );
02252    }
02253 
02254    return varient;
02255 }
02256 
02257 
02258 
02259 #endif /* PIPE_ARCH_X86 */

Generated on Tue Sep 29 06:25:14 2009 for Gallium3D by  doxygen 1.5.4