tgsi_sse2.c

Go to the documentation of this file.
00001 /**************************************************************************
00002  * 
00003  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
00004  * All Rights Reserved.
00005  * 
00006  * Permission is hereby granted, free of charge, to any person obtaining a
00007  * copy of this software and associated documentation files (the
00008  * "Software"), to deal in the Software without restriction, including
00009  * without limitation the rights to use, copy, modify, merge, publish,
00010  * distribute, sub license, and/or sell copies of the Software, and to
00011  * permit persons to whom the Software is furnished to do so, subject to
00012  * the following conditions:
00013  * 
00014  * The above copyright notice and this permission notice (including the
00015  * next paragraph) shall be included in all copies or substantial portions
00016  * of the Software.
00017  * 
00018  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
00019  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
00020  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
00021  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
00022  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
00023  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
00024  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
00025  * 
00026  **************************************************************************/
00027 
00028 #include "pipe/p_debug.h"
00029 #include "pipe/p_shader_tokens.h"
00030 #include "util/u_math.h"
00031 #include "tgsi/tgsi_parse.h"
00032 #include "tgsi/tgsi_util.h"
00033 #include "tgsi_exec.h"
00034 #include "tgsi_sse2.h"
00035 
00036 #include "rtasm/rtasm_x86sse.h"
00037 
00038 #ifdef PIPE_ARCH_X86
00039 
00040 /* for 1/sqrt()
00041  *
00042  * This costs about 100fps (close to 10%) in gears:
00043  */
00044 #define HIGH_PRECISION 1
00045 
00046 #define FAST_MATH 1
00047 
00048 
00049 #define FOR_EACH_CHANNEL( CHAN )\
00050    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
00051 
00052 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
00053    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
00054 
00055 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
00056    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
00057 
00058 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
00059    FOR_EACH_CHANNEL( CHAN )\
00060       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
00061 
00062 #define CHAN_X 0
00063 #define CHAN_Y 1
00064 #define CHAN_Z 2
00065 #define CHAN_W 3
00066 
00067 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
00068 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
00069 
00070 #define TEMP_R0   TGSI_EXEC_TEMP_R0
00071 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
00072 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
00073 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
00074 
00075 
00080 static struct x86_reg
00081 make_xmm(
00082    unsigned xmm )
00083 {
00084    return x86_make_reg(
00085       file_XMM,
00086       (enum x86_reg_name) xmm );
00087 }
00088 
00093 static struct x86_reg
00094 get_const_base( void )
00095 {
00096    return x86_make_reg(
00097       file_REG32,
00098       reg_CX );
00099 }
00100 
00101 static struct x86_reg
00102 get_input_base( void )
00103 {
00104    return x86_make_reg(
00105       file_REG32,
00106       reg_AX );
00107 }
00108 
00109 static struct x86_reg
00110 get_output_base( void )
00111 {
00112    return x86_make_reg(
00113       file_REG32,
00114       reg_DX );
00115 }
00116 
00117 static struct x86_reg
00118 get_temp_base( void )
00119 {
00120    return x86_make_reg(
00121       file_REG32,
00122       reg_BX );
00123 }
00124 
00125 static struct x86_reg
00126 get_coef_base( void )
00127 {
00128    return get_output_base();
00129 }
00130 
00131 static struct x86_reg
00132 get_immediate_base( void )
00133 {
00134    return x86_make_reg(
00135       file_REG32,
00136       reg_DI );
00137 }
00138 
00139 
00145 static struct x86_reg
00146 get_immediate(
00147    unsigned vec,
00148    unsigned chan )
00149 {
00150    return x86_make_disp(
00151       get_immediate_base(),
00152       (vec * 4 + chan) * 4 );
00153 }
00154 
00155 static struct x86_reg
00156 get_const(
00157    unsigned vec,
00158    unsigned chan )
00159 {
00160    return x86_make_disp(
00161       get_const_base(),
00162       (vec * 4 + chan) * 4 );
00163 }
00164 
00165 static struct x86_reg
00166 get_input(
00167    unsigned vec,
00168    unsigned chan )
00169 {
00170    return x86_make_disp(
00171       get_input_base(),
00172       (vec * 4 + chan) * 16 );
00173 }
00174 
00175 static struct x86_reg
00176 get_output(
00177    unsigned vec,
00178    unsigned chan )
00179 {
00180    return x86_make_disp(
00181       get_output_base(),
00182       (vec * 4 + chan) * 16 );
00183 }
00184 
00185 static struct x86_reg
00186 get_temp(
00187    unsigned vec,
00188    unsigned chan )
00189 {
00190    return x86_make_disp(
00191       get_temp_base(),
00192       (vec * 4 + chan) * 16 );
00193 }
00194 
00195 static struct x86_reg
00196 get_coef(
00197    unsigned vec,
00198    unsigned chan,
00199    unsigned member )
00200 {
00201    return x86_make_disp(
00202       get_coef_base(),
00203       ((vec * 3 + member) * 4 + chan) * 4 );
00204 }
00205 
00206 
00207 static void
00208 emit_ret(
00209    struct x86_function  *func )
00210 {
00211    x86_ret( func );
00212 }
00213 
00214 
00225 static void
00226 emit_const(
00227    struct x86_function *func,
00228    uint xmm,
00229    int vec,
00230    uint chan,
00231    uint indirect,
00232    uint indirectFile,
00233    int indirectIndex )
00234 {
00235    if (indirect) {
00236       /* 'vec' is the offset from the address register's value.
00237        * We're loading CONST[ADDR+vec] into an xmm register.
00238        */
00239       struct x86_reg r0 = get_input_base();
00240       struct x86_reg r1 = get_output_base();
00241       uint i;
00242 
00243       assert( indirectFile == TGSI_FILE_ADDRESS );
00244       assert( indirectIndex == 0 );
00245 
00246       x86_push( func, r0 );
00247       x86_push( func, r1 );
00248 
00249       /*
00250        * Loop over the four pixels or vertices in the quad.
00251        * Get the value of the address (offset) register for pixel/vertex[i],
00252        * add it to the src offset and index into the constant buffer.
00253        * Note that we're working on SOA data.
00254        * If any of the pixel/vertex execution channels are unused their
00255        * values will be garbage.  It's very important that we don't use
00256        * those garbage values as indexes into the constant buffer since
00257        * that'll cause segfaults.
00258        * The solution is to bitwise-AND the offset with the execution mask
00259        * register whose values are either 0 or ~0.
00260        * The caller must setup the execution mask register to indicate
00261        * which channels are valid/alive before running the shader.
00262        * The execution mask will also figure into loops and conditionals
00263        * someday.
00264        */
00265       for (i = 0; i < QUAD_SIZE; i++) {
00266          /* r1 = address register[i] */
00267          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
00268          /* r0 = execution mask[i] */
00269          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
00270          /* r1 = r1 & r0 */
00271          x86_and( func, r1, r0 );
00272          /* r0 = 'vec', the offset */
00273          x86_lea( func, r0, get_const( vec, chan ) );
00274 
00275          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
00276           */
00277          x86_add( func, r1, r1 );
00278          x86_add( func, r1, r1 );
00279          x86_add( func, r1, r1 );
00280          x86_add( func, r1, r1 );
00281 
00282          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
00283          x86_mov( func, r1, x86_deref( r0 ) );
00284          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
00285       }
00286 
00287       x86_pop( func, r1 );
00288       x86_pop( func, r0 );
00289 
00290       sse_movaps(
00291          func,
00292          make_xmm( xmm ),
00293          get_temp( TEMP_R0, CHAN_X ) );
00294    }
00295    else {
00296       /* 'vec' is the index into the src register file, such as TEMP[vec] */
00297       assert( vec >= 0 );
00298 
00299       sse_movss(
00300          func,
00301          make_xmm( xmm ),
00302          get_const( vec, chan ) );
00303       sse_shufps(
00304          func,
00305          make_xmm( xmm ),
00306          make_xmm( xmm ),
00307          SHUF( 0, 0, 0, 0 ) );
00308    }
00309 }
00310 
00311 static void
00312 emit_immediate(
00313    struct x86_function *func,
00314    unsigned xmm,
00315    unsigned vec,
00316    unsigned chan )
00317 {
00318    sse_movss(
00319       func,
00320       make_xmm( xmm ),
00321       get_immediate( vec, chan ) );
00322    sse_shufps(
00323       func,
00324       make_xmm( xmm ),
00325       make_xmm( xmm ),
00326       SHUF( 0, 0, 0, 0 ) );
00327 }
00328 
00329 
00336 static void
00337 emit_inputf(
00338    struct x86_function *func,
00339    unsigned xmm,
00340    unsigned vec,
00341    unsigned chan )
00342 {
00343    sse_movups(
00344       func,
00345       make_xmm( xmm ),
00346       get_input( vec, chan ) );
00347 }
00348 
00355 static void
00356 emit_output(
00357    struct x86_function *func,
00358    unsigned xmm,
00359    unsigned vec,
00360    unsigned chan )
00361 {
00362    sse_movups(
00363       func,
00364       get_output( vec, chan ),
00365       make_xmm( xmm ) );
00366 }
00367 
00374 static void
00375 emit_tempf(
00376    struct x86_function *func,
00377    unsigned xmm,
00378    unsigned vec,
00379    unsigned chan )
00380 {
00381    sse_movaps(
00382       func,
00383       make_xmm( xmm ),
00384       get_temp( vec, chan ) );
00385 }
00386 
00394 static void
00395 emit_coef(
00396    struct x86_function *func,
00397    unsigned xmm,
00398    unsigned vec,
00399    unsigned chan,
00400    unsigned member )
00401 {
00402    sse_movss(
00403       func,
00404       make_xmm( xmm ),
00405       get_coef( vec, chan, member ) );
00406    sse_shufps(
00407       func,
00408       make_xmm( xmm ),
00409       make_xmm( xmm ),
00410       SHUF( 0, 0, 0, 0 ) );
00411 }
00412 
00417 static void
00418 emit_inputs(
00419    struct x86_function *func,
00420    unsigned xmm,
00421    unsigned vec,
00422    unsigned chan )
00423 {
00424    sse_movups(
00425       func,
00426       get_input( vec, chan ),
00427       make_xmm( xmm ) );
00428 }
00429 
00430 static void
00431 emit_temps(
00432    struct x86_function *func,
00433    unsigned xmm,
00434    unsigned vec,
00435    unsigned chan )
00436 {
00437    sse_movaps(
00438       func,
00439       get_temp( vec, chan ),
00440       make_xmm( xmm ) );
00441 }
00442 
00443 static void
00444 emit_addrs(
00445    struct x86_function *func,
00446    unsigned xmm,
00447    unsigned vec,
00448    unsigned chan )
00449 {
00450    assert( vec == 0 );
00451 
00452    emit_temps(
00453       func,
00454       xmm,
00455       vec + TGSI_EXEC_TEMP_ADDR,
00456       chan );
00457 }
00458 
00463 static void
00464 emit_coef_a0(
00465    struct x86_function *func,
00466    unsigned xmm,
00467    unsigned vec,
00468    unsigned chan )
00469 {
00470    emit_coef(
00471       func,
00472       xmm,
00473       vec,
00474       chan,
00475       0 );
00476 }
00477 
00478 static void
00479 emit_coef_dadx(
00480    struct x86_function *func,
00481    unsigned xmm,
00482    unsigned vec,
00483    unsigned chan )
00484 {
00485    emit_coef(
00486       func,
00487       xmm,
00488       vec,
00489       chan,
00490       1 );
00491 }
00492 
00493 static void
00494 emit_coef_dady(
00495    struct x86_function *func,
00496    unsigned xmm,
00497    unsigned vec,
00498    unsigned chan )
00499 {
00500    emit_coef(
00501       func,
00502       xmm,
00503       vec,
00504       chan,
00505       2 );
00506 }
00507 
00512 static void
00513 emit_push_gp(
00514    struct x86_function *func )
00515 {
00516    x86_push(
00517       func,
00518       x86_make_reg( file_REG32, reg_AX) );
00519    x86_push(
00520       func,
00521       x86_make_reg( file_REG32, reg_CX) );
00522    x86_push(
00523       func,
00524       x86_make_reg( file_REG32, reg_DX) );
00525 }
00526 
00527 static void
00528 x86_pop_gp(
00529    struct x86_function *func )
00530 {
00531    /* Restore GP registers in a reverse order.
00532     */
00533    x86_pop(
00534       func,
00535       x86_make_reg( file_REG32, reg_DX) );
00536    x86_pop(
00537       func,
00538       x86_make_reg( file_REG32, reg_CX) );
00539    x86_pop(
00540       func,
00541       x86_make_reg( file_REG32, reg_AX) );
00542 }
00543 
00544 static void
00545 emit_func_call_dst(
00546    struct x86_function *func,
00547    unsigned xmm_dst,
00548    void (PIPE_CDECL *code)() )
00549 {
00550    sse_movaps(
00551       func,
00552       get_temp( TEMP_R0, 0 ),
00553       make_xmm( xmm_dst ) );
00554 
00555    emit_push_gp(
00556       func );
00557 
00558    {
00559       struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
00560 
00561       x86_lea(
00562          func,
00563          ecx,
00564          get_temp( TEMP_R0, 0 ) );
00565 
00566       x86_push( func, ecx );
00567       x86_mov_reg_imm( func, ecx, (unsigned long) code );
00568       x86_call( func, ecx );
00569       x86_pop(func, ecx ); 
00570    }
00571 
00572 
00573    x86_pop_gp(
00574       func );
00575 
00576    sse_movaps(
00577       func,
00578       make_xmm( xmm_dst ),
00579       get_temp( TEMP_R0, 0 ) );
00580 }
00581 
00582 static void
00583 emit_func_call_dst_src(
00584    struct x86_function *func,
00585    unsigned xmm_dst,
00586    unsigned xmm_src,
00587    void (PIPE_CDECL *code)() )
00588 {
00589    sse_movaps(
00590       func,
00591       get_temp( TEMP_R0, 1 ),
00592       make_xmm( xmm_src ) );
00593 
00594    emit_func_call_dst(
00595       func,
00596       xmm_dst,
00597       code );
00598 }
00599 
00604 static void
00605 emit_abs(
00606    struct x86_function *func,
00607    unsigned xmm )
00608 {
00609    sse_andps(
00610       func,
00611       make_xmm( xmm ),
00612       get_temp(
00613          TGSI_EXEC_TEMP_7FFFFFFF_I,
00614          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
00615 }
00616 
00617 static void
00618 emit_add(
00619    struct x86_function *func,
00620    unsigned xmm_dst,
00621    unsigned xmm_src )
00622 {
00623    sse_addps(
00624       func,
00625       make_xmm( xmm_dst ),
00626       make_xmm( xmm_src ) );
00627 }
00628 
00629 static void PIPE_CDECL
00630 cos4f(
00631    float *store )
00632 {
00633    store[0] = cosf( store[0] );
00634    store[1] = cosf( store[1] );
00635    store[2] = cosf( store[2] );
00636    store[3] = cosf( store[3] );
00637 }
00638 
00639 static void
00640 emit_cos(
00641    struct x86_function *func,
00642    unsigned xmm_dst )
00643 {
00644    emit_func_call_dst(
00645       func,
00646       xmm_dst,
00647       cos4f );
00648 }
00649 
00650 static void PIPE_CDECL
00651 ex24f(
00652    float *store )
00653 {
00654 #if FAST_MATH
00655    store[0] = util_fast_exp2( store[0] );
00656    store[1] = util_fast_exp2( store[1] );
00657    store[2] = util_fast_exp2( store[2] );
00658    store[3] = util_fast_exp2( store[3] );
00659 #else
00660    store[0] = powf( 2.0f, store[0] );
00661    store[1] = powf( 2.0f, store[1] );
00662    store[2] = powf( 2.0f, store[2] );
00663    store[3] = powf( 2.0f, store[3] );
00664 #endif
00665 }
00666 
00667 static void
00668 emit_ex2(
00669    struct x86_function *func,
00670    unsigned xmm_dst )
00671 {
00672    emit_func_call_dst(
00673       func,
00674       xmm_dst,
00675       ex24f );
00676 }
00677 
00678 static void
00679 emit_f2it(
00680    struct x86_function *func,
00681    unsigned xmm )
00682 {
00683    sse2_cvttps2dq(
00684       func,
00685       make_xmm( xmm ),
00686       make_xmm( xmm ) );
00687 }
00688 
00689 static void
00690 emit_i2f(
00691    struct x86_function *func,
00692    unsigned xmm )
00693 {
00694    sse2_cvtdq2ps(
00695       func,
00696       make_xmm( xmm ),
00697       make_xmm( xmm ) );
00698 }
00699 
00700 static void PIPE_CDECL
00701 flr4f(
00702    float *store )
00703 {
00704    store[0] = floorf( store[0] );
00705    store[1] = floorf( store[1] );
00706    store[2] = floorf( store[2] );
00707    store[3] = floorf( store[3] );
00708 }
00709 
00710 static void
00711 emit_flr(
00712    struct x86_function *func,
00713    unsigned xmm_dst )
00714 {
00715    emit_func_call_dst(
00716       func,
00717       xmm_dst,
00718       flr4f );
00719 }
00720 
00721 static void PIPE_CDECL
00722 frc4f(
00723    float *store )
00724 {
00725    store[0] -= floorf( store[0] );
00726    store[1] -= floorf( store[1] );
00727    store[2] -= floorf( store[2] );
00728    store[3] -= floorf( store[3] );
00729 }
00730 
00731 static void
00732 emit_frc(
00733    struct x86_function *func,
00734    unsigned xmm_dst )
00735 {
00736    emit_func_call_dst(
00737       func,
00738       xmm_dst,
00739       frc4f );
00740 }
00741 
00742 static void PIPE_CDECL
00743 lg24f(
00744    float *store )
00745 {
00746    store[0] = util_fast_log2( store[0] );
00747    store[1] = util_fast_log2( store[1] );
00748    store[2] = util_fast_log2( store[2] );
00749    store[3] = util_fast_log2( store[3] );
00750 }
00751 
00752 static void
00753 emit_lg2(
00754    struct x86_function *func,
00755    unsigned xmm_dst )
00756 {
00757    emit_func_call_dst(
00758       func,
00759       xmm_dst,
00760       lg24f );
00761 }
00762 
00763 static void
00764 emit_MOV(
00765    struct x86_function *func,
00766    unsigned xmm_dst,
00767    unsigned xmm_src )
00768 {
00769    sse_movups(
00770       func,
00771       make_xmm( xmm_dst ),
00772       make_xmm( xmm_src ) );
00773 }
00774 
00775 static void
00776 emit_mul (struct x86_function *func,
00777           unsigned xmm_dst,
00778           unsigned xmm_src)
00779 {
00780    sse_mulps(
00781       func,
00782       make_xmm( xmm_dst ),
00783       make_xmm( xmm_src ) );
00784 }
00785 
00786 static void
00787 emit_neg(
00788    struct x86_function *func,
00789    unsigned xmm )
00790 {
00791    sse_xorps(
00792       func,
00793       make_xmm( xmm ),
00794       get_temp(
00795          TGSI_EXEC_TEMP_80000000_I,
00796          TGSI_EXEC_TEMP_80000000_C ) );
00797 }
00798 
00799 static void PIPE_CDECL
00800 pow4f(
00801    float *store )
00802 {
00803 #if FAST_MATH
00804    store[0] = util_fast_pow( store[0], store[4] );
00805    store[1] = util_fast_pow( store[1], store[5] );
00806    store[2] = util_fast_pow( store[2], store[6] );
00807    store[3] = util_fast_pow( store[3], store[7] );
00808 #else
00809    store[0] = powf( store[0], store[4] );
00810    store[1] = powf( store[1], store[5] );
00811    store[2] = powf( store[2], store[6] );
00812    store[3] = powf( store[3], store[7] );
00813 #endif
00814 }
00815 
00816 static void
00817 emit_pow(
00818    struct x86_function *func,
00819    unsigned xmm_dst,
00820    unsigned xmm_src )
00821 {
00822    emit_func_call_dst_src(
00823       func,
00824       xmm_dst,
00825       xmm_src,
00826       pow4f );
00827 }
00828 
00829 static void
00830 emit_rcp (
00831    struct x86_function *func,
00832    unsigned xmm_dst,
00833    unsigned xmm_src )
00834 {
00835    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
00836     * good enough.  Need to either emit a proper divide or use the
00837     * iterative technique described below in emit_rsqrt().
00838     */
00839    sse2_rcpps(
00840       func,
00841       make_xmm( xmm_dst ),
00842       make_xmm( xmm_src ) );
00843 }
00844 
00845 static void PIPE_CDECL
00846 rnd4f(
00847    float *store )
00848 {
00849    store[0] = floorf( store[0] + 0.5f );
00850    store[1] = floorf( store[1] + 0.5f );
00851    store[2] = floorf( store[2] + 0.5f );
00852    store[3] = floorf( store[3] + 0.5f );
00853 }
00854 
00855 static void
00856 emit_rnd(
00857    struct x86_function *func,
00858    unsigned xmm_dst )
00859 {
00860    emit_func_call_dst(
00861       func,
00862       xmm_dst,
00863       rnd4f );
00864 }
00865 
00866 static void
00867 emit_rsqrt(
00868    struct x86_function *func,
00869    unsigned xmm_dst,
00870    unsigned xmm_src )
00871 {
00872 #if HIGH_PRECISION
00873    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
00874     * implementations, it is possible to improve its precision at
00875     * fairly low cost, using a newton/raphson step, as below:
00876     * 
00877     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
00878     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
00879     *
00880     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
00881     */
00882    {
00883       struct x86_reg dst = make_xmm( xmm_dst );
00884       struct x86_reg src = make_xmm( xmm_src );
00885       struct x86_reg tmp0 = make_xmm( 2 );
00886       struct x86_reg tmp1 = make_xmm( 3 );
00887 
00888       assert( xmm_dst != xmm_src );
00889       assert( xmm_dst != 2 && xmm_dst != 3 );
00890       assert( xmm_src != 2 && xmm_src != 3 );
00891 
00892       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
00893       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
00894       sse_rsqrtps( func, tmp1, src  );
00895       sse_mulps(   func, src,  tmp1 );
00896       sse_mulps(   func, dst,  tmp1 );
00897       sse_mulps(   func, src,  tmp1 );
00898       sse_subps(   func, tmp0, src  );
00899       sse_mulps(   func, dst,  tmp0 );
00900    }
00901 #else
00902    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
00903     * good enough.
00904     */
00905    sse_rsqrtps(
00906       func,
00907       make_xmm( xmm_dst ),
00908       make_xmm( xmm_src ) );
00909 #endif
00910 }
00911 
00912 static void
00913 emit_setsign(
00914    struct x86_function *func,
00915    unsigned xmm )
00916 {
00917    sse_orps(
00918       func,
00919       make_xmm( xmm ),
00920       get_temp(
00921          TGSI_EXEC_TEMP_80000000_I,
00922          TGSI_EXEC_TEMP_80000000_C ) );
00923 }
00924 
00925 static void PIPE_CDECL
00926 sgn4f(
00927    float *store )
00928 {
00929    store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
00930    store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
00931    store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
00932    store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
00933 }
00934 
00935 static void
00936 emit_sgn(
00937    struct x86_function *func,
00938    unsigned xmm_dst )
00939 {
00940    emit_func_call_dst(
00941       func,
00942       xmm_dst,
00943       sgn4f );
00944 }
00945 
00946 static void PIPE_CDECL
00947 sin4f(
00948    float *store )
00949 {
00950    store[0] = sinf( store[0] );
00951    store[1] = sinf( store[1] );
00952    store[2] = sinf( store[2] );
00953    store[3] = sinf( store[3] );
00954 }
00955 
00956 static void
00957 emit_sin (struct x86_function *func,
00958           unsigned xmm_dst)
00959 {
00960    emit_func_call_dst(
00961       func,
00962       xmm_dst,
00963       sin4f );
00964 }
00965 
00966 static void
00967 emit_sub(
00968    struct x86_function *func,
00969    unsigned xmm_dst,
00970    unsigned xmm_src )
00971 {
00972    sse_subps(
00973       func,
00974       make_xmm( xmm_dst ),
00975       make_xmm( xmm_src ) );
00976 }
00977 
00982 static void
00983 emit_fetch(
00984    struct x86_function *func,
00985    unsigned xmm,
00986    const struct tgsi_full_src_register *reg,
00987    const unsigned chan_index )
00988 {
00989    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
00990 
00991    switch (swizzle) {
00992    case TGSI_EXTSWIZZLE_X:
00993    case TGSI_EXTSWIZZLE_Y:
00994    case TGSI_EXTSWIZZLE_Z:
00995    case TGSI_EXTSWIZZLE_W:
00996       switch (reg->SrcRegister.File) {
00997       case TGSI_FILE_CONSTANT:
00998          emit_const(
00999             func,
01000             xmm,
01001             reg->SrcRegister.Index,
01002             swizzle,
01003             reg->SrcRegister.Indirect,
01004             reg->SrcRegisterInd.File,
01005             reg->SrcRegisterInd.Index );
01006          break;
01007 
01008       case TGSI_FILE_IMMEDIATE:
01009          emit_immediate(
01010             func,
01011             xmm,
01012             reg->SrcRegister.Index,
01013             swizzle );
01014          break;
01015 
01016       case TGSI_FILE_INPUT:
01017          emit_inputf(
01018             func,
01019             xmm,
01020             reg->SrcRegister.Index,
01021             swizzle );
01022          break;
01023 
01024       case TGSI_FILE_TEMPORARY:
01025          emit_tempf(
01026             func,
01027             xmm,
01028             reg->SrcRegister.Index,
01029             swizzle );
01030          break;
01031 
01032       default:
01033          assert( 0 );
01034       }
01035       break;
01036 
01037    case TGSI_EXTSWIZZLE_ZERO:
01038       emit_tempf(
01039          func,
01040          xmm,
01041          TGSI_EXEC_TEMP_00000000_I,
01042          TGSI_EXEC_TEMP_00000000_C );
01043       break;
01044 
01045    case TGSI_EXTSWIZZLE_ONE:
01046       emit_tempf(
01047          func,
01048          xmm,
01049          TEMP_ONE_I,
01050          TEMP_ONE_C );
01051       break;
01052 
01053    default:
01054       assert( 0 );
01055    }
01056 
01057    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
01058    case TGSI_UTIL_SIGN_CLEAR:
01059       emit_abs( func, xmm );
01060       break;
01061 
01062    case TGSI_UTIL_SIGN_SET:
01063       emit_setsign( func, xmm );
01064       break;
01065 
01066    case TGSI_UTIL_SIGN_TOGGLE:
01067       emit_neg( func, xmm );
01068       break;
01069 
01070    case TGSI_UTIL_SIGN_KEEP:
01071       break;
01072    }
01073 }
01074 
01075 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
01076    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
01077 
01082 static void
01083 emit_store(
01084    struct x86_function *func,
01085    unsigned xmm,
01086    const struct tgsi_full_dst_register *reg,
01087    const struct tgsi_full_instruction *inst,
01088    unsigned chan_index )
01089 {
01090    switch( reg->DstRegister.File ) {
01091    case TGSI_FILE_OUTPUT:
01092       emit_output(
01093          func,
01094          xmm,
01095          reg->DstRegister.Index,
01096          chan_index );
01097       break;
01098 
01099    case TGSI_FILE_TEMPORARY:
01100       emit_temps(
01101          func,
01102          xmm,
01103          reg->DstRegister.Index,
01104          chan_index );
01105       break;
01106 
01107    case TGSI_FILE_ADDRESS:
01108       emit_addrs(
01109          func,
01110          xmm,
01111          reg->DstRegister.Index,
01112          chan_index );
01113       break;
01114 
01115    default:
01116       assert( 0 );
01117    }
01118 
01119    switch( inst->Instruction.Saturate ) {
01120    case TGSI_SAT_NONE:
01121       break;
01122 
01123    case TGSI_SAT_ZERO_ONE:
01124       /* assert( 0 ); */
01125       break;
01126 
01127    case TGSI_SAT_MINUS_PLUS_ONE:
01128       assert( 0 );
01129       break;
01130    }
01131 }
01132 
01133 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
01134    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
01135 
01140 static void
01141 emit_kil(
01142    struct x86_function *func,
01143    const struct tgsi_full_src_register *reg )
01144 {
01145    unsigned uniquemask;
01146    unsigned registers[4];
01147    unsigned nextregister = 0;
01148    unsigned firstchan = ~0;
01149    unsigned chan_index;
01150 
01151    /* This mask stores component bits that were already tested. Note that
01152     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
01153     * tested. */
01154    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
01155 
01156    FOR_EACH_CHANNEL( chan_index ) {
01157       unsigned swizzle;
01158 
01159       /* unswizzle channel */
01160       swizzle = tgsi_util_get_full_src_register_extswizzle(
01161          reg,
01162          chan_index );
01163 
01164       /* check if the component has not been already tested */
01165       if( !(uniquemask & (1 << swizzle)) ) {
01166          uniquemask |= 1 << swizzle;
01167 
01168          /* allocate register */
01169          registers[chan_index] = nextregister;
01170          emit_fetch(
01171             func,
01172             nextregister,
01173             reg,
01174             chan_index );
01175          nextregister++;
01176 
01177          /* mark the first channel used */
01178          if( firstchan == ~0 ) {
01179             firstchan = chan_index;
01180          }
01181       }
01182    }
01183 
01184    x86_push(
01185       func,
01186       x86_make_reg( file_REG32, reg_AX ) );
01187    x86_push(
01188       func,
01189       x86_make_reg( file_REG32, reg_DX ) );
01190 
01191    FOR_EACH_CHANNEL( chan_index ) {
01192       if( uniquemask & (1 << chan_index) ) {
01193          sse_cmpps(
01194             func,
01195             make_xmm( registers[chan_index] ),
01196             get_temp(
01197                TGSI_EXEC_TEMP_00000000_I,
01198                TGSI_EXEC_TEMP_00000000_C ),
01199             cc_LessThan );
01200 
01201          if( chan_index == firstchan ) {
01202             sse_pmovmskb(
01203                func,
01204                x86_make_reg( file_REG32, reg_AX ),
01205                make_xmm( registers[chan_index] ) );
01206          }
01207          else {
01208             sse_pmovmskb(
01209                func,
01210                x86_make_reg( file_REG32, reg_DX ),
01211                make_xmm( registers[chan_index] ) );
01212             x86_or(
01213                func,
01214                x86_make_reg( file_REG32, reg_AX ),
01215                x86_make_reg( file_REG32, reg_DX ) );
01216          }
01217       }
01218    }
01219 
01220    x86_or(
01221       func,
01222       get_temp(
01223          TGSI_EXEC_TEMP_KILMASK_I,
01224          TGSI_EXEC_TEMP_KILMASK_C ),
01225       x86_make_reg( file_REG32, reg_AX ) );
01226 
01227    x86_pop(
01228       func,
01229       x86_make_reg( file_REG32, reg_DX ) );
01230    x86_pop(
01231       func,
01232       x86_make_reg( file_REG32, reg_AX ) );
01233 }
01234 
01235 
01236 static void
01237 emit_kilp(
01238    struct x86_function *func )
01239 {
01240    /* XXX todo / fix me */
01241 }
01242 
01243 
01244 static void
01245 emit_setcc(
01246    struct x86_function *func,
01247    struct tgsi_full_instruction *inst,
01248    enum sse_cc cc )
01249 {
01250    unsigned chan_index;
01251 
01252    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01253       FETCH( func, *inst, 0, 0, chan_index );
01254       FETCH( func, *inst, 1, 1, chan_index );
01255       sse_cmpps(
01256          func,
01257          make_xmm( 0 ),
01258          make_xmm( 1 ),
01259          cc );
01260       sse_andps(
01261          func,
01262          make_xmm( 0 ),
01263          get_temp(
01264             TEMP_ONE_I,
01265             TEMP_ONE_C ) );
01266       STORE( func, *inst, 0, 0, chan_index );
01267    }
01268 }
01269 
01270 static void
01271 emit_cmp(
01272    struct x86_function *func,
01273    struct tgsi_full_instruction *inst )
01274 {
01275    unsigned chan_index;
01276 
01277    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01278       FETCH( func, *inst, 0, 0, chan_index );
01279       FETCH( func, *inst, 1, 1, chan_index );
01280       FETCH( func, *inst, 2, 2, chan_index );
01281       sse_cmpps(
01282          func,
01283          make_xmm( 0 ),
01284          get_temp(
01285             TGSI_EXEC_TEMP_00000000_I,
01286             TGSI_EXEC_TEMP_00000000_C ),
01287          cc_LessThan );
01288       sse_andps(
01289          func,
01290          make_xmm( 1 ),
01291          make_xmm( 0 ) );
01292       sse_andnps(
01293          func,
01294          make_xmm( 0 ),
01295          make_xmm( 2 ) );
01296       sse_orps(
01297          func,
01298          make_xmm( 0 ),
01299          make_xmm( 1 ) );
01300       STORE( func, *inst, 0, 0, chan_index );
01301    }
01302 }
01303 
01304 
01309 static boolean
01310 indirect_temp_reference(const struct tgsi_full_instruction *inst)
01311 {
01312    uint i;
01313    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
01314       const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
01315       if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
01316           reg->SrcRegister.Indirect)
01317          return TRUE;
01318    }
01319    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
01320       const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
01321       if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
01322           reg->DstRegister.Indirect)
01323          return TRUE;
01324    }
01325    return FALSE;
01326 }
01327 
01328 
01329 static int
01330 emit_instruction(
01331    struct x86_function *func,
01332    struct tgsi_full_instruction *inst )
01333 {
01334    unsigned chan_index;
01335 
01336    /* we can't handle indirect addressing into temp register file yet */
01337    if (indirect_temp_reference(inst))
01338       return FALSE;
01339 
01340    switch (inst->Instruction.Opcode) {
01341    case TGSI_OPCODE_ARL:
01342       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01343          FETCH( func, *inst, 0, 0, chan_index );
01344          emit_flr(func, 0);
01345          emit_f2it( func, 0 );
01346          STORE( func, *inst, 0, 0, chan_index );
01347       }
01348       break;
01349 
01350    case TGSI_OPCODE_MOV:
01351    case TGSI_OPCODE_SWZ:
01352       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01353          FETCH( func, *inst, 0, 0, chan_index );
01354          STORE( func, *inst, 0, 0, chan_index );
01355       }
01356       break;
01357 
01358    case TGSI_OPCODE_LIT:
01359       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
01360           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
01361          emit_tempf(
01362             func,
01363             0,
01364             TEMP_ONE_I,
01365             TEMP_ONE_C);
01366          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
01367             STORE( func, *inst, 0, 0, CHAN_X );
01368          }
01369          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
01370             STORE( func, *inst, 0, 0, CHAN_W );
01371          }
01372       }
01373       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
01374           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
01375          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
01376             FETCH( func, *inst, 0, 0, CHAN_X );
01377             sse_maxps(
01378                func,
01379                make_xmm( 0 ),
01380                get_temp(
01381                   TGSI_EXEC_TEMP_00000000_I,
01382                   TGSI_EXEC_TEMP_00000000_C ) );
01383             STORE( func, *inst, 0, 0, CHAN_Y );
01384          }
01385          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
01386             /* XMM[1] = SrcReg[0].yyyy */
01387             FETCH( func, *inst, 1, 0, CHAN_Y );
01388             /* XMM[1] = max(XMM[1], 0) */
01389             sse_maxps(
01390                func,
01391                make_xmm( 1 ),
01392                get_temp(
01393                   TGSI_EXEC_TEMP_00000000_I,
01394                   TGSI_EXEC_TEMP_00000000_C ) );
01395             /* XMM[2] = SrcReg[0].wwww */
01396             FETCH( func, *inst, 2, 0, CHAN_W );
01397             /* XMM[2] = min(XMM[2], 128.0) */
01398             sse_minps(
01399                func,
01400                make_xmm( 2 ),
01401                get_temp(
01402                   TGSI_EXEC_TEMP_128_I,
01403                   TGSI_EXEC_TEMP_128_C ) );
01404             /* XMM[2] = max(XMM[2], -128.0) */
01405             sse_maxps(
01406                func,
01407                make_xmm( 2 ),
01408                get_temp(
01409                   TGSI_EXEC_TEMP_MINUS_128_I,
01410                   TGSI_EXEC_TEMP_MINUS_128_C ) );
01411             emit_pow( func, 1, 2 );
01412             FETCH( func, *inst, 0, 0, CHAN_X );
01413             sse_xorps(
01414                func,
01415                make_xmm( 2 ),
01416                make_xmm( 2 ) );
01417             sse_cmpps(
01418                func,
01419                make_xmm( 2 ),
01420                make_xmm( 0 ),
01421                cc_LessThan );
01422             sse_andps(
01423                func,
01424                make_xmm( 2 ),
01425                make_xmm( 1 ) );
01426             STORE( func, *inst, 2, 0, CHAN_Z );
01427          }
01428       }
01429       break;
01430 
01431    case TGSI_OPCODE_RCP:
01432    /* TGSI_OPCODE_RECIP */
01433       FETCH( func, *inst, 0, 0, CHAN_X );
01434       emit_rcp( func, 0, 0 );
01435       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01436          STORE( func, *inst, 0, 0, chan_index );
01437       }
01438       break;
01439 
01440    case TGSI_OPCODE_RSQ:
01441    /* TGSI_OPCODE_RECIPSQRT */
01442       FETCH( func, *inst, 0, 0, CHAN_X );
01443       emit_rsqrt( func, 1, 0 );
01444       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01445          STORE( func, *inst, 1, 0, chan_index );
01446       }
01447       break;
01448 
01449    case TGSI_OPCODE_EXP:
01450       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
01451           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
01452           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
01453          FETCH( func, *inst, 0, 0, CHAN_X );
01454          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
01455              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
01456             emit_MOV( func, 1, 0 );
01457             emit_flr( func, 1 );
01458             /* dst.x = ex2(floor(src.x)) */
01459             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
01460                emit_MOV( func, 2, 1 );
01461                emit_ex2( func, 2 );
01462                STORE( func, *inst, 2, 0, CHAN_X );
01463             }
01464             /* dst.y = src.x - floor(src.x) */
01465             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
01466                emit_MOV( func, 2, 0 );
01467                emit_sub( func, 2, 1 );
01468                STORE( func, *inst, 2, 0, CHAN_Y );
01469             }
01470          }
01471          /* dst.z = ex2(src.x) */
01472          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
01473             emit_ex2( func, 0 );
01474             STORE( func, *inst, 0, 0, CHAN_Z );
01475          }
01476       }
01477       /* dst.w = 1.0 */
01478       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
01479          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
01480          STORE( func, *inst, 0, 0, CHAN_W );
01481       }
01482       break;
01483 
01484    case TGSI_OPCODE_LOG:
01485       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
01486           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
01487           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
01488          FETCH( func, *inst, 0, 0, CHAN_X );
01489          emit_abs( func, 0 );
01490          emit_MOV( func, 1, 0 );
01491          emit_lg2( func, 1 );
01492          /* dst.z = lg2(abs(src.x)) */
01493          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
01494             STORE( func, *inst, 1, 0, CHAN_Z );
01495          }
01496          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
01497              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
01498             emit_flr( func, 1 );
01499             /* dst.x = floor(lg2(abs(src.x))) */
01500             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
01501                STORE( func, *inst, 1, 0, CHAN_X );
01502             }
01503             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
01504             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
01505                emit_ex2( func, 1 );
01506                emit_rcp( func, 1, 1 );
01507                emit_mul( func, 0, 1 );
01508                STORE( func, *inst, 0, 0, CHAN_Y );
01509             }
01510          }
01511       }
01512       /* dst.w = 1.0 */
01513       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
01514          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
01515          STORE( func, *inst, 0, 0, CHAN_W );
01516       }
01517       break;
01518 
01519    case TGSI_OPCODE_MUL:
01520       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01521          FETCH( func, *inst, 0, 0, chan_index );
01522          FETCH( func, *inst, 1, 1, chan_index );
01523          emit_mul( func, 0, 1 );
01524          STORE( func, *inst, 0, 0, chan_index );
01525       }
01526       break;
01527 
01528    case TGSI_OPCODE_ADD:
01529       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01530          FETCH( func, *inst, 0, 0, chan_index );
01531          FETCH( func, *inst, 1, 1, chan_index );
01532          emit_add( func, 0, 1 );
01533          STORE( func, *inst, 0, 0, chan_index );
01534       }
01535       break;
01536 
01537    case TGSI_OPCODE_DP3:
01538    /* TGSI_OPCODE_DOT3 */
01539       FETCH( func, *inst, 0, 0, CHAN_X );
01540       FETCH( func, *inst, 1, 1, CHAN_X );
01541       emit_mul( func, 0, 1 );
01542       FETCH( func, *inst, 1, 0, CHAN_Y );
01543       FETCH( func, *inst, 2, 1, CHAN_Y );
01544       emit_mul( func, 1, 2 );
01545       emit_add( func, 0, 1 );
01546       FETCH( func, *inst, 1, 0, CHAN_Z );
01547       FETCH( func, *inst, 2, 1, CHAN_Z );
01548       emit_mul( func, 1, 2 );
01549       emit_add( func, 0, 1 );
01550       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01551          STORE( func, *inst, 0, 0, chan_index );
01552       }
01553       break;
01554 
01555    case TGSI_OPCODE_DP4:
01556    /* TGSI_OPCODE_DOT4 */
01557       FETCH( func, *inst, 0, 0, CHAN_X );
01558       FETCH( func, *inst, 1, 1, CHAN_X );
01559       emit_mul( func, 0, 1 );
01560       FETCH( func, *inst, 1, 0, CHAN_Y );
01561       FETCH( func, *inst, 2, 1, CHAN_Y );
01562       emit_mul( func, 1, 2 );
01563       emit_add( func, 0, 1 );
01564       FETCH( func, *inst, 1, 0, CHAN_Z );
01565       FETCH( func, *inst, 2, 1, CHAN_Z );
01566       emit_mul(func, 1, 2 );
01567       emit_add(func, 0, 1 );
01568       FETCH( func, *inst, 1, 0, CHAN_W );
01569       FETCH( func, *inst, 2, 1, CHAN_W );
01570       emit_mul( func, 1, 2 );
01571       emit_add( func, 0, 1 );
01572       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01573          STORE( func, *inst, 0, 0, chan_index );
01574       }
01575       break;
01576 
01577    case TGSI_OPCODE_DST:
01578       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
01579          emit_tempf(
01580             func,
01581             0,
01582             TEMP_ONE_I,
01583             TEMP_ONE_C );
01584          STORE( func, *inst, 0, 0, CHAN_X );
01585       }
01586       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
01587          FETCH( func, *inst, 0, 0, CHAN_Y );
01588          FETCH( func, *inst, 1, 1, CHAN_Y );
01589          emit_mul( func, 0, 1 );
01590          STORE( func, *inst, 0, 0, CHAN_Y );
01591       }
01592       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
01593          FETCH( func, *inst, 0, 0, CHAN_Z );
01594          STORE( func, *inst, 0, 0, CHAN_Z );
01595       }
01596       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
01597          FETCH( func, *inst, 0, 1, CHAN_W );
01598          STORE( func, *inst, 0, 0, CHAN_W );
01599       }
01600       break;
01601 
01602    case TGSI_OPCODE_MIN:
01603       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01604          FETCH( func, *inst, 0, 0, chan_index );
01605          FETCH( func, *inst, 1, 1, chan_index );
01606          sse_minps(
01607             func,
01608             make_xmm( 0 ),
01609             make_xmm( 1 ) );
01610          STORE( func, *inst, 0, 0, chan_index );
01611       }
01612       break;
01613 
01614    case TGSI_OPCODE_MAX:
01615       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01616          FETCH( func, *inst, 0, 0, chan_index );
01617          FETCH( func, *inst, 1, 1, chan_index );
01618          sse_maxps(
01619             func,
01620             make_xmm( 0 ),
01621             make_xmm( 1 ) );
01622          STORE( func, *inst, 0, 0, chan_index );
01623       }
01624       break;
01625 
01626    case TGSI_OPCODE_SLT:
01627    /* TGSI_OPCODE_SETLT */
01628       emit_setcc( func, inst, cc_LessThan );
01629       break;
01630 
01631    case TGSI_OPCODE_SGE:
01632    /* TGSI_OPCODE_SETGE */
01633       emit_setcc( func, inst, cc_NotLessThan );
01634       break;
01635 
01636    case TGSI_OPCODE_MAD:
01637    /* TGSI_OPCODE_MADD */
01638       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01639          FETCH( func, *inst, 0, 0, chan_index );
01640          FETCH( func, *inst, 1, 1, chan_index );
01641          FETCH( func, *inst, 2, 2, chan_index );
01642          emit_mul( func, 0, 1 );
01643          emit_add( func, 0, 2 );
01644          STORE( func, *inst, 0, 0, chan_index );
01645       }
01646       break;
01647 
01648    case TGSI_OPCODE_SUB:
01649       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01650          FETCH( func, *inst, 0, 0, chan_index );
01651          FETCH( func, *inst, 1, 1, chan_index );
01652          emit_sub( func, 0, 1 );
01653          STORE( func, *inst, 0, 0, chan_index );
01654       }
01655       break;
01656 
01657    case TGSI_OPCODE_LERP:
01658    /* TGSI_OPCODE_LRP */
01659       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01660          FETCH( func, *inst, 0, 0, chan_index );
01661          FETCH( func, *inst, 1, 1, chan_index );
01662          FETCH( func, *inst, 2, 2, chan_index );
01663          emit_sub( func, 1, 2 );
01664          emit_mul( func, 0, 1 );
01665          emit_add( func, 0, 2 );
01666          STORE( func, *inst, 0, 0, chan_index );
01667       }
01668       break;
01669 
01670    case TGSI_OPCODE_CND:
01671       return 0;
01672       break;
01673 
01674    case TGSI_OPCODE_CND0:
01675       return 0;
01676       break;
01677 
01678    case TGSI_OPCODE_DOT2ADD:
01679    /* TGSI_OPCODE_DP2A */
01680       return 0;
01681       break;
01682 
01683    case TGSI_OPCODE_INDEX:
01684       return 0;
01685       break;
01686 
01687    case TGSI_OPCODE_NEGATE:
01688       return 0;
01689       break;
01690 
01691    case TGSI_OPCODE_FRAC:
01692    /* TGSI_OPCODE_FRC */
01693       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01694          FETCH( func, *inst, 0, 0, chan_index );
01695          emit_frc( func, 0 );
01696          STORE( func, *inst, 0, 0, chan_index );
01697       }
01698       break;
01699 
01700    case TGSI_OPCODE_CLAMP:
01701       return 0;
01702       break;
01703 
01704    case TGSI_OPCODE_FLOOR:
01705    /* TGSI_OPCODE_FLR */
01706       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01707          FETCH( func, *inst, 0, 0, chan_index );
01708          emit_flr( func, 0 );
01709          STORE( func, *inst, 0, 0, chan_index );
01710       }
01711       break;
01712 
01713    case TGSI_OPCODE_ROUND:
01714       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01715          FETCH( func, *inst, 0, 0, chan_index );
01716          emit_rnd( func, 0 );
01717          STORE( func, *inst, 0, 0, chan_index );
01718       }
01719       break;
01720 
01721    case TGSI_OPCODE_EXPBASE2:
01722    /* TGSI_OPCODE_EX2 */
01723       FETCH( func, *inst, 0, 0, CHAN_X );
01724       emit_ex2( func, 0 );
01725       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01726          STORE( func, *inst, 0, 0, chan_index );
01727       }
01728       break;
01729 
01730    case TGSI_OPCODE_LOGBASE2:
01731    /* TGSI_OPCODE_LG2 */
01732       FETCH( func, *inst, 0, 0, CHAN_X );
01733       emit_lg2( func, 0 );
01734       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01735          STORE( func, *inst, 0, 0, chan_index );
01736       }
01737       break;
01738 
01739    case TGSI_OPCODE_POWER:
01740    /* TGSI_OPCODE_POW */
01741       FETCH( func, *inst, 0, 0, CHAN_X );
01742       FETCH( func, *inst, 1, 1, CHAN_X );
01743       emit_pow( func, 0, 1 );
01744       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01745          STORE( func, *inst, 0, 0, chan_index );
01746       }
01747       break;
01748 
01749    case TGSI_OPCODE_CROSSPRODUCT:
01750    /* TGSI_OPCODE_XPD */
01751       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
01752           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
01753          FETCH( func, *inst, 1, 1, CHAN_Z );
01754          FETCH( func, *inst, 3, 0, CHAN_Z );
01755       }
01756       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
01757           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
01758          FETCH( func, *inst, 0, 0, CHAN_Y );
01759          FETCH( func, *inst, 4, 1, CHAN_Y );
01760       }
01761       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
01762          emit_MOV( func, 2, 0 );
01763          emit_mul( func, 2, 1 );
01764          emit_MOV( func, 5, 3 );
01765          emit_mul( func, 5, 4 );
01766          emit_sub( func, 2, 5 );
01767          STORE( func, *inst, 2, 0, CHAN_X );
01768       }
01769       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
01770           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
01771          FETCH( func, *inst, 2, 1, CHAN_X );
01772          FETCH( func, *inst, 5, 0, CHAN_X );
01773       }
01774       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
01775          emit_mul( func, 3, 2 );
01776          emit_mul( func, 1, 5 );
01777          emit_sub( func, 3, 1 );
01778          STORE( func, *inst, 3, 0, CHAN_Y );
01779       }
01780       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
01781          emit_mul( func, 5, 4 );
01782          emit_mul( func, 0, 2 );
01783          emit_sub( func, 5, 0 );
01784          STORE( func, *inst, 5, 0, CHAN_Z );
01785       }
01786       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
01787          emit_tempf(
01788             func,
01789             0,
01790             TEMP_ONE_I,
01791             TEMP_ONE_C );
01792          STORE( func, *inst, 0, 0, CHAN_W );
01793       }
01794       break;
01795 
01796    case TGSI_OPCODE_MULTIPLYMATRIX:
01797       return 0;
01798       break;
01799 
01800    case TGSI_OPCODE_ABS:
01801       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01802          FETCH( func, *inst, 0, 0, chan_index );
01803          emit_abs( func, 0) ;
01804 
01805          STORE( func, *inst, 0, 0, chan_index );
01806       }
01807       break;
01808 
01809    case TGSI_OPCODE_RCC:
01810       return 0;
01811       break;
01812 
01813    case TGSI_OPCODE_DPH:
01814       FETCH( func, *inst, 0, 0, CHAN_X );
01815       FETCH( func, *inst, 1, 1, CHAN_X );
01816       emit_mul( func, 0, 1 );
01817       FETCH( func, *inst, 1, 0, CHAN_Y );
01818       FETCH( func, *inst, 2, 1, CHAN_Y );
01819       emit_mul( func, 1, 2 );
01820       emit_add( func, 0, 1 );
01821       FETCH( func, *inst, 1, 0, CHAN_Z );
01822       FETCH( func, *inst, 2, 1, CHAN_Z );
01823       emit_mul( func, 1, 2 );
01824       emit_add( func, 0, 1 );
01825       FETCH( func, *inst, 1, 1, CHAN_W );
01826       emit_add( func, 0, 1 );
01827       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01828          STORE( func, *inst, 0, 0, chan_index );
01829       }
01830       break;
01831 
01832    case TGSI_OPCODE_COS:
01833       FETCH( func, *inst, 0, 0, CHAN_X );
01834       emit_cos( func, 0 );
01835       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01836          STORE( func, *inst, 0, 0, chan_index );
01837       }
01838       break;
01839 
01840    case TGSI_OPCODE_DDX:
01841       return 0;
01842       break;
01843 
01844    case TGSI_OPCODE_DDY:
01845       return 0;
01846       break;
01847 
01848    case TGSI_OPCODE_KILP:
01849       /* predicated kill */
01850       emit_kilp( func );
01851       return 0; /* XXX fix me */
01852       break;
01853 
01854    case TGSI_OPCODE_KIL:
01855       /* conditional kill */
01856       emit_kil( func, &inst->FullSrcRegisters[0] );
01857       break;
01858 
01859    case TGSI_OPCODE_PK2H:
01860       return 0;
01861       break;
01862 
01863    case TGSI_OPCODE_PK2US:
01864       return 0;
01865       break;
01866 
01867    case TGSI_OPCODE_PK4B:
01868       return 0;
01869       break;
01870 
01871    case TGSI_OPCODE_PK4UB:
01872       return 0;
01873       break;
01874 
01875    case TGSI_OPCODE_RFL:
01876       return 0;
01877       break;
01878 
01879    case TGSI_OPCODE_SEQ:
01880       emit_setcc( func, inst, cc_Equal );
01881       break;
01882 
01883    case TGSI_OPCODE_SFL:
01884       return 0;
01885       break;
01886 
01887    case TGSI_OPCODE_SGT:
01888       emit_setcc( func, inst, cc_NotLessThanEqual );
01889       break;
01890 
01891    case TGSI_OPCODE_SIN:
01892       FETCH( func, *inst, 0, 0, CHAN_X );
01893       emit_sin( func, 0 );
01894       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01895          STORE( func, *inst, 0, 0, chan_index );
01896       }
01897       break;
01898 
01899    case TGSI_OPCODE_SLE:
01900       emit_setcc( func, inst, cc_LessThanEqual );
01901       break;
01902 
01903    case TGSI_OPCODE_SNE:
01904       emit_setcc( func, inst, cc_NotEqual );
01905       break;
01906 
01907    case TGSI_OPCODE_STR:
01908       return 0;
01909       break;
01910 
01911    case TGSI_OPCODE_TEX:
01912       if (0) {
01913          /* Disable dummy texture code: 
01914           */
01915          emit_tempf(
01916             func,
01917             0,
01918             TEMP_ONE_I,
01919             TEMP_ONE_C );
01920          FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01921             STORE( func, *inst, 0, 0, chan_index );
01922          }
01923       }
01924       else {
01925          return 0;
01926       }
01927       break;
01928 
01929    case TGSI_OPCODE_TXD:
01930       return 0;
01931       break;
01932 
01933    case TGSI_OPCODE_UP2H:
01934       return 0;
01935       break;
01936 
01937    case TGSI_OPCODE_UP2US:
01938       return 0;
01939       break;
01940 
01941    case TGSI_OPCODE_UP4B:
01942       return 0;
01943       break;
01944 
01945    case TGSI_OPCODE_UP4UB:
01946       return 0;
01947       break;
01948 
01949    case TGSI_OPCODE_X2D:
01950       return 0;
01951       break;
01952 
01953    case TGSI_OPCODE_ARA:
01954       return 0;
01955       break;
01956 
01957    case TGSI_OPCODE_ARR:
01958       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01959          FETCH( func, *inst, 0, 0, chan_index );
01960          emit_rnd( func, 0 );
01961          emit_f2it( func, 0 );
01962          STORE( func, *inst, 0, 0, chan_index );
01963       }
01964       break;
01965 
01966    case TGSI_OPCODE_BRA:
01967       return 0;
01968       break;
01969 
01970    case TGSI_OPCODE_CAL:
01971       return 0;
01972       break;
01973 
01974    case TGSI_OPCODE_RET:
01975       emit_ret( func );
01976       break;
01977 
01978    case TGSI_OPCODE_END:
01979       break;
01980 
01981    case TGSI_OPCODE_SSG:
01982    /* TGSI_OPCODE_SGN */
01983       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
01984          FETCH( func, *inst, 0, 0, chan_index );
01985          emit_sgn( func, 0 );
01986          STORE( func, *inst, 0, 0, chan_index );
01987       }
01988       break;
01989 
01990    case TGSI_OPCODE_CMP:
01991       emit_cmp (func, inst);
01992       break;
01993 
01994    case TGSI_OPCODE_SCS:
01995       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
01996          FETCH( func, *inst, 0, 0, CHAN_X );
01997          emit_cos( func, 0 );
01998          STORE( func, *inst, 0, 0, CHAN_X );
01999       }
02000       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
02001          FETCH( func, *inst, 0, 0, CHAN_X );
02002          emit_sin( func, 0 );
02003          STORE( func, *inst, 0, 0, CHAN_Y );
02004       }
02005       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
02006          emit_tempf(
02007             func,
02008             0,
02009             TGSI_EXEC_TEMP_00000000_I,
02010             TGSI_EXEC_TEMP_00000000_C );
02011          STORE( func, *inst, 0, 0, CHAN_Z );
02012       }
02013       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
02014          emit_tempf(
02015             func,
02016             0,
02017             TEMP_ONE_I,
02018             TEMP_ONE_C );
02019          STORE( func, *inst, 0, 0, CHAN_W );
02020       }
02021       break;
02022 
02023    case TGSI_OPCODE_TXB:
02024       return 0;
02025       break;
02026 
02027    case TGSI_OPCODE_NRM:
02028       /* fall-through */
02029    case TGSI_OPCODE_NRM4:
02030       /* 3 or 4-component normalization */
02031       {
02032          uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
02033 
02034          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
02035              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
02036              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
02037              (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
02038 
02039             /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
02040 
02041             /* xmm4 = src.x */
02042             /* xmm0 = src.x * src.x */
02043             FETCH(func, *inst, 0, 0, CHAN_X);
02044             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
02045                emit_MOV(func, 4, 0);
02046             }
02047             emit_mul(func, 0, 0);
02048 
02049             /* xmm5 = src.y */
02050             /* xmm0 = xmm0 + src.y * src.y */
02051             FETCH(func, *inst, 1, 0, CHAN_Y);
02052             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
02053                emit_MOV(func, 5, 1);
02054             }
02055             emit_mul(func, 1, 1);
02056             emit_add(func, 0, 1);
02057 
02058             /* xmm6 = src.z */
02059             /* xmm0 = xmm0 + src.z * src.z */
02060             FETCH(func, *inst, 1, 0, CHAN_Z);
02061             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
02062                emit_MOV(func, 6, 1);
02063             }
02064             emit_mul(func, 1, 1);
02065             emit_add(func, 0, 1);
02066 
02067             if (dims == 4) {
02068                /* xmm7 = src.w */
02069                /* xmm0 = xmm0 + src.w * src.w */
02070                FETCH(func, *inst, 1, 0, CHAN_W);
02071                if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
02072                   emit_MOV(func, 7, 1);
02073                }
02074                emit_mul(func, 1, 1);
02075                emit_add(func, 0, 1);
02076             }
02077 
02078             /* xmm1 = 1 / sqrt(xmm0) */
02079             emit_rsqrt(func, 1, 0);
02080 
02081             /* dst.x = xmm1 * src.x */
02082             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
02083                emit_mul(func, 4, 1);
02084                STORE(func, *inst, 4, 0, CHAN_X);
02085             }
02086 
02087             /* dst.y = xmm1 * src.y */
02088             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
02089                emit_mul(func, 5, 1);
02090                STORE(func, *inst, 5, 0, CHAN_Y);
02091             }
02092 
02093             /* dst.z = xmm1 * src.z */
02094             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
02095                emit_mul(func, 6, 1);
02096                STORE(func, *inst, 6, 0, CHAN_Z);
02097             }
02098 
02099             /* dst.w = xmm1 * src.w */
02100             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
02101                emit_mul(func, 7, 1);
02102                STORE(func, *inst, 7, 0, CHAN_W);
02103             }
02104          }
02105 
02106          /* dst0.w = 1.0 */
02107          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
02108             emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
02109             STORE(func, *inst, 0, 0, CHAN_W);
02110          }
02111       }
02112       break;
02113 
02114    case TGSI_OPCODE_DIV:
02115       return 0;
02116       break;
02117 
02118    case TGSI_OPCODE_DP2:
02119       return 0;
02120       break;
02121 
02122    case TGSI_OPCODE_TXL:
02123       return 0;
02124       break;
02125 
02126    case TGSI_OPCODE_BRK:
02127       return 0;
02128       break;
02129 
02130    case TGSI_OPCODE_IF:
02131       return 0;
02132       break;
02133 
02134    case TGSI_OPCODE_LOOP:
02135       return 0;
02136       break;
02137 
02138    case TGSI_OPCODE_REP:
02139       return 0;
02140       break;
02141 
02142    case TGSI_OPCODE_ELSE:
02143       return 0;
02144       break;
02145 
02146    case TGSI_OPCODE_ENDIF:
02147       return 0;
02148       break;
02149 
02150    case TGSI_OPCODE_ENDLOOP:
02151       return 0;
02152       break;
02153 
02154    case TGSI_OPCODE_ENDREP:
02155       return 0;
02156       break;
02157 
02158    case TGSI_OPCODE_PUSHA:
02159       return 0;
02160       break;
02161 
02162    case TGSI_OPCODE_POPA:
02163       return 0;
02164       break;
02165 
02166    case TGSI_OPCODE_CEIL:
02167       return 0;
02168       break;
02169 
02170    case TGSI_OPCODE_I2F:
02171       return 0;
02172       break;
02173 
02174    case TGSI_OPCODE_NOT:
02175       return 0;
02176       break;
02177 
02178    case TGSI_OPCODE_TRUNC:
02179       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
02180          FETCH( func, *inst, 0, 0, chan_index );
02181          emit_f2it( func, 0 );
02182          emit_i2f( func, 0 );
02183          STORE( func, *inst, 0, 0, chan_index );
02184       }
02185       break;
02186 
02187    case TGSI_OPCODE_SHL:
02188       return 0;
02189       break;
02190 
02191    case TGSI_OPCODE_SHR:
02192       return 0;
02193       break;
02194 
02195    case TGSI_OPCODE_AND:
02196       return 0;
02197       break;
02198 
02199    case TGSI_OPCODE_OR:
02200       return 0;
02201       break;
02202 
02203    case TGSI_OPCODE_MOD:
02204       return 0;
02205       break;
02206 
02207    case TGSI_OPCODE_XOR:
02208       return 0;
02209       break;
02210 
02211    case TGSI_OPCODE_SAD:
02212       return 0;
02213       break;
02214 
02215    case TGSI_OPCODE_TXF:
02216       return 0;
02217       break;
02218 
02219    case TGSI_OPCODE_TXQ:
02220       return 0;
02221       break;
02222 
02223    case TGSI_OPCODE_CONT:
02224       return 0;
02225       break;
02226 
02227    case TGSI_OPCODE_EMIT:
02228       return 0;
02229       break;
02230 
02231    case TGSI_OPCODE_ENDPRIM:
02232       return 0;
02233       break;
02234 
02235    default:
02236       return 0;
02237    }
02238    
02239    return 1;
02240 }
02241 
02242 static void
02243 emit_declaration(
02244    struct x86_function *func,
02245    struct tgsi_full_declaration *decl )
02246 {
02247    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
02248       unsigned first, last, mask;
02249       unsigned i, j;
02250 
02251       first = decl->DeclarationRange.First;
02252       last = decl->DeclarationRange.Last;
02253       mask = decl->Declaration.UsageMask;
02254 
02255       for( i = first; i <= last; i++ ) {
02256          for( j = 0; j < NUM_CHANNELS; j++ ) {
02257             if( mask & (1 << j) ) {
02258                switch( decl->Declaration.Interpolate ) {
02259                case TGSI_INTERPOLATE_CONSTANT:
02260                   emit_coef_a0( func, 0, i, j );
02261                   emit_inputs( func, 0, i, j );
02262                   break;
02263 
02264                case TGSI_INTERPOLATE_LINEAR:
02265                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
02266                   emit_coef_dadx( func, 1, i, j );
02267                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
02268                   emit_coef_dady( func, 3, i, j );
02269                   emit_mul( func, 0, 1 );    /* x * dadx */
02270                   emit_coef_a0( func, 4, i, j );
02271                   emit_mul( func, 2, 3 );    /* y * dady */
02272                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
02273                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
02274                   emit_inputs( func, 0, i, j );
02275                   break;
02276 
02277                case TGSI_INTERPOLATE_PERSPECTIVE:
02278                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
02279                   emit_coef_dadx( func, 1, i, j );
02280                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
02281                   emit_coef_dady( func, 3, i, j );
02282                   emit_mul( func, 0, 1 );    /* x * dadx */
02283                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
02284                   emit_coef_a0( func, 5, i, j );
02285                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
02286                   emit_mul( func, 2, 3 );    /* y * dady */
02287                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
02288                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
02289                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
02290                   emit_inputs( func, 0, i, j );
02291                   break;
02292 
02293                default:
02294                   assert( 0 );
02295                   break;
02296                }
02297             }
02298          }
02299       }
02300    }
02301 }
02302 
02303 static void aos_to_soa( struct x86_function *func, 
02304                         uint arg_aos,
02305                         uint arg_soa, 
02306                         uint arg_num, 
02307                         uint arg_stride )
02308 {
02309    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
02310    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
02311    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
02312    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
02313    int inner_loop;
02314 
02315 
02316    /* Save EBX */
02317    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
02318 
02319    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
02320    x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
02321    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
02322    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
02323 
02324    /* do */
02325    inner_loop = x86_get_label( func );
02326    {
02327       x86_push( func, aos_input );
02328       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
02329       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
02330       x86_add( func, aos_input, stride );
02331       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
02332       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
02333       x86_add( func, aos_input, stride );
02334       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
02335       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
02336       x86_add( func, aos_input, stride );
02337       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
02338       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
02339       x86_pop( func, aos_input );
02340 
02341       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
02342       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
02343       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
02344       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
02345       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
02346       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
02347 
02348       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
02349       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
02350       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
02351       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
02352 
02353       /* Advance to next input */
02354       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
02355       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
02356    }
02357    /* while --num_inputs */
02358    x86_dec( func, num_inputs );
02359    x86_jcc( func, cc_NE, inner_loop );
02360 
02361    /* Restore EBX */
02362    x86_pop( func, aos_input );
02363 }
02364 
02365 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
02366 {
02367    struct x86_reg soa_output;
02368    struct x86_reg aos_output;
02369    struct x86_reg num_outputs;
02370    struct x86_reg temp;
02371    int inner_loop;
02372 
02373    soa_output = x86_make_reg( file_REG32, reg_AX );
02374    aos_output = x86_make_reg( file_REG32, reg_BX );
02375    num_outputs = x86_make_reg( file_REG32, reg_CX );
02376    temp = x86_make_reg( file_REG32, reg_DX );
02377 
02378    /* Save EBX */
02379    x86_push( func, aos_output );
02380 
02381    x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
02382    x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
02383    x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
02384 
02385    /* do */
02386    inner_loop = x86_get_label( func );
02387    {
02388       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
02389       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
02390       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
02391       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
02392 
02393       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
02394       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
02395       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
02396       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
02397       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
02398       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
02399 
02400       x86_mov( func, temp, x86_fn_arg( func, stride ) );
02401       x86_push( func, aos_output );
02402       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
02403       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
02404       x86_add( func, aos_output, temp );
02405       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
02406       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
02407       x86_add( func, aos_output, temp );
02408       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
02409       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
02410       x86_add( func, aos_output, temp );
02411       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
02412       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
02413       x86_pop( func, aos_output );
02414 
02415       /* Advance to next output */
02416       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
02417       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
02418    }
02419    /* while --num_outputs */
02420    x86_dec( func, num_outputs );
02421    x86_jcc( func, cc_NE, inner_loop );
02422 
02423    /* Restore EBX */
02424    x86_pop( func, aos_output );
02425 }
02426 
02443 unsigned
02444 tgsi_emit_sse2(
02445    const struct tgsi_token *tokens,
02446    struct x86_function *func,
02447    float (*immediates)[4],
02448    boolean do_swizzles )
02449 {
02450    struct tgsi_parse_context parse;
02451    boolean instruction_phase = FALSE;
02452    unsigned ok = 1;
02453    uint num_immediates = 0;
02454 
02455    util_init_math();
02456 
02457    func->csr = func->store;
02458 
02459    tgsi_parse_init( &parse, tokens );
02460 
02461    /* Can't just use EDI, EBX without save/restoring them:
02462     */
02463    x86_push(
02464       func,
02465       get_immediate_base() );
02466 
02467    x86_push(
02468       func,
02469       get_temp_base() );
02470 
02471 
02472    /*
02473     * Different function args for vertex/fragment shaders:
02474     */
02475    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
02476       /* DECLARATION phase, do not load output argument. */
02477       x86_mov(
02478          func,
02479          get_input_base(),
02480          x86_fn_arg( func, 1 ) );
02481       /* skipping outputs argument here */
02482       x86_mov(
02483          func,
02484          get_const_base(),
02485          x86_fn_arg( func, 3 ) );
02486       x86_mov(
02487          func,
02488          get_temp_base(),
02489          x86_fn_arg( func, 4 ) );
02490       x86_mov(
02491          func,
02492          get_coef_base(),
02493          x86_fn_arg( func, 5 ) );
02494       x86_mov(
02495          func,
02496          get_immediate_base(),
02497          x86_fn_arg( func, 6 ) );
02498    }
02499    else {
02500       assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
02501 
02502       if (do_swizzles)
02503          aos_to_soa( func, 
02504                      6,         /* aos_input */
02505                      1,         /* machine->input */
02506                      7,         /* num_inputs */
02507                      8 );       /* input_stride */
02508 
02509       x86_mov(
02510          func,
02511          get_input_base(),
02512          x86_fn_arg( func, 1 ) );
02513       x86_mov(
02514          func,
02515          get_output_base(),
02516          x86_fn_arg( func, 2 ) );
02517       x86_mov(
02518          func,
02519          get_const_base(),
02520          x86_fn_arg( func, 3 ) );
02521       x86_mov(
02522          func,
02523          get_temp_base(),
02524          x86_fn_arg( func, 4 ) );
02525       x86_mov(
02526          func,
02527          get_immediate_base(),
02528          x86_fn_arg( func, 5 ) );
02529    }
02530 
02531    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
02532       tgsi_parse_token( &parse );
02533 
02534       switch( parse.FullToken.Token.Type ) {
02535       case TGSI_TOKEN_TYPE_DECLARATION:
02536          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
02537             emit_declaration(
02538                func,
02539                &parse.FullToken.FullDeclaration );
02540          }
02541          break;
02542 
02543       case TGSI_TOKEN_TYPE_INSTRUCTION:
02544          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
02545             if( !instruction_phase ) {
02546                /* INSTRUCTION phase, overwrite coeff with output. */
02547                instruction_phase = TRUE;
02548                x86_mov(
02549                   func,
02550                   get_output_base(),
02551                   x86_fn_arg( func, 2 ) );
02552             }
02553          }
02554 
02555          ok = emit_instruction(
02556             func,
02557             &parse.FullToken.FullInstruction );
02558 
02559          if (!ok) {
02560             debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n", 
02561                          parse.FullToken.FullInstruction.Instruction.Opcode,
02562                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
02563                          "vertex shader" : "fragment shader");
02564          }
02565          break;
02566 
02567       case TGSI_TOKEN_TYPE_IMMEDIATE:
02568          /* simply copy the immediate values into the next immediates[] slot */
02569          {
02570             const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
02571             uint i;
02572             assert(size <= 4);
02573             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
02574             for( i = 0; i < size; i++ ) {
02575                immediates[num_immediates][i] =
02576                   parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
02577             }
02578 #if 0
02579             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
02580                    num_immediates,
02581                    immediates[num_immediates][0],
02582                    immediates[num_immediates][1],
02583                    immediates[num_immediates][2],
02584                    immediates[num_immediates][3]);
02585 #endif
02586             num_immediates++;
02587          }
02588          break;
02589 
02590       default:
02591          ok = 0;
02592          assert( 0 );
02593       }
02594    }
02595 
02596    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
02597       if (do_swizzles)
02598          soa_to_aos( func, 9, 2, 10, 11 );
02599    }
02600 
02601    /* Can't just use EBX, EDI without save/restoring them:
02602     */
02603    x86_pop(
02604       func,
02605       get_temp_base() );
02606 
02607    x86_pop(
02608       func,
02609       get_immediate_base() );
02610 
02611    emit_ret( func );
02612 
02613    tgsi_parse_free( &parse );
02614 
02615    return ok;
02616 }
02617 
02618 #endif /* PIPE_ARCH_X86 */
02619 

Generated on Tue Sep 29 06:25:15 2009 for Gallium3D by  doxygen 1.5.4