Gallium3D: gallium/drivers/i965simple/brw_wm

00001 
00002 #include "brw_context.h"
00003 #include "brw_eu.h"
00004 #include "brw_wm.h"
00005 #include "util/u_math.h"
00006 #include "util/u_memory.h"
00007 #include "pipe/p_shader_tokens.h"
00008 #include "tgsi/tgsi_parse.h"
00009 
00010 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
00011 {
00012    c->tmp_index++;
00013    c->reg_index = MAX2(c->reg_index, c->tmp_start + c->tmp_index);
00014    return brw_vec8_grf(c->tmp_start + c->tmp_index, 0);
00015 }
00016 
00017 static void release_tmps(struct brw_wm_compile *c)
00018 {
00019    c->tmp_index = 0;
00020 }
00021 
00022 
00023 
00024 static int is_null( struct brw_reg reg )
00025 {
00026    return (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
00027            reg.nr == BRW_ARF_NULL);
00028 }
00029 
00030 static void emit_pixel_xy( struct brw_wm_compile *c )
00031 {
00032    if (is_null(c->pixel_xy[0])) {
00033 
00034       struct brw_compile *p = &c->func;
00035       struct brw_reg r1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
00036 
00037       c->pixel_xy[0] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
00038       c->pixel_xy[1] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
00039 
00040       /* Calculate pixel centers by adding 1 or 0 to each of the
00041        * micro-tile coordinates passed in r1.
00042        */
00043       brw_ADD(p,
00044               c->pixel_xy[0],
00045               stride(suboffset(r1_uw, 4), 2, 4, 0),
00046               brw_imm_v(0x10101010));
00047 
00048       brw_ADD(p,
00049               c->pixel_xy[1],
00050               stride(suboffset(r1_uw, 5), 2, 4, 0),
00051               brw_imm_v(0x11001100));
00052    }
00053 }
00054 
00055 
00056 
00057 
00058 
00059 
00060 static void emit_delta_xy( struct brw_wm_compile *c )
00061 {
00062    if (is_null(c->delta_xy[0])) {
00063       struct brw_compile *p = &c->func;
00064       struct brw_reg r1 = brw_vec1_grf(1, 0);
00065 
00066       emit_pixel_xy(c);
00067 
00068       c->delta_xy[0] = alloc_tmp(c);
00069       c->delta_xy[1] = alloc_tmp(c);
00070 
00071       /* Calc delta X,Y by subtracting origin in r1 from the pixel
00072        * centers.
00073        */
00074       brw_ADD(p,
00075               c->delta_xy[0],
00076               retype(c->pixel_xy[0], BRW_REGISTER_TYPE_UW),
00077               negate(r1));
00078 
00079       brw_ADD(p,
00080               c->delta_xy[1],
00081               retype(c->pixel_xy[1], BRW_REGISTER_TYPE_UW),
00082               negate(suboffset(r1,1)));
00083    }
00084 }
00085 
00086 
00087 
00088 #if 0
00089 static void emit_pixel_w( struct brw_wm_compile *c )
00090 {
00091    if (is_null(c->pixel_w)) {
00092       struct brw_compile *p = &c->func;
00093 
00094       struct brw_reg interp_wpos = c->coef_wpos;
00095       
00096       c->pixel_w = alloc_tmp(c);
00097 
00098       emit_delta_xy(c);
00099 
00100       /* Calc 1/w - just linterp wpos[3] optimized by putting the
00101        * result straight into a message reg.
00102        */
00103       struct brw_reg interp3 = brw_vec1_grf(interp_wpos.nr+1, 4);
00104       brw_LINE(p, brw_null_reg(), interp3, c->delta_xy[0]);
00105       brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), c->delta_xy[1]);
00106 
00107       /* Calc w */
00108       brw_math_16( p, 
00109                    c->pixel_w,
00110                    BRW_MATH_FUNCTION_INV,
00111                    BRW_MATH_SATURATE_NONE,
00112                    2, 
00113                    brw_null_reg(),
00114                    BRW_MATH_PRECISION_FULL);
00115    }
00116 }
00117 #endif
00118 
00119 
00120 static void emit_cinterp(struct brw_wm_compile *c,
00121                          int idx,
00122                          int mask )
00123 {
00124    struct brw_compile *p = &c->func;
00125    struct brw_reg interp[4];
00126    struct brw_reg coef = c->payload_coef[idx];
00127    int i;
00128 
00129    interp[0] = brw_vec1_grf(coef.nr, 0);
00130    interp[1] = brw_vec1_grf(coef.nr, 4);
00131    interp[2] = brw_vec1_grf(coef.nr+1, 0);
00132    interp[3] = brw_vec1_grf(coef.nr+1, 4);
00133 
00134    for(i = 0; i < 4; i++ ) {
00135       if (mask & (1<<i)) {
00136          struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
00137          brw_MOV(p, dst, suboffset(interp[i],3));
00138       }
00139    }
00140 }
00141 
00142 static void emit_linterp(struct brw_wm_compile *c,
00143                          int idx,
00144                          int mask )
00145 {
00146    struct brw_compile *p = &c->func;
00147    struct brw_reg interp[4];
00148    struct brw_reg coef = c->payload_coef[idx];
00149    int i;
00150 
00151    emit_delta_xy(c);
00152 
00153    interp[0] = brw_vec1_grf(coef.nr, 0);
00154    interp[1] = brw_vec1_grf(coef.nr, 4);
00155    interp[2] = brw_vec1_grf(coef.nr+1, 0);
00156    interp[3] = brw_vec1_grf(coef.nr+1, 4);
00157 
00158    for(i = 0; i < 4; i++ ) {
00159       if (mask & (1<<i)) {
00160          struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
00161          brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
00162          brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
00163       }
00164    }
00165 }
00166 
00167 #if 0
00168 static void emit_pinterp(struct brw_wm_compile *c,
00169                          int idx,
00170                          int mask )
00171 {
00172    struct brw_compile *p = &c->func;
00173    struct brw_reg interp[4];
00174    struct brw_reg coef = c->payload_coef[idx];
00175    int i;
00176 
00177    get_delta_xy(c);
00178    get_pixel_w(c);
00179 
00180    interp[0] = brw_vec1_grf(coef.nr, 0);
00181    interp[1] = brw_vec1_grf(coef.nr, 4);
00182    interp[2] = brw_vec1_grf(coef.nr+1, 0);
00183    interp[3] = brw_vec1_grf(coef.nr+1, 4);
00184 
00185    for(i = 0; i < 4; i++ ) {
00186       if (mask & (1<<i)) {
00187          struct brw_reg dst = allocate_reg(c, TGSI_FILE_INPUT, idx, i);
00188          brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
00189          brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
00190          brw_MUL(p, dst, dst, c->pixel_w);
00191       }
00192    }
00193 }
00194 #endif
00195 
00196 
00197 
00198 #if 0
00199 static void emit_wpos( )
00200 { 
00201    struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx);
00202    struct tgsi_full_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
00203    struct tgsi_full_src_register deltas = get_delta_xy(c);
00204    struct tgsi_full_src_register arg2;
00205    unsigned opcode;
00206 
00207    opcode = WM_LINTERP;
00208    arg2 = src_undef();
00209 
00210    /* Have to treat wpos.xy specially:
00211     */
00212    emit_op(c,
00213            WM_WPOSXY,
00214            dst_mask(dst, WRITEMASK_XY),
00215            0, 0, 0,
00216            get_pixel_xy(c),
00217            src_undef(),
00218            src_undef());
00219       
00220    dst = dst_mask(dst, WRITEMASK_ZW);
00221 
00222    /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
00223     */
00224    emit_op(c,
00225            WM_LINTERP,
00226            dst,
00227            0, 0, 0,
00228            interp,
00229            deltas,
00230            arg2);
00231 }
00232 #endif
00233 
00234 
00235 
00236 
00237 /* Perform register allocation:
00238  * 
00239  *  -- r0???
00240  *  -- passthrough depth regs (and stencil/aa??)
00241  *  -- curbe ??
00242  *  -- inputs (coefficients)
00243  *
00244  * Use a totally static register allocation.  This will perform poorly
00245  * but is an easy way to get started (again).
00246  */
00247 static void prealloc_reg(struct brw_wm_compile *c)
00248 {
00249    int i, j;
00250    int nr_curbe_regs = 0;
00251 
00252    /* R0, then some depth related regs:
00253     */
00254    for (i = 0; i < c->key.nr_depth_regs; i++) {
00255       c->payload_depth[i] =  brw_vec8_grf(i*2, 0);
00256       c->reg_index += 2;
00257    }
00258 
00259 
00260    /* Then a copy of our part of the CURBE entry:
00261     */
00262    {
00263       int nr_constants = c->fp->info.file_max[TGSI_FILE_CONSTANT] + 1;
00264       int index = 0;
00265 
00266       /* XXX number of constants, or highest numbered constant? */
00267       assert(nr_constants == c->fp->info.file_count[TGSI_FILE_CONSTANT]);
00268 
00269       c->prog_data.max_const = 4*nr_constants;
00270       for (i = 0; i < nr_constants; i++) {
00271          for (j = 0; j < 4; j++, index++) 
00272             c->wm_regs[TGSI_FILE_CONSTANT][i][j] = brw_vec1_grf(c->reg_index + index/8,
00273                                                                 index%8);
00274       }
00275 
00276       nr_curbe_regs = 2*((4*nr_constants+15)/16);
00277       c->reg_index += nr_curbe_regs;
00278    }
00279 
00280    /* Adjust for parameter coefficients for position, which are
00281     * currently always provided.
00282     */
00283 //   c->position_coef[i] = brw_vec8_grf(c->reg_index, 0);
00284    c->reg_index += 2;
00285 
00286    /* Next we receive the plane coefficients for parameter
00287     * interpolation:
00288     */
00289    assert(c->fp->info.file_max[TGSI_FILE_INPUT] == c->fp->info.num_inputs);
00290    for (i = 0; i < c->fp->info.file_max[TGSI_FILE_INPUT] + 1; i++) {
00291       c->payload_coef[i] = brw_vec8_grf(c->reg_index, 0);
00292       c->reg_index += 2;
00293    }
00294 
00295    c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
00296    c->prog_data.urb_read_length = (c->fp->info.num_inputs + 1) * 2;
00297    c->prog_data.curb_read_length = nr_curbe_regs;
00298 
00299    /* That's the end of the payload, now we can start allocating registers.
00300     */
00301    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
00302    c->reg_index++;
00303 
00304    c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
00305    c->reg_index += 2;
00306 
00307    /* Now allocate room for the interpolated inputs and staging
00308     * registers for the outputs:
00309     */
00310    /* XXX do we want to loop over the _number_ of inputs/outputs or loop
00311     * to the highest input/output index that's used?
00312     *  Probably the same, actually.
00313     */
00314    assert(c->fp->info.file_max[TGSI_FILE_INPUT] + 1 == c->fp->info.num_inputs);
00315    assert(c->fp->info.file_max[TGSI_FILE_OUTPUT] + 1 == c->fp->info.num_outputs);
00316    for (i = 0; i < c->fp->info.file_max[TGSI_FILE_INPUT] + 1; i++) 
00317       for (j = 0; j < 4; j++)
00318          c->wm_regs[TGSI_FILE_INPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
00319 
00320    for (i = 0; i < c->fp->info.file_max[TGSI_FILE_OUTPUT] + 1; i++) 
00321       for (j = 0; j < 4; j++)
00322          c->wm_regs[TGSI_FILE_OUTPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
00323 
00324    /* Beyond this we should only need registers for internal temporaries:
00325     */
00326    c->tmp_start = c->reg_index;
00327 }
00328 
00329 
00330 
00331 
00332 
00333 /* Need to interpolate fragment program inputs in as a preamble to the
00334  * shader.  A more sophisticated compiler would do this on demand, but
00335  * we'll do it up front:
00336  */
00337 void brw_wm_emit_decls(struct brw_wm_compile *c)
00338 {
00339    struct tgsi_parse_context parse;
00340    int done = 0;
00341 
00342    prealloc_reg(c);
00343 
00344    tgsi_parse_init( &parse, c->fp->program.tokens );
00345 
00346    while( !done &&
00347           !tgsi_parse_end_of_tokens( &parse ) ) 
00348    {
00349       tgsi_parse_token( &parse );
00350 
00351       switch( parse.FullToken.Token.Type ) {
00352       case TGSI_TOKEN_TYPE_DECLARATION:
00353       {
00354          const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
00355          unsigned first = decl->DeclarationRange.First;
00356          unsigned last = decl->DeclarationRange.Last;
00357          unsigned mask = decl->Declaration.UsageMask; /* ? */
00358          unsigned i;
00359 
00360          if (decl->Declaration.File != TGSI_FILE_INPUT)
00361             break;
00362 
00363          for( i = first; i <= last; i++ ) {
00364             switch (decl->Declaration.Interpolate) {
00365             case TGSI_INTERPOLATE_CONSTANT:
00366                emit_cinterp(c, i, mask);
00367                break;
00368 
00369             case TGSI_INTERPOLATE_LINEAR:
00370                emit_linterp(c, i, mask);
00371                break;
00372 
00373             case TGSI_INTERPOLATE_PERSPECTIVE:
00374                //emit_pinterp(c, i, mask);
00375                emit_linterp(c, i, mask);
00376                break;
00377             }
00378          }
00379          break;
00380       }
00381       case TGSI_TOKEN_TYPE_IMMEDIATE:
00382       case TGSI_TOKEN_TYPE_INSTRUCTION:
00383       default:
00384          done = 1;
00385          break;
00386       }
00387    }
00388 
00389    tgsi_parse_free (&parse);
00390    
00391    release_tmps(c);
00392 }
brw_wm_decl.c