Gallium3D: gallium/drivers/cell/spu/spu

00001 /**************************************************************************
00002  * 
00003  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
00004  * All Rights Reserved.
00005  *
00006  * Permission is hereby granted, free of charge, to any person obtaining a
00007  * copy of this software and associated documentation files (the
00008  * "Software"), to deal in the Software without restriction, including
00009  * without limitation the rights to use, copy, modify, merge, publish,
00010  * distribute, sub license, and/or sell copies of the Software, and to
00011  * permit persons to whom the Software is furnished to do so, subject to
00012  * the following conditions:
00013  * 
00014  * The above copyright notice and this permission notice (including the
00015  * next paragraph) shall be included in all copies or substantial portions
00016  * of the Software.
00017  * 
00018  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
00019  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
00020  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
00021  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
00022  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
00023  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
00024  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
00025  * 
00026  **************************************************************************/
00027 
00032 #include <transpose_matrix4x4.h>
00033 #include "pipe/p_compiler.h"
00034 #include "pipe/p_format.h"
00035 #include "util/u_math.h"
00036 #include "spu_colorpack.h"
00037 #include "spu_main.h"
00038 #include "spu_texture.h"
00039 #include "spu_tile.h"
00040 #include "spu_tri.h"
00041 
00042 
00044 typedef vector unsigned int mask_t;
00045 
00046 typedef union
00047 {
00048    vector float v;
00049    float f[4];
00050 } float4;
00051 
00052 
00056 struct vertex_header {
00057    vector float data[1];
00058 };
00059 
00060 
00061 
00062 /* XXX fix this */
00063 #undef CEILF
00064 #define CEILF(X) ((float) (int) ((X) + 0.99999))
00065 
00066 
00067 #define QUAD_TOP_LEFT     0
00068 #define QUAD_TOP_RIGHT    1
00069 #define QUAD_BOTTOM_LEFT  2
00070 #define QUAD_BOTTOM_RIGHT 3
00071 #define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
00072 #define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
00073 #define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
00074 #define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
00075 #define MASK_ALL          0xf
00076 
00077 
00078 #define DEBUG_VERTS 0
00079 
00083 struct edge {
00084    float dx;            
00085    float dy;            
00086    float dxdy;          
00087    float sx, sy;        
00088    int lines;           
00089 };
00090 
00091 
00092 struct interp_coef
00093 {
00094    float4 a0;
00095    float4 dadx;
00096    float4 dady;
00097 };
00098 
00099 
00104 struct setup_stage {
00105 
00106    /* Vertices are just an array of floats making up each attribute in
00107     * turn.  Currently fixed at 4 floats, but should change in time.
00108     * Codegen will help cope with this.
00109     */
00110    const struct vertex_header *vmax;
00111    const struct vertex_header *vmid;
00112    const struct vertex_header *vmin;
00113    const struct vertex_header *vprovoke;
00114 
00115    struct edge ebot;
00116    struct edge etop;
00117    struct edge emaj;
00118 
00119    float oneoverarea;
00120 
00121    uint tx, ty;
00122 
00123    int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
00124 
00125 #if 0
00126    struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
00127 #else
00128    struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
00129 #endif
00130 
00131 #if 0
00132    struct quad_header quad; 
00133 #endif
00134 
00135    struct {
00136       int left[2];   
00137       int right[2];
00138       int y;
00139       unsigned y_flags;
00140       unsigned mask;     
00141    } span;
00142 };
00143 
00144 
00145 
00146 static struct setup_stage setup;
00147 
00148 
00149 
00150 
00151 #if 0
00152 
00155 static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
00156 {
00157    return (struct setup_stage *)stage;
00158 }
00159 #endif
00160 
00161 #if 0
00162 
00165 static INLINE void
00166 quad_clip(struct setup_stage *setup)
00167 {
00168    const struct pipe_scissor_state *cliprect = &setup.softpipe->cliprect;
00169    const int minx = (int) cliprect->minx;
00170    const int maxx = (int) cliprect->maxx;
00171    const int miny = (int) cliprect->miny;
00172    const int maxy = (int) cliprect->maxy;
00173 
00174    if (setup.quad.x0 >= maxx ||
00175        setup.quad.y0 >= maxy ||
00176        setup.quad.x0 + 1 < minx ||
00177        setup.quad.y0 + 1 < miny) {
00178       /* totally clipped */
00179       setup.quad.mask = 0x0;
00180       return;
00181    }
00182    if (setup.quad.x0 < minx)
00183       setup.quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
00184    if (setup.quad.y0 < miny)
00185       setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
00186    if (setup.quad.x0 == maxx - 1)
00187       setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
00188    if (setup.quad.y0 == maxy - 1)
00189       setup.quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
00190 }
00191 #endif
00192 
00193 #if 0
00194 
00197 static INLINE void
00198 clip_emit_quad(struct setup_stage *setup)
00199 {
00200    quad_clip(setup);
00201    if (setup.quad.mask) {
00202       struct softpipe_context *sp = setup.softpipe;
00203       sp->quad.first->run(sp->quad.first, &setup.quad);
00204    }
00205 }
00206 #endif
00207 
00213 static INLINE void
00214 eval_coeff(uint slot, float x, float y, vector float result[4])
00215 {
00216    switch (spu.vertex_info.interp_mode[slot]) {
00217    case INTERP_CONSTANT:
00218       result[QUAD_TOP_LEFT] =
00219       result[QUAD_TOP_RIGHT] =
00220       result[QUAD_BOTTOM_LEFT] =
00221       result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
00222       break;
00223 
00224    case INTERP_LINEAR:
00225       /* fall-through, for now */
00226    default:
00227       {
00228          register vector float dadx = setup.coef[slot].dadx.v;
00229          register vector float dady = setup.coef[slot].dady.v;
00230          register vector float topLeft
00231             = spu_add(setup.coef[slot].a0.v,
00232                       spu_add(spu_mul(spu_splats(x), dadx),
00233                               spu_mul(spu_splats(y), dady)));
00234 
00235          result[QUAD_TOP_LEFT] = topLeft;
00236          result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
00237          result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
00238          result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
00239       }
00240    }
00241 }
00242 
00243 
00244 static INLINE vector float
00245 eval_z(float x, float y)
00246 {
00247    const uint slot = 0;
00248    const float dzdx = setup.coef[slot].dadx.f[2];
00249    const float dzdy = setup.coef[slot].dady.f[2];
00250    const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
00251    const vector float topLeftv = spu_splats(topLeft);
00252    const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
00253    return spu_add(topLeftv, derivs);
00254 }
00255 
00256 
00263 static INLINE void
00264 emit_quad( int x, int y, mask_t mask )
00265 {
00266    /* If any bits in mask are set... */
00267    if (spu_extract(spu_orx(mask), 0)) {
00268       const int ix = x - setup.cliprect_minx;
00269       const int iy = y - setup.cliprect_miny;
00270       vector float colors[4];
00271 
00272       spu.cur_ctile_status = TILE_STATUS_DIRTY;
00273       spu.cur_ztile_status = TILE_STATUS_DIRTY;
00274 
00275       if (spu.texture[0].start) {
00276          /* texture mapping */
00277          const uint unit = 0;
00278          vector float texcoords[4];
00279          eval_coeff(2, (float) x, (float) y, texcoords);
00280 
00281          if (spu_extract(mask, 0))
00282             colors[0] = spu.sample_texture[unit](unit, texcoords[0]);
00283          if (spu_extract(mask, 1))
00284             colors[1] = spu.sample_texture[unit](unit, texcoords[1]);
00285          if (spu_extract(mask, 2))
00286             colors[2] = spu.sample_texture[unit](unit, texcoords[2]);
00287          if (spu_extract(mask, 3))
00288             colors[3] = spu.sample_texture[unit](unit, texcoords[3]);
00289 
00290 
00291          if (spu.texture[1].start) {
00292             /* multi-texture mapping */
00293             const uint unit = 1;
00294             vector float colors1[4];
00295 
00296             eval_coeff(2, (float) x, (float) y, texcoords);
00297 
00298             if (spu_extract(mask, 0))
00299                colors1[0] = spu.sample_texture[unit](unit, texcoords[0]);
00300             if (spu_extract(mask, 1))
00301                colors1[1] = spu.sample_texture[unit](unit, texcoords[1]);
00302             if (spu_extract(mask, 2))
00303                colors1[2] = spu.sample_texture[unit](unit, texcoords[2]);
00304             if (spu_extract(mask, 3))
00305                colors1[3] = spu.sample_texture[unit](unit, texcoords[3]);
00306 
00307             /* hack: modulate first texture by second */
00308             colors[0] = spu_mul(colors[0], colors1[0]);
00309             colors[1] = spu_mul(colors[1], colors1[1]);
00310             colors[2] = spu_mul(colors[2], colors1[2]);
00311             colors[3] = spu_mul(colors[3], colors1[3]);
00312          }
00313 
00314       }
00315       else {
00316          /* simple shading */
00317 #if 0
00318          eval_coeff(1, (float) x, (float) y, colors);
00319 
00320 #else
00321          /* XXX new fragment program code */
00322 
00323          if (spu.fragment_program) {
00324             vector float inputs[4*4], outputs[2*4];
00325 
00326             /* setup inputs */
00327             eval_coeff(1, (float) x, (float) y, inputs);
00328 
00329             /* Execute the current fragment program */
00330             spu.fragment_program(inputs, outputs, spu.constants);
00331 
00332             /* Copy outputs */
00333             colors[0] = outputs[0*4+0];
00334             colors[1] = outputs[0*4+1];
00335             colors[2] = outputs[0*4+2];
00336             colors[3] = outputs[0*4+3];
00337 
00338             if (0 && spu.init.id==0 && y == 48) {
00339                printf("colors[0] = %f %f %f %f\n",
00340                       spu_extract(colors[0], 0),
00341                       spu_extract(colors[0], 1),
00342                       spu_extract(colors[0], 2),
00343                       spu_extract(colors[0], 3));
00344                printf("colors[1] = %f %f %f %f\n",
00345                       spu_extract(colors[1], 0),
00346                       spu_extract(colors[1], 1),
00347                       spu_extract(colors[1], 2),
00348                       spu_extract(colors[1], 3));
00349             }
00350 
00351          }
00352 #endif
00353       }
00354 
00355 
00356       {
00357          /* Convert fragment data from AoS to SoA format.
00358           * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
00359           * This is temporary!
00360           */
00361          vector float soa_frag[4];
00362          _transpose_matrix4x4(soa_frag, colors);
00363 
00364          float4 fragZ;
00365 
00366          fragZ.v = eval_z((float) x, (float) y);
00367 
00368          /* Do all per-fragment/quad operations here, including:
00369           *  alpha test, z test, stencil test, blend and framebuffer writing.
00370           */
00371          spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
00372                           fragZ.v,
00373                           soa_frag[0], soa_frag[1],
00374                           soa_frag[2], soa_frag[3],
00375                           mask);
00376       }
00377 
00378    }
00379 }
00380 
00381 
00386 static INLINE int block( int x )
00387 {
00388    return x & ~1;
00389 }
00390 
00391 
00397 static INLINE mask_t calculate_mask( int x )
00398 {
00399    /* This is a little tricky.
00400     * Use & instead of && to avoid branches.
00401     * Use negation to convert true/false to ~0/0 values.
00402     */
00403    mask_t mask;
00404    mask = spu_insert(-((x   >= setup.span.left[0]) & (x   < setup.span.right[0])), mask, 0);
00405    mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
00406    mask = spu_insert(-((x   >= setup.span.left[1]) & (x   < setup.span.right[1])), mask, 2);
00407    mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
00408    return mask;
00409 }
00410 
00411 
00415 static void flush_spans( void )
00416 {
00417    int minleft, maxright;
00418    int x;
00419 
00420    switch (setup.span.y_flags) {
00421    case 0x3:
00422       /* both odd and even lines written (both quad rows) */
00423       minleft = MIN2(setup.span.left[0], setup.span.left[1]);
00424       maxright = MAX2(setup.span.right[0], setup.span.right[1]);
00425       break;
00426 
00427    case 0x1:
00428       /* only even line written (quad top row) */
00429       minleft = setup.span.left[0];
00430       maxright = setup.span.right[0];
00431       break;
00432 
00433    case 0x2:
00434       /* only odd line written (quad bottom row) */
00435       minleft = setup.span.left[1];
00436       maxright = setup.span.right[1];
00437       break;
00438 
00439    default:
00440       return;
00441    }
00442 
00443 
00444    /* OK, we're very likely to need the tile data now.
00445     * clear or finish waiting if needed.
00446     */
00447    if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
00448       /* wait for mfc_get() to complete */
00449       //printf("SPU: %u: waiting for ctile\n", spu.init.id);
00450       wait_on_mask(1 << TAG_READ_TILE_COLOR);
00451       spu.cur_ctile_status = TILE_STATUS_CLEAN;
00452    }
00453    else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
00454       //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
00455       clear_c_tile(&spu.ctile);
00456       spu.cur_ctile_status = TILE_STATUS_DIRTY;
00457    }
00458    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
00459 
00460    if (spu.read_depth) {
00461       if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
00462          /* wait for mfc_get() to complete */
00463          //printf("SPU: %u: waiting for ztile\n", spu.init.id);
00464          wait_on_mask(1 << TAG_READ_TILE_Z);
00465          spu.cur_ztile_status = TILE_STATUS_CLEAN;
00466       }
00467       else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
00468          //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
00469          clear_z_tile(&spu.ztile);
00470          spu.cur_ztile_status = TILE_STATUS_DIRTY;
00471       }
00472       ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
00473    }
00474 
00475    /* XXX this loop could be moved into the above switch cases and
00476     * calculate_mask() could be simplified a bit...
00477     */
00478    for (x = block(minleft); x <= block(maxright); x += 2) {
00479 #if 1
00480       emit_quad( x, setup.span.y, calculate_mask( x ) );
00481 #endif
00482    }
00483 
00484    setup.span.y = 0;
00485    setup.span.y_flags = 0;
00486    setup.span.right[0] = 0;
00487    setup.span.right[1] = 0;
00488 }
00489 
00490 #if DEBUG_VERTS
00491 static void print_vertex(const struct vertex_header *v)
00492 {
00493    int i;
00494    fprintf(stderr, "Vertex: (%p)\n", v);
00495    for (i = 0; i < setup.quad.nr_attrs; i++) {
00496       fprintf(stderr, "  %d: %f %f %f %f\n",  i, 
00497               v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
00498    }
00499 }
00500 #endif
00501 
00502 
00503 static boolean setup_sort_vertices(const struct vertex_header *v0,
00504                                    const struct vertex_header *v1,
00505                                    const struct vertex_header *v2)
00506 {
00507 
00508 #if DEBUG_VERTS
00509    fprintf(stderr, "Triangle:\n");
00510    print_vertex(v0);
00511    print_vertex(v1);
00512    print_vertex(v2);
00513 #endif
00514 
00515    setup.vprovoke = v2;
00516 
00517    /* determine bottom to top order of vertices */
00518    {
00519       float y0 = spu_extract(v0->data[0], 1);
00520       float y1 = spu_extract(v1->data[0], 1);
00521       float y2 = spu_extract(v2->data[0], 1);
00522       if (y0 <= y1) {
00523          if (y1 <= y2) {
00524             /* y0<=y1<=y2 */
00525             setup.vmin = v0;   
00526             setup.vmid = v1;   
00527             setup.vmax = v2;
00528          }
00529          else if (y2 <= y0) {
00530             /* y2<=y0<=y1 */
00531             setup.vmin = v2;   
00532             setup.vmid = v0;   
00533             setup.vmax = v1;   
00534          }
00535          else {
00536             /* y0<=y2<=y1 */
00537             setup.vmin = v0;   
00538             setup.vmid = v2;   
00539             setup.vmax = v1;  
00540          }
00541       }
00542       else {
00543          if (y0 <= y2) {
00544             /* y1<=y0<=y2 */
00545             setup.vmin = v1;   
00546             setup.vmid = v0;   
00547             setup.vmax = v2;  
00548          }
00549          else if (y2 <= y1) {
00550             /* y2<=y1<=y0 */
00551             setup.vmin = v2;   
00552             setup.vmid = v1;   
00553             setup.vmax = v0;  
00554          }
00555          else {
00556             /* y1<=y2<=y0 */
00557             setup.vmin = v1;   
00558             setup.vmid = v2;   
00559             setup.vmax = v0;
00560          }
00561       }
00562    }
00563 
00564    /* Check if triangle is completely outside the tile bounds */
00565    if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
00566       return FALSE;
00567    if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
00568       return FALSE;
00569    if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
00570        spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
00571        spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
00572       return FALSE;
00573    if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
00574        spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
00575        spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
00576       return FALSE;
00577 
00578    setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
00579    setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
00580    setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
00581    setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
00582    setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0);
00583    setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1);
00584 
00585    /*
00586     * Compute triangle's area.  Use 1/area to compute partial
00587     * derivatives of attributes later.
00588     *
00589     * The area will be the same as prim->det, but the sign may be
00590     * different depending on how the vertices get sorted above.
00591     *
00592     * To determine whether the primitive is front or back facing we
00593     * use the prim->det value because its sign is correct.
00594     */
00595    {
00596       const float area = (setup.emaj.dx * setup.ebot.dy - 
00597                             setup.ebot.dx * setup.emaj.dy);
00598 
00599       setup.oneoverarea = 1.0f / area;
00600       /*
00601       _mesa_printf("%s one-over-area %f  area %f  det %f\n",
00602                    __FUNCTION__, setup.oneoverarea, area, prim->det );
00603       */
00604    }
00605 
00606 #if 0
00607    /* We need to know if this is a front or back-facing triangle for:
00608     *  - the GLSL gl_FrontFacing fragment attribute (bool)
00609     *  - two-sided stencil test
00610     */
00611    setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
00612 #endif
00613 
00614    return TRUE;
00615 }
00616 
00617 
00624 static INLINE void
00625 const_coeff(uint slot)
00626 {
00627    setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
00628    setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
00629    setup.coef[slot].a0.v = setup.vprovoke->data[slot];
00630 }
00631 
00632 
00637 static INLINE void
00638 tri_linear_coeff(uint slot, uint firstComp, uint lastComp)
00639 {
00640    uint i;
00641    const float *vmin_d = (float *) &setup.vmin->data[slot];
00642    const float *vmid_d = (float *) &setup.vmid->data[slot];
00643    const float *vmax_d = (float *) &setup.vmax->data[slot];
00644    const float x = spu_extract(setup.vmin->data[0], 0) - 0.5f;
00645    const float y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
00646 
00647    for (i = firstComp; i < lastComp; i++) {
00648       float botda = vmid_d[i] - vmin_d[i];
00649       float majda = vmax_d[i] - vmin_d[i];
00650       float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
00651       float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
00652    
00653       ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
00654 
00655       setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
00656       setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
00657 
00658       /* calculate a0 as the value which would be sampled for the
00659        * fragment at (0,0), taking into account that we want to sample at
00660        * pixel centers, in other words (0.5, 0.5).
00661        *
00662        * this is neat but unfortunately not a good way to do things for
00663        * triangles with very large values of dadx or dady as it will
00664        * result in the subtraction and re-addition from a0 of a very
00665        * large number, which means we'll end up loosing a lot of the
00666        * fractional bits and precision from a0.  the way to fix this is
00667        * to define a0 as the sample at a pixel center somewhere near vmin
00668        * instead - i'll switch to this later.
00669        */
00670       setup.coef[slot].a0.f[i] = (vmin_d[i] - 
00671                                  (setup.coef[slot].dadx.f[i] * x + 
00672                                   setup.coef[slot].dady.f[i] * y));
00673    }
00674 
00675    /*
00676    _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
00677                 slot, "xyzw"[i], 
00678                 setup.coef[slot].a0[i],
00679                 setup.coef[slot].dadx.f[i],
00680                 setup.coef[slot].dady.f[i]);
00681    */
00682 }
00683 
00684 
00688 static INLINE void
00689 tri_linear_coeff4(uint slot)
00690 {
00691    const vector float vmin_d = setup.vmin->data[slot];
00692    const vector float vmid_d = setup.vmid->data[slot];
00693    const vector float vmax_d = setup.vmax->data[slot];
00694    const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
00695    const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
00696 
00697    vector float botda = vmid_d - vmin_d;
00698    vector float majda = vmax_d - vmin_d;
00699 
00700    vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
00701                             spu_mul(botda, spu_splats(setup.emaj.dy)));
00702    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
00703                             spu_mul(majda, spu_splats(setup.ebot.dx)));
00704 
00705    setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
00706    setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
00707 
00708    vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
00709    vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
00710                          
00711    setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
00712 }
00713 
00714 
00715 
00716 #if 0
00717 
00725 static void tri_persp_coeff( unsigned slot,
00726                              unsigned i )
00727 {
00728    /* premultiply by 1/w:
00729     */
00730    float mina = setup.vmin->data[slot][i] * setup.vmin->data[0][3];
00731    float mida = setup.vmid->data[slot][i] * setup.vmid->data[0][3];
00732    float maxa = setup.vmax->data[slot][i] * setup.vmax->data[0][3];
00733 
00734    float botda = mida - mina;
00735    float majda = maxa - mina;
00736    float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
00737    float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
00738       
00739    /*
00740    printf("tri persp %d,%d: %f %f %f\n", slot, i,
00741           setup.vmin->data[slot][i],
00742           setup.vmid->data[slot][i],
00743           setup.vmax->data[slot][i]
00744           );
00745    */
00746 
00747    assert(slot < PIPE_MAX_SHADER_INPUTS);
00748    assert(i <= 3);
00749 
00750    setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
00751    setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
00752    setup.coef[slot].a0.f[i] = (mina - 
00753                             (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
00754                              setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
00755 }
00756 #endif
00757 
00758 
00763 static void setup_tri_coefficients(void)
00764 {
00765 #if 1
00766    uint i;
00767 
00768    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
00769       switch (spu.vertex_info.interp_mode[i]) {
00770       case INTERP_NONE:
00771          break;
00772       case INTERP_POS:
00773          /*tri_linear_coeff(i, 2, 3);*/
00774          /* XXX interp W if PERSPECTIVE... */
00775          tri_linear_coeff4(i);
00776          break;
00777       case INTERP_CONSTANT:
00778          const_coeff(i);
00779          break;
00780       case INTERP_LINEAR:
00781          tri_linear_coeff4(i);
00782          break;
00783       case INTERP_PERSPECTIVE:
00784          tri_linear_coeff4(i);  /* temporary */
00785          break;
00786       default:
00787          ASSERT(0);
00788       }
00789    }
00790 #else
00791    ASSERT(spu.vertex_info.interp_mode[0] == INTERP_POS);
00792    ASSERT(spu.vertex_info.interp_mode[1] == INTERP_LINEAR ||
00793           spu.vertex_info.interp_mode[1] == INTERP_CONSTANT);
00794    tri_linear_coeff(0, 2, 3);  /* slot 0, z */
00795    tri_linear_coeff(1, 0, 4);  /* slot 1, color */
00796 #endif
00797 }
00798 
00799 
00800 static void setup_tri_edges(void)
00801 {
00802    float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
00803    float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
00804 
00805    float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
00806    float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
00807    float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
00808 
00809    setup.emaj.sy = CEILF(vmin_y);
00810    setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
00811    setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
00812    setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
00813 
00814    setup.etop.sy = CEILF(vmid_y);
00815    setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
00816    setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
00817    setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
00818 
00819    setup.ebot.sy = CEILF(vmin_y);
00820    setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
00821    setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
00822    setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
00823 }
00824 
00825 
00830 static void subtriangle( struct edge *eleft,
00831                          struct edge *eright,
00832                          unsigned lines )
00833 {
00834    const int minx = setup.cliprect_minx;
00835    const int maxx = setup.cliprect_maxx;
00836    const int miny = setup.cliprect_miny;
00837    const int maxy = setup.cliprect_maxy;
00838    int y, start_y, finish_y;
00839    int sy = (int)eleft->sy;
00840 
00841    ASSERT((int)eleft->sy == (int) eright->sy);
00842 
00843    /* clip top/bottom */
00844    start_y = sy;
00845    finish_y = sy + lines;
00846 
00847    if (start_y < miny)
00848       start_y = miny;
00849 
00850    if (finish_y > maxy)
00851       finish_y = maxy;
00852 
00853    start_y -= sy;
00854    finish_y -= sy;
00855 
00856    /*
00857    _mesa_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);  
00858    */
00859 
00860    for (y = start_y; y < finish_y; y++) {
00861 
00862       /* avoid accumulating adds as floats don't have the precision to
00863        * accurately iterate large triangle edges that way.  luckily we
00864        * can just multiply these days.
00865        *
00866        * this is all drowned out by the attribute interpolation anyway.
00867        */
00868       int left = (int)(eleft->sx + y * eleft->dxdy);
00869       int right = (int)(eright->sx + y * eright->dxdy);
00870 
00871       /* clip left/right */
00872       if (left < minx)
00873          left = minx;
00874       if (right > maxx)
00875          right = maxx;
00876 
00877       if (left < right) {
00878          int _y = sy + y;
00879          if (block(_y) != setup.span.y) {
00880             flush_spans();
00881             setup.span.y = block(_y);
00882          }
00883 
00884          setup.span.left[_y&1] = left;
00885          setup.span.right[_y&1] = right;
00886          setup.span.y_flags |= 1<<(_y&1);
00887       }
00888    }
00889 
00890 
00891    /* save the values so that emaj can be restarted:
00892     */
00893    eleft->sx += lines * eleft->dxdy;
00894    eright->sx += lines * eright->dxdy;
00895    eleft->sy += lines;
00896    eright->sy += lines;
00897 }
00898 
00899 
00904 boolean
00905 tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
00906 {
00907    setup.tx = tx;
00908    setup.ty = ty;
00909 
00910    /* set clipping bounds to tile bounds */
00911    setup.cliprect_minx = tx * TILE_SIZE;
00912    setup.cliprect_miny = ty * TILE_SIZE;
00913    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
00914    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
00915 
00916    if (!setup_sort_vertices((struct vertex_header *) v0,
00917                             (struct vertex_header *) v1,
00918                             (struct vertex_header *) v2)) {
00919       return FALSE; /* totally clipped */
00920    }
00921 
00922    setup_tri_coefficients();
00923    setup_tri_edges();
00924 
00925    setup.span.y = 0;
00926    setup.span.y_flags = 0;
00927    setup.span.right[0] = 0;
00928    setup.span.right[1] = 0;
00929    /*   setup.span.z_mode = tri_z_mode( setup.ctx ); */
00930 
00931    /*   init_constant_attribs( setup ); */
00932       
00933    if (setup.oneoverarea < 0.0) {
00934       /* emaj on left:
00935        */
00936       subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
00937       subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
00938    }
00939    else {
00940       /* emaj on right:
00941        */
00942       subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
00943       subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
00944    }
00945 
00946    flush_spans();
00947 
00948    return TRUE;
00949 }
spu_tri.c