00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029 #include "pipe/p_config.h"
00030 #include "pipe/p_compiler.h"
00031 #include "util/u_memory.h"
00032 #include "util/u_math.h"
00033
00034 #include "translate.h"
00035
00036
00037 #if defined(PIPE_ARCH_X86)
00038
00039 #include "rtasm/rtasm_cpu.h"
00040 #include "rtasm/rtasm_x86sse.h"
00041
00042
00043 #define X 0
00044 #define Y 1
00045 #define Z 2
00046 #define W 3
00047
00048
00049 typedef void (PIPE_CDECL *run_func)( struct translate *translate,
00050 unsigned start,
00051 unsigned count,
00052 void *output_buffer );
00053
00054 typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
00055 const unsigned *elts,
00056 unsigned count,
00057 void *output_buffer );
00058
00059 struct translate_buffer {
00060 const void *base_ptr;
00061 unsigned stride;
00062 void *ptr;
00063 };
00064
00065
00066 struct translate_sse {
00067 struct translate translate;
00068
00069 struct x86_function linear_func;
00070 struct x86_function elt_func;
00071 struct x86_function *func;
00072
00073 boolean loaded_identity;
00074 boolean loaded_255;
00075 boolean loaded_inv_255;
00076
00077 float identity[4];
00078 float float_255[4];
00079 float inv_255[4];
00080
00081 struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
00082 unsigned nr_buffers;
00083
00084 run_func gen_run;
00085 run_elts_func gen_run_elts;
00086
00087
00088
00089
00090 struct x86_reg tmp_EAX;
00091 struct x86_reg idx_EBX;
00092 struct x86_reg outbuf_ECX;
00093 struct x86_reg machine_EDX;
00094 struct x86_reg count_ESI;
00095 };
00096
00097 static int get_offset( const void *a, const void *b )
00098 {
00099 return (const char *)b - (const char *)a;
00100 }
00101
00102
00103
00104 static struct x86_reg get_identity( struct translate_sse *p )
00105 {
00106 struct x86_reg reg = x86_make_reg(file_XMM, 6);
00107
00108 if (!p->loaded_identity) {
00109 p->loaded_identity = TRUE;
00110 p->identity[0] = 0;
00111 p->identity[1] = 0;
00112 p->identity[2] = 0;
00113 p->identity[3] = 1;
00114
00115 sse_movups(p->func, reg,
00116 x86_make_disp(p->machine_EDX,
00117 get_offset(p, &p->identity[0])));
00118 }
00119
00120 return reg;
00121 }
00122
00123 static struct x86_reg get_255( struct translate_sse *p )
00124 {
00125 struct x86_reg reg = x86_make_reg(file_XMM, 7);
00126
00127 if (!p->loaded_255) {
00128 p->loaded_255 = TRUE;
00129 p->float_255[0] =
00130 p->float_255[1] =
00131 p->float_255[2] =
00132 p->float_255[3] = 255.0f;
00133
00134 sse_movups(p->func, reg,
00135 x86_make_disp(p->machine_EDX,
00136 get_offset(p, &p->float_255[0])));
00137 }
00138
00139 return reg;
00140 }
00141
00142 static struct x86_reg get_inv_255( struct translate_sse *p )
00143 {
00144 struct x86_reg reg = x86_make_reg(file_XMM, 5);
00145
00146 if (!p->loaded_inv_255) {
00147 p->loaded_inv_255 = TRUE;
00148 p->inv_255[0] =
00149 p->inv_255[1] =
00150 p->inv_255[2] =
00151 p->inv_255[3] = 1.0f / 255.0f;
00152
00153 sse_movups(p->func, reg,
00154 x86_make_disp(p->machine_EDX,
00155 get_offset(p, &p->inv_255[0])));
00156 }
00157
00158 return reg;
00159 }
00160
00161
00162 static void emit_load_R32G32B32A32( struct translate_sse *p,
00163 struct x86_reg data,
00164 struct x86_reg arg0 )
00165 {
00166 sse_movups(p->func, data, arg0);
00167 }
00168
00169 static void emit_load_R32G32B32( struct translate_sse *p,
00170 struct x86_reg data,
00171 struct x86_reg arg0 )
00172 {
00173
00174
00175
00176
00177
00178
00179
00180 sse_movss(p->func, data, x86_make_disp(arg0, 8));
00181 sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
00182 sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
00183 sse_movlps(p->func, data, arg0);
00184 }
00185
00186 static void emit_load_R32G32( struct translate_sse *p,
00187 struct x86_reg data,
00188 struct x86_reg arg0 )
00189 {
00190
00191
00192
00193 sse_movups(p->func, data, get_identity(p) );
00194 sse_movlps(p->func, data, arg0);
00195 }
00196
00197
00198 static void emit_load_R32( struct translate_sse *p,
00199 struct x86_reg data,
00200 struct x86_reg arg0 )
00201 {
00202
00203
00204
00205 sse_movss(p->func, data, arg0);
00206 sse_orps(p->func, data, get_identity(p) );
00207 }
00208
00209
00210 static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
00211 struct x86_reg data,
00212 struct x86_reg src )
00213 {
00214
00215
00216
00217 sse_movss(p->func, data, src);
00218 sse2_punpcklbw(p->func, data, get_identity(p));
00219 sse2_punpcklbw(p->func, data, get_identity(p));
00220
00221
00222
00223 sse2_cvtdq2ps(p->func, data, data);
00224
00225
00226
00227
00228 sse_mulps(p->func, data, get_inv_255(p));
00229 }
00230
00231
00232
00233
00234 static void emit_store_R32G32B32A32( struct translate_sse *p,
00235 struct x86_reg dest,
00236 struct x86_reg dataXMM )
00237 {
00238 sse_movups(p->func, dest, dataXMM);
00239 }
00240
00241 static void emit_store_R32G32B32( struct translate_sse *p,
00242 struct x86_reg dest,
00243 struct x86_reg dataXMM )
00244 {
00245
00246
00247 sse_movlps(p->func, dest, dataXMM);
00248 sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) );
00249 sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
00250 }
00251
00252 static void emit_store_R32G32( struct translate_sse *p,
00253 struct x86_reg dest,
00254 struct x86_reg dataXMM )
00255 {
00256 sse_movlps(p->func, dest, dataXMM);
00257 }
00258
00259 static void emit_store_R32( struct translate_sse *p,
00260 struct x86_reg dest,
00261 struct x86_reg dataXMM )
00262 {
00263 sse_movss(p->func, dest, dataXMM);
00264 }
00265
00266
00267
00268 static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
00269 struct x86_reg dest,
00270 struct x86_reg dataXMM )
00271 {
00272
00273
00274 sse_mulps(p->func, dataXMM, get_255(p));
00275
00276
00277
00278 sse2_cvtps2dq(p->func, dataXMM, dataXMM);
00279 sse2_packssdw(p->func, dataXMM, dataXMM);
00280 sse2_packuswb(p->func, dataXMM, dataXMM);
00281 sse_movss(p->func, dest, dataXMM);
00282 }
00283
00284
00285
00286
00287
00288
00289
00290 static void emit_swizzle( struct translate_sse *p,
00291 struct x86_reg dest,
00292 struct x86_reg src,
00293 unsigned char shuffle )
00294 {
00295 sse_shufps(p->func, dest, src, shuffle);
00296 }
00297
00298
00299 static boolean translate_attr( struct translate_sse *p,
00300 const struct translate_element *a,
00301 struct x86_reg srcECX,
00302 struct x86_reg dstEAX)
00303 {
00304 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
00305
00306 switch (a->input_format) {
00307 case PIPE_FORMAT_R32_FLOAT:
00308 emit_load_R32(p, dataXMM, srcECX);
00309 break;
00310 case PIPE_FORMAT_R32G32_FLOAT:
00311 emit_load_R32G32(p, dataXMM, srcECX);
00312 break;
00313 case PIPE_FORMAT_R32G32B32_FLOAT:
00314 emit_load_R32G32B32(p, dataXMM, srcECX);
00315 break;
00316 case PIPE_FORMAT_R32G32B32A32_FLOAT:
00317 emit_load_R32G32B32A32(p, dataXMM, srcECX);
00318 break;
00319 case PIPE_FORMAT_B8G8R8A8_UNORM:
00320 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
00321 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
00322 break;
00323 case PIPE_FORMAT_R8G8B8A8_UNORM:
00324 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
00325 break;
00326 default:
00327 return FALSE;
00328 }
00329
00330 switch (a->output_format) {
00331 case PIPE_FORMAT_R32_FLOAT:
00332 emit_store_R32(p, dstEAX, dataXMM);
00333 break;
00334 case PIPE_FORMAT_R32G32_FLOAT:
00335 emit_store_R32G32(p, dstEAX, dataXMM);
00336 break;
00337 case PIPE_FORMAT_R32G32B32_FLOAT:
00338 emit_store_R32G32B32(p, dstEAX, dataXMM);
00339 break;
00340 case PIPE_FORMAT_R32G32B32A32_FLOAT:
00341 emit_store_R32G32B32A32(p, dstEAX, dataXMM);
00342 break;
00343 case PIPE_FORMAT_B8G8R8A8_UNORM:
00344 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
00345 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
00346 break;
00347 case PIPE_FORMAT_R8G8B8A8_UNORM:
00348 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
00349 break;
00350 default:
00351 return FALSE;
00352 }
00353
00354 return TRUE;
00355 }
00356
00357
00358 static boolean init_inputs( struct translate_sse *p,
00359 boolean linear )
00360 {
00361 unsigned i;
00362 if (linear) {
00363 for (i = 0; i < p->nr_buffers; i++) {
00364 struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
00365 get_offset(p, &p->buffer[i].stride));
00366 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
00367 get_offset(p, &p->buffer[i].ptr));
00368 struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
00369 get_offset(p, &p->buffer[i].base_ptr));
00370 struct x86_reg elt = p->idx_EBX;
00371 struct x86_reg tmp = p->tmp_EAX;
00372
00373
00374
00375
00376 x86_mov(p->func, tmp, buf_stride);
00377 x86_imul(p->func, tmp, elt);
00378 x86_add(p->func, tmp, buf_base_ptr);
00379
00380
00381
00382
00383
00384 if (p->nr_buffers == 1)
00385 x86_mov( p->func, elt, tmp );
00386 else
00387 x86_mov( p->func, buf_ptr, tmp );
00388 }
00389 }
00390
00391 return TRUE;
00392 }
00393
00394
00395 static struct x86_reg get_buffer_ptr( struct translate_sse *p,
00396 boolean linear,
00397 unsigned buf_idx,
00398 struct x86_reg elt )
00399 {
00400 if (linear && p->nr_buffers == 1) {
00401 return p->idx_EBX;
00402 }
00403 else if (linear) {
00404 struct x86_reg ptr = p->tmp_EAX;
00405 struct x86_reg buf_ptr =
00406 x86_make_disp(p->machine_EDX,
00407 get_offset(p, &p->buffer[buf_idx].ptr));
00408
00409 x86_mov(p->func, ptr, buf_ptr);
00410 return ptr;
00411 }
00412 else {
00413 struct x86_reg ptr = p->tmp_EAX;
00414
00415 struct x86_reg buf_stride =
00416 x86_make_disp(p->machine_EDX,
00417 get_offset(p, &p->buffer[buf_idx].stride));
00418
00419 struct x86_reg buf_base_ptr =
00420 x86_make_disp(p->machine_EDX,
00421 get_offset(p, &p->buffer[buf_idx].base_ptr));
00422
00423
00424
00425
00426
00427 x86_mov(p->func, ptr, buf_stride);
00428 x86_imul(p->func, ptr, elt);
00429 x86_add(p->func, ptr, buf_base_ptr);
00430 return ptr;
00431 }
00432 }
00433
00434
00435
00436 static boolean incr_inputs( struct translate_sse *p,
00437 boolean linear )
00438 {
00439 if (linear && p->nr_buffers == 1) {
00440 struct x86_reg stride = x86_make_disp(p->machine_EDX,
00441 get_offset(p, &p->buffer[0].stride));
00442
00443 x86_add(p->func, p->idx_EBX, stride);
00444 sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
00445 }
00446 else if (linear) {
00447 unsigned i;
00448
00449
00450
00451 for (i = 0; i < p->nr_buffers; i++) {
00452 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
00453 get_offset(p, &p->buffer[i].ptr));
00454 struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
00455 get_offset(p, &p->buffer[i].stride));
00456
00457 x86_mov(p->func, p->tmp_EAX, buf_ptr);
00458 x86_add(p->func, p->tmp_EAX, buf_stride);
00459 if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
00460 x86_mov(p->func, buf_ptr, p->tmp_EAX);
00461 }
00462 }
00463 else {
00464 x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4));
00465 }
00466
00467 return TRUE;
00468 }
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485
00486
00487 static boolean build_vertex_emit( struct translate_sse *p,
00488 struct x86_function *func,
00489 boolean linear )
00490 {
00491 int fixup, label;
00492 unsigned j;
00493
00494 p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
00495 p->idx_EBX = x86_make_reg(file_REG32, reg_BX);
00496 p->outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
00497 p->machine_EDX = x86_make_reg(file_REG32, reg_DX);
00498 p->count_ESI = x86_make_reg(file_REG32, reg_SI);
00499
00500 p->func = func;
00501 p->loaded_inv_255 = FALSE;
00502 p->loaded_255 = FALSE;
00503 p->loaded_identity = FALSE;
00504
00505 x86_init_func(p->func);
00506
00507
00508
00509 x86_push(p->func, p->idx_EBX);
00510 x86_push(p->func, p->count_ESI);
00511
00512
00513
00514 x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
00515 x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
00516 x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
00517 x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 4));
00518
00519
00520
00521 x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
00522 x86_cmp(p->func, p->count_ESI, p->tmp_EAX);
00523 fixup = x86_jcc_forward(p->func, cc_E);
00524
00525
00526
00527 init_inputs(p, linear);
00528
00529
00530
00531 label = x86_get_label(p->func);
00532 {
00533 struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
00534 int last_vb = -1;
00535 struct x86_reg vb;
00536
00537 for (j = 0; j < p->translate.key.nr_elements; j++) {
00538 const struct translate_element *a = &p->translate.key.element[j];
00539
00540
00541
00542 if (a->input_buffer != last_vb) {
00543 last_vb = a->input_buffer;
00544 vb = get_buffer_ptr(p, linear, a->input_buffer, elt);
00545 }
00546
00547 if (!translate_attr( p, a,
00548 x86_make_disp(vb, a->input_offset),
00549 x86_make_disp(p->outbuf_ECX, a->output_offset)))
00550 return FALSE;
00551 }
00552
00553
00554
00555 x86_lea(p->func,
00556 p->outbuf_ECX,
00557 x86_make_disp(p->outbuf_ECX,
00558 p->translate.key.output_stride));
00559
00560
00561
00562 incr_inputs( p, linear );
00563 }
00564
00565
00566
00567 x86_dec(p->func, p->count_ESI);
00568 x86_jcc(p->func, cc_NZ, label);
00569
00570
00571
00572 if (p->func->need_emms)
00573 mmx_emms(p->func);
00574
00575
00576
00577 x86_fixup_fwd_jump(p->func, fixup);
00578
00579
00580
00581
00582 x86_pop(p->func, p->count_ESI);
00583 x86_pop(p->func, p->idx_EBX);
00584 x86_ret(p->func);
00585
00586 return TRUE;
00587 }
00588
00589
00590
00591
00592
00593
00594
00595 static void translate_sse_set_buffer( struct translate *translate,
00596 unsigned buf,
00597 const void *ptr,
00598 unsigned stride )
00599 {
00600 struct translate_sse *p = (struct translate_sse *)translate;
00601
00602 if (buf < p->nr_buffers) {
00603 p->buffer[buf].base_ptr = (char *)ptr;
00604 p->buffer[buf].stride = stride;
00605 }
00606
00607 if (0) debug_printf("%s %d/%d: %p %d\n",
00608 __FUNCTION__, buf,
00609 p->nr_buffers,
00610 ptr, stride);
00611 }
00612
00613
00614 static void translate_sse_release( struct translate *translate )
00615 {
00616 struct translate_sse *p = (struct translate_sse *)translate;
00617
00618 x86_release_func( &p->linear_func );
00619 x86_release_func( &p->elt_func );
00620
00621 FREE(p);
00622 }
00623
00624 static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
00625 const unsigned *elts,
00626 unsigned count,
00627 void *output_buffer )
00628 {
00629 struct translate_sse *p = (struct translate_sse *)translate;
00630
00631 p->gen_run_elts( translate,
00632 elts,
00633 count,
00634 output_buffer );
00635 }
00636
00637 static void PIPE_CDECL translate_sse_run( struct translate *translate,
00638 unsigned start,
00639 unsigned count,
00640 void *output_buffer )
00641 {
00642 struct translate_sse *p = (struct translate_sse *)translate;
00643
00644 p->gen_run( translate,
00645 start,
00646 count,
00647 output_buffer );
00648 }
00649
00650
00651 struct translate *translate_sse2_create( const struct translate_key *key )
00652 {
00653 struct translate_sse *p = NULL;
00654 unsigned i;
00655
00656 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
00657 goto fail;
00658
00659 p = CALLOC_STRUCT( translate_sse );
00660 if (p == NULL)
00661 goto fail;
00662
00663 p->translate.key = *key;
00664 p->translate.release = translate_sse_release;
00665 p->translate.set_buffer = translate_sse_set_buffer;
00666 p->translate.run_elts = translate_sse_run_elts;
00667 p->translate.run = translate_sse_run;
00668
00669 for (i = 0; i < key->nr_elements; i++)
00670 p->nr_buffers = MAX2( p->nr_buffers, key->element[i].input_buffer + 1 );
00671
00672 if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
00673
00674 if (!build_vertex_emit(p, &p->linear_func, TRUE))
00675 goto fail;
00676
00677 if (!build_vertex_emit(p, &p->elt_func, FALSE))
00678 goto fail;
00679
00680 p->gen_run = (run_func)x86_get_func(&p->linear_func);
00681 if (p->gen_run == NULL)
00682 goto fail;
00683
00684 p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
00685 if (p->gen_run_elts == NULL)
00686 goto fail;
00687
00688 return &p->translate;
00689
00690 fail:
00691 if (p)
00692 translate_sse_release( &p->translate );
00693
00694 return NULL;
00695 }
00696
00697
00698
00699 #else
00700
00701 struct translate *translate_sse2_create( const struct translate_key *key )
00702 {
00703 return NULL;
00704 }
00705
00706 #endif