Several fixes for nv4 output. trunk
authorRyan C. Gordon <icculus@icculus.org>
Wed, 09 Jul 2008 19:52:47 -0400
branchtrunk
changeset 433 f6d785d7982e
parent 432 6c59f6c0456a
child 434 9d36ff50e897
Several fixes for nv4 output.
mojoshader.c
--- a/mojoshader.c	Wed Jul 09 13:27:54 2008 -0400
+++ b/mojoshader.c	Wed Jul 09 19:52:47 2008 -0400
@@ -3636,6 +3636,19 @@
 
     char swizzle_str[6];
     int i = 0;
+
+    if (ctx->support_nv4)  // vFace must be output as "vFace.x" in nv4.
+    {
+        if (arg->regtype == REG_TYPE_MISCTYPE)
+        {
+            if ( ((const MiscTypeType) arg->regnum) == MISCTYPE_TYPE_FACE )
+            {
+                swizzle_str[i++] = '.';
+                swizzle_str[i++] = 'x';
+            } // if
+        } // if
+    } // if
+
     const int scalar = scalar_register(arg->regtype, arg->regnum);
     if (!scalar && !no_swizzle(arg->swizzle))
     {
@@ -3943,14 +3956,21 @@
     // no-op in arb1.
 } // emit_ARB1_phase
 
+static inline const char *arb1_float_temp(const Context *ctx)
+{
+    // nv4 lets you specify data type.
+    return (ctx->support_nv4) ? "FLOAT TEMP" : "TEMP";
+} // arb1_float_temp
+
 static void emit_ARB1_finalize(Context *ctx)
 {
     // !!! FIXME: if we never wrote the position register, add the
     // !!! FIXME:  position_invariant program option here.
+    const char *tmpstr = arb1_float_temp(ctx);
     int i;
     push_output(ctx, &ctx->globals);
     for (i = 0; i < ctx->max_scratch_registers; i++)
-        output_line(ctx, "TEMP %s;", allocate_ARB1_scratch_reg_name(ctx));
+        output_line(ctx, "%s %s;", tmpstr, allocate_ARB1_scratch_reg_name(ctx));
 
     // nv2 fragment programs (and anything nv4) have a real REP/ENDREP.
     if ( (ctx->support_nv2) && (!shader_is_pixel(ctx)) && (!ctx->support_nv4) )
@@ -3973,15 +3993,25 @@
     switch (regtype)
     {
         case REG_TYPE_ADDRESS:
-            output_line(ctx, "ADDRESS %s;", varname);
-            if (!ctx->support_nv2)  // nv2 has four-component address already.
-                output_line(ctx, "TEMP addr%d;", regnum);
+            // nv4 replaced address registers with generic int registers.
+            if (ctx->support_nv4)
+                output_line(ctx, "INT TEMP %s;", varname);
+            else
+            {
+                // nv2 has four-component address already, but stock arb1 has
+                //  to emulate it in a temporary, and move components to the
+                //  scalar ADDRESS register on demand.
+                output_line(ctx, "ADDRESS %s;", varname);
+                if (!ctx->support_nv2)
+                    output_line(ctx, "TEMP addr%d;", regnum);
+            } // else
             break;
+
         //case REG_TYPE_PREDICATE:
         //    output_line(ctx, "bvec4 %s;", varname);
         //    break;
         case REG_TYPE_TEMP:
-            output_line(ctx, "TEMP %s;", varname);
+            output_line(ctx, "%s %s;", arb1_float_temp(ctx), varname);
             break;
         //case REG_TYPE_LOOP:
         //    break; // no-op. We declare these in for loops at the moment.
@@ -4173,7 +4203,7 @@
             push_output(ctx, &ctx->globals);
             // no mapping to built-in var? Just make it a regular global, pray.
             if (usage_str == NULL)
-                output_line(ctx, "TEMP %s;", varname);
+                output_line(ctx, "%s %s;", arb1_float_temp(ctx), varname);
             else
             {
                 output_line(ctx, "OUTPUT %s = %s%s%s%s;", varname, usage_str,
@@ -4250,7 +4280,7 @@
                 if (ctx->support_nv4)  // FINALLY, a vFace equivalent in nv4!
                 {
                     index_str[0] = '\0';  // no explicit number.
-                    usage_str = "fragment.facing.x";
+                    usage_str = "fragment.facing";
                 } // if
                 else
                 {
@@ -4302,7 +4332,38 @@
 EMIT_ARB1_OPCODE_DSSS_FUNC(MAD)
 EMIT_ARB1_OPCODE_DSS_FUNC(MUL)
 EMIT_ARB1_OPCODE_DS_FUNC(RCP)
-EMIT_ARB1_OPCODE_DS_FUNC(RSQ)
+
+static void emit_ARB1_RSQ(Context *ctx)
+{
+    // nv4 doesn't force abs() on this, so negative values will generate NaN.
+    // The spec says you should force the abs() yourself.
+    if (!ctx->support_nv4)
+    {
+        emit_ARB1_opcode_ds(ctx, "RSQ");  // pre-nv4 implies ABS.
+        return;
+    } // if
+
+    // we can optimize this to use nv2's |abs| construct in some cases.
+    if ( (ctx->source_args[0].src_mod == SRCMOD_NONE) ||
+         (ctx->source_args[0].src_mod == SRCMOD_NEGATE) ||
+         (ctx->source_args[0].src_mod == SRCMOD_ABSNEGATE) )
+        ctx->source_args[0].src_mod = SRCMOD_ABS;
+
+    const char *dst = make_ARB1_destarg_string(ctx);
+    const char *src0 = make_ARB1_srcarg_string(ctx, 0);
+
+    if (ctx->source_args[0].src_mod == SRCMOD_ABS)
+        output_line(ctx, "RSQ%s, %s;", dst, src0);
+    else
+    {
+        const char *scratch = allocate_ARB1_scratch_reg_name(ctx);
+        output_line(ctx, "ABS %s, %s;", scratch, src0);
+        output_line(ctx, "RSQ%s, %s.x;", dst, scratch);
+    } // else
+
+    emit_ARB1_dest_modifiers(ctx);
+} // emit_ARB1_RSQ
+
 EMIT_ARB1_OPCODE_DSS_FUNC(DP3)
 EMIT_ARB1_OPCODE_DSS_FUNC(DP4)
 EMIT_ARB1_OPCODE_DSS_FUNC(MIN)
@@ -4312,8 +4373,9 @@
 
 static void emit_ARB1_EXP(Context *ctx) { emit_ARB1_opcode_ds(ctx, "EX2"); }
 
-static void emit_ARB1_LOG(Context *ctx)
-{
+static void arb1_log(Context *ctx, const char *opcode)
+{
+    // !!! FIXME: SRCMOD_NEGATE can be made into SRCMOD_ABS here, too
     // we can optimize this to use nv2's |abs| construct in some cases.
     if ( (ctx->source_args[0].src_mod == SRCMOD_NONE) ||
          (ctx->source_args[0].src_mod == SRCMOD_ABSNEGATE) )
@@ -4323,17 +4385,24 @@
     const char *src0 = make_ARB1_srcarg_string(ctx, 0);
 
     if (ctx->source_args[0].src_mod == SRCMOD_ABS)
-        output_line(ctx, "LG2%s, %s;", dst, src0);
+        output_line(ctx, "%s%s, %s;", opcode, dst, src0);
     else
     {
         const char *scratch = allocate_ARB1_scratch_reg_name(ctx);
         output_line(ctx, "ABS %s, %s;", scratch, src0);
-        output_line(ctx, "LG2%s, %s.x;", dst, scratch);
+        output_line(ctx, "%s%s, %s.x;", opcode, dst, scratch);
     } // else
 
     emit_ARB1_dest_modifiers(ctx);
+} // arb1_log
+
+
+static void emit_ARB1_LOG(Context *ctx)
+{
+    arb1_log(ctx, "LG2");
 } // emit_ARB1_LOG
 
+
 EMIT_ARB1_OPCODE_DS_FUNC(LIT)
 EMIT_ARB1_OPCODE_DSS_FUNC(DST)
 
@@ -4398,7 +4467,7 @@
     } // else
 } // emit_ARB1_CALLNZ
 
-
+// !!! FIXME: needs BRA in nv2, LOOP in nv2 fragment progs, and REP in nv4.
 EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(LOOP)
 
 static void emit_ARB1_RET(Context *ctx)
@@ -4767,8 +4836,11 @@
 
 static void emit_ARB1_MOVA(Context *ctx)
 {
-    // NV_vertex_program2_option and later can use the ARR opcode.
-    if (ctx->support_nv2)
+    // nv2 and nv3 can use the ARR opcode.
+    // But nv4 removed ARR (and ADDRESS registers!). Just ROUND to an INT.
+    if (ctx->support_nv4)
+        emit_ARB1_opcode_ds(ctx, "ROUND.S");  // !!! FIXME: don't use a modifier here.
+    else if ((ctx->support_nv2) || (ctx->support_nv3))
         emit_ARB1_opcode_ds(ctx, "ARR");
     else
     {
@@ -4804,6 +4876,7 @@
     } // else
 } // emit_ARB1_MOVA
 
+
 static void emit_ARB1_TEXKILL(Context *ctx)
 {
     // !!! FIXME: d3d kills on xyz, arb1 kills on xyzw. Fix the swizzle!
@@ -4822,28 +4895,19 @@
 EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(TEXM3X3SPEC)
 EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(TEXM3X3VSPEC)
 
-static void emit_ARB1_EXPP(Context *ctx) { emit_ARB1_opcode_ds(ctx, "EXP"); }
+static void emit_ARB1_EXPP(Context *ctx)
+{
+    // EXP was removed in nv4: no benefit over full-precision EX2.
+    if (ctx->support_nv4)
+        emit_ARB1_opcode_ds(ctx, "EX2");
+    else
+        emit_ARB1_opcode_ds(ctx, "EXP");
+} // emit_ARB1_EXPP
 
 static void emit_ARB1_LOGP(Context *ctx)
 {
-    // we can optimize this to use nv2's |abs| construct in some cases.
-    if ( (ctx->source_args[0].src_mod == SRCMOD_NONE) ||
-         (ctx->source_args[0].src_mod == SRCMOD_ABSNEGATE) )
-        ctx->source_args[0].src_mod = SRCMOD_ABS;
-
-    const char *dst = make_ARB1_destarg_string(ctx);
-    const char *src0 = make_ARB1_srcarg_string(ctx, 0);
-
-    if (ctx->source_args[0].src_mod == SRCMOD_ABS)
-        output_line(ctx, "LOG%s, %s;", dst, src0);
-    else
-    {
-        const char *scratch = allocate_ARB1_scratch_reg_name(ctx);
-        output_line(ctx, "ABS %s, %s;", scratch, src0);
-        output_line(ctx, "LOG%s, %s.x;", dst, scratch);
-    } // else
-
-    emit_ARB1_dest_modifiers(ctx);
+    // LOG was removed in nv4: no benefit over full-precision LG2.
+    arb1_log(ctx, (ctx->support_nv4) ? "LG2" : "LOG");
 } // emit_ARB1_LOGP
 
 EMIT_ARB1_OPCODE_UNIMPLEMENTED_FUNC(CND)
@@ -4913,6 +4977,10 @@
 
 static void arb1_texld(Context *ctx, const char *opcode)
 {
+    // !!! FIXME: Hack: "TEXH" is invalid in nv4. Fix this more cleanly.
+    if ((ctx->dest_arg.result_mod & MOD_PP) && (ctx->support_nv4))
+        ctx->dest_arg.result_mod &= ~MOD_PP;
+
     // !!! FIXME: do non-RGBA textures map to same default values as D3D?
     const char *dst = make_ARB1_destarg_string(ctx);
     const SourceArgInfo *samp_arg = &ctx->source_args[1];